├── .gitattributes ├── markdown ├── extensions │ ├── nl2br.py │ ├── sane_lists.py │ ├── smart_strong.py │ ├── __init__.py │ ├── extra.py │ ├── html_tidy.py │ ├── meta.py │ ├── abbr.py │ ├── tables.py │ ├── def_list.py │ ├── rss.py │ ├── attr_list.py │ ├── fenced_code.py │ ├── wikilinks.py │ ├── toc.py │ ├── headerid.py │ ├── codehilite.py │ └── footnotes.py ├── etree_loader.py ├── __main__.py ├── postprocessors.py ├── blockparser.py ├── util.py ├── odict.py ├── serializers.py ├── preprocessors.py ├── treeprocessors.py ├── __init__.py ├── inlinepatterns.py └── blockprocessors.py ├── README.md ├── Markdown.py └── .gitignore /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /markdown/extensions/nl2br.py: -------------------------------------------------------------------------------- 1 | """ 2 | NL2BR Extension 3 | =============== 4 | 5 | A Python-Markdown extension to treat newlines as hard breaks; like 6 | GitHub-flavored Markdown does. 7 | 8 | Usage: 9 | 10 | >>> import markdown 11 | >>> print markdown.markdown('line 1\\nline 2', extensions=['nl2br']) 12 |
line 1
13 | line 2
Text with double__underscore__words.
13 | >>> print markdown.markdown('__Strong__ still works.', 14 | ... extensions=['smart_strong']) 15 |Strong still works.
16 | >>> print markdown.markdown('__this__works__too__.', 17 | ... extensions=['smart_strong']) 18 |this__works__too.
19 | 20 | Copyright 2011 21 | [Waylan Limberg](http://achinghead.com) 22 | 23 | ''' 24 | 25 | import markdown 26 | from markdown.inlinepatterns import SimpleTagPattern 27 | 28 | SMART_STRONG_RE = r'(?emphasis2') 38 | 39 | def makeExtension(configs={}): 40 | return SmartEmphasisExtension(configs=dict(configs)) 41 | 42 | if __name__ == '__main__': 43 | import doctest 44 | doctest.testmod() 45 | -------------------------------------------------------------------------------- /markdown/extensions/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extensions 3 | ----------------------------------------------------------------------------- 4 | """ 5 | 6 | class Extension: 7 | """ Base class for extensions to subclass. """ 8 | def __init__(self, configs = {}): 9 | """Create an instance of an Extention. 10 | 11 | Keyword arguments: 12 | 13 | * configs: A dict of configuration setting used by an Extension. 14 | """ 15 | self.config = configs 16 | 17 | def getConfig(self, key, default=''): 18 | """ Return a setting for the given key or an empty string. """ 19 | if key in self.config: 20 | return self.config[key][0] 21 | else: 22 | return default 23 | 24 | def getConfigs(self): 25 | """ Return all configs settings as a dict. """ 26 | return dict([(key, self.getConfig(key)) for key in self.config.keys()]) 27 | 28 | def getConfigInfo(self): 29 | """ Return all config descriptions as a list of tuples. """ 30 | return [(key, self.config[key][1]) for key in self.config.keys()] 31 | 32 | def setConfig(self, key, value): 33 | """ Set a config setting for `key` with the given `value`. """ 34 | self.config[key][0] = value 35 | 36 | def extendMarkdown(self, md, md_globals): 37 | """ 38 | Add the various proccesors and patterns to the Markdown Instance. 39 | 40 | This method must be overriden by every extension. 41 | 42 | Keyword arguments: 43 | 44 | * md: The Markdown instance. 45 | 46 | * md_globals: Global variables in the markdown module namespace. 47 | 48 | """ 49 | raise NotImplementedError, 'Extension "%s.%s" must define an "extendMarkdown"' \ 50 | 'method.' % (self.__class__.__module__, self.__class__.__name__) 51 | 52 | -------------------------------------------------------------------------------- /markdown/extensions/extra.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Python-Markdown Extra Extension 4 | =============================== 5 | 6 | A compilation of various Python-Markdown extensions that imitates 7 | [PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/). 8 | 9 | Note that each of the individual extensions still need to be available 10 | on your PYTHONPATH. This extension simply wraps them all up as a 11 | convenience so that only one extension needs to be listed when 12 | initiating Markdown. See the documentation for each individual 13 | extension for specifics about that extension. 14 | 15 | In the event that one or more of the supported extensions are not 16 | available for import, Markdown will issue a warning and simply continue 17 | without that extension. 18 | 19 | There may be additional extensions that are distributed with 20 | Python-Markdown that are not included here in Extra. Those extensions 21 | are not part of PHP Markdown Extra, and therefore, not part of 22 | Python-Markdown Extra. If you really would like Extra to include 23 | additional extensions, we suggest creating your own clone of Extra 24 | under a differant name. You could also edit the `extensions` global 25 | variable defined below, but be aware that such changes may be lost 26 | when you upgrade to any future version of Python-Markdown. 27 | 28 | """ 29 | 30 | import markdown 31 | 32 | extensions = ['smart_strong', 33 | 'fenced_code', 34 | 'footnotes', 35 | 'attr_list', 36 | 'def_list', 37 | 'tables', 38 | 'abbr', 39 | ] 40 | 41 | 42 | class ExtraExtension(markdown.Extension): 43 | """ Add various extensions to Markdown class.""" 44 | 45 | def extendMarkdown(self, md, md_globals): 46 | """ Register extension instances. """ 47 | md.registerExtensions(extensions, self.config) 48 | # Turn on processing of markdown text within raw html 49 | md.preprocessors['html_block'].markdown_in_raw = True 50 | 51 | def makeExtension(configs={}): 52 | return ExtraExtension(configs=dict(configs)) 53 | -------------------------------------------------------------------------------- /markdown/extensions/html_tidy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | HTML Tidy Extension for Python-Markdown 5 | ======================================= 6 | 7 | Runs [HTML Tidy][] on the output of Python-Markdown using the [uTidylib][] 8 | Python wrapper. Both libtidy and uTidylib must be installed on your system. 9 | 10 | Note than any Tidy [options][] can be passed in as extension configs. So, 11 | for example, to output HTML rather than XHTML, set ``output_xhtml=0``. To 12 | indent the output, set ``indent=auto`` and to have Tidy wrap the output in 13 | ```` and ```` tags, set ``show_body_only=0``. 14 | 15 | [HTML Tidy]: http://tidy.sourceforge.net/ 16 | [uTidylib]: http://utidylib.berlios.de/ 17 | [options]: http://tidy.sourceforge.net/docs/quickref.html 18 | 19 | Copyright (c)2008 [Waylan Limberg](http://achinghead.com) 20 | 21 | License: [BSD](http://www.opensource.org/licenses/bsd-license.php) 22 | 23 | Dependencies: 24 | * [Python2.3+](http://python.org) 25 | * [Markdown 2.0+](http://packages.python.org/Markdown/) 26 | * [HTML Tidy](http://utidylib.berlios.de/) 27 | * [uTidylib](http://utidylib.berlios.de/) 28 | 29 | """ 30 | 31 | import markdown 32 | try: 33 | import tidy 34 | except ImportError: 35 | tidy = None 36 | 37 | class TidyExtension(markdown.Extension): 38 | 39 | def __init__(self, configs): 40 | # Set defaults to match typical markdown behavior. 41 | self.config = dict(output_xhtml=1, 42 | show_body_only=1, 43 | char_encoding='utf8' 44 | ) 45 | # Merge in user defined configs overriding any present if nessecary. 46 | for c in configs: 47 | self.config[c[0]] = c[1] 48 | 49 | def extendMarkdown(self, md, md_globals): 50 | # Save options to markdown instance 51 | md.tidy_options = self.config 52 | # Add TidyProcessor to postprocessors 53 | if tidy: 54 | md.postprocessors['tidy'] = TidyProcessor(md) 55 | 56 | 57 | class TidyProcessor(markdown.postprocessors.Postprocessor): 58 | 59 | def run(self, text): 60 | # Pass text to Tidy. As Tidy does not accept unicode we need to encode 61 | # it and decode its return value. 62 | enc = self.markdown.tidy_options.get('char_encoding', 'utf8') 63 | return unicode(tidy.parseString(text.encode(enc), 64 | **self.markdown.tidy_options), 65 | encoding=enc) 66 | 67 | 68 | def makeExtension(configs=None): 69 | return TidyExtension(configs=configs) 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | 31 | 32 | ################# 33 | ## Visual Studio 34 | ################# 35 | 36 | ## Ignore Visual Studio temporary files, build results, and 37 | ## files generated by popular Visual Studio add-ons. 38 | 39 | # User-specific files 40 | *.suo 41 | *.user 42 | *.sln.docstates 43 | 44 | # Build results 45 | [Dd]ebug/ 46 | [Rr]elease/ 47 | *_i.c 48 | *_p.c 49 | *.ilk 50 | *.meta 51 | *.obj 52 | *.pch 53 | *.pdb 54 | *.pgc 55 | *.pgd 56 | *.rsp 57 | *.sbr 58 | *.tlb 59 | *.tli 60 | *.tlh 61 | *.tmp 62 | *.vspscc 63 | .builds 64 | *.dotCover 65 | 66 | ## TODO: If you have NuGet Package Restore enabled, uncomment this 67 | #packages/ 68 | 69 | # Visual C++ cache files 70 | ipch/ 71 | *.aps 72 | *.ncb 73 | *.opensdf 74 | *.sdf 75 | 76 | # Visual Studio profiler 77 | *.psess 78 | *.vsp 79 | 80 | # ReSharper is a .NET coding add-in 81 | _ReSharper* 82 | 83 | # Installshield output folder 84 | [Ee]xpress 85 | 86 | # DocProject is a documentation generator add-in 87 | DocProject/buildhelp/ 88 | DocProject/Help/*.HxT 89 | DocProject/Help/*.HxC 90 | DocProject/Help/*.hhc 91 | DocProject/Help/*.hhk 92 | DocProject/Help/*.hhp 93 | DocProject/Help/Html2 94 | DocProject/Help/html 95 | 96 | # Click-Once directory 97 | publish 98 | 99 | # Others 100 | [Bb]in 101 | [Oo]bj 102 | sql 103 | TestResults 104 | *.Cache 105 | ClientBin 106 | stylecop.* 107 | ~$* 108 | *.dbmdl 109 | Generated_Code #added for RIA/Silverlight projects 110 | 111 | # Backup & report files from converting an old project file to a newer 112 | # Visual Studio version. Backup files are not needed, because we have git ;-) 113 | _UpgradeReport_Files/ 114 | Backup*/ 115 | UpgradeLog*.XML 116 | 117 | 118 | 119 | ############ 120 | ## Windows 121 | ############ 122 | 123 | # Windows image file caches 124 | Thumbs.db 125 | 126 | # Folder config file 127 | Desktop.ini 128 | 129 | 130 | ############# 131 | ## Python 132 | ############# 133 | 134 | *.py[co] 135 | 136 | # Packages 137 | *.egg 138 | *.egg-info 139 | dist 140 | build 141 | eggs 142 | parts 143 | bin 144 | var 145 | sdist 146 | develop-eggs 147 | .installed.cfg 148 | 149 | # Installer logs 150 | pip-log.txt 151 | 152 | # Unit test / coverage reports 153 | .coverage 154 | .tox 155 | 156 | #Translations 157 | *.mo 158 | 159 | #Mr Developer 160 | .mr.developer.cfg 161 | 162 | # Mac crap 163 | .DS_Store 164 | -------------------------------------------------------------------------------- /markdown/extensions/meta.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 2 | 3 | """ 4 | Meta Data Extension for Python-Markdown 5 | ======================================= 6 | 7 | This extension adds Meta Data handling to markdown. 8 | 9 | Basic Usage: 10 | 11 | >>> import markdown 12 | >>> text = '''Title: A Test Doc. 13 | ... Author: Waylan Limberg 14 | ... John Doe 15 | ... Blank_Data: 16 | ... 17 | ... The body. This is paragraph one. 18 | ... ''' 19 | >>> md = markdown.Markdown(['meta']) 20 | >>> print md.convert(text) 21 |The body. This is paragraph one.
22 | >>> print md.Meta 23 | {u'blank_data': [u''], u'author': [u'Waylan Limberg', u'John Doe'], u'title': [u'A Test Doc.']} 24 | 25 | Make sure text without Meta Data still works (markdown < 1.6b returns a). 26 | 27 | >>> text = ' Some Code - not extra lines of meta data.' 28 | >>> md = markdown.Markdown(['meta']) 29 | >>> print md.convert(text) 30 |
Some Code - not extra lines of meta data.
31 |
32 | >>> md.Meta
33 | {}
34 |
35 | Copyright 2007-2008 [Waylan Limberg](http://achinghead.com).
36 |
37 | Project website: Some text with an ABBR and a REF. Ignore REFERENCE and ref.
18 | 19 | Copyright 2007-2008 20 | * [Waylan Limberg](http://achinghead.com/) 21 | * [Seemant Kulleen](http://www.kulleen.org/) 22 | 23 | 24 | ''' 25 | 26 | import re 27 | import markdown 28 | from markdown.util import etree 29 | 30 | # Global Vars 31 | ABBR_REF_RE = re.compile(r'[*]\[(?P[^\]]*)\][ ]?:\s*(?PA paragraph before a fenced code block:
20 |Fenced code block
21 |
22 |
23 | Works with safe_mode also (we check this because we are using the HtmlStash):
24 |
25 | >>> print markdown.markdown(text, extensions=['fenced_code'], safe_mode='replace')
26 | A paragraph before a fenced code block:
27 |Fenced code block
28 |
29 |
30 | Include tilde's in a code block and wrap with blank lines:
31 |
32 | >>> text = '''
33 | ... ~~~~~~~~
34 | ...
35 | ... ~~~~
36 | ... ~~~~~~~~'''
37 | >>> print markdown.markdown(text, extensions=['fenced_code'])
38 |
39 | ~~~~
40 |
41 |
42 | Language tags:
43 |
44 | >>> text = '''
45 | ... ~~~~{.python}
46 | ... # Some python code
47 | ... ~~~~'''
48 | >>> print markdown.markdown(text, extensions=['fenced_code'])
49 | # Some python code
50 |
51 |
52 | Optionally backticks instead of tildes as per how github's code block markdown is identified:
53 |
54 | >>> text = '''
55 | ... `````
56 | ... # Arbitrary code
57 | ... ~~~~~ # these tildes will not close the block
58 | ... `````'''
59 | >>> print markdown.markdown(text, extensions=['fenced_code'])
60 | # Arbitrary code
61 | ~~~~~ # these tildes will not close the block
62 |
63 |
64 | Copyright 2007-2008 [Waylan Limberg](http://achinghead.com/).
65 |
66 | Project website: .*?)(?<=\n)(?P=fence)[ ]*$',
85 | re.MULTILINE|re.DOTALL
86 | )
87 | CODE_WRAP = '%s
'
88 | LANG_TAG = ' class="%s"'
89 |
90 | class FencedCodeExtension(markdown.Extension):
91 |
92 | def extendMarkdown(self, md, md_globals):
93 | """ Add FencedBlockPreprocessor to the Markdown instance. """
94 | md.registerExtension(self)
95 |
96 | md.preprocessors.add('fenced_code_block',
97 | FencedBlockPreprocessor(md),
98 | "_begin")
99 |
100 |
101 | class FencedBlockPreprocessor(markdown.preprocessors.Preprocessor):
102 |
103 | def __init__(self, md):
104 | markdown.preprocessors.Preprocessor.__init__(self, md)
105 |
106 | self.checked_for_codehilite = False
107 | self.codehilite_conf = {}
108 |
109 | def run(self, lines):
110 | """ Match and store Fenced Code Blocks in the HtmlStash. """
111 |
112 | # Check for code hilite extension
113 | if not self.checked_for_codehilite:
114 | for ext in self.markdown.registeredExtensions:
115 | if isinstance(ext, CodeHiliteExtension):
116 | self.codehilite_conf = ext.config
117 | break
118 |
119 | self.checked_for_codehilite = True
120 |
121 | text = "\n".join(lines)
122 | while 1:
123 | m = FENCED_BLOCK_RE.search(text)
124 | if m:
125 | lang = ''
126 | if m.group('lang'):
127 | lang = LANG_TAG % m.group('lang')
128 |
129 | # If config is not empty, then the codehighlite extension
130 | # is enabled, so we call it to highlite the code
131 | if self.codehilite_conf:
132 | highliter = CodeHilite(m.group('code'),
133 | linenos=self.codehilite_conf['force_linenos'][0],
134 | guess_lang=self.codehilite_conf['guess_lang'][0],
135 | css_class=self.codehilite_conf['css_class'][0],
136 | style=self.codehilite_conf['pygments_style'][0],
137 | lang=(m.group('lang') or None),
138 | noclasses=self.codehilite_conf['noclasses'][0])
139 |
140 | code = highliter.hilite()
141 | else:
142 | code = CODE_WRAP % (lang, self._escape(m.group('code')))
143 |
144 | placeholder = self.markdown.htmlStash.store(code, safe=True)
145 | text = '%s\n%s\n%s'% (text[:m.start()], placeholder, text[m.end():])
146 | else:
147 | break
148 | return text.split("\n")
149 |
150 | def _escape(self, txt):
151 | """ basic html escaping """
152 | txt = txt.replace('&', '&')
153 | txt = txt.replace('<', '<')
154 | txt = txt.replace('>', '>')
155 | txt = txt.replace('"', '"')
156 | return txt
157 |
158 |
159 | def makeExtension(configs=None):
160 | return FencedCodeExtension(configs=configs)
161 |
162 |
163 | if __name__ == "__main__":
164 | import doctest
165 | doctest.testmod()
166 |
--------------------------------------------------------------------------------
/markdown/extensions/wikilinks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | '''
4 | WikiLinks Extension for Python-Markdown
5 | ======================================
6 |
7 | Converts [[WikiLinks]] to relative links. Requires Python-Markdown 2.0+
8 |
9 | Basic usage:
10 |
11 | >>> import markdown
12 | >>> text = "Some text with a [[WikiLink]]."
13 | >>> html = markdown.markdown(text, ['wikilinks'])
14 | >>> print html
15 | Some text with a WikiLink.
16 |
17 | Whitespace behavior:
18 |
19 | >>> print markdown.markdown('[[ foo bar_baz ]]', ['wikilinks'])
20 |
21 | >>> print markdown.markdown('foo [[ ]] bar', ['wikilinks'])
22 | foo bar
23 |
24 | To define custom settings the simple way:
25 |
26 | >>> print markdown.markdown(text,
27 | ... ['wikilinks(base_url=/wiki/,end_url=.html,html_class=foo)']
28 | ... )
29 | Some text with a WikiLink.
30 |
31 | Custom settings the complex way:
32 |
33 | >>> md = markdown.Markdown(
34 | ... extensions = ['wikilinks'],
35 | ... extension_configs = {'wikilinks': [
36 | ... ('base_url', 'http://example.com/'),
37 | ... ('end_url', '.html'),
38 | ... ('html_class', '') ]},
39 | ... safe_mode = True)
40 | >>> print md.convert(text)
41 | Some text with a WikiLink.
42 |
43 | Use MetaData with mdx_meta.py (Note the blank html_class in MetaData):
44 |
45 | >>> text = """wiki_base_url: http://example.com/
46 | ... wiki_end_url: .html
47 | ... wiki_html_class:
48 | ...
49 | ... Some text with a [[WikiLink]]."""
50 | >>> md = markdown.Markdown(extensions=['meta', 'wikilinks'])
51 | >>> print md.convert(text)
52 | Some text with a WikiLink.
53 |
54 | MetaData should not carry over to next document:
55 |
56 | >>> print md.convert("No [[MetaData]] here.")
57 | No MetaData here.
58 |
59 | Define a custom URL builder:
60 |
61 | >>> def my_url_builder(label, base, end):
62 | ... return '/bar/'
63 | >>> md = markdown.Markdown(extensions=['wikilinks'],
64 | ... extension_configs={'wikilinks' : [('build_url', my_url_builder)]})
65 | >>> print md.convert('[[foo]]')
66 |
67 |
68 | From the command line:
69 |
70 | python markdown.py -x wikilinks(base_url=http://example.com/,end_url=.html,html_class=foo) src.txt
71 |
72 | By [Waylan Limberg](http://achinghead.com/).
73 |
74 | License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
75 |
76 | Dependencies:
77 | * [Python 2.3+](http://python.org)
78 | * [Markdown 2.0+](http://packages.python.org/Markdown/)
79 | '''
80 |
81 | import markdown
82 | import re
83 |
84 | def build_url(label, base, end):
85 | """ Build a url from the label, a base, and an end. """
86 | clean_label = re.sub(r'([ ]+_)|(_[ ]+)|([ ]+)', '_', label)
87 | return '%s%s%s'% (base, clean_label, end)
88 |
89 |
90 | class WikiLinkExtension(markdown.Extension):
91 | def __init__(self, configs):
92 | # set extension defaults
93 | self.config = {
94 | 'base_url' : ['/', 'String to append to beginning or URL.'],
95 | 'end_url' : ['/', 'String to append to end of URL.'],
96 | 'html_class' : ['wikilink', 'CSS hook. Leave blank for none.'],
97 | 'build_url' : [build_url, 'Callable formats URL from label.'],
98 | }
99 |
100 | # Override defaults with user settings
101 | for key, value in configs :
102 | self.setConfig(key, value)
103 |
104 | def extendMarkdown(self, md, md_globals):
105 | self.md = md
106 |
107 | # append to end of inline patterns
108 | WIKILINK_RE = r'\[\[([\w0-9_ -]+)\]\]'
109 | wikilinkPattern = WikiLinks(WIKILINK_RE, self.getConfigs())
110 | wikilinkPattern.md = md
111 | md.inlinePatterns.add('wikilink', wikilinkPattern, "'):
132 | i = self.index(location[1:])
133 | if location.startswith('>'):
134 | if i >= len(self):
135 | # last item
136 | i = None
137 | else:
138 | i += 1
139 | else:
140 | raise ValueError('Not a valid location: "%s". Location key '
141 | 'must start with a ">" or "<".' % location)
142 | return i
143 |
144 | def add(self, key, value, location):
145 | """ Insert by key location. """
146 | i = self.index_for_location(location)
147 | if i is not None:
148 | self.insert(i, key, value)
149 | else:
150 | self.__setitem__(key, value)
151 |
152 | def link(self, key, location):
153 | """ Change location of an existing item. """
154 | n = self.keyOrder.index(key)
155 | del self.keyOrder[n]
156 | try:
157 | i = self.index_for_location(location)
158 | if i is not None:
159 | self.keyOrder.insert(i, key)
160 | else:
161 | self.keyOrder.append(key)
162 | except Exception, e:
163 | # restore to prevent data loss and reraise
164 | self.keyOrder.insert(n, key)
165 | raise e
166 |
--------------------------------------------------------------------------------
/markdown/extensions/toc.py:
--------------------------------------------------------------------------------
1 | """
2 | Table of Contents Extension for Python-Markdown
3 | * * *
4 |
5 | (c) 2008 [Jack Miller](http://codezen.org)
6 |
7 | Dependencies:
8 | * [Markdown 2.1+](http://packages.python.org/Markdown/)
9 |
10 | """
11 | import markdown
12 | from markdown.util import etree
13 | from markdown.extensions.headerid import slugify, unique, itertext
14 |
15 | import re
16 |
17 |
18 | class TocTreeprocessor(markdown.treeprocessors.Treeprocessor):
19 | # Iterator wrapper to get parent and child all at once
20 | def iterparent(self, root):
21 | for parent in root.getiterator():
22 | for child in parent:
23 | yield parent, child
24 |
25 | def run(self, doc):
26 | marker_found = False
27 |
28 | div = etree.Element("div")
29 | div.attrib["class"] = "toc"
30 | last_li = None
31 |
32 | # Add title to the div
33 | if self.config["title"]:
34 | header = etree.SubElement(div, "span")
35 | header.attrib["class"] = "toctitle"
36 | header.text = self.config["title"]
37 |
38 | level = 0
39 | list_stack=[div]
40 | header_rgx = re.compile("[Hh][123456]")
41 |
42 | # Get a list of id attributes
43 | used_ids = []
44 | for c in doc.getiterator():
45 | if "id" in c.attrib:
46 | used_ids.append(c.attrib["id"])
47 |
48 | for (p, c) in self.iterparent(doc):
49 | text = ''.join(itertext(c)).strip()
50 | if not text:
51 | continue
52 |
53 | # To keep the output from screwing up the
54 | # validation by putting a inside of a
55 | # we actually replace the
in its entirety.
56 | # We do not allow the marker inside a header as that
57 | # would causes an enless loop of placing a new TOC
58 | # inside previously generated TOC.
59 |
60 | if c.text and c.text.strip() == self.config["marker"] and \
61 | not header_rgx.match(c.tag) and c.tag not in ['pre', 'code']:
62 | for i in range(len(p)):
63 | if p[i] == c:
64 | p[i] = div
65 | break
66 | marker_found = True
67 |
68 | if header_rgx.match(c.tag):
69 | try:
70 | tag_level = int(c.tag[-1])
71 |
72 | while tag_level < level:
73 | list_stack.pop()
74 | level -= 1
75 |
76 | if tag_level > level:
77 | newlist = etree.Element("ul")
78 | if last_li:
79 | last_li.append(newlist)
80 | else:
81 | list_stack[-1].append(newlist)
82 | list_stack.append(newlist)
83 | if level == 0:
84 | level = tag_level
85 | else:
86 | level += 1
87 |
88 | # Do not override pre-existing ids
89 | if not "id" in c.attrib:
90 | id = unique(self.config["slugify"](text, '-'), used_ids)
91 | c.attrib["id"] = id
92 | else:
93 | id = c.attrib["id"]
94 |
95 | # List item link, to be inserted into the toc div
96 | last_li = etree.Element("li")
97 | link = etree.SubElement(last_li, "a")
98 | link.text = text
99 | link.attrib["href"] = '#' + id
100 |
101 | if self.config["anchorlink"] in [1, '1', True, 'True', 'true']:
102 | anchor = etree.Element("a")
103 | anchor.text = c.text
104 | anchor.attrib["href"] = "#" + id
105 | anchor.attrib["class"] = "toclink"
106 | c.text = ""
107 | for elem in c.getchildren():
108 | anchor.append(elem)
109 | c.remove(elem)
110 | c.append(anchor)
111 |
112 | list_stack[-1].append(last_li)
113 | except IndexError:
114 | # We have bad ordering of headers. Just move on.
115 | pass
116 | if not marker_found:
117 | # searialize and attach to markdown instance.
118 | prettify = self.markdown.treeprocessors.get('prettify')
119 | if prettify: prettify.run(div)
120 | toc = self.markdown.serializer(div)
121 | for pp in self.markdown.postprocessors.values():
122 | toc = pp.run(toc)
123 | self.markdown.toc = toc
124 |
125 | class TocExtension(markdown.Extension):
126 | def __init__(self, configs):
127 | self.config = { "marker" : ["[TOC]",
128 | "Text to find and replace with Table of Contents -"
129 | "Defaults to \"[TOC]\""],
130 | "slugify" : [slugify,
131 | "Function to generate anchors based on header text-"
132 | "Defaults to the headerid ext's slugify function."],
133 | "title" : [None,
134 | "Title to insert into TOC
- "
135 | "Defaults to None"],
136 | "anchorlink" : [0,
137 | "1 if header should be a self link"
138 | "Defaults to 0"]}
139 |
140 | for key, value in configs:
141 | self.setConfig(key, value)
142 |
143 | def extendMarkdown(self, md, md_globals):
144 | tocext = TocTreeprocessor(md)
145 | tocext.config = self.getConfigs()
146 | # Headerid ext is set to '>inline'. With this set to '>> import markdown
12 | >>> text = "# Some Header #"
13 | >>> md = markdown.markdown(text, ['headerid'])
14 | >>> print md
15 | Some Header
16 |
17 | All header IDs are unique:
18 |
19 | >>> text = '''
20 | ... #Header
21 | ... #Header
22 | ... #Header'''
23 | >>> md = markdown.markdown(text, ['headerid'])
24 | >>> print md
25 | Header
26 | Header
27 | Header
28 |
29 | To fit within a html template's hierarchy, set the header base level:
30 |
31 | >>> text = '''
32 | ... #Some Header
33 | ... ## Next Level'''
34 | >>> md = markdown.markdown(text, ['headerid(level=3)'])
35 | >>> print md
36 | Some Header
37 | Next Level
38 |
39 | Works with inline markup.
40 |
41 | >>> text = '#Some *Header* with [markup](http://example.com).'
42 | >>> md = markdown.markdown(text, ['headerid'])
43 | >>> print md
44 | Some Header with markup.
45 |
46 | Turn off auto generated IDs:
47 |
48 | >>> text = '''
49 | ... # Some Header
50 | ... # Another Header'''
51 | >>> md = markdown.markdown(text, ['headerid(forceid=False)'])
52 | >>> print md
53 | Some Header
54 | Another Header
55 |
56 | Use with MetaData extension:
57 |
58 | >>> text = '''header_level: 2
59 | ... header_forceid: Off
60 | ...
61 | ... # A Header'''
62 | >>> md = markdown.markdown(text, ['headerid', 'meta'])
63 | >>> print md
64 | A Header
65 |
66 | Copyright 2007-2011 [Waylan Limberg](http://achinghead.com/).
67 |
68 | Project website:
69 | Contact: markdown@freewisdom.org
70 |
71 | License: BSD (see ../docs/LICENSE for details)
72 |
73 | Dependencies:
74 | * [Python 2.3+](http://python.org)
75 | * [Markdown 2.0+](http://packages.python.org/Markdown/)
76 |
77 | """
78 |
79 | import markdown
80 | import re
81 | import logging
82 | import unicodedata
83 |
84 | logger = logging.getLogger('MARKDOWN')
85 |
86 | IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
87 |
88 |
89 | def slugify(value, separator):
90 | """ Slugify a string, to make it URL friendly. """
91 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
92 | value = re.sub('[^\w\s-]', '', value.decode('ascii')).strip().lower()
93 | return re.sub('[%s\s]+' % separator, separator, value)
94 |
95 |
96 | def unique(id, ids):
97 | """ Ensure id is unique in set of ids. Append '_1', '_2'... if not """
98 | while id in ids or not id:
99 | m = IDCOUNT_RE.match(id)
100 | if m:
101 | id = '%s_%d'% (m.group(1), int(m.group(2))+1)
102 | else:
103 | id = '%s_%d'% (id, 1)
104 | ids.append(id)
105 | return id
106 |
107 |
108 | def itertext(elem):
109 | """ Loop through all children and return text only.
110 |
111 | Reimplements method of same name added to ElementTree in Python 2.7
112 |
113 | """
114 | if elem.text:
115 | yield elem.text
116 | for e in elem:
117 | for s in itertext(e):
118 | yield s
119 | if e.tail:
120 | yield e.tail
121 |
122 |
123 | class HeaderIdTreeprocessor(markdown.treeprocessors.Treeprocessor):
124 | """ Assign IDs to headers. """
125 |
126 | IDs = set()
127 |
128 | def run(self, doc):
129 | start_level, force_id = self._get_meta()
130 | slugify = self.config['slugify']
131 | sep = self.config['separator']
132 | for elem in doc.getiterator():
133 | if elem.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
134 | if force_id:
135 | if "id" in elem.attrib:
136 | id = elem.id
137 | else:
138 | id = slugify(u''.join(itertext(elem)), sep)
139 | elem.set('id', unique(id, self.IDs))
140 | if start_level:
141 | level = int(elem.tag[-1]) + start_level
142 | if level > 6:
143 | level = 6
144 | elem.tag = 'h%d' % level
145 |
146 |
147 | def _get_meta(self):
148 | """ Return meta data suported by this ext as a tuple """
149 | level = int(self.config['level']) - 1
150 | force = self._str2bool(self.config['forceid'])
151 | if hasattr(self.md, 'Meta'):
152 | if self.md.Meta.has_key('header_level'):
153 | level = int(self.md.Meta['header_level'][0]) - 1
154 | if self.md.Meta.has_key('header_forceid'):
155 | force = self._str2bool(self.md.Meta['header_forceid'][0])
156 | return level, force
157 |
158 | def _str2bool(self, s, default=False):
159 | """ Convert a string to a booleen value. """
160 | s = str(s)
161 | if s.lower() in ['0', 'f', 'false', 'off', 'no', 'n']:
162 | return False
163 | elif s.lower() in ['1', 't', 'true', 'on', 'yes', 'y']:
164 | return True
165 | return default
166 |
167 |
168 | class HeaderIdExtension (markdown.Extension):
169 | def __init__(self, configs):
170 | # set defaults
171 | self.config = {
172 | 'level' : ['1', 'Base level for headers.'],
173 | 'forceid' : ['True', 'Force all headers to have an id.'],
174 | 'separator' : ['-', 'Word separator.'],
175 | 'slugify' : [slugify, 'Callable to generate anchors'],
176 | }
177 |
178 | for key, value in configs:
179 | self.setConfig(key, value)
180 |
181 | def extendMarkdown(self, md, md_globals):
182 | md.registerExtension(self)
183 | self.processor = HeaderIdTreeprocessor()
184 | self.processor.md = md
185 | self.processor.config = self.getConfigs()
186 | # Replace existing hasheader in place.
187 | md.treeprocessors.add('headerid', self.processor, '>inline')
188 |
189 | def reset(self):
190 | self.processor.IDs = []
191 |
192 |
193 | def makeExtension(configs=None):
194 | return HeaderIdExtension(configs=configs)
195 |
196 | if __name__ == "__main__":
197 | import doctest
198 | doctest.testmod()
199 |
200 |
--------------------------------------------------------------------------------
/markdown/extensions/codehilite.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | """
4 | CodeHilite Extension for Python-Markdown
5 | ========================================
6 |
7 | Adds code/syntax highlighting to standard Python-Markdown code blocks.
8 |
9 | Copyright 2006-2008 [Waylan Limberg](http://achinghead.com/).
10 |
11 | Project website:
12 | Contact: markdown@freewisdom.org
13 |
14 | License: BSD (see ../LICENSE.md for details)
15 |
16 | Dependencies:
17 | * [Python 2.3+](http://python.org/)
18 | * [Markdown 2.0+](http://packages.python.org/Markdown/)
19 | * [Pygments](http://pygments.org/)
20 |
21 | """
22 |
23 | import markdown
24 | try:
25 | from pygments import highlight
26 | from pygments.lexers import get_lexer_by_name, guess_lexer, TextLexer
27 | from pygments.formatters import HtmlFormatter
28 | pygments = True
29 | except ImportError:
30 | pygments = False
31 |
32 | # ------------------ The Main CodeHilite Class ----------------------
33 | class CodeHilite:
34 | """
35 | Determine language of source code, and pass it into the pygments hilighter.
36 |
37 | Basic Usage:
38 | >>> code = CodeHilite(src = 'some text')
39 | >>> html = code.hilite()
40 |
41 | * src: Source string or any object with a .readline attribute.
42 |
43 | * linenos: (Boolean) Turn line numbering 'on' or 'off' (off by default).
44 |
45 | * guess_lang: (Boolean) Turn language auto-detection 'on' or 'off' (on by default).
46 |
47 | * css_class: Set class name of wrapper div ('codehilite' by default).
48 |
49 | Low Level Usage:
50 | >>> code = CodeHilite()
51 | >>> code.src = 'some text' # String or anything with a .readline attr.
52 | >>> code.linenos = True # True or False; Turns line numbering on or of.
53 | >>> html = code.hilite()
54 |
55 | """
56 |
57 | def __init__(self, src=None, linenos=False, guess_lang=True,
58 | css_class="codehilite", lang=None, style='default',
59 | noclasses=False, tab_length=4):
60 | self.src = src
61 | self.lang = lang
62 | self.linenos = linenos
63 | self.guess_lang = guess_lang
64 | self.css_class = css_class
65 | self.style = style
66 | self.noclasses = noclasses
67 | self.tab_length = tab_length
68 |
69 | def hilite(self):
70 | """
71 | Pass code to the [Pygments](http://pygments.pocoo.org/) highliter with
72 | optional line numbers. The output should then be styled with css to
73 | your liking. No styles are applied by default - only styling hooks
74 | (i.e.: ).
75 |
76 | returns : A string of html.
77 |
78 | """
79 |
80 | self.src = self.src.strip('\n')
81 |
82 | if self.lang is None:
83 | self._getLang()
84 |
85 | if pygments:
86 | try:
87 | lexer = get_lexer_by_name(self.lang)
88 | except ValueError:
89 | try:
90 | if self.guess_lang:
91 | lexer = guess_lexer(self.src)
92 | else:
93 | lexer = TextLexer()
94 | except ValueError:
95 | lexer = TextLexer()
96 | formatter = HtmlFormatter(linenos=self.linenos,
97 | cssclass=self.css_class,
98 | style=self.style,
99 | noclasses=self.noclasses)
100 | return highlight(self.src, lexer, formatter)
101 | else:
102 | # just escape and build markup usable by JS highlighting libs
103 | txt = self.src.replace('&', '&')
104 | txt = txt.replace('<', '<')
105 | txt = txt.replace('>', '>')
106 | txt = txt.replace('"', '"')
107 | classes = []
108 | if self.lang:
109 | classes.append('language-%s' % self.lang)
110 | if self.linenos:
111 | classes.append('linenums')
112 | class_str = ''
113 | if classes:
114 | class_str = ' class="%s"' % ' '.join(classes)
115 | return '%s
\n'% \
116 | (self.css_class, class_str, txt)
117 |
118 | def _getLang(self):
119 | """
120 | Determines language of a code block from shebang line and whether said
121 | line should be removed or left in place. If the sheband line contains a
122 | path (even a single /) then it is assumed to be a real shebang line and
123 | left alone. However, if no path is given (e.i.: #!python or :::python)
124 | then it is assumed to be a mock shebang for language identifitation of a
125 | code fragment and removed from the code block prior to processing for
126 | code highlighting. When a mock shebang (e.i: #!python) is found, line
127 | numbering is turned on. When colons are found in place of a shebang
128 | (e.i.: :::python), line numbering is left in the current state - off
129 | by default.
130 |
131 | """
132 |
133 | import re
134 |
135 | #split text into lines
136 | lines = self.src.split("\n")
137 | #pull first line to examine
138 | fl = lines.pop(0)
139 |
140 | c = re.compile(r'''
141 | (?:(?:^::+)|(?P^[#]!)) # Shebang or 2 or more colons.
142 | (?P(?:/\w+)*[/ ])? # Zero or 1 path
143 | (?P[\w+-]*) # The language
144 | ''', re.VERBOSE)
145 | # search first line for shebang
146 | m = c.search(fl)
147 | if m:
148 | # we have a match
149 | try:
150 | self.lang = m.group('lang').lower()
151 | except IndexError:
152 | self.lang = None
153 | if m.group('path'):
154 | # path exists - restore first line
155 | lines.insert(0, fl)
156 | if m.group('shebang'):
157 | # shebang exists - use line numbers
158 | self.linenos = True
159 | else:
160 | # No match
161 | lines.insert(0, fl)
162 |
163 | self.src = "\n".join(lines).strip("\n")
164 |
165 |
166 |
167 | # ------------------ The Markdown Extension -------------------------------
168 | class HiliteTreeprocessor(markdown.treeprocessors.Treeprocessor):
169 | """ Hilight source code in code blocks. """
170 |
171 | def run(self, root):
172 | """ Find code blocks and store in htmlStash. """
173 | blocks = root.getiterator('pre')
174 | for block in blocks:
175 | children = block.getchildren()
176 | if len(children) == 1 and children[0].tag == 'code':
177 | code = CodeHilite(children[0].text,
178 | linenos=self.config['force_linenos'],
179 | guess_lang=self.config['guess_lang'],
180 | css_class=self.config['css_class'],
181 | style=self.config['pygments_style'],
182 | noclasses=self.config['noclasses'],
183 | tab_length=self.markdown.tab_length)
184 | placeholder = self.markdown.htmlStash.store(code.hilite(),
185 | safe=True)
186 | # Clear codeblock in etree instance
187 | block.clear()
188 | # Change to p element which will later
189 | # be removed when inserting raw html
190 | block.tag = 'p'
191 | block.text = placeholder
192 |
193 |
194 | class CodeHiliteExtension(markdown.Extension):
195 | """ Add source code hilighting to markdown codeblocks. """
196 |
197 | def __init__(self, configs):
198 | # define default configs
199 | self.config = {
200 | 'force_linenos' : [False, "Force line numbers - Default: False"],
201 | 'guess_lang' : [True, "Automatic language detection - Default: True"],
202 | 'css_class' : ["codehilite",
203 | "Set class name for wrapper - Default: codehilite"],
204 | 'pygments_style' : ['default', 'Pygments HTML Formatter Style (Colorscheme) - Default: default'],
205 | 'noclasses': [False, 'Use inline styles instead of CSS classes - Default false']
206 | }
207 |
208 | # Override defaults with user settings
209 | for key, value in configs:
210 | # convert strings to booleans
211 | if value == 'True': value = True
212 | if value == 'False': value = False
213 | self.setConfig(key, value)
214 |
215 | def extendMarkdown(self, md, md_globals):
216 | """ Add HilitePostprocessor to Markdown instance. """
217 | hiliter = HiliteTreeprocessor(md)
218 | hiliter.config = self.getConfigs()
219 | md.treeprocessors.add("hilite", hiliter, "" in text:
96 | text = text.replace(">", ">")
97 | return text
98 | except (TypeError, AttributeError):
99 | _raise_serialization_error(text)
100 |
101 |
102 | def _escape_attrib(text):
103 | # escape attribute value
104 | try:
105 | if "&" in text:
106 | text = text.replace("&", "&")
107 | if "<" in text:
108 | text = text.replace("<", "<")
109 | if ">" in text:
110 | text = text.replace(">", ">")
111 | if "\"" in text:
112 | text = text.replace("\"", """)
113 | if "\n" in text:
114 | text = text.replace("\n", "
")
115 | return text
116 | except (TypeError, AttributeError):
117 | _raise_serialization_error(text)
118 |
119 | def _escape_attrib_html(text):
120 | # escape attribute value
121 | try:
122 | if "&" in text:
123 | text = text.replace("&", "&")
124 | if "<" in text:
125 | text = text.replace("<", "<")
126 | if ">" in text:
127 | text = text.replace(">", ">")
128 | if "\"" in text:
129 | text = text.replace("\"", """)
130 | return text
131 | except (TypeError, AttributeError):
132 | _raise_serialization_error(text)
133 |
134 |
135 | def _serialize_html(write, elem, qnames, namespaces, format):
136 | tag = elem.tag
137 | text = elem.text
138 | if tag is Comment:
139 | write("" % _escape_cdata(text))
140 | elif tag is ProcessingInstruction:
141 | write("%s?>" % _escape_cdata(text))
142 | else:
143 | tag = qnames[tag]
144 | if tag is None:
145 | if text:
146 | write(_escape_cdata(text))
147 | for e in elem:
148 | _serialize_html(write, e, qnames, None, format)
149 | else:
150 | write("<" + tag)
151 | items = elem.items()
152 | if items or namespaces:
153 | items.sort() # lexical order
154 | for k, v in items:
155 | if isinstance(k, QName):
156 | k = k.text
157 | if isinstance(v, QName):
158 | v = qnames[v.text]
159 | else:
160 | v = _escape_attrib_html(v)
161 | if qnames[k] == v and format == 'html':
162 | # handle boolean attributes
163 | write(" %s" % v)
164 | else:
165 | write(" %s=\"%s\"" % (qnames[k], v))
166 | if namespaces:
167 | items = namespaces.items()
168 | items.sort(key=lambda x: x[1]) # sort on prefix
169 | for v, k in items:
170 | if k:
171 | k = ":" + k
172 | write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
173 | if format == "xhtml" and tag in HTML_EMPTY:
174 | write(" />")
175 | else:
176 | write(">")
177 | tag = tag.lower()
178 | if text:
179 | if tag == "script" or tag == "style":
180 | write(text)
181 | else:
182 | write(_escape_cdata(text))
183 | for e in elem:
184 | _serialize_html(write, e, qnames, None, format)
185 | if tag not in HTML_EMPTY:
186 | write("" + tag + ">")
187 | if elem.tail:
188 | write(_escape_cdata(elem.tail))
189 |
190 | def _write_html(root,
191 | encoding=None,
192 | default_namespace=None,
193 | format="html"):
194 | assert root is not None
195 | data = []
196 | write = data.append
197 | qnames, namespaces = _namespaces(root, default_namespace)
198 | _serialize_html(write, root, qnames, namespaces, format)
199 | if encoding is None:
200 | return "".join(data)
201 | else:
202 | return _encode("".join(data))
203 |
204 |
205 | # --------------------------------------------------------------------
206 | # serialization support
207 |
208 | def _namespaces(elem, default_namespace=None):
209 | # identify namespaces used in this tree
210 |
211 | # maps qnames to *encoded* prefix:local names
212 | qnames = {None: None}
213 |
214 | # maps uri:s to prefixes
215 | namespaces = {}
216 | if default_namespace:
217 | namespaces[default_namespace] = ""
218 |
219 | def add_qname(qname):
220 | # calculate serialized qname representation
221 | try:
222 | if qname[:1] == "{":
223 | uri, tag = qname[1:].split("}", 1)
224 | prefix = namespaces.get(uri)
225 | if prefix is None:
226 | prefix = _namespace_map.get(uri)
227 | if prefix is None:
228 | prefix = "ns%d" % len(namespaces)
229 | if prefix != "xml":
230 | namespaces[uri] = prefix
231 | if prefix:
232 | qnames[qname] = "%s:%s" % (prefix, tag)
233 | else:
234 | qnames[qname] = tag # default element
235 | else:
236 | if default_namespace:
237 | raise ValueError(
238 | "cannot use non-qualified names with "
239 | "default_namespace option"
240 | )
241 | qnames[qname] = qname
242 | except TypeError:
243 | _raise_serialization_error(qname)
244 |
245 | # populate qname and namespaces table
246 | try:
247 | iterate = elem.iter
248 | except AttributeError:
249 | iterate = elem.getiterator # cET compatibility
250 | for elem in iterate():
251 | tag = elem.tag
252 | if isinstance(tag, QName) and tag.text not in qnames:
253 | add_qname(tag.text)
254 | elif isinstance(tag, basestring):
255 | if tag not in qnames:
256 | add_qname(tag)
257 | elif tag is not None and tag is not Comment and tag is not PI:
258 | _raise_serialization_error(tag)
259 | for key, value in elem.items():
260 | if isinstance(key, QName):
261 | key = key.text
262 | if key not in qnames:
263 | add_qname(key)
264 | if isinstance(value, QName) and value.text not in qnames:
265 | add_qname(value.text)
266 | text = elem.text
267 | if isinstance(text, QName) and text.text not in qnames:
268 | add_qname(text.text)
269 | return qnames, namespaces
270 |
271 | def to_html_string(element):
272 | return _write_html(ElementTree(element).getroot(), format="html")
273 |
274 | def to_xhtml_string(element):
275 | return _write_html(ElementTree(element).getroot(), format="xhtml")
276 |
--------------------------------------------------------------------------------
/markdown/extensions/footnotes.py:
--------------------------------------------------------------------------------
1 | """
2 | ========================= FOOTNOTES =================================
3 |
4 | This section adds footnote handling to markdown. It can be used as
5 | an example for extending python-markdown with relatively complex
6 | functionality. While in this case the extension is included inside
7 | the module itself, it could just as easily be added from outside the
8 | module. Not that all markdown classes above are ignorant about
9 | footnotes. All footnote functionality is provided separately and
10 | then added to the markdown instance at the run time.
11 |
12 | Footnote functionality is attached by calling extendMarkdown()
13 | method of FootnoteExtension. The method also registers the
14 | extension to allow it's state to be reset by a call to reset()
15 | method.
16 |
17 | Example:
18 | Footnotes[^1] have a label[^label] and a definition[^!DEF].
19 |
20 | [^1]: This is a footnote
21 | [^label]: A footnote on "label"
22 | [^!DEF]: The footnote for definition
23 |
24 | """
25 |
26 | import re
27 | import markdown
28 | from markdown.util import etree
29 |
30 | FN_BACKLINK_TEXT = "zz1337820767766393qq"
31 | NBSP_PLACEHOLDER = "qq3936677670287331zz"
32 | DEF_RE = re.compile(r'[ ]{0,3}\[\^([^\]]*)\]:\s*(.*)')
33 | TABBED_RE = re.compile(r'((\t)|( ))(.*)')
34 |
35 | class FootnoteExtension(markdown.Extension):
36 | """ Footnote Extension. """
37 |
38 | def __init__ (self, configs):
39 | """ Setup configs. """
40 | self.config = {'PLACE_MARKER':
41 | ["///Footnotes Go Here///",
42 | "The text string that marks where the footnotes go"],
43 | 'UNIQUE_IDS':
44 | [False,
45 | "Avoid name collisions across "
46 | "multiple calls to reset()."],
47 | "BACKLINK_TEXT":
48 | ["↩",
49 | "The text string that links from the footnote to the reader's place."]
50 | }
51 |
52 | for key, value in configs:
53 | self.config[key][0] = value
54 |
55 | # In multiple invocations, emit links that don't get tangled.
56 | self.unique_prefix = 0
57 |
58 | self.reset()
59 |
60 | def extendMarkdown(self, md, md_globals):
61 | """ Add pieces to Markdown. """
62 | md.registerExtension(self)
63 | self.parser = md.parser
64 | self.md = md
65 | # Insert a preprocessor before ReferencePreprocessor
66 | md.preprocessors.add("footnote", FootnotePreprocessor(self),
67 | "amp_substitute")
80 |
81 | def reset(self):
82 | """ Clear the footnotes on reset, and prepare for a distinct document. """
83 | self.footnotes = markdown.odict.OrderedDict()
84 | self.unique_prefix += 1
85 |
86 | def findFootnotesPlaceholder(self, root):
87 | """ Return ElementTree Element that contains Footnote placeholder. """
88 | def finder(element):
89 | for child in element:
90 | if child.text:
91 | if child.text.find(self.getConfig("PLACE_MARKER")) > -1:
92 | return child, element, True
93 | if child.tail:
94 | if child.tail.find(self.getConfig("PLACE_MARKER")) > -1:
95 | return child, element, False
96 | finder(child)
97 | return None
98 |
99 | res = finder(root)
100 | return res
101 |
102 | def setFootnote(self, id, text):
103 | """ Store a footnote for later retrieval. """
104 | self.footnotes[id] = text
105 |
106 | def makeFootnoteId(self, id):
107 | """ Return footnote link id. """
108 | if self.getConfig("UNIQUE_IDS"):
109 | return 'fn:%d-%s' % (self.unique_prefix, id)
110 | else:
111 | return 'fn:%s' % id
112 |
113 | def makeFootnoteRefId(self, id):
114 | """ Return footnote back-link id. """
115 | if self.getConfig("UNIQUE_IDS"):
116 | return 'fnref:%d-%s' % (self.unique_prefix, id)
117 | else:
118 | return 'fnref:%s' % id
119 |
120 | def makeFootnotesDiv(self, root):
121 | """ Return div of footnotes as et Element. """
122 |
123 | if not self.footnotes.keys():
124 | return None
125 |
126 | div = etree.Element("div")
127 | div.set('class', 'footnote')
128 | etree.SubElement(div, "hr")
129 | ol = etree.SubElement(div, "ol")
130 |
131 | for id in self.footnotes.keys():
132 | li = etree.SubElement(ol, "li")
133 | li.set("id", self.makeFootnoteId(id))
134 | self.parser.parseChunk(li, self.footnotes[id])
135 | backlink = etree.Element("a")
136 | backlink.set("href", "#" + self.makeFootnoteRefId(id))
137 | if self.md.output_format not in ['html5', 'xhtml5']:
138 | backlink.set("rev", "footnote") # Invalid in HTML5
139 | backlink.set("class", "footnote-backref")
140 | backlink.set("title", "Jump back to footnote %d in the text" % \
141 | (self.footnotes.index(id)+1))
142 | backlink.text = FN_BACKLINK_TEXT
143 |
144 | if li.getchildren():
145 | node = li[-1]
146 | if node.tag == "p":
147 | node.text = node.text + NBSP_PLACEHOLDER
148 | node.append(backlink)
149 | else:
150 | p = etree.SubElement(li, "p")
151 | p.append(backlink)
152 | return div
153 |
154 |
155 | class FootnotePreprocessor(markdown.preprocessors.Preprocessor):
156 | """ Find all footnote references and store for later use. """
157 |
158 | def __init__ (self, footnotes):
159 | self.footnotes = footnotes
160 |
161 | def run(self, lines):
162 | """
163 | Loop through lines and find, set, and remove footnote definitions.
164 |
165 | Keywords:
166 |
167 | * lines: A list of lines of text
168 |
169 | Return: A list of lines of text with footnote definitions removed.
170 |
171 | """
172 | newlines = []
173 | i = 0
174 | #import pdb; pdb.set_trace() #for i, line in enumerate(lines):
175 | while True:
176 | m = DEF_RE.match(lines[i])
177 | if m:
178 | fn, _i = self.detectTabbed(lines[i+1:])
179 | fn.insert(0, m.group(2))
180 | i += _i-1 # skip past footnote
181 | self.footnotes.setFootnote(m.group(1), "\n".join(fn))
182 | else:
183 | newlines.append(lines[i])
184 | if len(lines) > i+1:
185 | i += 1
186 | else:
187 | break
188 | return newlines
189 |
190 | def detectTabbed(self, lines):
191 | """ Find indented text and remove indent before further proccesing.
192 |
193 | Keyword arguments:
194 |
195 | * lines: an array of strings
196 |
197 | Returns: a list of post processed items and the index of last line.
198 |
199 | """
200 | items = []
201 | blank_line = False # have we encountered a blank line yet?
202 | i = 0 # to keep track of where we are
203 |
204 | def detab(line):
205 | match = TABBED_RE.match(line)
206 | if match:
207 | return match.group(4)
208 |
209 | for line in lines:
210 | if line.strip(): # Non-blank line
211 | detabbed_line = detab(line)
212 | if detabbed_line:
213 | items.append(detabbed_line)
214 | i += 1
215 | continue
216 | elif not blank_line and not DEF_RE.match(line):
217 | # not tabbed but still part of first par.
218 | items.append(line)
219 | i += 1
220 | continue
221 | else:
222 | return items, i+1
223 |
224 | else: # Blank line: _maybe_ we are done.
225 | blank_line = True
226 | i += 1 # advance
227 |
228 | # Find the next non-blank line
229 | for j in range(i, len(lines)):
230 | if lines[j].strip():
231 | next_line = lines[j]; break
232 | else:
233 | break # There is no more text; we are done.
234 |
235 | # Check if the next non-blank line is tabbed
236 | if detab(next_line): # Yes, more work to do.
237 | items.append("")
238 | continue
239 | else:
240 | break # No, we are done.
241 | else:
242 | i += 1
243 |
244 | return items, i
245 |
246 |
247 | class FootnotePattern(markdown.inlinepatterns.Pattern):
248 | """ InlinePattern for footnote markers in a document's body text. """
249 |
250 | def __init__(self, pattern, footnotes):
251 | markdown.inlinepatterns.Pattern.__init__(self, pattern)
252 | self.footnotes = footnotes
253 |
254 | def handleMatch(self, m):
255 | id = m.group(2)
256 | if id in self.footnotes.footnotes.keys():
257 | sup = etree.Element("sup")
258 | a = etree.SubElement(sup, "a")
259 | sup.set('id', self.footnotes.makeFootnoteRefId(id))
260 | a.set('href', '#' + self.footnotes.makeFootnoteId(id))
261 | if self.footnotes.md.output_format not in ['html5', 'xhtml5']:
262 | a.set('rel', 'footnote') # invalid in HTML5
263 | a.set('class', 'footnote-ref')
264 | a.text = unicode(self.footnotes.footnotes.index(id) + 1)
265 | return sup
266 | else:
267 | return None
268 |
269 |
270 | class FootnoteTreeprocessor(markdown.treeprocessors.Treeprocessor):
271 | """ Build and append footnote div to end of document. """
272 |
273 | def __init__ (self, footnotes):
274 | self.footnotes = footnotes
275 |
276 | def run(self, root):
277 | footnotesDiv = self.footnotes.makeFootnotesDiv(root)
278 | if footnotesDiv:
279 | result = self.footnotes.findFootnotesPlaceholder(root)
280 | if result:
281 | child, parent, isText = result
282 | ind = parent.getchildren().index(child)
283 | if isText:
284 | parent.remove(child)
285 | parent.insert(ind, footnotesDiv)
286 | else:
287 | parent.insert(ind + 1, footnotesDiv)
288 | child.tail = None
289 | else:
290 | root.append(footnotesDiv)
291 |
292 | class FootnotePostprocessor(markdown.postprocessors.Postprocessor):
293 | """ Replace placeholders with html entities. """
294 | def __init__(self, footnotes):
295 | self.footnotes = footnotes
296 |
297 | def run(self, text):
298 | text = text.replace(FN_BACKLINK_TEXT, self.footnotes.getConfig("BACKLINK_TEXT"))
299 | return text.replace(NBSP_PLACEHOLDER, " ")
300 |
301 | def makeExtension(configs=[]):
302 | """ Return an instance of the FootnoteExtension """
303 | return FootnoteExtension(configs=configs)
304 |
305 |
--------------------------------------------------------------------------------
/markdown/preprocessors.py:
--------------------------------------------------------------------------------
1 | """
2 | PRE-PROCESSORS
3 | =============================================================================
4 |
5 | Preprocessors work on source text before we start doing anything too
6 | complicated.
7 | """
8 |
9 | import re
10 | import util
11 | import odict
12 |
13 |
14 | def build_preprocessors(md_instance, **kwargs):
15 | """ Build the default set of preprocessors used by Markdown. """
16 | preprocessors = odict.OrderedDict()
17 | if md_instance.safeMode != 'escape':
18 | preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
19 | preprocessors["reference"] = ReferencePreprocessor(md_instance)
20 | return preprocessors
21 |
22 |
23 | class Preprocessor(util.Processor):
24 | """
25 | Preprocessors are run after the text is broken into lines.
26 |
27 | Each preprocessor implements a "run" method that takes a pointer to a
28 | list of lines of the document, modifies it as necessary and returns
29 | either the same pointer or a pointer to a new list.
30 |
31 | Preprocessors must extend markdown.Preprocessor.
32 |
33 | """
34 | def run(self, lines):
35 | """
36 | Each subclass of Preprocessor should override the `run` method, which
37 | takes the document as a list of strings split by newlines and returns
38 | the (possibly modified) list of lines.
39 |
40 | """
41 | pass
42 |
43 |
44 | class HtmlBlockPreprocessor(Preprocessor):
45 | """Remove html blocks from the text and store them for later retrieval."""
46 |
47 | right_tag_patterns = ["%s>", "%s>"]
48 | attrs_pattern = r"""
49 | \s+(?P[^>"'/= ]+)=(?P['"])(?P.*?)(?P=q) # attr="value"
50 | | # OR
51 | \s+(?P[^>"'/= ]+)=(?P[^> ]+) # attr=value
52 | | # OR
53 | \s+(?P[^>"'/= ]+) # attr
54 | """
55 | left_tag_pattern = r'^\<(?P[^> ]+)(?P(%s)*)\s*\/?\>?' % attrs_pattern
56 | attrs_re = re.compile(attrs_pattern, re.VERBOSE)
57 | left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
58 | markdown_in_raw = False
59 |
60 | def _get_left_tag(self, block):
61 | m = self.left_tag_re.match(block)
62 | if m:
63 | tag = m.group('tag')
64 | raw_attrs = m.group('attrs')
65 | attrs = {}
66 | if raw_attrs:
67 | for ma in self.attrs_re.finditer(raw_attrs):
68 | if ma.group('attr'):
69 | if ma.group('value'):
70 | attrs[ma.group('attr').strip()] = ma.group('value')
71 | else:
72 | attrs[ma.group('attr').strip()] = ""
73 | elif ma.group('attr1'):
74 | if ma.group('value1'):
75 | attrs[ma.group('attr1').strip()] = ma.group('value1')
76 | else:
77 | attrs[ma.group('attr1').strip()] = ""
78 | elif ma.group('attr2'):
79 | attrs[ma.group('attr2').strip()] = ""
80 | return tag, len(m.group(0)), attrs
81 | else:
82 | tag = block[1:].split(">", 1)[0].lower()
83 | return tag, len(tag)+2, {}
84 |
85 | def _recursive_tagfind(self, ltag, rtag, start_index, block):
86 | while 1:
87 | i = block.find(rtag, start_index)
88 | if i == -1:
89 | return -1
90 | j = block.find(ltag, start_index)
91 | # if no ltag, or rtag found before another ltag, return index
92 | if (j > i or j == -1):
93 | return i + len(rtag)
94 | # another ltag found before rtag, use end of ltag as starting
95 | # point and search again
96 | j = block.find('>', j)
97 | start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
98 | if start_index == -1:
99 | # HTML potentially malformed- ltag has no corresponding
100 | # rtag
101 | return -1
102 |
103 | def _get_right_tag(self, left_tag, left_index, block):
104 | for p in self.right_tag_patterns:
105 | tag = p % left_tag
106 | i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
107 | if i > 2:
108 | return tag.lstrip("<").rstrip(">"), i
109 | return block.rstrip()[-left_index:-1].lower(), len(block)
110 |
111 | def _equal_tags(self, left_tag, right_tag):
112 | if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
113 | return True
114 | if ("/" + left_tag) == right_tag:
115 | return True
116 | if (right_tag == "--" and left_tag == "--"):
117 | return True
118 | elif left_tag == right_tag[1:] \
119 | and right_tag[0] == "/":
120 | return True
121 | else:
122 | return False
123 |
124 | def _is_oneliner(self, tag):
125 | return (tag in ['hr', 'hr/'])
126 |
127 | def run(self, lines):
128 | text = "\n".join(lines)
129 | new_blocks = []
130 | text = text.split("\n\n")
131 | items = []
132 | left_tag = ''
133 | right_tag = ''
134 | in_tag = False # flag
135 |
136 | while text:
137 | block = text[0]
138 | if block.startswith("\n"):
139 | block = block[1:]
140 | text = text[1:]
141 |
142 | if block.startswith("\n"):
143 | block = block[1:]
144 |
145 | if not in_tag:
146 | if block.startswith("<") and len(block.strip()) > 1:
147 |
148 | if block[1] == "!":
149 | # is a comment block
150 | left_tag, left_index, attrs = "--", 2, {}
151 | else:
152 | left_tag, left_index, attrs = self._get_left_tag(block)
153 | right_tag, data_index = self._get_right_tag(left_tag,
154 | left_index,
155 | block)
156 | # keep checking conditions below and maybe just append
157 |
158 | if data_index < len(block) \
159 | and (util.isBlockLevel(left_tag)
160 | or left_tag == '--'):
161 | text.insert(0, block[data_index:])
162 | block = block[:data_index]
163 |
164 | if not (util.isBlockLevel(left_tag) \
165 | or block[1] in ["!", "?", "@", "%"]):
166 | new_blocks.append(block)
167 | continue
168 |
169 | if self._is_oneliner(left_tag):
170 | new_blocks.append(block.strip())
171 | continue
172 |
173 | if block.rstrip().endswith(">") \
174 | and self._equal_tags(left_tag, right_tag):
175 | if self.markdown_in_raw and 'markdown' in attrs.keys():
176 | start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
177 | '', block[:left_index])
178 | end = block[-len(right_tag)-2:]
179 | block = block[left_index:-len(right_tag)-2]
180 | new_blocks.append(
181 | self.markdown.htmlStash.store(start))
182 | new_blocks.append(block)
183 | new_blocks.append(
184 | self.markdown.htmlStash.store(end))
185 | else:
186 | new_blocks.append(
187 | self.markdown.htmlStash.store(block.strip()))
188 | continue
189 | else:
190 | # if is block level tag and is not complete
191 |
192 | if util.isBlockLevel(left_tag) or left_tag == "--" \
193 | and not block.rstrip().endswith(">"):
194 | items.append(block.strip())
195 | in_tag = True
196 | else:
197 | new_blocks.append(
198 | self.markdown.htmlStash.store(block.strip()))
199 |
200 | continue
201 |
202 | new_blocks.append(block)
203 |
204 | else:
205 | items.append(block)
206 |
207 | right_tag, data_index = self._get_right_tag(left_tag, 0, block)
208 |
209 | if self._equal_tags(left_tag, right_tag):
210 | # if find closing tag
211 |
212 | if data_index < len(block):
213 | # we have more text after right_tag
214 | items[-1] = block[:data_index]
215 | text.insert(0, block[data_index:])
216 |
217 | in_tag = False
218 | if self.markdown_in_raw and 'markdown' in attrs.keys():
219 | start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
220 | '', items[0][:left_index])
221 | items[0] = items[0][left_index:]
222 | end = items[-1][-len(right_tag)-2:]
223 | items[-1] = items[-1][:-len(right_tag)-2]
224 | new_blocks.append(
225 | self.markdown.htmlStash.store(start))
226 | new_blocks.extend(items)
227 | new_blocks.append(
228 | self.markdown.htmlStash.store(end))
229 | else:
230 | new_blocks.append(
231 | self.markdown.htmlStash.store('\n\n'.join(items)))
232 | items = []
233 |
234 | if items:
235 | if self.markdown_in_raw and 'markdown' in attrs.keys():
236 | start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?',
237 | '', items[0][:left_index])
238 | items[0] = items[0][left_index:]
239 | end = items[-1][-len(right_tag)-2:]
240 | items[-1] = items[-1][:-len(right_tag)-2]
241 | new_blocks.append(
242 | self.markdown.htmlStash.store(start))
243 | new_blocks.extend(items)
244 | if end.strip():
245 | new_blocks.append(
246 | self.markdown.htmlStash.store(end))
247 | else:
248 | new_blocks.append(
249 | self.markdown.htmlStash.store('\n\n'.join(items)))
250 | #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
251 | new_blocks.append('\n')
252 |
253 | new_text = "\n\n".join(new_blocks)
254 | return new_text.split("\n")
255 |
256 |
257 | class ReferencePreprocessor(Preprocessor):
258 | """ Remove reference definitions from text and store for later use. """
259 |
260 | TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
261 | RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
262 | TITLE_RE = re.compile(r'^%s$' % TITLE)
263 |
264 | def run (self, lines):
265 | new_text = [];
266 | while lines:
267 | line = lines.pop(0)
268 | m = self.RE.match(line)
269 | if m:
270 | id = m.group(1).strip().lower()
271 | link = m.group(2).lstrip('<').rstrip('>')
272 | t = m.group(5) or m.group(6) or m.group(7)
273 | if not t:
274 | # Check next line for title
275 | tm = self.TITLE_RE.match(lines[0])
276 | if tm:
277 | lines.pop(0)
278 | t = tm.group(2) or tm.group(3) or tm.group(4)
279 | self.markdown.references[id] = (link, t)
280 | else:
281 | new_text.append(line)
282 |
283 | return new_text #+ "\n"
284 |
--------------------------------------------------------------------------------
/markdown/treeprocessors.py:
--------------------------------------------------------------------------------
1 | import inlinepatterns
2 | import util
3 | import odict
4 |
5 |
6 | def build_treeprocessors(md_instance, **kwargs):
7 | """ Build the default treeprocessors for Markdown. """
8 | treeprocessors = odict.OrderedDict()
9 | treeprocessors["inline"] = InlineProcessor(md_instance)
10 | treeprocessors["prettify"] = PrettifyTreeprocessor(md_instance)
11 | return treeprocessors
12 |
13 |
14 | def isString(s):
15 | """ Check if it's string """
16 | if not isinstance(s, util.AtomicString):
17 | return isinstance(s, basestring)
18 | return False
19 |
20 |
21 | class Processor:
22 | def __init__(self, markdown_instance=None):
23 | if markdown_instance:
24 | self.markdown = markdown_instance
25 |
26 |
27 | class Treeprocessor(Processor):
28 | """
29 | Treeprocessors are run on the ElementTree object before serialization.
30 |
31 | Each Treeprocessor implements a "run" method that takes a pointer to an
32 | ElementTree, modifies it as necessary and returns an ElementTree
33 | object.
34 |
35 | Treeprocessors must extend markdown.Treeprocessor.
36 |
37 | """
38 | def run(self, root):
39 | """
40 | Subclasses of Treeprocessor should implement a `run` method, which
41 | takes a root ElementTree. This method can return another ElementTree
42 | object, and the existing root ElementTree will be replaced, or it can
43 | modify the current tree and return None.
44 | """
45 | pass
46 |
47 |
48 | class InlineProcessor(Treeprocessor):
49 | """
50 | A Treeprocessor that traverses a tree, applying inline patterns.
51 | """
52 |
53 | def __init__(self, md):
54 | self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
55 | self.__placeholder_suffix = util.ETX
56 | self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
57 | + len(self.__placeholder_suffix)
58 | self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
59 | self.markdown = md
60 |
61 | def __makePlaceholder(self, type):
62 | """ Generate a placeholder """
63 | id = "%04d" % len(self.stashed_nodes)
64 | hash = util.INLINE_PLACEHOLDER % id
65 | return hash, id
66 |
67 | def __findPlaceholder(self, data, index):
68 | """
69 | Extract id from data string, start from index
70 |
71 | Keyword arguments:
72 |
73 | * data: string
74 | * index: index, from which we start search
75 |
76 | Returns: placeholder id and string index, after the found placeholder.
77 |
78 | """
79 | m = self.__placeholder_re.search(data, index)
80 | if m:
81 | return m.group(1), m.end()
82 | else:
83 | return None, index + 1
84 |
85 | def __stashNode(self, node, type):
86 | """ Add node to stash """
87 | placeholder, id = self.__makePlaceholder(type)
88 | self.stashed_nodes[id] = node
89 | return placeholder
90 |
91 | def __handleInline(self, data, patternIndex=0):
92 | """
93 | Process string with inline patterns and replace it
94 | with placeholders
95 |
96 | Keyword arguments:
97 |
98 | * data: A line of Markdown text
99 | * patternIndex: The index of the inlinePattern to start with
100 |
101 | Returns: String with placeholders.
102 |
103 | """
104 | if not isinstance(data, util.AtomicString):
105 | startIndex = 0
106 | while patternIndex < len(self.markdown.inlinePatterns):
107 | data, matched, startIndex = self.__applyPattern(
108 | self.markdown.inlinePatterns.value_for_index(patternIndex),
109 | data, patternIndex, startIndex)
110 | if not matched:
111 | patternIndex += 1
112 | return data
113 |
114 | def __processElementText(self, node, subnode, isText=True):
115 | """
116 | Process placeholders in Element.text or Element.tail
117 | of Elements popped from self.stashed_nodes.
118 |
119 | Keywords arguments:
120 |
121 | * node: parent node
122 | * subnode: processing node
123 | * isText: bool variable, True - it's text, False - it's tail
124 |
125 | Returns: None
126 |
127 | """
128 | if isText:
129 | text = subnode.text
130 | subnode.text = None
131 | else:
132 | text = subnode.tail
133 | subnode.tail = None
134 |
135 | childResult = self.__processPlaceholders(text, subnode)
136 |
137 | if not isText and node is not subnode:
138 | pos = node.getchildren().index(subnode)
139 | node.remove(subnode)
140 | else:
141 | pos = 0
142 |
143 | childResult.reverse()
144 | for newChild in childResult:
145 | node.insert(pos, newChild)
146 |
147 | def __processPlaceholders(self, data, parent):
148 | """
149 | Process string with placeholders and generate ElementTree tree.
150 |
151 | Keyword arguments:
152 |
153 | * data: string with placeholders instead of ElementTree elements.
154 | * parent: Element, which contains processing inline data
155 |
156 | Returns: list with ElementTree elements with applied inline patterns.
157 |
158 | """
159 | def linkText(text):
160 | if text:
161 | if result:
162 | if result[-1].tail:
163 | result[-1].tail += text
164 | else:
165 | result[-1].tail = text
166 | else:
167 | if parent.text:
168 | parent.text += text
169 | else:
170 | parent.text = text
171 | result = []
172 | strartIndex = 0
173 | while data:
174 | index = data.find(self.__placeholder_prefix, strartIndex)
175 | if index != -1:
176 | id, phEndIndex = self.__findPlaceholder(data, index)
177 |
178 | if id in self.stashed_nodes:
179 | node = self.stashed_nodes.get(id)
180 |
181 | if index > 0:
182 | text = data[strartIndex:index]
183 | linkText(text)
184 |
185 | if not isString(node): # it's Element
186 | for child in [node] + node.getchildren():
187 | if child.tail:
188 | if child.tail.strip():
189 | self.__processElementText(node, child,False)
190 | if child.text:
191 | if child.text.strip():
192 | self.__processElementText(child, child)
193 | else: # it's just a string
194 | linkText(node)
195 | strartIndex = phEndIndex
196 | continue
197 |
198 | strartIndex = phEndIndex
199 | result.append(node)
200 |
201 | else: # wrong placeholder
202 | end = index + len(self.__placeholder_prefix)
203 | linkText(data[strartIndex:end])
204 | strartIndex = end
205 | else:
206 | text = data[strartIndex:]
207 | if isinstance(data, util.AtomicString):
208 | # We don't want to loose the AtomicString
209 | text = util.AtomicString(text)
210 | linkText(text)
211 | data = ""
212 |
213 | return result
214 |
215 | def __applyPattern(self, pattern, data, patternIndex, startIndex=0):
216 | """
217 | Check if the line fits the pattern, create the necessary
218 | elements, add it to stashed_nodes.
219 |
220 | Keyword arguments:
221 |
222 | * data: the text to be processed
223 | * pattern: the pattern to be checked
224 | * patternIndex: index of current pattern
225 | * startIndex: string index, from which we start searching
226 |
227 | Returns: String with placeholders instead of ElementTree elements.
228 |
229 | """
230 | match = pattern.getCompiledRegExp().match(data[startIndex:])
231 | leftData = data[:startIndex]
232 |
233 | if not match:
234 | return data, False, 0
235 |
236 | node = pattern.handleMatch(match)
237 |
238 | if node is None:
239 | return data, True, len(leftData)+match.span(len(match.groups()))[0]
240 |
241 | if not isString(node):
242 | if not isinstance(node.text, util.AtomicString):
243 | # We need to process current node too
244 | for child in [node] + node.getchildren():
245 | if not isString(node):
246 | if child.text:
247 | child.text = self.__handleInline(child.text,
248 | patternIndex + 1)
249 | if child.tail:
250 | child.tail = self.__handleInline(child.tail,
251 | patternIndex)
252 |
253 | placeholder = self.__stashNode(node, pattern.type())
254 |
255 | return "%s%s%s%s" % (leftData,
256 | match.group(1),
257 | placeholder, match.groups()[-1]), True, 0
258 |
259 | def run(self, tree):
260 | """Apply inline patterns to a parsed Markdown tree.
261 |
262 | Iterate over ElementTree, find elements with inline tag, apply inline
263 | patterns and append newly created Elements to tree. If you don't
264 | want to process your data with inline paterns, instead of normal string,
265 | use subclass AtomicString:
266 |
267 | node.text = markdown.AtomicString("This will not be processed.")
268 |
269 | Arguments:
270 |
271 | * tree: ElementTree object, representing Markdown tree.
272 |
273 | Returns: ElementTree object with applied inline patterns.
274 |
275 | """
276 | self.stashed_nodes = {}
277 |
278 | stack = [tree]
279 |
280 | while stack:
281 | currElement = stack.pop()
282 | insertQueue = []
283 | for child in currElement.getchildren():
284 | if child.text and not isinstance(child.text, util.AtomicString):
285 | text = child.text
286 | child.text = None
287 | lst = self.__processPlaceholders(self.__handleInline(
288 | text), child)
289 | stack += lst
290 | insertQueue.append((child, lst))
291 | if child.tail:
292 | tail = self.__handleInline(child.tail)
293 | dumby = util.etree.Element('d')
294 | tailResult = self.__processPlaceholders(tail, dumby)
295 | if dumby.text:
296 | child.tail = dumby.text
297 | else:
298 | child.tail = None
299 | pos = currElement.getchildren().index(child) + 1
300 | tailResult.reverse()
301 | for newChild in tailResult:
302 | currElement.insert(pos, newChild)
303 | if child.getchildren():
304 | stack.append(child)
305 |
306 | for element, lst in insertQueue:
307 | if self.markdown.enable_attributes:
308 | if element.text:
309 | element.text = \
310 | inlinepatterns.handleAttributes(element.text,
311 | element)
312 | i = 0
313 | for newChild in lst:
314 | if self.markdown.enable_attributes:
315 | # Processing attributes
316 | if newChild.tail:
317 | newChild.tail = \
318 | inlinepatterns.handleAttributes(newChild.tail,
319 | element)
320 | if newChild.text:
321 | newChild.text = \
322 | inlinepatterns.handleAttributes(newChild.text,
323 | newChild)
324 | element.insert(i, newChild)
325 | i += 1
326 | return tree
327 |
328 |
329 | class PrettifyTreeprocessor(Treeprocessor):
330 | """ Add linebreaks to the html document. """
331 |
332 | def _prettifyETree(self, elem):
333 | """ Recursively add linebreaks to ElementTree children. """
334 |
335 | i = "\n"
336 | if util.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']:
337 | if (not elem.text or not elem.text.strip()) \
338 | and len(elem) and util.isBlockLevel(elem[0].tag):
339 | elem.text = i
340 | for e in elem:
341 | if util.isBlockLevel(e.tag):
342 | self._prettifyETree(e)
343 | if not elem.tail or not elem.tail.strip():
344 | elem.tail = i
345 | if not elem.tail or not elem.tail.strip():
346 | elem.tail = i
347 |
348 | def run(self, root):
349 | """ Add linebreaks to ElementTree root object. """
350 |
351 | self._prettifyETree(root)
352 | # Do
's seperately as they are often in the middle of
353 | # inline content and missed by _prettifyETree.
354 | brs = root.getiterator('br')
355 | for br in brs:
356 | if not br.tail or not br.tail.strip():
357 | br.tail = '\n'
358 | else:
359 | br.tail = '\n%s' % br.tail
360 |
--------------------------------------------------------------------------------
/markdown/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Python Markdown
3 | ===============
4 |
5 | Python Markdown converts Markdown to HTML and can be used as a library or
6 | called from the command line.
7 |
8 | ## Basic usage as a module:
9 |
10 | import markdown
11 | html = markdown.markdown(your_text_string)
12 |
13 | See for more
14 | information and instructions on how to extend the functionality of
15 | Python Markdown. Read that before you try modifying this file.
16 |
17 | ## Authors and License
18 |
19 | Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
20 | maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
21 | Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
22 |
23 | Contact: markdown@freewisdom.org
24 |
25 | Copyright 2007-2012 The Python Markdown Project (v. 1.7 and later)
26 | Copyright 200? Django Software Foundation (OrderedDict implementation)
27 | Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
28 | Copyright 2004 Manfred Stienstra (the original version)
29 |
30 | License: BSD (see LICENSE for details).
31 | """
32 |
33 | version = "2.2.1"
34 | version_info = (2,2,1, "final")
35 |
36 | import re
37 | import codecs
38 | import sys
39 | import logging
40 | import util
41 | from preprocessors import build_preprocessors
42 | from blockprocessors import build_block_parser
43 | from treeprocessors import build_treeprocessors
44 | from inlinepatterns import build_inlinepatterns
45 | from postprocessors import build_postprocessors
46 | from extensions import Extension
47 | from serializers import to_html_string, to_xhtml_string
48 |
49 | __all__ = ['Markdown', 'markdown', 'markdownFromFile']
50 |
51 | logger = logging.getLogger('MARKDOWN')
52 |
53 |
54 | class Markdown:
55 | """Convert Markdown to HTML."""
56 |
57 | doc_tag = "div" # Element used to wrap document - later removed
58 |
59 | option_defaults = {
60 | 'html_replacement_text' : '[HTML_REMOVED]',
61 | 'tab_length' : 4,
62 | 'enable_attributes' : True,
63 | 'smart_emphasis' : True,
64 | 'lazy_ol' : True,
65 | }
66 |
67 | output_formats = {
68 | 'html' : to_html_string,
69 | 'html4' : to_html_string,
70 | 'html5' : to_html_string,
71 | 'xhtml' : to_xhtml_string,
72 | 'xhtml1': to_xhtml_string,
73 | 'xhtml5': to_xhtml_string,
74 | }
75 |
76 | ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']',
77 | '(', ')', '>', '#', '+', '-', '.', '!']
78 |
79 | def __init__(self, *args, **kwargs):
80 | """
81 | Creates a new Markdown instance.
82 |
83 | Keyword arguments:
84 |
85 | * extensions: A list of extensions.
86 | If they are of type string, the module mdx_name.py will be loaded.
87 | If they are a subclass of markdown.Extension, they will be used
88 | as-is.
89 | * extension_configs: Configuration settingis for extensions.
90 | * output_format: Format of output. Supported formats are:
91 | * "xhtml1": Outputs XHTML 1.x. Default.
92 | * "xhtml5": Outputs XHTML style tags of HTML 5
93 | * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
94 | * "html4": Outputs HTML 4
95 | * "html5": Outputs HTML style tags of HTML 5
96 | * "html": Outputs latest supported version of HTML (currently HTML 4).
97 | Note that it is suggested that the more specific formats ("xhtml1"
98 | and "html4") be used as "xhtml" or "html" may change in the future
99 | if it makes sense at that time.
100 | * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
101 | * html_replacement_text: Text used when safe_mode is set to "replace".
102 | * tab_length: Length of tabs in the source. Default: 4
103 | * enable_attributes: Enable the conversion of attributes. Default: True
104 | * smart_emphasis: Treat `_connected_words_` intelegently Default: True
105 | * lazy_ol: Ignore number of first item of ordered lists. Default: True
106 |
107 | """
108 |
109 | # For backward compatibility, loop through old positional args
110 | pos = ['extensions', 'extension_configs', 'safe_mode', 'output_format']
111 | c = 0
112 | for arg in args:
113 | if not kwargs.has_key(pos[c]):
114 | kwargs[pos[c]] = arg
115 | c += 1
116 | if c == len(pos):
117 | # ignore any additional args
118 | break
119 |
120 | # Loop through kwargs and assign defaults
121 | for option, default in self.option_defaults.items():
122 | setattr(self, option, kwargs.get(option, default))
123 |
124 | self.safeMode = kwargs.get('safe_mode', False)
125 | if self.safeMode and not kwargs.has_key('enable_attributes'):
126 | # Disable attributes in safeMode when not explicitly set
127 | self.enable_attributes = False
128 |
129 | self.registeredExtensions = []
130 | self.docType = ""
131 | self.stripTopLevelTags = True
132 |
133 | self.build_parser()
134 |
135 | self.references = {}
136 | self.htmlStash = util.HtmlStash()
137 | self.registerExtensions(extensions=kwargs.get('extensions', []),
138 | configs=kwargs.get('extension_configs', {}))
139 | self.set_output_format(kwargs.get('output_format', 'xhtml1'))
140 | self.reset()
141 |
142 | def build_parser(self):
143 | """ Build the parser from the various parts. """
144 | self.preprocessors = build_preprocessors(self)
145 | self.parser = build_block_parser(self)
146 | self.inlinePatterns = build_inlinepatterns(self)
147 | self.treeprocessors = build_treeprocessors(self)
148 | self.postprocessors = build_postprocessors(self)
149 | return self
150 |
151 | def registerExtensions(self, extensions, configs):
152 | """
153 | Register extensions with this instance of Markdown.
154 |
155 | Keyword arguments:
156 |
157 | * extensions: A list of extensions, which can either
158 | be strings or objects. See the docstring on Markdown.
159 | * configs: A dictionary mapping module names to config options.
160 |
161 | """
162 | for ext in extensions:
163 | if isinstance(ext, basestring):
164 | ext = self.build_extension(ext, configs.get(ext, []))
165 | if isinstance(ext, Extension):
166 | ext.extendMarkdown(self, globals())
167 | elif ext is not None:
168 | raise TypeError(
169 | 'Extension "%s.%s" must be of type: "markdown.Extension"'
170 | % (ext.__class__.__module__, ext.__class__.__name__))
171 |
172 | return self
173 |
174 | def build_extension(self, ext_name, configs = []):
175 | """Build extension by name, then return the module.
176 |
177 | The extension name may contain arguments as part of the string in the
178 | following format: "extname(key1=value1,key2=value2)"
179 |
180 | """
181 |
182 | # Parse extensions config params (ignore the order)
183 | configs = dict(configs)
184 | pos = ext_name.find("(") # find the first "("
185 | if pos > 0:
186 | ext_args = ext_name[pos+1:-1]
187 | ext_name = ext_name[:pos]
188 | pairs = [x.split("=") for x in ext_args.split(",")]
189 | configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
190 |
191 | # Setup the module name
192 | module_name = ext_name
193 | if '.' not in ext_name:
194 | module_name = '.'.join(['markdown.extensions', ext_name])
195 |
196 | # Try loading the extension first from one place, then another
197 | try: # New style (markdown.extensons.)
198 | module = __import__(module_name, {}, {}, [module_name.rpartition('.')[0]])
199 | except ImportError:
200 | module_name_old_style = '_'.join(['mdx', ext_name])
201 | try: # Old style (mdx_)
202 | module = __import__(module_name_old_style)
203 | except ImportError, e:
204 | message = "Failed loading extension '%s' from '%s' or '%s'" \
205 | % (ext_name, module_name, module_name_old_style)
206 | e.args = (message,) + e.args[1:]
207 | raise
208 |
209 | # If the module is loaded successfully, we expect it to define a
210 | # function called makeExtension()
211 | try:
212 | return module.makeExtension(configs.items())
213 | except AttributeError, e:
214 | message = e.args[0]
215 | message = "Failed to initiate extension " \
216 | "'%s': %s" % (ext_name, message)
217 | e.args = (message,) + e.args[1:]
218 | raise
219 |
220 | def registerExtension(self, extension):
221 | """ This gets called by the extension """
222 | self.registeredExtensions.append(extension)
223 | return self
224 |
225 | def reset(self):
226 | """
227 | Resets all state variables so that we can start with a new text.
228 | """
229 | self.htmlStash.reset()
230 | self.references.clear()
231 |
232 | for extension in self.registeredExtensions:
233 | if hasattr(extension, 'reset'):
234 | extension.reset()
235 |
236 | return self
237 |
238 | def set_output_format(self, format):
239 | """ Set the output format for the class instance. """
240 | self.output_format = format.lower()
241 | try:
242 | self.serializer = self.output_formats[self.output_format]
243 | except KeyError, e:
244 | valid_formats = self.output_formats.keys()
245 | valid_formats.sort()
246 | message = 'Invalid Output Format: "%s". Use one of %s.' \
247 | % (self.output_format,
248 | '"' + '", "'.join(valid_formats) + '"')
249 | e.args = (message,) + e.args[1:]
250 | raise
251 | return self
252 |
253 | def convert(self, source):
254 | """
255 | Convert markdown to serialized XHTML or HTML.
256 |
257 | Keyword arguments:
258 |
259 | * source: Source text as a Unicode string.
260 |
261 | Markdown processing takes place in five steps:
262 |
263 | 1. A bunch of "preprocessors" munge the input text.
264 | 2. BlockParser() parses the high-level structural elements of the
265 | pre-processed text into an ElementTree.
266 | 3. A bunch of "treeprocessors" are run against the ElementTree. One
267 | such treeprocessor runs InlinePatterns against the ElementTree,
268 | detecting inline markup.
269 | 4. Some post-processors are run against the text after the ElementTree
270 | has been serialized into text.
271 | 5. The output is written to a string.
272 |
273 | """
274 |
275 | # Fixup the source text
276 | if not source.strip():
277 | return u"" # a blank unicode string
278 |
279 | try:
280 | source = unicode(source)
281 | except UnicodeDecodeError, e:
282 | # Customise error message while maintaining original trackback
283 | e.reason += '. -- Note: Markdown only accepts unicode input!'
284 | raise
285 |
286 | source = source.replace(util.STX, "").replace(util.ETX, "")
287 | source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
288 | source = re.sub(r'\n\s+\n', '\n\n', source)
289 | source = source.expandtabs(self.tab_length)
290 |
291 | # Split into lines and run the line preprocessors.
292 | self.lines = source.split("\n")
293 | for prep in self.preprocessors.values():
294 | self.lines = prep.run(self.lines)
295 |
296 | # Parse the high-level elements.
297 | root = self.parser.parseDocument(self.lines).getroot()
298 |
299 | # Run the tree-processors
300 | for treeprocessor in self.treeprocessors.values():
301 | newRoot = treeprocessor.run(root)
302 | if newRoot:
303 | root = newRoot
304 |
305 | # Serialize _properly_. Strip top-level tags.
306 | output = self.serializer(root)
307 | if self.stripTopLevelTags:
308 | try:
309 | start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2
310 | end = output.rindex('%s>'%self.doc_tag)
311 | output = output[start:end].strip()
312 | except ValueError:
313 | if output.strip().endswith('<%s />'%self.doc_tag):
314 | # We have an empty document
315 | output = ''
316 | else:
317 | # We have a serious problem
318 | raise ValueError('Markdown failed to strip top-level tags. Document=%r' % output.strip())
319 |
320 | # Run the text post-processors
321 | for pp in self.postprocessors.values():
322 | output = pp.run(output)
323 |
324 | return output.strip()
325 |
326 | def convertFile(self, input=None, output=None, encoding=None):
327 | """Converts a markdown file and returns the HTML as a unicode string.
328 |
329 | Decodes the file using the provided encoding (defaults to utf-8),
330 | passes the file content to markdown, and outputs the html to either
331 | the provided stream or the file with provided name, using the same
332 | encoding as the source file. The 'xmlcharrefreplace' error handler is
333 | used when encoding the output.
334 |
335 | **Note:** This is the only place that decoding and encoding of unicode
336 | takes place in Python-Markdown. (All other code is unicode-in /
337 | unicode-out.)
338 |
339 | Keyword arguments:
340 |
341 | * input: File object or path. Reads from stdin if `None`.
342 | * output: File object or path. Writes to stdout if `None`.
343 | * encoding: Encoding of input and output files. Defaults to utf-8.
344 |
345 | """
346 |
347 | encoding = encoding or "utf-8"
348 |
349 | # Read the source
350 | if input:
351 | if isinstance(input, str):
352 | input_file = codecs.open(input, mode="r", encoding=encoding)
353 | else:
354 | input_file = codecs.getreader(encoding)(input)
355 | text = input_file.read()
356 | input_file.close()
357 | else:
358 | text = sys.stdin.read()
359 | if not isinstance(text, unicode):
360 | text = text.decode(encoding)
361 |
362 | text = text.lstrip('\ufeff') # remove the byte-order mark
363 |
364 | # Convert
365 | html = self.convert(text)
366 |
367 | # Write to file or stdout
368 | if output:
369 | if isinstance(output, str):
370 | output_file = codecs.open(output, "w",
371 | encoding=encoding,
372 | errors="xmlcharrefreplace")
373 | output_file.write(html)
374 | output_file.close()
375 | else:
376 | writer = codecs.getwriter(encoding)
377 | output_file = writer(output, errors="xmlcharrefreplace")
378 | output_file.write(html)
379 | # Don't close here. User may want to write more.
380 | else:
381 | # Encode manually and write bytes to stdout.
382 | html = html.encode(encoding, "xmlcharrefreplace")
383 | try:
384 | # Write bytes directly to buffer (Python 3).
385 | sys.stdout.buffer.write(html)
386 | except AttributeError:
387 | # Probably Python 2, which works with bytes by default.
388 | sys.stdout.write(html)
389 |
390 | return self
391 |
392 |
393 | """
394 | EXPORTED FUNCTIONS
395 | =============================================================================
396 |
397 | Those are the two functions we really mean to export: markdown() and
398 | markdownFromFile().
399 | """
400 |
401 | def markdown(text, *args, **kwargs):
402 | """Convert a markdown string to HTML and return HTML as a unicode string.
403 |
404 | This is a shortcut function for `Markdown` class to cover the most
405 | basic use case. It initializes an instance of Markdown, loads the
406 | necessary extensions and runs the parser on the given text.
407 |
408 | Keyword arguments:
409 |
410 | * text: Markdown formatted text as Unicode or ASCII string.
411 | * Any arguments accepted by the Markdown class.
412 |
413 | Returns: An HTML document as a string.
414 |
415 | """
416 | md = Markdown(*args, **kwargs)
417 | return md.convert(text)
418 |
419 |
420 | def markdownFromFile(*args, **kwargs):
421 | """Read markdown code from a file and write it to a file or a stream.
422 |
423 | This is a shortcut function which initializes an instance of Markdown,
424 | and calls the convertFile method rather than convert.
425 |
426 | Keyword arguments:
427 |
428 | * input: a file name or readable object.
429 | * output: a file name or writable object.
430 | * encoding: Encoding of input and output.
431 | * Any arguments accepted by the Markdown class.
432 |
433 | """
434 | # For backward compatibility loop through positional args
435 | pos = ['input', 'output', 'extensions', 'encoding']
436 | c = 0
437 | for arg in args:
438 | if not kwargs.has_key(pos[c]):
439 | kwargs[pos[c]] = arg
440 | c += 1
441 | if c == len(pos):
442 | break
443 |
444 | md = Markdown(**kwargs)
445 | md.convertFile(kwargs.get('input', None),
446 | kwargs.get('output', None),
447 | kwargs.get('encoding', None))
448 |
449 |
--------------------------------------------------------------------------------
/markdown/inlinepatterns.py:
--------------------------------------------------------------------------------
1 | """
2 | INLINE PATTERNS
3 | =============================================================================
4 |
5 | Inline patterns such as *emphasis* are handled by means of auxiliary
6 | objects, one per pattern. Pattern objects must be instances of classes
7 | that extend markdown.Pattern. Each pattern object uses a single regular
8 | expression and needs support the following methods:
9 |
10 | pattern.getCompiledRegExp() # returns a regular expression
11 |
12 | pattern.handleMatch(m) # takes a match object and returns
13 | # an ElementTree element or just plain text
14 |
15 | All of python markdown's built-in patterns subclass from Pattern,
16 | but you can add additional patterns that don't.
17 |
18 | Also note that all the regular expressions used by inline must
19 | capture the whole block. For this reason, they all start with
20 | '^(.*)' and end with '(.*)!'. In case with built-in expression
21 | Pattern takes care of adding the "^(.*)" and "(.*)!".
22 |
23 | Finally, the order in which regular expressions are applied is very
24 | important - e.g. if we first replace http://.../ links with tags
25 | and _then_ try to replace inline html, we would end up with a mess.
26 | So, we apply the expressions in the following order:
27 |
28 | * escape and backticks have to go before everything else, so
29 | that we can preempt any markdown patterns by escaping them.
30 |
31 | * then we handle auto-links (must be done before inline html)
32 |
33 | * then we handle inline HTML. At this point we will simply
34 | replace all inline HTML strings with a placeholder and add
35 | the actual HTML to a hash.
36 |
37 | * then inline images (must be done before links)
38 |
39 | * then bracketed links, first regular then reference-style
40 |
41 | * finally we apply strong and emphasis
42 | """
43 |
44 | import util
45 | import odict
46 | import re
47 | from urlparse import urlparse, urlunparse
48 | # If you see an ImportError for htmlentitydefs after using 2to3 to convert for
49 | # use by Python3, then you are probably using the buggy version from Python 3.0.
50 | # We recomend using the tool from Python 3.1 even if you will be running the
51 | # code on Python 3.0. The following line should be converted by the tool to:
52 | # `from html import entities` and later calls to `htmlentitydefs` should be
53 | # changed to call `entities`. Python 3.1's tool does this but 3.0's does not.
54 | import htmlentitydefs
55 |
56 |
57 | def build_inlinepatterns(md_instance, **kwargs):
58 | """ Build the default set of inline patterns for Markdown. """
59 | inlinePatterns = odict.OrderedDict()
60 | inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
61 | inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
62 | inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
63 | inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
64 | inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
65 | inlinePatterns["image_reference"] = \
66 | ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)
67 | inlinePatterns["short_reference"] = \
68 | ReferencePattern(SHORT_REF_RE, md_instance)
69 | inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
70 | inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
71 | inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
72 | if md_instance.safeMode != 'escape':
73 | inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
74 | inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
75 | inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
76 | inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
77 | inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
78 | inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
79 | if md_instance.smart_emphasis:
80 | inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
81 | else:
82 | inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
83 | return inlinePatterns
84 |
85 | """
86 | The actual regular expressions for patterns
87 | -----------------------------------------------------------------------------
88 | """
89 |
90 | NOBRACKET = r'[^\]\[]*'
91 | BRK = ( r'\[('
92 | + (NOBRACKET + r'(\[')*6
93 | + (NOBRACKET+ r'\])*')*6
94 | + NOBRACKET + r')\]' )
95 | NOIMG = r'(?|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
106 | # [text](url) or [text]() or [text](url "title")
107 |
108 | IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
109 | #  or 
110 | REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3]
111 | SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google]
112 | IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]
113 | NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _
114 | AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' #
115 | AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' #
116 |
117 | HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...>
118 | ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &
119 | LINE_BREAK_RE = r' \n' # two spaces at end of line
120 |
121 |
122 | def dequote(string):
123 | """Remove quotes from around a string."""
124 | if ( ( string.startswith('"') and string.endswith('"'))
125 | or (string.startswith("'") and string.endswith("'")) ):
126 | return string[1:-1]
127 | else:
128 | return string
129 |
130 | ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
131 |
132 | def handleAttributes(text, parent):
133 | """Set values of an element based on attribute definitions ({@id=123})."""
134 | def attributeCallback(match):
135 | parent.set(match.group(1), match.group(2).replace('\n', ' '))
136 | return ATTR_RE.sub(attributeCallback, text)
137 |
138 |
139 | """
140 | The pattern classes
141 | -----------------------------------------------------------------------------
142 | """
143 |
144 | class Pattern:
145 | """Base class that inline patterns subclass. """
146 |
147 | def __init__(self, pattern, markdown_instance=None):
148 | """
149 | Create an instant of an inline pattern.
150 |
151 | Keyword arguments:
152 |
153 | * pattern: A regular expression that matches a pattern
154 |
155 | """
156 | self.pattern = pattern
157 | self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
158 | re.DOTALL | re.UNICODE)
159 |
160 | # Api for Markdown to pass safe_mode into instance
161 | self.safe_mode = False
162 | if markdown_instance:
163 | self.markdown = markdown_instance
164 |
165 | def getCompiledRegExp(self):
166 | """ Return a compiled regular expression. """
167 | return self.compiled_re
168 |
169 | def handleMatch(self, m):
170 | """Return a ElementTree element from the given match.
171 |
172 | Subclasses should override this method.
173 |
174 | Keyword arguments:
175 |
176 | * m: A re match object containing a match of the pattern.
177 |
178 | """
179 | pass
180 |
181 | def type(self):
182 | """ Return class name, to define pattern type """
183 | return self.__class__.__name__
184 |
185 | def unescape(self, text):
186 | """ Return unescaped text given text with an inline placeholder. """
187 | try:
188 | stash = self.markdown.treeprocessors['inline'].stashed_nodes
189 | except KeyError:
190 | return text
191 | def itertext(el):
192 | ' Reimplement Element.itertext for older python versions '
193 | tag = el.tag
194 | if not isinstance(tag, basestring) and tag is not None:
195 | return
196 | if el.text:
197 | yield el.text
198 | for e in el:
199 | for s in itertext(e):
200 | yield s
201 | if e.tail:
202 | yield e.tail
203 | def get_stash(m):
204 | id = m.group(1)
205 | if id in stash:
206 | value = stash.get(id)
207 | if isinstance(value, basestring):
208 | return value
209 | else:
210 | # An etree Element - return text content only
211 | return ''.join(itertext(value))
212 | return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
213 |
214 |
215 | class SimpleTextPattern(Pattern):
216 | """ Return a simple text of group(2) of a Pattern. """
217 | def handleMatch(self, m):
218 | text = m.group(2)
219 | if text == util.INLINE_PLACEHOLDER_PREFIX:
220 | return None
221 | return text
222 |
223 |
224 | class EscapePattern(Pattern):
225 | """ Return an escaped character. """
226 |
227 | def handleMatch(self, m):
228 | char = m.group(2)
229 | if char in self.markdown.ESCAPED_CHARS:
230 | return '%s%s%s' % (util.STX, ord(char), util.ETX)
231 | else:
232 | return '\\%s' % char
233 |
234 |
235 | class SimpleTagPattern(Pattern):
236 | """
237 | Return element of type `tag` with a text attribute of group(3)
238 | of a Pattern.
239 |
240 | """
241 | def __init__ (self, pattern, tag):
242 | Pattern.__init__(self, pattern)
243 | self.tag = tag
244 |
245 | def handleMatch(self, m):
246 | el = util.etree.Element(self.tag)
247 | el.text = m.group(3)
248 | return el
249 |
250 |
251 | class SubstituteTagPattern(SimpleTagPattern):
252 | """ Return an element of type `tag` with no children. """
253 | def handleMatch (self, m):
254 | return util.etree.Element(self.tag)
255 |
256 |
257 | class BacktickPattern(Pattern):
258 | """ Return a `` element containing the matching text. """
259 | def __init__ (self, pattern):
260 | Pattern.__init__(self, pattern)
261 | self.tag = "code"
262 |
263 | def handleMatch(self, m):
264 | el = util.etree.Element(self.tag)
265 | el.text = util.AtomicString(m.group(3).strip())
266 | return el
267 |
268 |
269 | class DoubleTagPattern(SimpleTagPattern):
270 | """Return a ElementTree element nested in tag2 nested in tag1.
271 |
272 | Useful for strong emphasis etc.
273 |
274 | """
275 | def handleMatch(self, m):
276 | tag1, tag2 = self.tag.split(",")
277 | el1 = util.etree.Element(tag1)
278 | el2 = util.etree.SubElement(el1, tag2)
279 | el2.text = m.group(3)
280 | return el1
281 |
282 |
283 | class HtmlPattern(Pattern):
284 | """ Store raw inline html and return a placeholder. """
285 | def handleMatch (self, m):
286 | rawhtml = self.unescape(m.group(2))
287 | place_holder = self.markdown.htmlStash.store(rawhtml)
288 | return place_holder
289 |
290 | def unescape(self, text):
291 | """ Return unescaped text given text with an inline placeholder. """
292 | try:
293 | stash = self.markdown.treeprocessors['inline'].stashed_nodes
294 | except KeyError:
295 | return text
296 | def get_stash(m):
297 | id = m.group(1)
298 | value = stash.get(id)
299 | if value is not None:
300 | try:
301 | return self.markdown.serializer(value)
302 | except:
303 | return '\%s' % value
304 |
305 | return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
306 |
307 |
308 | class LinkPattern(Pattern):
309 | """ Return a link element from the given match. """
310 | def handleMatch(self, m):
311 | el = util.etree.Element("a")
312 | el.text = m.group(2)
313 | title = m.group(13)
314 | href = m.group(9)
315 |
316 | if href:
317 | if href[0] == "<":
318 | href = href[1:-1]
319 | el.set("href", self.sanitize_url(self.unescape(href.strip())))
320 | else:
321 | el.set("href", "")
322 |
323 | if title:
324 | title = dequote(self.unescape(title))
325 | el.set("title", title)
326 | return el
327 |
328 | def sanitize_url(self, url):
329 | """
330 | Sanitize a url against xss attacks in "safe_mode".
331 |
332 | Rather than specifically blacklisting `javascript:alert("XSS")` and all
333 | its aliases (see ), we whitelist known
334 | safe url formats. Most urls contain a network location, however some
335 | are known not to (i.e.: mailto links). Script urls do not contain a
336 | location. Additionally, for `javascript:...`, the scheme would be
337 | "javascript" but some aliases will appear to `urlparse()` to have no
338 | scheme. On top of that relative links (i.e.: "foo/bar.html") have no
339 | scheme. Therefore we must check "path", "parameters", "query" and
340 | "fragment" for any literal colons. We don't check "scheme" for colons
341 | because it *should* never have any and "netloc" must allow the form:
342 | `username:password@host:port`.
343 |
344 | """
345 | url = url.replace(' ', '%20')
346 | if not self.markdown.safeMode:
347 | # Return immediately bipassing parsing.
348 | return url
349 |
350 | try:
351 | scheme, netloc, path, params, query, fragment = url = urlparse(url)
352 | except ValueError:
353 | # Bad url - so bad it couldn't be parsed.
354 | return ''
355 |
356 | locless_schemes = ['', 'mailto', 'news']
357 | if netloc == '' and scheme not in locless_schemes:
358 | # This fails regardless of anything else.
359 | # Return immediately to save additional proccessing
360 | return ''
361 |
362 | for part in url[2:]:
363 | if ":" in part:
364 | # Not a safe url
365 | return ''
366 |
367 | # Url passes all tests. Return url as-is.
368 | return urlunparse(url)
369 |
370 | class ImagePattern(LinkPattern):
371 | """ Return a img element from the given match. """
372 | def handleMatch(self, m):
373 | el = util.etree.Element("img")
374 | src_parts = m.group(9).split()
375 | if src_parts:
376 | src = src_parts[0]
377 | if src[0] == "<" and src[-1] == ">":
378 | src = src[1:-1]
379 | el.set('src', self.sanitize_url(self.unescape(src)))
380 | else:
381 | el.set('src', "")
382 | if len(src_parts) > 1:
383 | el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
384 |
385 | if self.markdown.enable_attributes:
386 | truealt = handleAttributes(m.group(2), el)
387 | else:
388 | truealt = m.group(2)
389 |
390 | el.set('alt', self.unescape(truealt))
391 | return el
392 |
393 | class ReferencePattern(LinkPattern):
394 | """ Match to a stored reference and return link element. """
395 |
396 | NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
397 |
398 | def handleMatch(self, m):
399 | try:
400 | id = m.group(9).lower()
401 | except IndexError:
402 | id = None
403 | if not id:
404 | # if we got something like "[Google][]" or "[Goggle]"
405 | # we'll use "google" as the id
406 | id = m.group(2).lower()
407 |
408 | # Clean up linebreaks in id
409 | id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
410 | if not id in self.markdown.references: # ignore undefined refs
411 | return None
412 | href, title = self.markdown.references[id]
413 |
414 | text = m.group(2)
415 | return self.makeTag(href, title, text)
416 |
417 | def makeTag(self, href, title, text):
418 | el = util.etree.Element('a')
419 |
420 | el.set('href', self.sanitize_url(href))
421 | if title:
422 | el.set('title', title)
423 |
424 | el.text = text
425 | return el
426 |
427 |
428 | class ImageReferencePattern(ReferencePattern):
429 | """ Match to a stored reference and return img element. """
430 | def makeTag(self, href, title, text):
431 | el = util.etree.Element("img")
432 | el.set("src", self.sanitize_url(href))
433 | if title:
434 | el.set("title", title)
435 | el.set("alt", self.unescape(text))
436 | return el
437 |
438 |
439 | class AutolinkPattern(Pattern):
440 | """ Return a link Element given an autolink (``). """
441 | def handleMatch(self, m):
442 | el = util.etree.Element("a")
443 | el.set('href', self.unescape(m.group(2)))
444 | el.text = util.AtomicString(m.group(2))
445 | return el
446 |
447 | class AutomailPattern(Pattern):
448 | """
449 | Return a mailto link Element given an automail link (``).
450 | """
451 | def handleMatch(self, m):
452 | el = util.etree.Element('a')
453 | email = self.unescape(m.group(2))
454 | if email.startswith("mailto:"):
455 | email = email[len("mailto:"):]
456 |
457 | def codepoint2name(code):
458 | """Return entity definition by code, or the code if not defined."""
459 | entity = htmlentitydefs.codepoint2name.get(code)
460 | if entity:
461 | return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
462 | else:
463 | return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
464 |
465 | letters = [codepoint2name(ord(letter)) for letter in email]
466 | el.text = util.AtomicString(''.join(letters))
467 |
468 | mailto = "mailto:" + email
469 | mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
470 | ord(letter) for letter in mailto])
471 | el.set('href', mailto)
472 | return el
473 |
474 |
--------------------------------------------------------------------------------
/markdown/blockprocessors.py:
--------------------------------------------------------------------------------
1 | """
2 | CORE MARKDOWN BLOCKPARSER
3 | =============================================================================
4 |
5 | This parser handles basic parsing of Markdown blocks. It doesn't concern itself
6 | with inline elements such as **bold** or *italics*, but rather just catches
7 | blocks, lists, quotes, etc.
8 |
9 | The BlockParser is made up of a bunch of BlockProssors, each handling a
10 | different type of block. Extensions may add/replace/remove BlockProcessors
11 | as they need to alter how markdown blocks are parsed.
12 |
13 | """
14 |
15 | import logging
16 | import re
17 | import util
18 | from blockparser import BlockParser
19 |
20 | logger = logging.getLogger('MARKDOWN')
21 |
22 |
23 | def build_block_parser(md_instance, **kwargs):
24 | """ Build the default block parser used by Markdown. """
25 | parser = BlockParser(md_instance)
26 | parser.blockprocessors['empty'] = EmptyBlockProcessor(parser)
27 | parser.blockprocessors['indent'] = ListIndentProcessor(parser)
28 | parser.blockprocessors['code'] = CodeBlockProcessor(parser)
29 | parser.blockprocessors['hashheader'] = HashHeaderProcessor(parser)
30 | parser.blockprocessors['setextheader'] = SetextHeaderProcessor(parser)
31 | parser.blockprocessors['hr'] = HRProcessor(parser)
32 | parser.blockprocessors['olist'] = OListProcessor(parser)
33 | parser.blockprocessors['ulist'] = UListProcessor(parser)
34 | parser.blockprocessors['quote'] = BlockQuoteProcessor(parser)
35 | parser.blockprocessors['paragraph'] = ParagraphProcessor(parser)
36 | return parser
37 |
38 |
39 | class BlockProcessor:
40 | """ Base class for block processors.
41 |
42 | Each subclass will provide the methods below to work with the source and
43 | tree. Each processor will need to define it's own ``test`` and ``run``
44 | methods. The ``test`` method should return True or False, to indicate
45 | whether the current block should be processed by this processor. If the
46 | test passes, the parser will call the processors ``run`` method.
47 |
48 | """
49 |
50 | def __init__(self, parser):
51 | self.parser = parser
52 | self.tab_length = parser.markdown.tab_length
53 |
54 | def lastChild(self, parent):
55 | """ Return the last child of an etree element. """
56 | if len(parent):
57 | return parent[-1]
58 | else:
59 | return None
60 |
61 | def detab(self, text):
62 | """ Remove a tab from the front of each line of the given text. """
63 | newtext = []
64 | lines = text.split('\n')
65 | for line in lines:
66 | if line.startswith(' '*self.tab_length):
67 | newtext.append(line[self.tab_length:])
68 | elif not line.strip():
69 | newtext.append('')
70 | else:
71 | break
72 | return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
73 |
74 | def looseDetab(self, text, level=1):
75 | """ Remove a tab from front of lines but allowing dedented lines. """
76 | lines = text.split('\n')
77 | for i in range(len(lines)):
78 | if lines[i].startswith(' '*self.tab_length*level):
79 | lines[i] = lines[i][self.tab_length*level:]
80 | return '\n'.join(lines)
81 |
82 | def test(self, parent, block):
83 | """ Test for block type. Must be overridden by subclasses.
84 |
85 | As the parser loops through processors, it will call the ``test`` method
86 | on each to determine if the given block of text is of that type. This
87 | method must return a boolean ``True`` or ``False``. The actual method of
88 | testing is left to the needs of that particular block type. It could
89 | be as simple as ``block.startswith(some_string)`` or a complex regular
90 | expression. As the block type may be different depending on the parent
91 | of the block (i.e. inside a list), the parent etree element is also
92 | provided and may be used as part of the test.
93 |
94 | Keywords:
95 |
96 | * ``parent``: A etree element which will be the parent of the block.
97 | * ``block``: A block of text from the source which has been split at
98 | blank lines.
99 | """
100 | pass
101 |
102 | def run(self, parent, blocks):
103 | """ Run processor. Must be overridden by subclasses.
104 |
105 | When the parser determines the appropriate type of a block, the parser
106 | will call the corresponding processor's ``run`` method. This method
107 | should parse the individual lines of the block and append them to
108 | the etree.
109 |
110 | Note that both the ``parent`` and ``etree`` keywords are pointers
111 | to instances of the objects which should be edited in place. Each
112 | processor must make changes to the existing objects as there is no
113 | mechanism to return new/different objects to replace them.
114 |
115 | This means that this method should be adding SubElements or adding text
116 | to the parent, and should remove (``pop``) or add (``insert``) items to
117 | the list of blocks.
118 |
119 | Keywords:
120 |
121 | * ``parent``: A etree element which is the parent of the current block.
122 | * ``blocks``: A list of all remaining blocks of the document.
123 | """
124 | pass
125 |
126 |
127 | class ListIndentProcessor(BlockProcessor):
128 | """ Process children of list items.
129 |
130 | Example:
131 | * a list item
132 | process this part
133 |
134 | or this part
135 |
136 | """
137 |
138 | ITEM_TYPES = ['li']
139 | LIST_TYPES = ['ul', 'ol']
140 |
141 | def __init__(self, *args):
142 | BlockProcessor.__init__(self, *args)
143 | self.INDENT_RE = re.compile(r'^(([ ]{%s})+)'% self.tab_length)
144 |
145 | def test(self, parent, block):
146 | return block.startswith(' '*self.tab_length) and \
147 | not self.parser.state.isstate('detabbed') and \
148 | (parent.tag in self.ITEM_TYPES or \
149 | (len(parent) and parent[-1] and \
150 | (parent[-1].tag in self.LIST_TYPES)
151 | )
152 | )
153 |
154 | def run(self, parent, blocks):
155 | block = blocks.pop(0)
156 | level, sibling = self.get_level(parent, block)
157 | block = self.looseDetab(block, level)
158 |
159 | self.parser.state.set('detabbed')
160 | if parent.tag in self.ITEM_TYPES:
161 | # It's possible that this parent has a 'ul' or 'ol' child list
162 | # with a member. If that is the case, then that should be the
163 | # parent. This is intended to catch the edge case of an indented
164 | # list whose first member was parsed previous to this point
165 | # see OListProcessor
166 | if len(parent) and parent[-1].tag in self.LIST_TYPES:
167 | self.parser.parseBlocks(parent[-1], [block])
168 | else:
169 | # The parent is already a li. Just parse the child block.
170 | self.parser.parseBlocks(parent, [block])
171 | elif sibling.tag in self.ITEM_TYPES:
172 | # The sibling is a li. Use it as parent.
173 | self.parser.parseBlocks(sibling, [block])
174 | elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES:
175 | # The parent is a list (``ol`` or ``ul``) which has children.
176 | # Assume the last child li is the parent of this block.
177 | if sibling[-1].text:
178 | # If the parent li has text, that text needs to be moved to a p
179 | # The p must be 'inserted' at beginning of list in the event
180 | # that other children already exist i.e.; a nested sublist.
181 | p = util.etree.Element('p')
182 | p.text = sibling[-1].text
183 | sibling[-1].text = ''
184 | sibling[-1].insert(0, p)
185 | self.parser.parseChunk(sibling[-1], block)
186 | else:
187 | self.create_item(sibling, block)
188 | self.parser.state.reset()
189 |
190 | def create_item(self, parent, block):
191 | """ Create a new li and parse the block with it as the parent. """
192 | li = util.etree.SubElement(parent, 'li')
193 | self.parser.parseBlocks(li, [block])
194 |
195 | def get_level(self, parent, block):
196 | """ Get level of indent based on list level. """
197 | # Get indent level
198 | m = self.INDENT_RE.match(block)
199 | if m:
200 | indent_level = len(m.group(1))/self.tab_length
201 | else:
202 | indent_level = 0
203 | if self.parser.state.isstate('list'):
204 | # We're in a tightlist - so we already are at correct parent.
205 | level = 1
206 | else:
207 | # We're in a looselist - so we need to find parent.
208 | level = 0
209 | # Step through children of tree to find matching indent level.
210 | while indent_level > level:
211 | child = self.lastChild(parent)
212 | if child and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES):
213 | if child.tag in self.LIST_TYPES:
214 | level += 1
215 | parent = child
216 | else:
217 | # No more child levels. If we're short of indent_level,
218 | # we have a code block. So we stop here.
219 | break
220 | return level, parent
221 |
222 |
223 | class CodeBlockProcessor(BlockProcessor):
224 | """ Process code blocks. """
225 |
226 | def test(self, parent, block):
227 | return block.startswith(' '*self.tab_length)
228 |
229 | def run(self, parent, blocks):
230 | sibling = self.lastChild(parent)
231 | block = blocks.pop(0)
232 | theRest = ''
233 | if sibling and sibling.tag == "pre" and len(sibling) \
234 | and sibling[0].tag == "code":
235 | # The previous block was a code block. As blank lines do not start
236 | # new code blocks, append this block to the previous, adding back
237 | # linebreaks removed from the split into a list.
238 | code = sibling[0]
239 | block, theRest = self.detab(block)
240 | code.text = util.AtomicString('%s\n%s\n' % (code.text, block.rstrip()))
241 | else:
242 | # This is a new codeblock. Create the elements and insert text.
243 | pre = util.etree.SubElement(parent, 'pre')
244 | code = util.etree.SubElement(pre, 'code')
245 | block, theRest = self.detab(block)
246 | code.text = util.AtomicString('%s\n' % block.rstrip())
247 | if theRest:
248 | # This block contained unindented line(s) after the first indented
249 | # line. Insert these lines as the first block of the master blocks
250 | # list for future processing.
251 | blocks.insert(0, theRest)
252 |
253 |
254 | class BlockQuoteProcessor(BlockProcessor):
255 |
256 | RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
257 |
258 | def test(self, parent, block):
259 | return bool(self.RE.search(block))
260 |
261 | def run(self, parent, blocks):
262 | block = blocks.pop(0)
263 | m = self.RE.search(block)
264 | if m:
265 | before = block[:m.start()] # Lines before blockquote
266 | # Pass lines before blockquote in recursively for parsing forst.
267 | self.parser.parseBlocks(parent, [before])
268 | # Remove ``> `` from begining of each line.
269 | block = '\n'.join([self.clean(line) for line in
270 | block[m.start():].split('\n')])
271 | sibling = self.lastChild(parent)
272 | if sibling and sibling.tag == "blockquote":
273 | # Previous block was a blockquote so set that as this blocks parent
274 | quote = sibling
275 | else:
276 | # This is a new blockquote. Create a new parent element.
277 | quote = util.etree.SubElement(parent, 'blockquote')
278 | # Recursively parse block with blockquote as parent.
279 | # change parser state so blockquotes embedded in lists use p tags
280 | self.parser.state.set('blockquote')
281 | self.parser.parseChunk(quote, block)
282 | self.parser.state.reset()
283 |
284 | def clean(self, line):
285 | """ Remove ``>`` from beginning of a line. """
286 | m = self.RE.match(line)
287 | if line.strip() == ">":
288 | return ""
289 | elif m:
290 | return m.group(2)
291 | else:
292 | return line
293 |
294 | class OListProcessor(BlockProcessor):
295 | """ Process ordered list blocks. """
296 |
297 | TAG = 'ol'
298 | # Detect an item (``1. item``). ``group(1)`` contains contents of item.
299 | RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)')
300 | # Detect items on secondary lines. they can be of either list type.
301 | CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)')
302 | # Detect indented (nested) items of either type
303 | INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*')
304 | # The integer (python string) with which the lists starts (default=1)
305 | # Eg: If list is intialized as)
306 | # 3. Item
307 | # The ol tag will get starts="3" attribute
308 | STARTSWITH = '1'
309 | # List of allowed sibling tags.
310 | SIBLING_TAGS = ['ol', 'ul']
311 |
312 | def test(self, parent, block):
313 | return bool(self.RE.match(block))
314 |
315 | def run(self, parent, blocks):
316 | # Check fr multiple items in one block.
317 | items = self.get_items(blocks.pop(0))
318 | sibling = self.lastChild(parent)
319 |
320 | if sibling and sibling.tag in self.SIBLING_TAGS:
321 | # Previous block was a list item, so set that as parent
322 | lst = sibling
323 | # make sure previous item is in a p- if the item has text, then it
324 | # it isn't in a p
325 | if lst[-1].text:
326 | # since it's possible there are other children for this sibling,
327 | # we can't just SubElement the p, we need to insert it as the
328 | # first item
329 | p = util.etree.Element('p')
330 | p.text = lst[-1].text
331 | lst[-1].text = ''
332 | lst[-1].insert(0, p)
333 | # if the last item has a tail, then the tail needs to be put in a p
334 | # likely only when a header is not followed by a blank line
335 | lch = self.lastChild(lst[-1])
336 | if lch is not None and lch.tail:
337 | p = util.etree.SubElement(lst[-1], 'p')
338 | p.text = lch.tail.lstrip()
339 | lch.tail = ''
340 |
341 | # parse first block differently as it gets wrapped in a p.
342 | li = util.etree.SubElement(lst, 'li')
343 | self.parser.state.set('looselist')
344 | firstitem = items.pop(0)
345 | self.parser.parseBlocks(li, [firstitem])
346 | self.parser.state.reset()
347 | elif parent.tag in ['ol', 'ul']:
348 | # this catches the edge case of a multi-item indented list whose
349 | # first item is in a blank parent-list item:
350 | # * * subitem1
351 | # * subitem2
352 | # see also ListIndentProcessor
353 | lst = parent
354 | else:
355 | # This is a new list so create parent with appropriate tag.
356 | lst = util.etree.SubElement(parent, self.TAG)
357 | # Check if a custom start integer is set
358 | if not self.parser.markdown.lazy_ol and self.STARTSWITH !='1':
359 | lst.attrib['start'] = self.STARTSWITH
360 |
361 | self.parser.state.set('list')
362 | # Loop through items in block, recursively parsing each with the
363 | # appropriate parent.
364 | for item in items:
365 | if item.startswith(' '*self.tab_length):
366 | # Item is indented. Parse with last item as parent
367 | self.parser.parseBlocks(lst[-1], [item])
368 | else:
369 | # New item. Create li and parse with it as parent
370 | li = util.etree.SubElement(lst, 'li')
371 | self.parser.parseBlocks(li, [item])
372 | self.parser.state.reset()
373 |
374 | def get_items(self, block):
375 | """ Break a block into list items. """
376 | items = []
377 | for line in block.split('\n'):
378 | m = self.CHILD_RE.match(line)
379 | if m:
380 | # This is a new list item
381 | # Check first item for the start index
382 | if not items and self.TAG=='ol':
383 | # Detect the integer value of first list item
384 | INTEGER_RE = re.compile('(\d+)')
385 | self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()
386 | # Append to the list
387 | items.append(m.group(3))
388 | elif self.INDENT_RE.match(line):
389 | # This is an indented (possibly nested) item.
390 | if items[-1].startswith(' '*self.tab_length):
391 | # Previous item was indented. Append to that item.
392 | items[-1] = '%s\n%s' % (items[-1], line)
393 | else:
394 | items.append(line)
395 | else:
396 | # This is another line of previous item. Append to that item.
397 | items[-1] = '%s\n%s' % (items[-1], line)
398 | return items
399 |
400 |
401 | class UListProcessor(OListProcessor):
402 | """ Process unordered list blocks. """
403 |
404 | TAG = 'ul'
405 | RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)')
406 |
407 |
408 | class HashHeaderProcessor(BlockProcessor):
409 | """ Process Hash Headers. """
410 |
411 | # Detect a header at start of any line in block
412 | RE = re.compile(r'(^|\n)(?P#{1,6})(?P.*?)#*(\n|$)')
413 |
414 | def test(self, parent, block):
415 | return bool(self.RE.search(block))
416 |
417 | def run(self, parent, blocks):
418 | block = blocks.pop(0)
419 | m = self.RE.search(block)
420 | if m:
421 | before = block[:m.start()] # All lines before header
422 | after = block[m.end():] # All lines after header
423 | if before:
424 | # As the header was not the first line of the block and the
425 | # lines before the header must be parsed first,
426 | # recursively parse this lines as a block.
427 | self.parser.parseBlocks(parent, [before])
428 | # Create header using named groups from RE
429 | h = util.etree.SubElement(parent, 'h%d' % len(m.group('level')))
430 | h.text = m.group('header').strip()
431 | if after:
432 | # Insert remaining lines as first block for future parsing.
433 | blocks.insert(0, after)
434 | else:
435 | # This should never happen, but just in case...
436 | logger.warn("We've got a problem header: %r" % block)
437 |
438 |
439 | class SetextHeaderProcessor(BlockProcessor):
440 | """ Process Setext-style Headers. """
441 |
442 | # Detect Setext-style header. Must be first 2 lines of block.
443 | RE = re.compile(r'^.*?\n[=-]+[ ]*(\n|$)', re.MULTILINE)
444 |
445 | def test(self, parent, block):
446 | return bool(self.RE.match(block))
447 |
448 | def run(self, parent, blocks):
449 | lines = blocks.pop(0).split('\n')
450 | # Determine level. ``=`` is 1 and ``-`` is 2.
451 | if lines[1].startswith('='):
452 | level = 1
453 | else:
454 | level = 2
455 | h = util.etree.SubElement(parent, 'h%d' % level)
456 | h.text = lines[0].strip()
457 | if len(lines) > 2:
458 | # Block contains additional lines. Add to master blocks for later.
459 | blocks.insert(0, '\n'.join(lines[2:]))
460 |
461 |
462 | class HRProcessor(BlockProcessor):
463 | """ Process Horizontal Rules. """
464 |
465 | RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*'
466 | # Detect hr on any line of a block.
467 | SEARCH_RE = re.compile(RE, re.MULTILINE)
468 |
469 | def test(self, parent, block):
470 | m = self.SEARCH_RE.search(block)
471 | # No atomic grouping in python so we simulate it here for performance.
472 | # The regex only matches what would be in the atomic group - the HR.
473 | # Then check if we are at end of block or if next char is a newline.
474 | if m and (m.end() == len(block) or block[m.end()] == '\n'):
475 | # Save match object on class instance so we can use it later.
476 | self.match = m
477 | return True
478 | return False
479 |
480 | def run(self, parent, blocks):
481 | block = blocks.pop(0)
482 | # Check for lines in block before hr.
483 | prelines = block[:self.match.start()].rstrip('\n')
484 | if prelines:
485 | # Recursively parse lines before hr so they get parsed first.
486 | self.parser.parseBlocks(parent, [prelines])
487 | # create hr
488 | util.etree.SubElement(parent, 'hr')
489 | # check for lines in block after hr.
490 | postlines = block[self.match.end():].lstrip('\n')
491 | if postlines:
492 | # Add lines after hr to master blocks for later parsing.
493 | blocks.insert(0, postlines)
494 |
495 |
496 |
497 | class EmptyBlockProcessor(BlockProcessor):
498 | """ Process blocks and start with an empty line. """
499 |
500 | # Detect a block that only contains whitespace
501 | # or only whitespace on the first line.
502 | RE = re.compile(r'^\s*\n')
503 |
504 | def test(self, parent, block):
505 | return bool(self.RE.match(block))
506 |
507 | def run(self, parent, blocks):
508 | block = blocks.pop(0)
509 | m = self.RE.match(block)
510 | if m:
511 | # Add remaining line to master blocks for later.
512 | blocks.insert(0, block[m.end():])
513 | sibling = self.lastChild(parent)
514 | if sibling and sibling.tag == 'pre' and sibling[0] and \
515 | sibling[0].tag == 'code':
516 | # Last block is a codeblock. Append to preserve whitespace.
517 | sibling[0].text = util.AtomicString('%s/n/n/n' % sibling[0].text )
518 |
519 |
520 | class ParagraphProcessor(BlockProcessor):
521 | """ Process Paragraph blocks. """
522 |
523 | def test(self, parent, block):
524 | return True
525 |
526 | def run(self, parent, blocks):
527 | block = blocks.pop(0)
528 | if block.strip():
529 | # Not a blank block. Add to parent, otherwise throw it away.
530 | if self.parser.state.isstate('list'):
531 | # The parent is a tight-list.
532 | #
533 | # Check for any children. This will likely only happen in a
534 | # tight-list when a header isn't followed by a blank line.
535 | # For example:
536 | #
537 | # * # Header
538 | # Line 2 of list item - not part of header.
539 | sibling = self.lastChild(parent)
540 | if sibling is not None:
541 | # Insetrt after sibling.
542 | if sibling.tail:
543 | sibling.tail = '%s\n%s' % (sibling.tail, block)
544 | else:
545 | sibling.tail = '\n%s' % block
546 | else:
547 | # Append to parent.text
548 | if parent.text:
549 | parent.text = '%s\n%s' % (parent.text, block)
550 | else:
551 | parent.text = block.lstrip()
552 | else:
553 | # Create a regular paragraph
554 | p = util.etree.SubElement(parent, 'p')
555 | p.text = block.lstrip()
556 |
--------------------------------------------------------------------------------