├── .gitignore
├── .hgignore
├── .hgtags
├── LICENSE
├── MANIFEST.in
├── README
├── bin
└── html2rest
├── html2rest.py
├── pkg
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | build/
4 | pip-log.txt
5 | *~
6 | dist/
7 | *.egg-info/
8 | example.db
9 | demo/media/
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
1 |
2 | syntax: glob
3 |
4 | *.pyc
5 | *~
6 | *.swp
7 | *.tmp
8 | tests/out
9 | dist/*
10 | *.egg-info*
11 | *bak
12 | tmp/*
13 | *.orig
14 |
15 |
16 |
--------------------------------------------------------------------------------
/.hgtags:
--------------------------------------------------------------------------------
1 | daf39519f7373ff77a22407532dbb6f3d7f76d45 0.2
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2006-2011 Gerard Flanagan
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification,
5 | are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 |
10 | 2. Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 |
14 | 3. Neither the name of html2rest nor the names of its contributors may be
15 | used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 |
2 | include bin/*
3 | include LICENSE
4 |
5 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 |
2 | html2rest.py
3 | ============
4 |
5 | Convert HTML to restructuredText. Very limited, but intended as a "50%" tool, to be
6 | followed by manual editing.
7 |
8 | Install
9 | #######
10 |
11 | ::
12 |
13 | easy_install html2rest
14 |
15 | Or::
16 |
17 | pip install html2rest
18 |
19 | Usage
20 | #####
21 |
22 | From the command line::
23 |
24 | html2rest http://sphinx.pocoo.org/templating.html > templating.rst
25 |
26 | Or programmatically::
27 |
28 | from html2rest import html2rest
29 |
30 | stream = StringIO()
31 |
32 | html2rest('
', writer=stream)
33 |
34 | Specify input encoding (default is 'utf8') and a preprocessor::
35 |
36 | def strip_chars(html):
37 | return html.replace('¶', '')
38 |
39 | html2rest(html, writer=stream, encoding='latin1', preprocess=strip_chars)
40 |
41 |
--------------------------------------------------------------------------------
/bin/html2rest:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import urllib
5 | import codecs
6 | import locale
7 |
8 | from html2rest import html2rest
9 |
10 | fileobj = None
11 | args = sys.argv[1:]
12 | if args:
13 | arg = args[0]
14 | if '://' in arg:
15 | fileobj = urllib.urlopen(arg)
16 | else:
17 | fileobj = open(arg, 'rb')
18 | if len(args) > 1:
19 | encoding = args[1]
20 | else:
21 | encoding = 'utf8'
22 | if arg[-1] == '/':
23 | arg = arg[:-1]
24 | relto = arg.rpartition('/')[0]
25 | else:
26 | fileobj = sys.stdin
27 | encoding = locale.getpreferredencoding() or 'utf-8'
28 | relto = None
29 | try:
30 | html2rest(fileobj.read(), encoding=encoding, relto=relto)
31 | finally:
32 | try:
33 | fileobj.close()
34 | except:
35 | pass
36 |
37 |
--------------------------------------------------------------------------------
/html2rest.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #-----------------------------------------------------------------------------
3 | # Copyright (c) 2006-2011 Gerard Flanagan
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining
6 | # a copy of this software and associated documentation files (the "Software"),
7 | # to deal in the Software without restriction, including without limitation
8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 | # and/or sell copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included
13 | # in all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | #-----------------------------------------------------------------------------
23 |
24 | __version__ = '0.2.2'
25 |
26 | import sys
27 | import os
28 | import re
29 | from sgmllib import SGMLParser
30 | from StringIO import StringIO
31 | from textwrap import TextWrapper
32 | from urllib2 import urlparse
33 |
34 | CODEBLOCK = '::'
35 | BLOCKTAGS = ['div', 'blockquote']
36 | IGNORETAGS = ['title', 'style', 'script']
37 | UNDERLINES = list('=-~`+;')
38 |
39 | # Fredrik Lundh, http://effbot.org/zone/re-sub.html
40 | def unescape(text, to_encoding='utf8'):
41 | def fixup(m):
42 | text = m.group(0)
43 | if text[:2] == "":
44 | # character reference
45 | try:
46 | if text[:3].lower() == "":
47 | return unichr(int(text[3:-1], 16))
48 | else:
49 | return unichr(int(text[2:-1]))
50 | except ValueError:
51 | pass
52 | else:
53 | # named entity
54 | import htmlentitydefs
55 | try:
56 | text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
57 | except KeyError:
58 | pass
59 | return text # leave as is
60 | return re.sub("?\w+;", fixup, text).encode(to_encoding)
61 |
62 | try:
63 | from BeautifulSoup import BeautifulSoup
64 | except ImportError:
65 | def BeautifulSoup(text, *args, **kw):
66 | return text
67 |
68 | def readsoup(html, convert='html', encoding='utf8'):
69 | #for br in ['
', '
', '
']:
70 | # text = text.replace(br, '\n')
71 | # text = text.replace(br.upper(), '\n')
72 | return str(BeautifulSoup(html, convertEntities=convert,
73 | fromEncoding=encoding))
74 |
75 | def html2rest(html, writer=sys.stdout, encoding='utf8', relto=None, preprocess=None):
76 | relroot = relpath = None
77 | if relto:
78 | parsed = urlparse.urlparse(relto)
79 | relroot = parsed.scheme + '://' + parsed.netloc
80 | relpath = relroot + parsed.path
81 | if relpath[-1] != '/':
82 | relpath += '/'
83 | if preprocess:
84 | html = preprocess(html, encoding=encoding)
85 | parser = Parser(writer, encoding, relroot, relpath)
86 | #parser.feed(readsoup(html))
87 | parser.feed(html.decode(encoding))
88 | parser.close()
89 |
90 | class LineBuffer(object):
91 |
92 | def __init__(self):
93 | self._lines = []
94 | self._wrapper = TextWrapper()
95 |
96 | def __len__(self):
97 | return len(self._lines)
98 |
99 | def __getitem__(self, i):
100 | return self._lines[i]
101 |
102 | def __setitem__(self, i, value):
103 | self._lines[i] = value
104 |
105 | def clear(self):
106 | self._lines[:] = []
107 |
108 | def read(self):
109 | return '\n'.join(self._lines)
110 |
111 | def write(self, s):
112 | #normalise whitespace
113 | s = ' '.join(s.split())
114 | self._lines.extend(self._wrapper.wrap(s))
115 |
116 | def rawwrite(self, s):
117 | self._lines.extend(s.splitlines())
118 |
119 | def indent(self, numspaces=4, start=0):
120 | linebuf = self._lines
121 | n = len(linebuf)
122 | if n > start:
123 | indent = ' ' * numspaces
124 | for i in range(start, n):
125 | linebuf[i] = indent + linebuf[i]
126 |
127 | def lstrip(self):
128 | linebuf = self._lines
129 | for i in range(len(linebuf)):
130 | linebuf[i] = linebuf[i].lstrip()
131 |
132 | class Parser(SGMLParser):
133 |
134 | def __init__(self, writer=sys.stdout, encoding='utf8', relroot=None, relpath=None):
135 | SGMLParser.__init__(self)
136 | self.writer = writer
137 | self.encoding = encoding
138 | self.relroot = relroot
139 | self.relpath = relpath
140 | self.stringbuffer = StringIO()
141 | self.linebuffer = LineBuffer()
142 | self.verbatim = False
143 | self.lists = []
144 | self.ignoredata = False
145 | self.inblock = 0
146 | self.nobreak = False
147 | self.hrefs = {}
148 |
149 | def close(self):
150 | self.writeline()
151 | SGMLParser.close(self)
152 |
153 | def flush(self):
154 | if self.linebuffer:
155 | if self.inblock > 1:
156 | indent = 4 * (self.inblock - 1)
157 | self.linebuffer.indent(indent)
158 | self.writer.write(unescape(self.linebuffer.read(), self.encoding))
159 | self.linebuffer.clear()
160 |
161 | def flush_stringbuffer(self):
162 | sbuf = self.stringbuffer.getvalue()
163 | if not sbuf:
164 | return
165 | elif self.linebuffer:
166 | self.linebuffer[-1] += sbuf
167 | else:
168 | self.linebuffer.write(sbuf)
169 | self.clear_stringbuffer()
170 |
171 | def clear_stringbuffer(self):
172 | #self.stringbuffer.reset()
173 | self.stringbuffer.seek(0)
174 | self.stringbuffer.truncate()
175 |
176 | def data(self, text):
177 | self.stringbuffer.write(text)
178 |
179 | def pending(self):
180 | return self.stringbuffer.tell() or self.linebuffer
181 |
182 | def write(self, text=''):
183 | self.flush_stringbuffer()
184 | self.flush()
185 | self.writer.write(unescape(text))
186 |
187 | def writeline(self, text=''):
188 | self.write(text + '\n')
189 |
190 | def writestartblock(self, text=''):
191 | if self.pending():
192 | self.writeline()
193 | self.writeline()
194 | self.writeline(text)
195 |
196 | def writeendblock(self, text=''):
197 | self.writeline(text)
198 | self.writeline()
199 |
200 | def writeblock(self, text=''):
201 | self.writestartblock(text)
202 | self.writeline()
203 |
204 | def handle_data(self, data):
205 | if self.ignoredata:
206 | return
207 | elif self.verbatim:
208 | self.data(data)
209 | else:
210 | if '#pending' in self.hrefs:
211 | self.hrefs[self.hrefs['#pending']] = data
212 | self.data(' '.join(data.splitlines()))
213 |
214 | def unknown_starttag(self, tag, attrs):
215 | if tag in IGNORETAGS:
216 | self.ignoredata = True
217 | elif len(tag) == 2 and tag[0] == 'h':
218 | self.writestartblock()
219 | elif tag == 'br':
220 | if self.verbatim:
221 | self.data('\n')
222 | elif not self.inblock:
223 | self.writeline()
224 | else:
225 | self.data(' ')
226 | elif not self.verbatim:
227 | self.data(' ')
228 |
229 | def unknown_endtag(self, tag):
230 | self.ignoredata = False
231 | if len(tag) == 2 and tag[0] == 'h':
232 | self.flush_stringbuffer()
233 | if self.linebuffer:
234 | linebuf = self.linebuffer
235 | linebuf[-1] = linebuf[-1].strip()
236 | char = UNDERLINES[int(tag[1])-1]
237 | linebuf.write(char * len(linebuf[-1]))
238 | self.writeline()
239 | #elif tag in BLOCKTAGS and self.pending():
240 | # if self.lists:
241 | # self.end_li()
242 | # else:
243 | # self.writeline()
244 | elif not self.verbatim:
245 | self.data(' ')
246 |
247 | def start_a(self, attrs):
248 | href = dict(attrs).get('href', None)
249 | if not href or href.startswith('#'):
250 | return
251 | elif self.relroot and self.relpath and 'mailto:' not in href:
252 | if href.startswith('/'):
253 | href = self.relroot + href
254 | elif '://' not in href:
255 | href = self.relpath + href
256 | self.data('`')
257 | self.hrefs['#pending'] = href
258 |
259 | def end_a(self):
260 | if '#pending' in self.hrefs:
261 | self.data('`_')
262 | del self.hrefs['#pending']
263 |
264 | def start_pre(self, attrs):
265 | if self.lists:
266 | self.end_li()
267 | self.writeline()
268 | #self.inblock += 1
269 | self.verbatim = True
270 | self.writeblock(CODEBLOCK)
271 |
272 | def end_pre(self):
273 | sbuf = self.stringbuffer.getvalue()
274 | if sbuf:
275 | self.linebuffer.rawwrite(sbuf)
276 | self.linebuffer.indent(4)
277 | self.clear_stringbuffer()
278 | self.writeendblock()
279 | #self.inblock -= 1
280 | self.verbatim = False
281 |
282 | def start_ul(self, attrs):
283 | if self.lists:
284 | self.end_li()
285 | self.writeline()
286 | else:
287 | self.writeline()
288 | self.lists.append('+ ')
289 | self.inblock += 1
290 |
291 | def end_ul(self):
292 | self.end_li()
293 | self.lists.pop()
294 | self.inblock -= 1
295 | if self.inblock:
296 | self.writeline()
297 | else:
298 | self.writeendblock()
299 |
300 | def start_ol(self, attrs):
301 | if self.lists:
302 | self.end_li()
303 | self.writeline()
304 | else:
305 | self.writeline()
306 | self.lists.append('#. ')
307 | self.inblock += 1
308 |
309 | def end_ol(self):
310 | self.end_li()
311 | self.lists.pop()
312 | self.inblock -= 1
313 | if self.inblock:
314 | self.writeline()
315 | else:
316 | self.writeendblock()
317 |
318 | def start_p(self, attrs):
319 | if self.verbatim:
320 | self.writeline()
321 | elif not self.inblock:
322 | self.writeline()
323 |
324 | def end_p(self):
325 | if self.inblock:
326 | #self.flush_stringbuffer()
327 | if self.verbatim:
328 | self.writeline()
329 | else:
330 | return
331 | else:
332 | self.linebuffer.lstrip()
333 | self.writeline()
334 |
335 | def start_li(self, attrs):
336 | self.writeline()
337 | self.data(self.lists[-1])
338 |
339 | def end_li(self):
340 | self.flush_stringbuffer()
341 | linebuf = self.linebuffer
342 | if linebuf and linebuf[0] and linebuf[0].lstrip()[:2] in ['+ ', '#.']:
343 | start=1
344 | else:
345 | # the start of the has already been written, perhaps because
346 | # there was a block
347 | start = 0
348 | self.linebuffer.indent(len(self.lists[-1]), start=start)
349 | self.write()
350 |
351 | def start_dl(self, attrs):
352 | self.writeline()
353 | self.inblock += 1
354 | self.nobreak = True
355 |
356 | def end_dl(self):
357 | self.nobreak = False
358 | self.writeline()
359 | self.inblock -= 1
360 |
361 | def start_dt(self, attrs):
362 | self.data(':')
363 |
364 | def end_dt(self):
365 | self.data(':')
366 |
367 | def start_dd(self, attrs):
368 | self.data(' ')
369 |
370 | def end_dd(self):
371 | self.flush_stringbuffer()
372 | self.linebuffer.indent(2, start=1)
373 | self.writeline()
374 |
375 | def start_em(self, attrs):
376 | self.data(' *')
377 |
378 | def end_em(self):
379 | self.data('*')
380 |
381 | def start_b(self, attrs):
382 | self.data(' **')
383 |
384 | def end_b(self):
385 | self.data('**')
386 |
387 | def start_code(self, attrs):
388 | self.data(' `')
389 |
390 | def end_code(self):
391 | self.data('`')
392 |
393 | def start_span(self, attrs):
394 | pass
395 |
396 | def end_span(self):
397 | pass
398 |
399 | def start_body(self, attrs):
400 | pass
401 |
402 | def end_body(self):
403 | self.end_p()
404 | for href, link in self.hrefs.items():
405 | if href[0] != '#':
406 | self.writeline('.. _%s: %s' % (link, href))
407 | self.end_p()
408 |
409 |
--------------------------------------------------------------------------------
/pkg:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | grep "__version__.*dev['\"]$" html2rest.py >/dev/null 2>&1
4 |
5 | if [ $? -eq 0 ]; then
6 | # dev
7 | python setup.py sdist --formats=gztar,zip
8 | else
9 | # not dev
10 | python setup.py sdist --formats=gztar,zip register upload
11 | fi
12 |
13 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | from os.path import join as pathjoin, exists as pathexists, dirname, basename, abspath
4 | from distutils.core import setup
5 |
6 | import re
7 | version_rx = r"^__version__ = '(.*)'$"
8 | version_pattern = re.compile(version_rx)
9 |
10 |
11 | fd = open('html2rest.py')
12 | try:
13 | for line in fd:
14 | m = version_pattern.match(line)
15 | if m:
16 | break
17 | else:
18 | raise Exception("couldn't find __version__")
19 | finally:
20 | fd.close()
21 |
22 | __version__ = m.group(1)
23 |
24 | srcdir = dirname(abspath(__file__))
25 |
26 | print "running setup for html2rest version %s" % __version__
27 |
28 |
29 |
30 | setup(
31 | name="html2rest",
32 | version=__version__,
33 | description="Convert HTML to restructuredText",
34 | author="Gerard Flanagan",
35 | author_email = "grflanagan@gmail.com",
36 | classifiers=["Development Status :: 4 - Beta",
37 | "Intended Audience :: Developers",
38 | "License :: OSI Approved :: BSD License",
39 | "Programming Language :: Python",
40 | "Topic :: Software Development :: Libraries",
41 | "Topic :: Software Development :: Libraries :: Python Modules",
42 | ],
43 | url="https://github.com/podados/python-html2rest",
44 | license="BSD",
45 | download_url="http://pypi.python.org/packages/source/h/html2rest/html2rest-%s.tar.gz" % __version__,
46 | py_modules=['html2rest'],
47 | scripts = [
48 | pathjoin(srcdir, 'bin', 'html2rest'),
49 | ],
50 | )
51 |
52 |
--------------------------------------------------------------------------------