├── .gitignore
├── .hgignore
├── .hgtags
├── LICENSE
├── MANIFEST.in
├── README
├── bin
    └── html2rest
├── html2rest.py
├── pkg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.swp
 3 | build/
 4 | pip-log.txt
 5 | *~
 6 | dist/
 7 | *.egg-info/
 8 | example.db
 9 | demo/media/
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
 1 | 
 2 | syntax: glob
 3 | 
 4 | *.pyc
 5 | *~
 6 | *.swp
 7 | *.tmp
 8 | tests/out
 9 | dist/*
10 | *.egg-info*
11 | *bak
12 | tmp/*
13 | *.orig
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/.hgtags:
--------------------------------------------------------------------------------
1 | daf39519f7373ff77a22407532dbb6f3d7f76d45 0.2
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2006-2011 Gerard Flanagan
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice, 
 8 |        this list of conditions and the following disclaimer.
 9 |     
10 |     2. Redistributions in binary form must reproduce the above copyright 
11 |        notice, this list of conditions and the following disclaimer in the
12 |        documentation and/or other materials provided with the distribution.
13 | 
14 |     3. Neither the name of html2rest nor the names of its contributors may be
15 |        used to endorse or promote products derived from this software without
16 |        specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | 
2 | include bin/*
3 | include LICENSE
4 | 
5 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | 
 2 | html2rest.py
 3 | ============
 4 | 
 5 | Convert HTML to restructuredText. Very limited, but intended as a "50%" tool, to be
 6 | followed by manual editing.
 7 | 
 8 | Install
 9 | #######
10 | 
11 | ::
12 | 
13 |     easy_install html2rest
14 | 
15 | Or::
16 | 
17 |     pip install html2rest
18 | 
19 | Usage
20 | #####
21 | 
22 | From the command line::
23 | 
24 |     html2rest http://sphinx.pocoo.org/templating.html > templating.rst
25 | 
26 | Or programmatically::
27 | 
28 |     from html2rest import html2rest
29 | 
30 |     stream = StringIO()
31 | 
32 |     html2rest('<ul><li>one</li><li>two</li></ul>', writer=stream)
33 | 
34 | Specify input encoding (default is 'utf8') and a preprocessor::
35 | 
36 |     def strip_chars(html):
37 |         return html.replace('¶', '')
38 |     
39 |     html2rest(html, writer=stream, encoding='latin1', preprocess=strip_chars)
40 | 
41 | 


--------------------------------------------------------------------------------
/bin/html2rest:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import urllib
 5 | import codecs
 6 | import locale
 7 | 
 8 | from html2rest import html2rest
 9 | 
10 | fileobj = None
11 | args = sys.argv[1:]
12 | if args:
13 |     arg = args[0]
14 |     if '://' in arg:
15 |         fileobj = urllib.urlopen(arg)
16 |     else:
17 |         fileobj = open(arg, 'rb')
18 |     if len(args) > 1:
19 |         encoding = args[1]
20 |     else:
21 |         encoding = 'utf8'
22 |     if arg[-1] == '/':
23 |         arg = arg[:-1]
24 |     relto = arg.rpartition('/')[0]
25 | else:
26 |     fileobj = sys.stdin
27 |     encoding = locale.getpreferredencoding() or 'utf-8'
28 |     relto = None
29 | try:
30 |     html2rest(fileobj.read(), encoding=encoding, relto=relto)
31 | finally:
32 |     try:
33 |         fileobj.close()
34 |     except:
35 |         pass
36 | 
37 | 


--------------------------------------------------------------------------------
/html2rest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #-----------------------------------------------------------------------------
  3 | # Copyright (c) 2006-2011  Gerard Flanagan
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining
  6 | # a copy of this software and associated documentation files (the "Software"),
  7 | # to deal in the Software without restriction, including without limitation
  8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | # and/or sell copies of the Software, and to permit persons to whom the
 10 | # Software is furnished to do so, subject to the following conditions:
 11 | #
 12 | #    The above copyright notice and this permission notice shall be included
 13 | #    in all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | #-----------------------------------------------------------------------------
 23 | 
 24 | __version__ = '0.2.2'
 25 | 
 26 | import sys
 27 | import os
 28 | import re
 29 | from sgmllib import SGMLParser
 30 | from StringIO import StringIO
 31 | from textwrap import TextWrapper
 32 | from urllib2 import urlparse
 33 | 
 34 | CODEBLOCK = '::'
 35 | BLOCKTAGS = ['div', 'blockquote']
 36 | IGNORETAGS = ['title', 'style', 'script']
 37 | UNDERLINES = list('=-~`+;')
 38 | 
 39 | # Fredrik Lundh, http://effbot.org/zone/re-sub.html
 40 | def unescape(text, to_encoding='utf8'):
 41 |     def fixup(m):
 42 |         text = m.group(0)
 43 |         if text[:2] == "&#":
 44 |             # character reference
 45 |             try:
 46 |                 if text[:3].lower() == "&#x":
 47 |                     return unichr(int(text[3:-1], 16))
 48 |                 else:
 49 |                     return unichr(int(text[2:-1]))
 50 |             except ValueError:
 51 |                 pass
 52 |         else:
 53 |             # named entity
 54 |             import htmlentitydefs
 55 |             try:
 56 |                 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
 57 |             except KeyError:
 58 |                 pass
 59 |         return text # leave as is
 60 |     return re.sub("&#?\w+;", fixup, text).encode(to_encoding)
 61 | 
 62 | try:
 63 |     from BeautifulSoup import BeautifulSoup
 64 | except ImportError:
 65 |     def BeautifulSoup(text, *args, **kw):
 66 |         return text
 67 | 
 68 | def readsoup(html, convert='html', encoding='utf8'):
 69 |     #for br in ['<br>', '<br/>', '<br />']:
 70 |     #    text = text.replace(br, '\n')
 71 |     #    text = text.replace(br.upper(), '\n')
 72 |     return str(BeautifulSoup(html, convertEntities=convert,
 73 |                                             fromEncoding=encoding))
 74 | 
 75 | def html2rest(html, writer=sys.stdout, encoding='utf8', relto=None, preprocess=None):
 76 |     relroot = relpath = None
 77 |     if relto:
 78 |         parsed = urlparse.urlparse(relto)
 79 |         relroot = parsed.scheme + '://' + parsed.netloc
 80 |         relpath = relroot + parsed.path
 81 |         if relpath[-1] != '/':
 82 |             relpath += '/'
 83 |     if preprocess:
 84 |         html = preprocess(html, encoding=encoding)
 85 |     parser = Parser(writer, encoding, relroot, relpath)
 86 |     #parser.feed(readsoup(html))
 87 |     parser.feed(html.decode(encoding))
 88 |     parser.close()
 89 | 
 90 | class LineBuffer(object):
 91 | 
 92 |     def __init__(self):
 93 |         self._lines = []
 94 |         self._wrapper = TextWrapper()
 95 | 
 96 |     def __len__(self):
 97 |         return len(self._lines)
 98 | 
 99 |     def __getitem__(self, i):
100 |         return self._lines[i]
101 | 
102 |     def __setitem__(self, i, value):
103 |         self._lines[i] = value
104 | 
105 |     def clear(self):
106 |         self._lines[:] = []
107 | 
108 |     def read(self):
109 |         return '\n'.join(self._lines)
110 | 
111 |     def write(self, s):
112 |         #normalise whitespace
113 |         s = ' '.join(s.split())
114 |         self._lines.extend(self._wrapper.wrap(s))
115 | 
116 |     def rawwrite(self, s):
117 |         self._lines.extend(s.splitlines())
118 | 
119 |     def indent(self, numspaces=4, start=0):
120 |         linebuf = self._lines
121 |         n = len(linebuf)
122 |         if n > start:
123 |             indent = ' ' * numspaces
124 |             for i in range(start, n):
125 |                 linebuf[i] = indent + linebuf[i]
126 | 
127 |     def lstrip(self):
128 |         linebuf = self._lines
129 |         for i in range(len(linebuf)):
130 |             linebuf[i] = linebuf[i].lstrip()
131 | 
132 | class Parser(SGMLParser):
133 | 
134 |     def __init__(self, writer=sys.stdout, encoding='utf8', relroot=None, relpath=None):
135 |         SGMLParser.__init__(self)
136 |         self.writer = writer
137 |         self.encoding = encoding
138 |         self.relroot = relroot
139 |         self.relpath = relpath
140 |         self.stringbuffer = StringIO()
141 |         self.linebuffer = LineBuffer()
142 |         self.verbatim = False
143 |         self.lists = []
144 |         self.ignoredata = False
145 |         self.inblock = 0
146 |         self.nobreak = False
147 |         self.hrefs = {}
148 | 
149 |     def close(self):
150 |         self.writeline()
151 |         SGMLParser.close(self)
152 | 
153 |     def flush(self):
154 |         if self.linebuffer:
155 |             if self.inblock > 1:
156 |                 indent = 4 * (self.inblock - 1)
157 |                 self.linebuffer.indent(indent)
158 |             self.writer.write(unescape(self.linebuffer.read(), self.encoding))
159 |             self.linebuffer.clear()
160 | 
161 |     def flush_stringbuffer(self):
162 |         sbuf = self.stringbuffer.getvalue()
163 |         if not sbuf:
164 |             return
165 |         elif self.linebuffer:
166 |             self.linebuffer[-1] += sbuf
167 |         else:
168 |             self.linebuffer.write(sbuf)
169 |         self.clear_stringbuffer()
170 | 
171 |     def clear_stringbuffer(self):
172 |         #self.stringbuffer.reset()
173 |         self.stringbuffer.seek(0)
174 |         self.stringbuffer.truncate()
175 | 
176 |     def data(self, text):
177 |         self.stringbuffer.write(text)
178 | 
179 |     def pending(self):
180 |         return self.stringbuffer.tell() or self.linebuffer
181 | 
182 |     def write(self, text=''):
183 |         self.flush_stringbuffer()
184 |         self.flush()
185 |         self.writer.write(unescape(text))
186 | 
187 |     def writeline(self, text=''):
188 |         self.write(text + '\n')
189 | 
190 |     def writestartblock(self, text=''):
191 |         if self.pending():
192 |             self.writeline()
193 |         self.writeline()
194 |         self.writeline(text)
195 | 
196 |     def writeendblock(self, text=''):
197 |         self.writeline(text)
198 |         self.writeline()
199 | 
200 |     def writeblock(self, text=''):
201 |         self.writestartblock(text)
202 |         self.writeline()
203 | 
204 |     def handle_data(self, data):
205 |         if self.ignoredata:
206 |             return
207 |         elif self.verbatim:
208 |             self.data(data)
209 |         else:
210 |             if '#pending' in self.hrefs:
211 |                 self.hrefs[self.hrefs['#pending']] = data
212 |             self.data(' '.join(data.splitlines()))
213 | 
214 |     def unknown_starttag(self, tag, attrs):
215 |         if tag in IGNORETAGS:
216 |             self.ignoredata = True
217 |         elif len(tag) == 2 and tag[0] == 'h':
218 |             self.writestartblock()
219 |         elif tag == 'br':
220 |             if self.verbatim:
221 |                 self.data('\n')
222 |             elif not self.inblock:
223 |                 self.writeline()
224 |             else:
225 |                 self.data(' ')
226 |         elif not self.verbatim:
227 |             self.data(' ')
228 | 
229 |     def unknown_endtag(self, tag):
230 |         self.ignoredata = False
231 |         if len(tag) == 2 and tag[0] == 'h':
232 |             self.flush_stringbuffer()
233 |             if self.linebuffer:
234 |                 linebuf = self.linebuffer
235 |                 linebuf[-1] = linebuf[-1].strip()
236 |                 char = UNDERLINES[int(tag[1])-1]
237 |                 linebuf.write(char * len(linebuf[-1]))
238 |                 self.writeline()
239 |         #elif tag in BLOCKTAGS and self.pending():
240 |         #    if self.lists:
241 |         #        self.end_li()
242 |         #    else:
243 |         #        self.writeline()
244 |         elif not self.verbatim:
245 |             self.data(' ')
246 | 
247 |     def start_a(self, attrs):
248 |         href = dict(attrs).get('href', None)
249 |         if not href or href.startswith('#'):
250 |             return
251 |         elif self.relroot and self.relpath and 'mailto:' not in href:
252 |             if href.startswith('/'):
253 |                 href = self.relroot + href
254 |             elif '://' not in href:
255 |                 href = self.relpath + href
256 |         self.data('`')
257 |         self.hrefs['#pending'] = href
258 | 
259 |     def end_a(self):
260 |         if '#pending' in self.hrefs:
261 |             self.data('`_')
262 |             del self.hrefs['#pending']
263 | 
264 |     def start_pre(self, attrs):
265 |         if self.lists:
266 |             self.end_li()
267 |             self.writeline()
268 |         #self.inblock += 1
269 |         self.verbatim = True
270 |         self.writeblock(CODEBLOCK)
271 | 
272 |     def end_pre(self):
273 |         sbuf = self.stringbuffer.getvalue()
274 |         if sbuf:
275 |             self.linebuffer.rawwrite(sbuf)
276 |             self.linebuffer.indent(4)
277 |         self.clear_stringbuffer()
278 |         self.writeendblock()
279 |         #self.inblock -= 1
280 |         self.verbatim = False
281 | 
282 |     def start_ul(self, attrs):
283 |         if self.lists:
284 |             self.end_li()
285 |             self.writeline()
286 |         else:
287 |             self.writeline()
288 |         self.lists.append('+ ')
289 |         self.inblock += 1
290 | 
291 |     def end_ul(self):
292 |         self.end_li()
293 |         self.lists.pop()
294 |         self.inblock -= 1
295 |         if self.inblock:
296 |             self.writeline()
297 |         else:
298 |             self.writeendblock()
299 | 
300 |     def start_ol(self, attrs):
301 |         if self.lists:
302 |             self.end_li()
303 |             self.writeline()
304 |         else:
305 |             self.writeline()
306 |         self.lists.append('#. ')
307 |         self.inblock += 1
308 | 
309 |     def end_ol(self):
310 |         self.end_li()
311 |         self.lists.pop()
312 |         self.inblock -= 1
313 |         if self.inblock:
314 |             self.writeline()
315 |         else:
316 |             self.writeendblock()
317 | 
318 |     def start_p(self, attrs):
319 |         if self.verbatim:
320 |             self.writeline()
321 |         elif not self.inblock:
322 |             self.writeline()
323 | 
324 |     def end_p(self):
325 |         if self.inblock:
326 |         #self.flush_stringbuffer()
327 |             if self.verbatim:
328 |                 self.writeline()
329 |             else:
330 |                 return
331 |         else:
332 |             self.linebuffer.lstrip()
333 |             self.writeline()
334 | 
335 |     def start_li(self, attrs):
336 |         self.writeline()
337 |         self.data(self.lists[-1])
338 |     
339 |     def end_li(self):
340 |         self.flush_stringbuffer()
341 |         linebuf = self.linebuffer
342 |         if linebuf and linebuf[0] and linebuf[0].lstrip()[:2] in ['+ ', '#.']:
343 |             start=1
344 |         else:
345 |             # the start of the <li> has already been written, perhaps because
346 |             # there was a <pre> block
347 |             start = 0
348 |         self.linebuffer.indent(len(self.lists[-1]), start=start)
349 |         self.write()
350 | 
351 |     def start_dl(self, attrs):
352 |         self.writeline()
353 |         self.inblock += 1
354 |         self.nobreak = True
355 | 
356 |     def end_dl(self):
357 |         self.nobreak = False
358 |         self.writeline()
359 |         self.inblock -= 1
360 | 
361 |     def start_dt(self, attrs):
362 |         self.data(':')
363 | 
364 |     def end_dt(self):
365 |         self.data(':')
366 | 
367 |     def start_dd(self, attrs):
368 |         self.data(' ')
369 | 
370 |     def end_dd(self):
371 |         self.flush_stringbuffer()
372 |         self.linebuffer.indent(2, start=1)
373 |         self.writeline()
374 | 
375 |     def start_em(self, attrs):
376 |         self.data(' *')
377 | 
378 |     def end_em(self):
379 |         self.data('*')
380 | 
381 |     def start_b(self, attrs):
382 |         self.data(' **')
383 | 
384 |     def end_b(self):
385 |         self.data('**')
386 | 
387 |     def start_code(self, attrs):
388 |         self.data(' `')
389 | 
390 |     def end_code(self):
391 |         self.data('`')
392 | 
393 |     def start_span(self, attrs):
394 |         pass
395 | 
396 |     def end_span(self):
397 |         pass
398 | 
399 |     def start_body(self, attrs):
400 |         pass
401 | 
402 |     def end_body(self):
403 |         self.end_p()
404 |         for href, link in self.hrefs.items():
405 |             if href[0] != '#':
406 |                 self.writeline('.. _%s: %s' % (link, href))
407 |         self.end_p()
408 | 
409 | 


--------------------------------------------------------------------------------
/pkg:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | grep "__version__.*dev['\"]$" html2rest.py >/dev/null 2>&1
 4 | 
 5 | if [ $? -eq 0 ]; then
 6 |     # dev
 7 |     python setup.py sdist --formats=gztar,zip
 8 | else
 9 |     # not dev
10 |     python setup.py sdist --formats=gztar,zip register upload
11 | fi
12 | 
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | from os.path import join as pathjoin, exists as pathexists, dirname, basename, abspath
 4 | from distutils.core import setup
 5 | 
 6 | import re
 7 | version_rx = r"^__version__ = '(.*)'$"
 8 | version_pattern = re.compile(version_rx)
 9 | 
10 | 
11 | fd = open('html2rest.py')
12 | try:
13 |     for line in fd:
14 |         m = version_pattern.match(line)
15 |         if m:
16 |             break
17 |     else:
18 |         raise Exception("couldn't find __version__")
19 | finally:
20 |     fd.close()
21 | 
22 | __version__ = m.group(1)
23 | 
24 | srcdir = dirname(abspath(__file__))
25 | 
26 | print "running setup for html2rest version %s" % __version__
27 | 
28 | 
29 | 
30 | setup(
31 |         name="html2rest",
32 |         version=__version__,
33 |         description="Convert HTML to restructuredText",
34 |         author="Gerard Flanagan",
35 |         author_email = "grflanagan@gmail.com",
36 |         classifiers=["Development Status :: 4 - Beta",
37 |                     "Intended Audience :: Developers",
38 |                     "License :: OSI Approved :: BSD License",
39 |                     "Programming Language :: Python",
40 |                     "Topic :: Software Development :: Libraries",
41 |                     "Topic :: Software Development :: Libraries :: Python Modules",
42 |                     ],
43 |         url="https://github.com/podados/python-html2rest",
44 |         license="BSD",
45 |         download_url="http://pypi.python.org/packages/source/h/html2rest/html2rest-%s.tar.gz" % __version__,
46 |         py_modules=['html2rest'],
47 |         scripts = [
48 |             pathjoin(srcdir, 'bin', 'html2rest'),
49 |         ],
50 | )
51 |     
52 | 


--------------------------------------------------------------------------------