├── .gitignore ├── .hgignore ├── .hgtags ├── LICENSE ├── MANIFEST.in ├── README ├── bin └── html2rest ├── html2rest.py ├── pkg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | build/ 4 | pip-log.txt 5 | *~ 6 | dist/ 7 | *.egg-info/ 8 | example.db 9 | demo/media/ 10 | 11 | 12 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | 2 | syntax: glob 3 | 4 | *.pyc 5 | *~ 6 | *.swp 7 | *.tmp 8 | tests/out 9 | dist/* 10 | *.egg-info* 11 | *bak 12 | tmp/* 13 | *.orig 14 | 15 | 16 | -------------------------------------------------------------------------------- /.hgtags: -------------------------------------------------------------------------------- 1 | daf39519f7373ff77a22407532dbb6f3d7f76d45 0.2 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006-2011 Gerard Flanagan 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of html2rest nor the names of its contributors may be 15 | used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | include bin/* 3 | include LICENSE 4 | 5 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | 2 | html2rest.py 3 | ============ 4 | 5 | Convert HTML to restructuredText. Very limited, but intended as a "50%" tool, to be 6 | followed by manual editing. 7 | 8 | Install 9 | ####### 10 | 11 | :: 12 | 13 | easy_install html2rest 14 | 15 | Or:: 16 | 17 | pip install html2rest 18 | 19 | Usage 20 | ##### 21 | 22 | From the command line:: 23 | 24 | html2rest http://sphinx.pocoo.org/templating.html > templating.rst 25 | 26 | Or programmatically:: 27 | 28 | from html2rest import html2rest 29 | 30 | stream = StringIO() 31 | 32 | html2rest('', writer=stream) 33 | 34 | Specify input encoding (default is 'utf8') and a preprocessor:: 35 | 36 | def strip_chars(html): 37 | return html.replace('¶', '') 38 | 39 | html2rest(html, writer=stream, encoding='latin1', preprocess=strip_chars) 40 | 41 | -------------------------------------------------------------------------------- /bin/html2rest: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import urllib 5 | import codecs 6 | import locale 7 | 8 | from html2rest import html2rest 9 | 10 | fileobj = None 11 | args = sys.argv[1:] 12 | if args: 13 | arg = args[0] 14 | if '://' in arg: 15 | fileobj = urllib.urlopen(arg) 16 | else: 17 | fileobj = open(arg, 'rb') 18 | if len(args) > 1: 19 | encoding = args[1] 20 | else: 21 | encoding = 'utf8' 22 | if arg[-1] == '/': 23 | arg = arg[:-1] 24 | relto = arg.rpartition('/')[0] 25 | else: 26 | fileobj = sys.stdin 27 | encoding = locale.getpreferredencoding() or 'utf-8' 28 | relto = None 29 | try: 30 | html2rest(fileobj.read(), encoding=encoding, relto=relto) 31 | finally: 32 | try: 33 | fileobj.close() 34 | except: 35 | pass 36 | 37 | -------------------------------------------------------------------------------- /html2rest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #----------------------------------------------------------------------------- 3 | # Copyright (c) 2006-2011 Gerard Flanagan 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the "Software"), 7 | # to deal in the Software without restriction, including without limitation 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | # and/or sell copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included 13 | # in all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | #----------------------------------------------------------------------------- 23 | 24 | __version__ = '0.2.2' 25 | 26 | import sys 27 | import os 28 | import re 29 | from sgmllib import SGMLParser 30 | from StringIO import StringIO 31 | from textwrap import TextWrapper 32 | from urllib2 import urlparse 33 | 34 | CODEBLOCK = '::' 35 | BLOCKTAGS = ['div', 'blockquote'] 36 | IGNORETAGS = ['title', 'style', 'script'] 37 | UNDERLINES = list('=-~`+;') 38 | 39 | # Fredrik Lundh, http://effbot.org/zone/re-sub.html 40 | def unescape(text, to_encoding='utf8'): 41 | def fixup(m): 42 | text = m.group(0) 43 | if text[:2] == "&#": 44 | # character reference 45 | try: 46 | if text[:3].lower() == "&#x": 47 | return unichr(int(text[3:-1], 16)) 48 | else: 49 | return unichr(int(text[2:-1])) 50 | except ValueError: 51 | pass 52 | else: 53 | # named entity 54 | import htmlentitydefs 55 | try: 56 | text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) 57 | except KeyError: 58 | pass 59 | return text # leave as is 60 | return re.sub("&#?\w+;", fixup, text).encode(to_encoding) 61 | 62 | try: 63 | from BeautifulSoup import BeautifulSoup 64 | except ImportError: 65 | def BeautifulSoup(text, *args, **kw): 66 | return text 67 | 68 | def readsoup(html, convert='html', encoding='utf8'): 69 | #for br in ['
', '
', '
']: 70 | # text = text.replace(br, '\n') 71 | # text = text.replace(br.upper(), '\n') 72 | return str(BeautifulSoup(html, convertEntities=convert, 73 | fromEncoding=encoding)) 74 | 75 | def html2rest(html, writer=sys.stdout, encoding='utf8', relto=None, preprocess=None): 76 | relroot = relpath = None 77 | if relto: 78 | parsed = urlparse.urlparse(relto) 79 | relroot = parsed.scheme + '://' + parsed.netloc 80 | relpath = relroot + parsed.path 81 | if relpath[-1] != '/': 82 | relpath += '/' 83 | if preprocess: 84 | html = preprocess(html, encoding=encoding) 85 | parser = Parser(writer, encoding, relroot, relpath) 86 | #parser.feed(readsoup(html)) 87 | parser.feed(html.decode(encoding)) 88 | parser.close() 89 | 90 | class LineBuffer(object): 91 | 92 | def __init__(self): 93 | self._lines = [] 94 | self._wrapper = TextWrapper() 95 | 96 | def __len__(self): 97 | return len(self._lines) 98 | 99 | def __getitem__(self, i): 100 | return self._lines[i] 101 | 102 | def __setitem__(self, i, value): 103 | self._lines[i] = value 104 | 105 | def clear(self): 106 | self._lines[:] = [] 107 | 108 | def read(self): 109 | return '\n'.join(self._lines) 110 | 111 | def write(self, s): 112 | #normalise whitespace 113 | s = ' '.join(s.split()) 114 | self._lines.extend(self._wrapper.wrap(s)) 115 | 116 | def rawwrite(self, s): 117 | self._lines.extend(s.splitlines()) 118 | 119 | def indent(self, numspaces=4, start=0): 120 | linebuf = self._lines 121 | n = len(linebuf) 122 | if n > start: 123 | indent = ' ' * numspaces 124 | for i in range(start, n): 125 | linebuf[i] = indent + linebuf[i] 126 | 127 | def lstrip(self): 128 | linebuf = self._lines 129 | for i in range(len(linebuf)): 130 | linebuf[i] = linebuf[i].lstrip() 131 | 132 | class Parser(SGMLParser): 133 | 134 | def __init__(self, writer=sys.stdout, encoding='utf8', relroot=None, relpath=None): 135 | SGMLParser.__init__(self) 136 | self.writer = writer 137 | self.encoding = encoding 138 | self.relroot = relroot 139 | self.relpath = relpath 140 | self.stringbuffer = StringIO() 141 | self.linebuffer = LineBuffer() 142 | self.verbatim = False 143 | self.lists = [] 144 | self.ignoredata = False 145 | self.inblock = 0 146 | self.nobreak = False 147 | self.hrefs = {} 148 | 149 | def close(self): 150 | self.writeline() 151 | SGMLParser.close(self) 152 | 153 | def flush(self): 154 | if self.linebuffer: 155 | if self.inblock > 1: 156 | indent = 4 * (self.inblock - 1) 157 | self.linebuffer.indent(indent) 158 | self.writer.write(unescape(self.linebuffer.read(), self.encoding)) 159 | self.linebuffer.clear() 160 | 161 | def flush_stringbuffer(self): 162 | sbuf = self.stringbuffer.getvalue() 163 | if not sbuf: 164 | return 165 | elif self.linebuffer: 166 | self.linebuffer[-1] += sbuf 167 | else: 168 | self.linebuffer.write(sbuf) 169 | self.clear_stringbuffer() 170 | 171 | def clear_stringbuffer(self): 172 | #self.stringbuffer.reset() 173 | self.stringbuffer.seek(0) 174 | self.stringbuffer.truncate() 175 | 176 | def data(self, text): 177 | self.stringbuffer.write(text) 178 | 179 | def pending(self): 180 | return self.stringbuffer.tell() or self.linebuffer 181 | 182 | def write(self, text=''): 183 | self.flush_stringbuffer() 184 | self.flush() 185 | self.writer.write(unescape(text)) 186 | 187 | def writeline(self, text=''): 188 | self.write(text + '\n') 189 | 190 | def writestartblock(self, text=''): 191 | if self.pending(): 192 | self.writeline() 193 | self.writeline() 194 | self.writeline(text) 195 | 196 | def writeendblock(self, text=''): 197 | self.writeline(text) 198 | self.writeline() 199 | 200 | def writeblock(self, text=''): 201 | self.writestartblock(text) 202 | self.writeline() 203 | 204 | def handle_data(self, data): 205 | if self.ignoredata: 206 | return 207 | elif self.verbatim: 208 | self.data(data) 209 | else: 210 | if '#pending' in self.hrefs: 211 | self.hrefs[self.hrefs['#pending']] = data 212 | self.data(' '.join(data.splitlines())) 213 | 214 | def unknown_starttag(self, tag, attrs): 215 | if tag in IGNORETAGS: 216 | self.ignoredata = True 217 | elif len(tag) == 2 and tag[0] == 'h': 218 | self.writestartblock() 219 | elif tag == 'br': 220 | if self.verbatim: 221 | self.data('\n') 222 | elif not self.inblock: 223 | self.writeline() 224 | else: 225 | self.data(' ') 226 | elif not self.verbatim: 227 | self.data(' ') 228 | 229 | def unknown_endtag(self, tag): 230 | self.ignoredata = False 231 | if len(tag) == 2 and tag[0] == 'h': 232 | self.flush_stringbuffer() 233 | if self.linebuffer: 234 | linebuf = self.linebuffer 235 | linebuf[-1] = linebuf[-1].strip() 236 | char = UNDERLINES[int(tag[1])-1] 237 | linebuf.write(char * len(linebuf[-1])) 238 | self.writeline() 239 | #elif tag in BLOCKTAGS and self.pending(): 240 | # if self.lists: 241 | # self.end_li() 242 | # else: 243 | # self.writeline() 244 | elif not self.verbatim: 245 | self.data(' ') 246 | 247 | def start_a(self, attrs): 248 | href = dict(attrs).get('href', None) 249 | if not href or href.startswith('#'): 250 | return 251 | elif self.relroot and self.relpath and 'mailto:' not in href: 252 | if href.startswith('/'): 253 | href = self.relroot + href 254 | elif '://' not in href: 255 | href = self.relpath + href 256 | self.data('`') 257 | self.hrefs['#pending'] = href 258 | 259 | def end_a(self): 260 | if '#pending' in self.hrefs: 261 | self.data('`_') 262 | del self.hrefs['#pending'] 263 | 264 | def start_pre(self, attrs): 265 | if self.lists: 266 | self.end_li() 267 | self.writeline() 268 | #self.inblock += 1 269 | self.verbatim = True 270 | self.writeblock(CODEBLOCK) 271 | 272 | def end_pre(self): 273 | sbuf = self.stringbuffer.getvalue() 274 | if sbuf: 275 | self.linebuffer.rawwrite(sbuf) 276 | self.linebuffer.indent(4) 277 | self.clear_stringbuffer() 278 | self.writeendblock() 279 | #self.inblock -= 1 280 | self.verbatim = False 281 | 282 | def start_ul(self, attrs): 283 | if self.lists: 284 | self.end_li() 285 | self.writeline() 286 | else: 287 | self.writeline() 288 | self.lists.append('+ ') 289 | self.inblock += 1 290 | 291 | def end_ul(self): 292 | self.end_li() 293 | self.lists.pop() 294 | self.inblock -= 1 295 | if self.inblock: 296 | self.writeline() 297 | else: 298 | self.writeendblock() 299 | 300 | def start_ol(self, attrs): 301 | if self.lists: 302 | self.end_li() 303 | self.writeline() 304 | else: 305 | self.writeline() 306 | self.lists.append('#. ') 307 | self.inblock += 1 308 | 309 | def end_ol(self): 310 | self.end_li() 311 | self.lists.pop() 312 | self.inblock -= 1 313 | if self.inblock: 314 | self.writeline() 315 | else: 316 | self.writeendblock() 317 | 318 | def start_p(self, attrs): 319 | if self.verbatim: 320 | self.writeline() 321 | elif not self.inblock: 322 | self.writeline() 323 | 324 | def end_p(self): 325 | if self.inblock: 326 | #self.flush_stringbuffer() 327 | if self.verbatim: 328 | self.writeline() 329 | else: 330 | return 331 | else: 332 | self.linebuffer.lstrip() 333 | self.writeline() 334 | 335 | def start_li(self, attrs): 336 | self.writeline() 337 | self.data(self.lists[-1]) 338 | 339 | def end_li(self): 340 | self.flush_stringbuffer() 341 | linebuf = self.linebuffer 342 | if linebuf and linebuf[0] and linebuf[0].lstrip()[:2] in ['+ ', '#.']: 343 | start=1 344 | else: 345 | # the start of the
  • has already been written, perhaps because 346 | # there was a
     block
    347 |             start = 0
    348 |         self.linebuffer.indent(len(self.lists[-1]), start=start)
    349 |         self.write()
    350 | 
    351 |     def start_dl(self, attrs):
    352 |         self.writeline()
    353 |         self.inblock += 1
    354 |         self.nobreak = True
    355 | 
    356 |     def end_dl(self):
    357 |         self.nobreak = False
    358 |         self.writeline()
    359 |         self.inblock -= 1
    360 | 
    361 |     def start_dt(self, attrs):
    362 |         self.data(':')
    363 | 
    364 |     def end_dt(self):
    365 |         self.data(':')
    366 | 
    367 |     def start_dd(self, attrs):
    368 |         self.data(' ')
    369 | 
    370 |     def end_dd(self):
    371 |         self.flush_stringbuffer()
    372 |         self.linebuffer.indent(2, start=1)
    373 |         self.writeline()
    374 | 
    375 |     def start_em(self, attrs):
    376 |         self.data(' *')
    377 | 
    378 |     def end_em(self):
    379 |         self.data('*')
    380 | 
    381 |     def start_b(self, attrs):
    382 |         self.data(' **')
    383 | 
    384 |     def end_b(self):
    385 |         self.data('**')
    386 | 
    387 |     def start_code(self, attrs):
    388 |         self.data(' `')
    389 | 
    390 |     def end_code(self):
    391 |         self.data('`')
    392 | 
    393 |     def start_span(self, attrs):
    394 |         pass
    395 | 
    396 |     def end_span(self):
    397 |         pass
    398 | 
    399 |     def start_body(self, attrs):
    400 |         pass
    401 | 
    402 |     def end_body(self):
    403 |         self.end_p()
    404 |         for href, link in self.hrefs.items():
    405 |             if href[0] != '#':
    406 |                 self.writeline('.. _%s: %s' % (link, href))
    407 |         self.end_p()
    408 | 
    409 | 
    
    
    --------------------------------------------------------------------------------
    /pkg:
    --------------------------------------------------------------------------------
     1 | #!/bin/sh
     2 | 
     3 | grep "__version__.*dev['\"]$" html2rest.py >/dev/null 2>&1
     4 | 
     5 | if [ $? -eq 0 ]; then
     6 |     # dev
     7 |     python setup.py sdist --formats=gztar,zip
     8 | else
     9 |     # not dev
    10 |     python setup.py sdist --formats=gztar,zip register upload
    11 | fi
    12 | 
    13 | 
    
    
    --------------------------------------------------------------------------------
    /setup.py:
    --------------------------------------------------------------------------------
     1 | # -*- coding: utf-8 -*-
     2 | import os
     3 | from os.path import join as pathjoin, exists as pathexists, dirname, basename, abspath
     4 | from distutils.core import setup
     5 | 
     6 | import re
     7 | version_rx = r"^__version__ = '(.*)'$"
     8 | version_pattern = re.compile(version_rx)
     9 | 
    10 | 
    11 | fd = open('html2rest.py')
    12 | try:
    13 |     for line in fd:
    14 |         m = version_pattern.match(line)
    15 |         if m:
    16 |             break
    17 |     else:
    18 |         raise Exception("couldn't find __version__")
    19 | finally:
    20 |     fd.close()
    21 | 
    22 | __version__ = m.group(1)
    23 | 
    24 | srcdir = dirname(abspath(__file__))
    25 | 
    26 | print "running setup for html2rest version %s" % __version__
    27 | 
    28 | 
    29 | 
    30 | setup(
    31 |         name="html2rest",
    32 |         version=__version__,
    33 |         description="Convert HTML to restructuredText",
    34 |         author="Gerard Flanagan",
    35 |         author_email = "grflanagan@gmail.com",
    36 |         classifiers=["Development Status :: 4 - Beta",
    37 |                     "Intended Audience :: Developers",
    38 |                     "License :: OSI Approved :: BSD License",
    39 |                     "Programming Language :: Python",
    40 |                     "Topic :: Software Development :: Libraries",
    41 |                     "Topic :: Software Development :: Libraries :: Python Modules",
    42 |                     ],
    43 |         url="https://github.com/podados/python-html2rest",
    44 |         license="BSD",
    45 |         download_url="http://pypi.python.org/packages/source/h/html2rest/html2rest-%s.tar.gz" % __version__,
    46 |         py_modules=['html2rest'],
    47 |         scripts = [
    48 |             pathjoin(srcdir, 'bin', 'html2rest'),
    49 |         ],
    50 | )
    51 |     
    52 | 
    
    
    --------------------------------------------------------------------------------