├── MANIFEST.in ├── gendoc.py ├── PKG-INFO ├── README.path ├── tidy ├── error.py ├── pvt_ctypes │ └── README.ctypes ├── __init__.py ├── README.tidydll ├── test_tidy.py └── lib.py ├── README.txt ├── LICENSE ├── setup.py ├── INSTALL.txt └── path.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include INSTALL.txt 2 | include LICENSE 3 | include *.py 4 | include README.* 5 | include MANIFEST.in 6 | include tidy/README.* 7 | include tidy/pvt_ctypes/README.* 8 | -------------------------------------------------------------------------------- /gendoc.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from epydoc.cli import cli 3 | 4 | def run(argv=sys.argv): 5 | argv_old=sys.argv 6 | sys.argv=argv 7 | cli() 8 | sys.argv=argv_old 9 | 10 | if __name__=='__main__': 11 | default='epydoc -o apidoc tidy/error.py tidy/lib.py tidy/__init__.py'.split() 12 | run(default) 13 | -------------------------------------------------------------------------------- /PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: uTidylib 3 | Version: 0.2 4 | Summary: Wrapper for HTML Tidy at http://tidy.sourceforge.net 5 | Home-page: http://utidylib.sf.net 6 | Author: Cory Dodt 7 | Author-email: corydodt@twistedmatrix.com 8 | License: UNKNOWN 9 | Description: A wrapper for the relocatable version of HTML Tidy (see 10 | http://tidy.sourceforge.net for details). This allows you to 11 | tidy HTML files through a Pythonic interface. 12 | Platform: UNKNOWN 13 | -------------------------------------------------------------------------------- /README.path: -------------------------------------------------------------------------------- 1 | **This applies to the file path.py, not uTidyLib. Please see the text 2 | file LICENSE for information about uTidyLib.** 3 | 4 | License: You may use path.py for whatever you wish, at your own 5 | risk. (For example, you may modify, relicense, and redistribute it.) 6 | It is provided without any guarantee or warranty of any kind, not even 7 | for merchantability or fitness for any purpose. 8 | 9 | If you do make changes to path.py, please consider sending them along 10 | to me at jason@jorendorff.com. 11 | -------------------------------------------------------------------------------- /tidy/error.py: -------------------------------------------------------------------------------- 1 | __all__ = ('TidyLibError', 'InvalidOptionError', 'OptionArgError', 2 | ) 3 | 4 | class TidyLibError(Exception): 5 | def __init__(self, arg): 6 | self.arg=arg 7 | 8 | class InvalidOptionError(TidyLibError): 9 | def __str__(self): 10 | return "%s was not a valid Tidy option." % (self.arg) 11 | __repr__=__str__ 12 | 13 | class OptionArgError(TidyLibError): 14 | def __init__(self, arg): 15 | self.arg=arg 16 | def __str__(self): 17 | return self.arg -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | This is uTidylib, the Python wrapper for the HTML cleaning 2 | library named TidyLib: http://tidy.sf.net . Python 2.3 or later 3 | is required to use uTidylib. Repeat, Python 2.3 or later is 4 | *required* to use uTidylib. 5 | 6 | Once installed, there are two ways to get help. The simplest is: 7 | 8 | $ python 9 | >>> import tidy 10 | >>> help(tidy) 11 | . . . 12 | 13 | Then, of course, there's the epydoc-generated API documentation, which 14 | is available at site-packages/tidy/apidoc/index.html . 15 | 16 | __________________ 17 | 18 | 10 Second Tutorial 19 | __________________ 20 | 21 | >>> import tidy 22 | >>> options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0) 23 | >>> print tidy.parseString('Hello Tidy!', **options) 24 | 25 | 27 | 28 |
29 |\N{LATIN SMALL LETTER E WITH ACUTE} 12 | 13 | 14 | 15 | '''.encode('utf8') 16 | file('foo.htm', 'w').write(foo) 17 | self.input1 = "" 18 | self.input2 = "\n" + "
asdkfjhasldkfjhsldjas\n" * 100
19 | def defaultDocs(self):
20 | doc1 = tidy.parseString(self.input1)
21 | doc2 = tidy.parseString(self.input2)
22 | doc3 = tidy.parse("foo.htm")
23 | doc4 = tidy.parse("bar.htm") # doesn't exist
24 | return (doc1, doc2, doc3, doc4)
25 | def test_badOptions(self):
26 | badopts = [{'foo': 1}, {'indent': '---'}, {'indent_spaces': None}]
27 | for dct in badopts:
28 | try:
29 | tidy.parseString(self.input2, **dct)
30 | except tidy.TidyLibError:
31 | pass
32 | else:
33 | self.fail("Invalid option %s should have raised an error" %
34 | repr(dct))
35 | def test_encodings(self):
36 | foo = file('foo.htm').read().decode('utf8').encode('ascii',
37 | 'xmlcharrefreplace')
38 | doc1u = tidy.parseString(foo, input_encoding='ascii',
39 | output_encoding='latin1')
40 | self.failUnless(str(doc1u).find('\xe9')>=0)
41 | doc2u = tidy.parseString(foo, input_encoding='ascii',
42 | output_encoding='utf8')
43 | self.failUnless(str(doc2u).find('\xc3\xa9')>=0)
44 | def test_errors(self):
45 | doc1, doc2, doc3, doc4 = self.defaultDocs()
46 | for doc in [doc1, doc2, doc3]:
47 | str(getattr(doc, 'errors'))
48 | self.assertEquals(doc1.errors[0].line, 1)
49 | def test_options(self):
50 | options = dict(add_xml_decl=1, show_errors=1, newline='CR',
51 | output_xhtml=1)
52 | doc1 = tidy.parseString(self.input1, **options)
53 | found = re.search('//2\W+//]]>', str(doc1),
54 | re.MULTILINE)
55 | self.failUnless(found)
56 | doc2 = tidy.parseString("", **options)
57 | self.failUnless(str(doc2).startswith('1) # FIXME - tidylib doesn't
59 | ## # support this?
60 | self.failUnless(str(doc2).find('\n')<0)
61 | doc3 = tidy.parse('foo.htm', char_encoding='utf8',
62 | alt_text='foo')
63 | self.failUnless(str(doc3).find('alt="foo"')>=0)
64 | self.failUnless(str(doc3).find('\xc3\xa9')>=0)
65 | def test_parse(self):
66 | doc1, doc2, doc3, doc4 = self.defaultDocs()
67 | self.failUnless(str(doc1).find('') >=0)
68 | self.failUnless(str(doc2).find('') >= 0)
69 | self.failUnless(str(doc3).find('') >= 0)
70 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from path import path
2 | from distutils.core import setup
3 | from distutils.command.install import install
4 | from distutils.command.install_data import install_data
5 | from distutils.command.bdist_wininst import bdist_wininst
6 | from distutils import sysconfig
7 |
8 | # pack the doc in as data files
9 | apidoc=path('apidoc')
10 | data_files = [] ; dfa = data_files.append
11 | pkgdir = path('tidy')
12 | if apidoc.isdir():
13 | dfa((str(pkgdir/apidoc), map(str, apidoc.files())))
14 | for p in path('apidoc').walkdirs():
15 | dfa((str(pkgdir/p), map(str, p.files())))
16 |
17 |
18 | class bdist_wininst_utidylib(bdist_wininst):
19 | def finalize_options(self):
20 | dfa = self.distribution.data_files.append
21 | dfa((str(pkgdir), [str(pkgdir/'cygtidy-0-99-0.dll'),
22 | str(pkgdir/'README.tidydll')]
23 | ))
24 | private_ctypes = pkgdir/'pvt_ctypes'
25 | dfa((str(private_ctypes), [str(private_ctypes/'ctypes.zip'),
26 | str(private_ctypes/'_ctypes.pyd'),
27 | str(private_ctypes/'README.ctypes')]
28 | ))
29 |
30 | # TODO - make it impossible to install on python2.2
31 | bdist_wininst.finalize_options(self)
32 |
33 | # make sure data files are installed in tidylib package during binary
34 | # build phase - this is evil.
35 | class install_data_utidylib(install_data):
36 | def finalize_options (self):
37 | self.set_undefined_options('install',
38 | ('install_lib', 'install_dir'))
39 | install_data.finalize_options(self)
40 |
41 | class install_utidylib(install):
42 | def run(self):
43 | install.run(self)
44 | print "*** This library requires that you have two libraries ***"
45 | print "*** installed: ctypes and libtidy. ***"
46 | print "*** Please make sure they are installed correctly ***"
47 | print "*** before reporting a bug. ***"
48 | print "*** See: ***"
49 | print "*** http://starship.python.net/crew/theller/ctypes/ ***"
50 | print "*** and http://tidy.sourceforge.net ***"
51 | print "*** (or consult your vendor documentation for binary ***"
52 | print "*** packages.) ***"
53 |
54 |
55 |
56 | setup_data = dict(packages=['tidy', ],
57 | data_files=data_files,
58 | cmdclass=dict(install_data=install_data_utidylib,
59 | bdist_wininst=bdist_wininst_utidylib,
60 | install=install_utidylib,
61 | ),
62 | name='uTidylib',
63 | version='0.2',
64 | author='Cory Dodt',
65 | author_email='corydodt@twistedmatrix.com',
66 | url='http://utidylib.sf.net',
67 | description='Wrapper for HTML Tidy at '
68 | 'http://tidy.sourceforge.net',
69 | long_description='''\
70 | A wrapper for the relocatable version of HTML Tidy (see
71 | http://tidy.sourceforge.net for details). This allows you to
72 | tidy HTML files through a Pythonic interface.'''
73 | )
74 |
75 | if __name__ == '__main__':
76 | setup(**setup_data)
77 |
--------------------------------------------------------------------------------
/INSTALL.txt:
--------------------------------------------------------------------------------
1 | If you're reading this, you are probably using a platform that
2 | doesn't have binaries available. Check anyway:
3 |
4 | http://sourceforge.net/project/showfiles.php?group_id=84459
5 |
6 | You may also want to consult this document if you get the message:
7 | "Couldn't find libtidy, please make sure it is installed correctly."
8 |
9 | ==================================================================
10 | On Linux (instructions for other flavors of Unix mostly the same):
11 | ___________________
12 | 1. Install libtidy:
13 |
14 | TidyLib can be obtained from http://tidy.sourceforge.net/src/tidy_src.tgz
15 |
16 | (1a) Compile
17 |
18 | $ tar xvfz tidy_src.tgz
19 | $ cd tidy
20 | $ sh build/gnuauto/setup.sh
21 | $ ./configure # may want to specify --prefix=/usr here, see below
22 | $ make
23 |
24 |
25 | (1b) Install
26 | (become root)
27 | # make install
28 |
29 | This will place libtidy in /usr/local/lib. If you use --prefix=/usr in
30 | the configure line flagged above, your library will go to /usr/lib
31 | instead. The directory you install the library into MUST be
32 | configured with ldconfig, so if you installed into /usr/local/lib and
33 | it's mysteriously not working:
34 |
35 | # man ldconfig
36 | # man ld.so.conf
37 |
38 | Other Unices may have some variant of ldconfig, or they may use an
39 | environment variable such as LIBPATH, LD_LIBRARY_PATH, etc.
40 |
41 | __________________
42 | 2. Install ctypes:
43 |
44 | Ctypes is available from:
45 | http://sourceforge.net/project/showfiles.php?group_id=71702
46 |
47 | _________________________________
48 | 3. Install uLibtidy python files:
49 |
50 | (as root)
51 | # cd uTidylib
52 | # python setup.py install
53 |
54 |
55 |
56 | ==================================================================
57 | On Windows:
58 | __________________
59 | 1. Install libtidy
60 |
61 | TidyLib can be obtained from http://tidy.sourceforge.net/src/tidy_src.tgz
62 |
63 | libtidy provides 2 ways to compile on Windows. The first way is to
64 | use the project and makefiles in uTidylib/libtidy/build/msvc. This
65 | way is not recommended as it requires you to purchase MS Visual C++.
66 |
67 | 1a) Install Cygwin
68 | The second, recommended way is to install Cygwin, with at least the
69 | following packages:
70 | make, automake, libtool, gcc, and gcc-mingw
71 | It is recommended that you do _not_ install Cygwin Python; instead use
72 | the Windows installer at http://python.org/download/ .
73 |
74 | 1b) Compile
75 | We will compile with the mingw compiler, which produces binaries that
76 | do not depend on the Cygwin DLLs.
77 | $ tar xvfz tidy_src.tgz
78 | $ cd tidy
79 | $ sh build/gnuauto/setup.sh
80 | $ CFLAGS=-mno-cygwin ./configure
81 | $ make
82 |
83 | 1c) Copy DLL to a directory in the PATH:
84 |
85 | $ cp src/.libs/cygtidy-0-99-0.dll $SYSTEMROOT
86 |
87 | __________________
88 | 2. Install ctypes:
89 |
90 | Ctypes is available from:
91 | http://sourceforge.net/project/showfiles.php?group_id=71702
92 |
93 | _________________________________
94 | 3. Install uLibtidy python files:
95 |
96 | $ cd uTidylib
97 | $ python setup.py install
98 |
99 |
100 | ==================================================================
101 | Running tests (after installing):
102 | _________________________________
103 |
104 | Running tests requires that you have installed Twisted
105 | (http://twistedmatrix.com), as uTidyLib uses the trial framework for
106 | testing.
107 |
108 | $ python -c "from twisted.scripts import trial; trial.run()" -p tidylib
109 |
110 | This should work on either Windows or Unix.
111 |
112 |
113 | ==================================================================
114 | The Doc:
115 | ________
116 |
117 | To build the doc, just run:
118 |
119 | $ python gendoc.py
120 |
121 | (This requires that you have epydoc installed.)
122 |
123 | The API documentation will be built in the ``apidoc'' directory.
--------------------------------------------------------------------------------
/tidy/lib.py:
--------------------------------------------------------------------------------
1 | from __future__ import generators
2 |
3 | import sys
4 | import os.path
5 | from itertools import count
6 | packagedir = os.path.dirname(__file__)
7 |
8 | # look for ctypes in the system path, then try looking for a private ctypes
9 | # distribution
10 | try:
11 | import ctypes
12 | except ImportError:
13 | private_ctypes = os.path.join(packagedir, 'pvt_ctypes')
14 | sys.path.insert(0, private_ctypes)
15 | sys.path.insert(0, os.path.join(private_ctypes, 'ctypes.zip'))
16 | import ctypes
17 | from cStringIO import StringIO
18 | import weakref
19 | from tidy.error import *
20 |
21 | # search the path for libtidy using the known names; try the package
22 | # directory too
23 | thelib=None
24 | os.environ['PATH'] = "%s%s%s" % (packagedir, os.pathsep, os.environ['PATH'])
25 | for libname in ('cygtidy-0-99-0', 'libtidy', 'libtidy.so',
26 | 'libtidy-0.99.so.0', 'tidylib'):
27 | try:
28 | thelib = getattr(ctypes.cdll, libname)
29 | break
30 | except OSError:
31 | pass
32 | if not thelib:
33 | raise OSError("Couldn't find libtidy, please make sure it is installed.")
34 |
35 | class Loader:
36 | """I am a trivial wrapper that eliminates the need for tidy.tidyFoo,
37 | so you can just access tidy.Foo
38 | """
39 | def __init__(self):
40 | self.lib=thelib
41 | def __getattr__(self, name):
42 | try:
43 | return getattr(self.lib, "tidy%s" % name)
44 | # current ctypes uses ValueError, future will use AttributeError
45 | except (ValueError, AttributeError):
46 | return getattr(self.lib, name)
47 |
48 | _tidy=Loader()
49 |
50 | # define a callback to pass to Tidylib
51 | def _putByte(handle, c):
52 | """Lookup sink by handle and call its putByte method"""
53 | sinkfactory[handle].putByte(c)
54 | return 0
55 |
56 | PUTBYTEFUNC=ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_char)
57 | putByte=PUTBYTEFUNC(_putByte)
58 |
59 | class _OutputSink(ctypes.Structure):
60 | _fields_=[("sinkData", ctypes.c_int),
61 | ("putByte", PUTBYTEFUNC),
62 | ]
63 |
64 | class _Sink:
65 | def __init__(self):
66 | self._data = StringIO()
67 | self.struct = _OutputSink()
68 | self.struct.putByte = putByte
69 | def putByte(self, c):
70 | self._data.write(c)
71 | def __str__(self):
72 | return self._data.getvalue()
73 |
74 | class ReportItem:
75 | def __init__(self, err):
76 | self.err = err
77 | if err.startswith('line'):
78 | tokens = err.split(' ',6)
79 | self.severity = tokens[5][0] # W or E
80 | self.line = int(tokens[1])
81 | self.col = int(tokens[3])
82 | self.message = tokens[6]
83 | else:
84 | tokens = err.split(' ',1)
85 | self.severity = tokens[0][0]
86 | self.message = tokens[1]
87 | self.line = None
88 | self.col = None
89 | # TODO - parse emacs mode
90 | def __str__(self):
91 | severities = dict(W='Warning', E='Error', C='Config')
92 | try:
93 | if self.line:
94 | return "line %d col %d - %s: %s" % (self.line, self.col,
95 | severities[self.severity],
96 | self.message)
97 |
98 | else:
99 | return "%s: %s" % (severities[self.severity], self.message)
100 | except KeyError:
101 | return self.err
102 |
103 | def __repr__(self):
104 | return "%s('%s')" % (self.__class__.__name__,
105 | str(self).replace("'", "\\'"))
106 |
107 | class FactoryDict(dict):
108 | """I am a dict with a create method and no __setitem__. This allows
109 | me to control my own keys.
110 | """
111 | def create(self):
112 | """Subclasses should implement me to generate a new item"""
113 | def _setitem(self, name, value):
114 | dict.__setitem__(self, name, value)
115 | def __setitem__(self, name, value):
116 | raise TypeError, "Use create() to get a new object"
117 |
118 |
119 | class SinkFactory(FactoryDict):
120 | """Mapping for lookup of sinks by handle"""
121 | def __init__(self):
122 | FactoryDict.__init__(self)
123 | self.lastsink = 0
124 | def create(self):
125 | sink = _Sink()
126 | sink.struct.sinkData = self.lastsink
127 | FactoryDict._setitem(self, self.lastsink, sink)
128 | self.lastsink = self.lastsink+1
129 | return sink
130 |
131 | sinkfactory=SinkFactory()
132 |
133 | class _Document(object):
134 | def __init__(self):
135 | self.cdoc = _tidy.Create()
136 | self.errsink = sinkfactory.create()
137 | _tidy.SetErrorSink(self.cdoc, ctypes.byref(self.errsink.struct))
138 | def write(self, stream):
139 | stream.write(str(self))
140 | def get_errors(self):
141 | ret = []
142 | for line in str(self.errsink).split('\n'):
143 | line = line.strip(' \n\r')
144 | if line: ret.append(ReportItem(line))
145 | return ret
146 | errors=property(get_errors)
147 | def __str__(self):
148 | stlen = ctypes.c_int(8192)
149 | st = ctypes.c_buffer(stlen.value)
150 | rc = _tidy.SaveString(self.cdoc, st, ctypes.byref(stlen))
151 | if rc==-12: # buffer too small
152 | st = ctypes.c_buffer(stlen.value)
153 | _tidy.SaveString(self.cdoc, st, ctypes.byref(stlen))
154 | return st.value
155 |
156 | errors = {'missing or malformed argument for option: ': OptionArgError,
157 | 'unknown option: ': InvalidOptionError,
158 | }
159 |
160 |
161 | class DocumentFactory(FactoryDict):
162 | def _setOptions(self, doc, **options):
163 | for k in options.keys():
164 |
165 | # this will flush out most argument type errors...
166 | if options[k] is None: options[k] = ''
167 |
168 | _tidy.OptParseValue(doc.cdoc,
169 | k.replace('_', '-'),
170 | str(options[k]))
171 | if doc.errors:
172 | match=filter(doc.errors[-1].message.startswith, errors.keys())
173 | if match:
174 | raise errors[match[0]](doc.errors[-1].message)
175 | def load(self, doc, arg, loader):
176 | loader(doc.cdoc, arg)
177 | _tidy.CleanAndRepair(doc.cdoc)
178 | def loadFile(self, doc, filename):
179 | self.load(doc, filename, _tidy.ParseFile)
180 | def loadString(self, doc, st):
181 | self.load(doc, st, _tidy.ParseString)
182 | def _create(self, *args, **kwargs):
183 | doc = _Document()
184 | self._setOptions(doc, **kwargs)
185 | ref = weakref.ref(doc, self.releaseDoc)
186 | FactoryDict._setitem(self, ref, doc.cdoc)
187 | return doc
188 | def parse(self, filename, *args, **kwargs):
189 | """Open and process filename as an HTML file, returning a
190 | processed document object.
191 | @param kwargs: named options to pass to TidyLib for processing
192 | the input file.
193 | @param filename: the name of a file to process
194 | @return: a document object
195 | """
196 | doc = self._create(**kwargs)
197 | self.loadFile(doc, filename)
198 | return doc
199 | def parseString(self, st, *args, **kwargs):
200 | """Use st as an HTML file, and process it, returning a
201 | document object.
202 | @param kwargs: named options to pass to TidyLib for processing
203 | the input file.
204 | @param st: the string to parse
205 | @return: a document object
206 | """
207 | doc = self._create(**kwargs)
208 | self.loadString(doc, st)
209 | return doc
210 | def releaseDoc(self, ref):
211 | _tidy.Release(self[ref])
212 |
213 | docfactory = DocumentFactory()
214 | parse = docfactory.parse
215 | parseString = docfactory.parseString
216 |
--------------------------------------------------------------------------------
/path.py:
--------------------------------------------------------------------------------
1 | """ path.py - An object representing a path to a file or directory.
2 |
3 | Example:
4 |
5 | from path import path
6 | d = path('/home/guido/bin')
7 | for f in d.files('*.py'):
8 | f.chmod(0755)
9 |
10 | This module requires Python 2.2 or later.
11 |
12 |
13 | URL: http://www.jorendorff.com/articles/python/path
14 | Author: Jason Orendorff