├── MANIFEST.in ├── gendoc.py ├── PKG-INFO ├── README.path ├── tidy ├── error.py ├── pvt_ctypes │ └── README.ctypes ├── __init__.py ├── README.tidydll ├── test_tidy.py └── lib.py ├── README.txt ├── LICENSE ├── setup.py ├── INSTALL.txt └── path.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include INSTALL.txt 2 | include LICENSE 3 | include *.py 4 | include README.* 5 | include MANIFEST.in 6 | include tidy/README.* 7 | include tidy/pvt_ctypes/README.* 8 | -------------------------------------------------------------------------------- /gendoc.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from epydoc.cli import cli 3 | 4 | def run(argv=sys.argv): 5 | argv_old=sys.argv 6 | sys.argv=argv 7 | cli() 8 | sys.argv=argv_old 9 | 10 | if __name__=='__main__': 11 | default='epydoc -o apidoc tidy/error.py tidy/lib.py tidy/__init__.py'.split() 12 | run(default) 13 | -------------------------------------------------------------------------------- /PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: uTidylib 3 | Version: 0.2 4 | Summary: Wrapper for HTML Tidy at http://tidy.sourceforge.net 5 | Home-page: http://utidylib.sf.net 6 | Author: Cory Dodt 7 | Author-email: corydodt@twistedmatrix.com 8 | License: UNKNOWN 9 | Description: A wrapper for the relocatable version of HTML Tidy (see 10 | http://tidy.sourceforge.net for details). This allows you to 11 | tidy HTML files through a Pythonic interface. 12 | Platform: UNKNOWN 13 | -------------------------------------------------------------------------------- /README.path: -------------------------------------------------------------------------------- 1 | **This applies to the file path.py, not uTidyLib. Please see the text 2 | file LICENSE for information about uTidyLib.** 3 | 4 | License: You may use path.py for whatever you wish, at your own 5 | risk. (For example, you may modify, relicense, and redistribute it.) 6 | It is provided without any guarantee or warranty of any kind, not even 7 | for merchantability or fitness for any purpose. 8 | 9 | If you do make changes to path.py, please consider sending them along 10 | to me at jason@jorendorff.com. 11 | -------------------------------------------------------------------------------- /tidy/error.py: -------------------------------------------------------------------------------- 1 | __all__ = ('TidyLibError', 'InvalidOptionError', 'OptionArgError', 2 | ) 3 | 4 | class TidyLibError(Exception): 5 | def __init__(self, arg): 6 | self.arg=arg 7 | 8 | class InvalidOptionError(TidyLibError): 9 | def __str__(self): 10 | return "%s was not a valid Tidy option." % (self.arg) 11 | __repr__=__str__ 12 | 13 | class OptionArgError(TidyLibError): 14 | def __init__(self, arg): 15 | self.arg=arg 16 | def __str__(self): 17 | return self.arg -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | This is uTidylib, the Python wrapper for the HTML cleaning 2 | library named TidyLib: http://tidy.sf.net . Python 2.3 or later 3 | is required to use uTidylib. Repeat, Python 2.3 or later is 4 | *required* to use uTidylib. 5 | 6 | Once installed, there are two ways to get help. The simplest is: 7 | 8 | $ python 9 | >>> import tidy 10 | >>> help(tidy) 11 | . . . 12 | 13 | Then, of course, there's the epydoc-generated API documentation, which 14 | is available at site-packages/tidy/apidoc/index.html . 15 | 16 | __________________ 17 | 18 | 10 Second Tutorial 19 | __________________ 20 | 21 | >>> import tidy 22 | >>> options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0) 23 | >>> print tidy.parseString('Hello Tidy!', **options) 24 | 25 | 27 | 28 | 29 | 30 | 31 | 32 | Hello Tidy! 33 | 34 | 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /tidy/pvt_ctypes/README.ctypes: -------------------------------------------------------------------------------- 1 | ** This notice applies only to the version of ctypes packaged with the Windows 2 | binary package. These files are found in the pvt_ctypes directory. 3 | See the file LICENSE for information about uTidyLib. ** 4 | 5 | Copyright (c) 2000, 2001, 2002, 2003 Thomas Heller 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining 8 | a copy of this software and associated documentation files (the 9 | "Software"), to deal in the Software without restriction, including 10 | without limitation the rights to use, copy, modify, merge, publish, 11 | distribute, sublicense, and/or sell copies of the Software, and to 12 | permit persons to whom the Software is furnished to do so, subject to 13 | the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 22 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /tidy/__init__.py: -------------------------------------------------------------------------------- 1 | """The Tidy wrapper. 2 | I am the main interface to TidyLib. This package supports processing HTML with 3 | Tidy, with all the options that the tidy command line supports. 4 | 5 | For more information on the tidy options, see the reference. These options can 6 | be given as keyword arguments to parse and parseString, by changing dashes (-) 7 | to underscores(_). 8 | 9 | For example: 10 | 11 | >>> import tidy 12 | >>> options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0) 13 | >>> print tidy.parseString('Hello Tidy!', **options) 14 | 15 | 17 | 18 | 19 | 20 | 21 | 22 | Hello Tidy! 23 | 24 | 25 | 26 | For options like newline and output_encoding, which must be set to one of a 27 | fixed number of choices, you can provide either the numeric or string version 28 | of the choice; so both tidy.parseString('foo', newline=2) and 29 | tidy.parseString('foo', newline='CR') do the same thing. 30 | 31 | There are no plans to support other features of TidyLib, such as document-tree 32 | traversal, since Python has several quality DOM implementations. (The author 33 | uses Twisted's implementation, twisted.web.microdom). 34 | """ 35 | 36 | try: 37 | dict(x=1) 38 | except TypeError: 39 | raise ImportError("Python 2.3 or later is required to import this library.") 40 | 41 | __all__ = ['error', 'lib'] 42 | 43 | from tidy.lib import parse, parseString 44 | from tidy.error import * 45 | 46 | -------------------------------------------------------------------------------- /tidy/README.tidydll: -------------------------------------------------------------------------------- 1 | ** This notice applies only to the tidy DLL distributed with the Windows binary 2 | package of uTidylib. See the file LICENSE for information about uTidyLib. ** 3 | 4 | HTML Tidy 5 | 6 | HTML parser and pretty printer 7 | 8 | Copyright (c) 1998-2003 World Wide Web Consortium 9 | (Massachusetts Institute of Technology, European Research 10 | Consortium for Informatics and Mathematics, Keio University). 11 | All Rights Reserved. 12 | 13 | This software and documentation is provided "as is," and 14 | the copyright holders and contributing author(s) make no 15 | representations or warranties, express or implied, including 16 | but not limited to, warranties of merchantability or fitness 17 | for any particular purpose or that the use of the software or 18 | documentation will not infringe any third party patents, 19 | copyrights, trademarks or other rights. 20 | 21 | The copyright holders and contributing author(s) will not be held 22 | liable for any direct, indirect, special or consequential damages 23 | arising out of any use of the software or documentation, even if 24 | advised of the possibility of such damage. 25 | 26 | Permission is hereby granted to use, copy, modify, and distribute 27 | this source code, or portions hereof, documentation and executables, 28 | for any purpose, without fee, subject to the following restrictions: 29 | 30 | 1. The origin of this source code must not be misrepresented. 31 | 2. Altered versions must be plainly marked as such and must 32 | not be misrepresented as being the original source. 33 | 3. This Copyright notice may not be removed or altered from any 34 | source or altered source distribution. 35 | 36 | The copyright holders and contributing author(s) specifically 37 | permit, without fee, and encourage the use of this source code 38 | as a component for supporting the Hypertext Markup Language in 39 | commercial products. If you use this source code in a product, 40 | acknowledgment is not required but would be appreciated. 41 | 42 | -------------------------------------------------------------------------------- /tidy/test_tidy.py: -------------------------------------------------------------------------------- 1 | import re 2 | from twisted.trial import unittest 3 | import tidy 4 | 5 | class TidyTestCase(unittest.TestCase): 6 | def __init__(self, *args, **kwargs): 7 | foo = u''' 8 |

woot

9 |
10 | 11 |

\N{LATIN SMALL LETTER E WITH ACUTE} 12 | 13 | 14 | 15 | '''.encode('utf8') 16 | file('foo.htm', 'w').write(foo) 17 | self.input1 = "" 18 | self.input2 = "\n" + "

asdkfjhasldkfjhsldjas\n" * 100 19 | def defaultDocs(self): 20 | doc1 = tidy.parseString(self.input1) 21 | doc2 = tidy.parseString(self.input2) 22 | doc3 = tidy.parse("foo.htm") 23 | doc4 = tidy.parse("bar.htm") # doesn't exist 24 | return (doc1, doc2, doc3, doc4) 25 | def test_badOptions(self): 26 | badopts = [{'foo': 1}, {'indent': '---'}, {'indent_spaces': None}] 27 | for dct in badopts: 28 | try: 29 | tidy.parseString(self.input2, **dct) 30 | except tidy.TidyLibError: 31 | pass 32 | else: 33 | self.fail("Invalid option %s should have raised an error" % 34 | repr(dct)) 35 | def test_encodings(self): 36 | foo = file('foo.htm').read().decode('utf8').encode('ascii', 37 | 'xmlcharrefreplace') 38 | doc1u = tidy.parseString(foo, input_encoding='ascii', 39 | output_encoding='latin1') 40 | self.failUnless(str(doc1u).find('\xe9')>=0) 41 | doc2u = tidy.parseString(foo, input_encoding='ascii', 42 | output_encoding='utf8') 43 | self.failUnless(str(doc2u).find('\xc3\xa9')>=0) 44 | def test_errors(self): 45 | doc1, doc2, doc3, doc4 = self.defaultDocs() 46 | for doc in [doc1, doc2, doc3]: 47 | str(getattr(doc, 'errors')) 48 | self.assertEquals(doc1.errors[0].line, 1) 49 | def test_options(self): 50 | options = dict(add_xml_decl=1, show_errors=1, newline='CR', 51 | output_xhtml=1) 52 | doc1 = tidy.parseString(self.input1, **options) 53 | found = re.search('//2\W+//]]>', str(doc1), 54 | re.MULTILINE) 55 | self.failUnless(found) 56 | doc2 = tidy.parseString("", **options) 57 | self.failUnless(str(doc2).startswith('1) # FIXME - tidylib doesn't 59 | ## # support this? 60 | self.failUnless(str(doc2).find('\n')<0) 61 | doc3 = tidy.parse('foo.htm', char_encoding='utf8', 62 | alt_text='foo') 63 | self.failUnless(str(doc3).find('alt="foo"')>=0) 64 | self.failUnless(str(doc3).find('\xc3\xa9')>=0) 65 | def test_parse(self): 66 | doc1, doc2, doc3, doc4 = self.defaultDocs() 67 | self.failUnless(str(doc1).find('') >=0) 68 | self.failUnless(str(doc2).find('') >= 0) 69 | self.failUnless(str(doc3).find('') >= 0) 70 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from path import path 2 | from distutils.core import setup 3 | from distutils.command.install import install 4 | from distutils.command.install_data import install_data 5 | from distutils.command.bdist_wininst import bdist_wininst 6 | from distutils import sysconfig 7 | 8 | # pack the doc in as data files 9 | apidoc=path('apidoc') 10 | data_files = [] ; dfa = data_files.append 11 | pkgdir = path('tidy') 12 | if apidoc.isdir(): 13 | dfa((str(pkgdir/apidoc), map(str, apidoc.files()))) 14 | for p in path('apidoc').walkdirs(): 15 | dfa((str(pkgdir/p), map(str, p.files()))) 16 | 17 | 18 | class bdist_wininst_utidylib(bdist_wininst): 19 | def finalize_options(self): 20 | dfa = self.distribution.data_files.append 21 | dfa((str(pkgdir), [str(pkgdir/'cygtidy-0-99-0.dll'), 22 | str(pkgdir/'README.tidydll')] 23 | )) 24 | private_ctypes = pkgdir/'pvt_ctypes' 25 | dfa((str(private_ctypes), [str(private_ctypes/'ctypes.zip'), 26 | str(private_ctypes/'_ctypes.pyd'), 27 | str(private_ctypes/'README.ctypes')] 28 | )) 29 | 30 | # TODO - make it impossible to install on python2.2 31 | bdist_wininst.finalize_options(self) 32 | 33 | # make sure data files are installed in tidylib package during binary 34 | # build phase - this is evil. 35 | class install_data_utidylib(install_data): 36 | def finalize_options (self): 37 | self.set_undefined_options('install', 38 | ('install_lib', 'install_dir')) 39 | install_data.finalize_options(self) 40 | 41 | class install_utidylib(install): 42 | def run(self): 43 | install.run(self) 44 | print "*** This library requires that you have two libraries ***" 45 | print "*** installed: ctypes and libtidy. ***" 46 | print "*** Please make sure they are installed correctly ***" 47 | print "*** before reporting a bug. ***" 48 | print "*** See: ***" 49 | print "*** http://starship.python.net/crew/theller/ctypes/ ***" 50 | print "*** and http://tidy.sourceforge.net ***" 51 | print "*** (or consult your vendor documentation for binary ***" 52 | print "*** packages.) ***" 53 | 54 | 55 | 56 | setup_data = dict(packages=['tidy', ], 57 | data_files=data_files, 58 | cmdclass=dict(install_data=install_data_utidylib, 59 | bdist_wininst=bdist_wininst_utidylib, 60 | install=install_utidylib, 61 | ), 62 | name='uTidylib', 63 | version='0.2', 64 | author='Cory Dodt', 65 | author_email='corydodt@twistedmatrix.com', 66 | url='http://utidylib.sf.net', 67 | description='Wrapper for HTML Tidy at ' 68 | 'http://tidy.sourceforge.net', 69 | long_description='''\ 70 | A wrapper for the relocatable version of HTML Tidy (see 71 | http://tidy.sourceforge.net for details). This allows you to 72 | tidy HTML files through a Pythonic interface.''' 73 | ) 74 | 75 | if __name__ == '__main__': 76 | setup(**setup_data) 77 | -------------------------------------------------------------------------------- /INSTALL.txt: -------------------------------------------------------------------------------- 1 | If you're reading this, you are probably using a platform that 2 | doesn't have binaries available. Check anyway: 3 | 4 | http://sourceforge.net/project/showfiles.php?group_id=84459 5 | 6 | You may also want to consult this document if you get the message: 7 | "Couldn't find libtidy, please make sure it is installed correctly." 8 | 9 | ================================================================== 10 | On Linux (instructions for other flavors of Unix mostly the same): 11 | ___________________ 12 | 1. Install libtidy: 13 | 14 | TidyLib can be obtained from http://tidy.sourceforge.net/src/tidy_src.tgz 15 | 16 | (1a) Compile 17 | 18 | $ tar xvfz tidy_src.tgz 19 | $ cd tidy 20 | $ sh build/gnuauto/setup.sh 21 | $ ./configure # may want to specify --prefix=/usr here, see below 22 | $ make 23 | 24 | 25 | (1b) Install 26 | (become root) 27 | # make install 28 | 29 | This will place libtidy in /usr/local/lib. If you use --prefix=/usr in 30 | the configure line flagged above, your library will go to /usr/lib 31 | instead. The directory you install the library into MUST be 32 | configured with ldconfig, so if you installed into /usr/local/lib and 33 | it's mysteriously not working: 34 | 35 | # man ldconfig 36 | # man ld.so.conf 37 | 38 | Other Unices may have some variant of ldconfig, or they may use an 39 | environment variable such as LIBPATH, LD_LIBRARY_PATH, etc. 40 | 41 | __________________ 42 | 2. Install ctypes: 43 | 44 | Ctypes is available from: 45 | http://sourceforge.net/project/showfiles.php?group_id=71702 46 | 47 | _________________________________ 48 | 3. Install uLibtidy python files: 49 | 50 | (as root) 51 | # cd uTidylib 52 | # python setup.py install 53 | 54 | 55 | 56 | ================================================================== 57 | On Windows: 58 | __________________ 59 | 1. Install libtidy 60 | 61 | TidyLib can be obtained from http://tidy.sourceforge.net/src/tidy_src.tgz 62 | 63 | libtidy provides 2 ways to compile on Windows. The first way is to 64 | use the project and makefiles in uTidylib/libtidy/build/msvc. This 65 | way is not recommended as it requires you to purchase MS Visual C++. 66 | 67 | 1a) Install Cygwin 68 | The second, recommended way is to install Cygwin, with at least the 69 | following packages: 70 | make, automake, libtool, gcc, and gcc-mingw 71 | It is recommended that you do _not_ install Cygwin Python; instead use 72 | the Windows installer at http://python.org/download/ . 73 | 74 | 1b) Compile 75 | We will compile with the mingw compiler, which produces binaries that 76 | do not depend on the Cygwin DLLs. 77 | $ tar xvfz tidy_src.tgz 78 | $ cd tidy 79 | $ sh build/gnuauto/setup.sh 80 | $ CFLAGS=-mno-cygwin ./configure 81 | $ make 82 | 83 | 1c) Copy DLL to a directory in the PATH: 84 | 85 | $ cp src/.libs/cygtidy-0-99-0.dll $SYSTEMROOT 86 | 87 | __________________ 88 | 2. Install ctypes: 89 | 90 | Ctypes is available from: 91 | http://sourceforge.net/project/showfiles.php?group_id=71702 92 | 93 | _________________________________ 94 | 3. Install uLibtidy python files: 95 | 96 | $ cd uTidylib 97 | $ python setup.py install 98 | 99 | 100 | ================================================================== 101 | Running tests (after installing): 102 | _________________________________ 103 | 104 | Running tests requires that you have installed Twisted 105 | (http://twistedmatrix.com), as uTidyLib uses the trial framework for 106 | testing. 107 | 108 | $ python -c "from twisted.scripts import trial; trial.run()" -p tidylib 109 | 110 | This should work on either Windows or Unix. 111 | 112 | 113 | ================================================================== 114 | The Doc: 115 | ________ 116 | 117 | To build the doc, just run: 118 | 119 | $ python gendoc.py 120 | 121 | (This requires that you have epydoc installed.) 122 | 123 | The API documentation will be built in the ``apidoc'' directory. -------------------------------------------------------------------------------- /tidy/lib.py: -------------------------------------------------------------------------------- 1 | from __future__ import generators 2 | 3 | import sys 4 | import os.path 5 | from itertools import count 6 | packagedir = os.path.dirname(__file__) 7 | 8 | # look for ctypes in the system path, then try looking for a private ctypes 9 | # distribution 10 | try: 11 | import ctypes 12 | except ImportError: 13 | private_ctypes = os.path.join(packagedir, 'pvt_ctypes') 14 | sys.path.insert(0, private_ctypes) 15 | sys.path.insert(0, os.path.join(private_ctypes, 'ctypes.zip')) 16 | import ctypes 17 | from cStringIO import StringIO 18 | import weakref 19 | from tidy.error import * 20 | 21 | # search the path for libtidy using the known names; try the package 22 | # directory too 23 | thelib=None 24 | os.environ['PATH'] = "%s%s%s" % (packagedir, os.pathsep, os.environ['PATH']) 25 | for libname in ('cygtidy-0-99-0', 'libtidy', 'libtidy.so', 26 | 'libtidy-0.99.so.0', 'tidylib'): 27 | try: 28 | thelib = getattr(ctypes.cdll, libname) 29 | break 30 | except OSError: 31 | pass 32 | if not thelib: 33 | raise OSError("Couldn't find libtidy, please make sure it is installed.") 34 | 35 | class Loader: 36 | """I am a trivial wrapper that eliminates the need for tidy.tidyFoo, 37 | so you can just access tidy.Foo 38 | """ 39 | def __init__(self): 40 | self.lib=thelib 41 | def __getattr__(self, name): 42 | try: 43 | return getattr(self.lib, "tidy%s" % name) 44 | # current ctypes uses ValueError, future will use AttributeError 45 | except (ValueError, AttributeError): 46 | return getattr(self.lib, name) 47 | 48 | _tidy=Loader() 49 | 50 | # define a callback to pass to Tidylib 51 | def _putByte(handle, c): 52 | """Lookup sink by handle and call its putByte method""" 53 | sinkfactory[handle].putByte(c) 54 | return 0 55 | 56 | PUTBYTEFUNC=ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_char) 57 | putByte=PUTBYTEFUNC(_putByte) 58 | 59 | class _OutputSink(ctypes.Structure): 60 | _fields_=[("sinkData", ctypes.c_int), 61 | ("putByte", PUTBYTEFUNC), 62 | ] 63 | 64 | class _Sink: 65 | def __init__(self): 66 | self._data = StringIO() 67 | self.struct = _OutputSink() 68 | self.struct.putByte = putByte 69 | def putByte(self, c): 70 | self._data.write(c) 71 | def __str__(self): 72 | return self._data.getvalue() 73 | 74 | class ReportItem: 75 | def __init__(self, err): 76 | self.err = err 77 | if err.startswith('line'): 78 | tokens = err.split(' ',6) 79 | self.severity = tokens[5][0] # W or E 80 | self.line = int(tokens[1]) 81 | self.col = int(tokens[3]) 82 | self.message = tokens[6] 83 | else: 84 | tokens = err.split(' ',1) 85 | self.severity = tokens[0][0] 86 | self.message = tokens[1] 87 | self.line = None 88 | self.col = None 89 | # TODO - parse emacs mode 90 | def __str__(self): 91 | severities = dict(W='Warning', E='Error', C='Config') 92 | try: 93 | if self.line: 94 | return "line %d col %d - %s: %s" % (self.line, self.col, 95 | severities[self.severity], 96 | self.message) 97 | 98 | else: 99 | return "%s: %s" % (severities[self.severity], self.message) 100 | except KeyError: 101 | return self.err 102 | 103 | def __repr__(self): 104 | return "%s('%s')" % (self.__class__.__name__, 105 | str(self).replace("'", "\\'")) 106 | 107 | class FactoryDict(dict): 108 | """I am a dict with a create method and no __setitem__. This allows 109 | me to control my own keys. 110 | """ 111 | def create(self): 112 | """Subclasses should implement me to generate a new item""" 113 | def _setitem(self, name, value): 114 | dict.__setitem__(self, name, value) 115 | def __setitem__(self, name, value): 116 | raise TypeError, "Use create() to get a new object" 117 | 118 | 119 | class SinkFactory(FactoryDict): 120 | """Mapping for lookup of sinks by handle""" 121 | def __init__(self): 122 | FactoryDict.__init__(self) 123 | self.lastsink = 0 124 | def create(self): 125 | sink = _Sink() 126 | sink.struct.sinkData = self.lastsink 127 | FactoryDict._setitem(self, self.lastsink, sink) 128 | self.lastsink = self.lastsink+1 129 | return sink 130 | 131 | sinkfactory=SinkFactory() 132 | 133 | class _Document(object): 134 | def __init__(self): 135 | self.cdoc = _tidy.Create() 136 | self.errsink = sinkfactory.create() 137 | _tidy.SetErrorSink(self.cdoc, ctypes.byref(self.errsink.struct)) 138 | def write(self, stream): 139 | stream.write(str(self)) 140 | def get_errors(self): 141 | ret = [] 142 | for line in str(self.errsink).split('\n'): 143 | line = line.strip(' \n\r') 144 | if line: ret.append(ReportItem(line)) 145 | return ret 146 | errors=property(get_errors) 147 | def __str__(self): 148 | stlen = ctypes.c_int(8192) 149 | st = ctypes.c_buffer(stlen.value) 150 | rc = _tidy.SaveString(self.cdoc, st, ctypes.byref(stlen)) 151 | if rc==-12: # buffer too small 152 | st = ctypes.c_buffer(stlen.value) 153 | _tidy.SaveString(self.cdoc, st, ctypes.byref(stlen)) 154 | return st.value 155 | 156 | errors = {'missing or malformed argument for option: ': OptionArgError, 157 | 'unknown option: ': InvalidOptionError, 158 | } 159 | 160 | 161 | class DocumentFactory(FactoryDict): 162 | def _setOptions(self, doc, **options): 163 | for k in options.keys(): 164 | 165 | # this will flush out most argument type errors... 166 | if options[k] is None: options[k] = '' 167 | 168 | _tidy.OptParseValue(doc.cdoc, 169 | k.replace('_', '-'), 170 | str(options[k])) 171 | if doc.errors: 172 | match=filter(doc.errors[-1].message.startswith, errors.keys()) 173 | if match: 174 | raise errors[match[0]](doc.errors[-1].message) 175 | def load(self, doc, arg, loader): 176 | loader(doc.cdoc, arg) 177 | _tidy.CleanAndRepair(doc.cdoc) 178 | def loadFile(self, doc, filename): 179 | self.load(doc, filename, _tidy.ParseFile) 180 | def loadString(self, doc, st): 181 | self.load(doc, st, _tidy.ParseString) 182 | def _create(self, *args, **kwargs): 183 | doc = _Document() 184 | self._setOptions(doc, **kwargs) 185 | ref = weakref.ref(doc, self.releaseDoc) 186 | FactoryDict._setitem(self, ref, doc.cdoc) 187 | return doc 188 | def parse(self, filename, *args, **kwargs): 189 | """Open and process filename as an HTML file, returning a 190 | processed document object. 191 | @param kwargs: named options to pass to TidyLib for processing 192 | the input file. 193 | @param filename: the name of a file to process 194 | @return: a document object 195 | """ 196 | doc = self._create(**kwargs) 197 | self.loadFile(doc, filename) 198 | return doc 199 | def parseString(self, st, *args, **kwargs): 200 | """Use st as an HTML file, and process it, returning a 201 | document object. 202 | @param kwargs: named options to pass to TidyLib for processing 203 | the input file. 204 | @param st: the string to parse 205 | @return: a document object 206 | """ 207 | doc = self._create(**kwargs) 208 | self.loadString(doc, st) 209 | return doc 210 | def releaseDoc(self, ref): 211 | _tidy.Release(self[ref]) 212 | 213 | docfactory = DocumentFactory() 214 | parse = docfactory.parse 215 | parseString = docfactory.parseString 216 | -------------------------------------------------------------------------------- /path.py: -------------------------------------------------------------------------------- 1 | """ path.py - An object representing a path to a file or directory. 2 | 3 | Example: 4 | 5 | from path import path 6 | d = path('/home/guido/bin') 7 | for f in d.files('*.py'): 8 | f.chmod(0755) 9 | 10 | This module requires Python 2.2 or later. 11 | 12 | 13 | URL: http://www.jorendorff.com/articles/python/path 14 | Author: Jason Orendorff (and others - see the url!) 15 | Date: 23 Feb 2003 16 | """ 17 | 18 | 19 | # TODO 20 | # - Is __iter__ worth the trouble? It breaks the sequence 21 | # protocol and breaks compatibility with str/unicode. 22 | # - Perhaps support arguments to touch(). 23 | # - Note: __add__() technically has a bug, I think, where 24 | # it doesn't play nice with other types that implement 25 | # __radd__(). Test this. 26 | # - Better error message in listdir() when self isn't a 27 | # directory. (On Windows, the error message really sucks.) 28 | # - Make sure everything has a good docstring. 29 | 30 | from __future__ import generators 31 | 32 | import sys, os, fnmatch, glob, shutil, codecs 33 | 34 | __version__ = '1.2' 35 | __all__ = ['path'] 36 | 37 | # Pre-2.3 support. Are unicode filenames supported? 38 | _base = str 39 | try: 40 | if os.path.supports_unicode_filenames: 41 | _base = unicode 42 | except AttributeError: 43 | pass 44 | 45 | # Pre-2.3 workaround for basestring. 46 | try: 47 | basestring 48 | except NameError: 49 | basestring = (str, unicode) 50 | 51 | # Universal newline support 52 | _textmode = 'r' 53 | if hasattr(file, 'newlines'): 54 | _textmode = 'U' 55 | 56 | 57 | class path(_base): 58 | """ Represents a filesystem path. 59 | 60 | For documentation on individual methods, consult their 61 | counterparts in os.path. 62 | """ 63 | 64 | # --- Special Python methods. 65 | 66 | def __repr__(self): 67 | return 'path(%s)' % _base.__repr__(self) 68 | 69 | def __iter__(self): 70 | return iter(self.listdir()) 71 | 72 | # Adding a path and a string yields a path. 73 | def __add__(self, more): 74 | return path(_base(self) + more) 75 | 76 | def __radd__(self, other): 77 | return path(other + _base(self)) 78 | 79 | # The / operator joins paths. 80 | def __div__(self, rel): 81 | """ fp.__div__(rel) == fp / rel == fp.joinpath(rel) 82 | 83 | Join two path components, adding a separator character if 84 | needed. 85 | """ 86 | return path(os.path.join(self, rel)) 87 | 88 | # Make the / operator work even when true division is enabled. 89 | __truediv__ = __div__ 90 | 91 | def getcwd(): 92 | """ Return the current working directory as a path object. """ 93 | return path(os.getcwd()) 94 | getcwd = staticmethod(getcwd) 95 | 96 | 97 | # --- Operations on path strings. 98 | 99 | def abspath(self): return path(os.path.abspath(self)) 100 | def normcase(self): return path(os.path.normcase(self)) 101 | def normpath(self): return path(os.path.normpath(self)) 102 | def realpath(self): return path(os.path.realpath(self)) 103 | def expanduser(self): return path(os.path.expanduser(self)) 104 | def expandvars(self): return path(os.path.expandvars(self)) 105 | def dirname(self): return path(os.path.dirname(self)) 106 | basename = os.path.basename 107 | 108 | def expand(self): 109 | """ Clean up a filename by calling expandvars(), 110 | expanduser(), and normpath() on it. 111 | 112 | This is commonly everything needed to clean up a filename 113 | read from a configuration file, for example. 114 | """ 115 | return self.expandvars().expanduser().normpath() 116 | 117 | 118 | def _get_ext(self): 119 | f, ext = os.path.splitext(_base(self)) 120 | return ext 121 | 122 | def _get_drive(self): 123 | drive, r = os.path.splitdrive(self) 124 | return path(drive) 125 | 126 | parent = property(dirname) 127 | name = property(basename) 128 | ext = property( 129 | _get_ext, None, None, 130 | """ The file extension, for example '.py'. """) 131 | drive = property( 132 | _get_drive, None, None, 133 | """ The drive specifier, for example 'C:'. 134 | This is always empty on systems that don't use drive specifiers. """) 135 | 136 | def splitpath(self): 137 | """ p.splitpath() -> Return (p.parent, p.name). """ 138 | parent, child = os.path.split(self) 139 | return path(parent), child 140 | 141 | def splitdrive(self): 142 | drive, rel = os.path.splitdrive(self) 143 | return path(drive), rel 144 | 145 | def splitext(self): 146 | # Cast to plain string using _base because Python 2.2 147 | # implementations of os.path.splitext use "for c in path:..." 148 | # which means something different when applied to a path 149 | # object. 150 | filename, ext = os.path.splitext(_base(self)) 151 | return path(filename), ext 152 | 153 | if hasattr(os.path, 'splitunc'): 154 | def splitunc(self): 155 | unc, rest = os.path.splitunc(self) 156 | return path(unc), rest 157 | 158 | def _get_uncshare(self): 159 | unc, r = os.path.splitunc(self) 160 | return path(unc) 161 | 162 | uncshare = property( 163 | _get_uncshare, None, None, 164 | """ The UNC mount point for this path. 165 | This is empty for paths on local drives. """) 166 | 167 | def joinpath(self, *args): 168 | """ Join two or more path components, adding a separator 169 | character (os.sep) if needed. Returns a new path 170 | object. 171 | """ 172 | return path(os.path.join(self, *args)) 173 | 174 | def splitall(self): 175 | """ Return a list of the path components in this path. 176 | 177 | The first item in the list will be a path. Its value will be 178 | either os.curdir, os.pardir, empty, or the root directory of 179 | this path (for example, '/' or 'C:\\'). The other items in 180 | the list will be strings. 181 | 182 | path.path.joinpath(*result) will yield the original path. 183 | """ 184 | parts = [] 185 | loc = self 186 | while loc != os.curdir and loc != os.pardir: 187 | prev = loc 188 | loc, child = prev.splitpath() 189 | if loc == prev: 190 | break 191 | parts.append(child) 192 | parts.append(loc) 193 | parts.reverse() 194 | return parts 195 | 196 | def relpath(self): 197 | """ Return this path as a relative path, 198 | based from the current working directory. 199 | """ 200 | cwd = path(os.getcwd()) 201 | return cwd.relpathto(self) 202 | 203 | def relpathto(self, dest): 204 | """ Return a relative path from self to dest. 205 | 206 | If there is no relative path from self to dest, for example if 207 | they reside on different drives in Windows, then this returns 208 | dest.abspath(). 209 | """ 210 | origin = self.abspath() 211 | dest = path(dest).abspath() 212 | 213 | orig_list = origin.normcase().splitall() 214 | # Don't normcase dest! We want to preserve the case. 215 | dest_list = dest.splitall() 216 | 217 | if orig_list[0] != os.path.normcase(dest_list[0]): 218 | # Can't get here from there. 219 | return dest 220 | 221 | # Find the location where the two paths start to differ. 222 | i = 0 223 | for start_seg, dest_seg in zip(orig_list, dest_list): 224 | if start_seg != os.path.normcase(dest_seg): 225 | break 226 | i += 1 227 | 228 | # Now i is the point where the two paths diverge. 229 | # Need a certain number of "os.pardir"s to work up 230 | # from the origin to the point of divergence. 231 | segments = [os.pardir] * (len(orig_list) - i) 232 | # Need to add the diverging part of dest_list. 233 | segments += dest_list[i:] 234 | if len(segments) == 0: 235 | # If they happen to be identical, use os.curdir. 236 | return path(os.curdir) 237 | else: 238 | return path(os.path.join(*segments)) 239 | 240 | 241 | # --- Listing, searching, walking, and matching 242 | 243 | def listdir(self, pattern=None): 244 | """ D.listdir() -> List of items in this directory. 245 | 246 | Use D.files() or D.dirs() instead if you want a listing 247 | of just files or just subdirectories. 248 | 249 | The elements of the list are path objects. 250 | 251 | With the optional 'pattern' argument, this only lists 252 | items whose names match the given pattern. 253 | """ 254 | names = os.listdir(self) 255 | if pattern is not None: 256 | names = fnmatch.filter(names, pattern) 257 | return [self / child for child in names] 258 | 259 | def dirs(self, pattern=None): 260 | """ D.dirs() -> List of this directory's subdirectories. 261 | 262 | The elements of the list are path objects. 263 | This does not walk recursively into subdirectories 264 | (but see path.walkdirs). 265 | 266 | With the optional 'pattern' argument, this only lists 267 | directories whose names match the given pattern. For 268 | example, d.dirs('build-*'). 269 | """ 270 | return [p for p in self.listdir(pattern) if p.isdir()] 271 | 272 | def files(self, pattern=None): 273 | """ D.files() -> List of the files in this directory. 274 | 275 | The elements of the list are path objects. 276 | This does not walk into subdirectories (see path.walkfiles). 277 | 278 | With the optional 'pattern' argument, this only lists files 279 | whose names match the given pattern. For example, 280 | d.files('*.pyc'). 281 | """ 282 | 283 | return [p for p in self.listdir(pattern) if p.isfile()] 284 | 285 | def walk(self, pattern=None): 286 | """ D.walk() -> iterator over files and subdirs, recursively. 287 | 288 | The iterator yields path objects naming each child item of 289 | this directory and its descendants. This requires that 290 | D.isdir(). 291 | 292 | This performs a depth-first traversal of the directory tree. 293 | Each directory is returned just before all its children. 294 | """ 295 | for child in self: 296 | if pattern is None or child.fnmatch(pattern): 297 | yield child 298 | if child.isdir(): 299 | for item in child.walk(pattern): 300 | yield item 301 | 302 | def walkdirs(self, pattern=None): 303 | """ D.walkdirs() -> iterator over subdirs, recursively. 304 | 305 | With the optional 'pattern' argument, this yields only 306 | directories whose names match the given pattern. For 307 | example, mydir.walkdirs('*test') yields only directories 308 | with names ending in 'test'. 309 | """ 310 | for child in self: 311 | if child.isdir(): 312 | if pattern is None or child.fnmatch(pattern): 313 | yield child 314 | for subsubdir in child.walkdirs(pattern): 315 | yield subsubdir 316 | 317 | def walkfiles(self, pattern=None): 318 | """ D.walkfiles() -> iterator over files in D, recursively. 319 | 320 | The optional argument, pattern, limits the results to files 321 | with names that match the pattern. For example, 322 | mydir.walkfiles('*.tmp') yields only files with the .tmp 323 | extension. 324 | """ 325 | for child in self: 326 | if child.isfile(): 327 | if pattern is None or child.fnmatch(pattern): 328 | yield child 329 | elif child.isdir(): 330 | for f in child.walkfiles(pattern): 331 | yield f 332 | 333 | def fnmatch(self, pattern): 334 | """ Return True if self.name matches the given pattern. 335 | 336 | pattern - A filename pattern with wildcards, 337 | for example '*.py'. 338 | """ 339 | return fnmatch.fnmatch(self.name, pattern) 340 | 341 | def glob(self, pattern): 342 | """ Return a list of path objects that match the pattern. 343 | 344 | pattern - a path relative to this directory, with wildcards. 345 | 346 | For example, path('/users').glob('*/bin/*') returns a list 347 | of all the files users have in their bin directories. 348 | """ 349 | return map(path, glob.glob(_base(self / pattern))) 350 | 351 | 352 | # --- Reading an entire file at once. 353 | 354 | def bytes(self): 355 | """ Open this file, read all bytes, return them as a string. """ 356 | f = file(self, 'rb') 357 | try: 358 | return f.read() 359 | finally: 360 | f.close() 361 | 362 | def text(self, encoding=None, errors='strict'): 363 | """ Open this file, read it in, return the content as a string. 364 | 365 | This uses 'U' mode in Python 2.3 and later, so '\r\n' and '\r' 366 | are automatically translated to '\n'. 367 | 368 | Optional arguments: 369 | 370 | encoding - The Unicode encoding (or character set) of 371 | the file. If present, the content of the file is 372 | decoded and returned as a unicode object; otherwise 373 | it is returned as an 8-bit str. 374 | errors - How to handle Unicode errors; see help(str.decode) 375 | for the options. Default is 'strict'. 376 | """ 377 | if encoding is None: 378 | # 8-bit 379 | f = file(self, _textmode) 380 | try: 381 | return f.read() 382 | finally: 383 | f.close() 384 | else: 385 | # Unicode 386 | f = codecs.open(self, 'r', encoding, errors) 387 | # (Note - Can't use 'U' mode here, since codecs.open 388 | # doesn't support 'U' mode, even in Python 2.3.) 389 | try: 390 | t = f.read() 391 | finally: 392 | f.close() 393 | return t.replace(u'\r\n', u'\n').replace(u'\r', u'\n') 394 | 395 | def lines(self, encoding=None, errors='strict', retain=True): 396 | """ Open this file, read all lines, return them in a list. 397 | 398 | Optional arguments: 399 | encoding - The Unicode encoding (or character set) of 400 | the file. The default is None, meaning the content 401 | of the file is read as 8-bit characters and returned 402 | as a list of (non-Unicode) str objects. 403 | errors - How to handle Unicode errors; see help(str.decode) 404 | for the options. Default is 'strict' 405 | retain - If true, retain newline characters; but all newline 406 | character combinations ('\r', '\n', '\r\n') are 407 | translated to '\n'. If false, newline characters are 408 | stripped off. Default is True. 409 | 410 | This uses 'U' mode in Python 2.3 and later. 411 | """ 412 | if encoding is None and retain: 413 | f = file(self, _textmode) 414 | try: 415 | return f.readlines() 416 | finally: 417 | f.close() 418 | else: 419 | return self.text(encoding, errors).splitlines(retain) 420 | 421 | 422 | # --- Methods for querying the filesystem. 423 | 424 | exists = os.path.exists 425 | isabs = os.path.isabs 426 | isdir = os.path.isdir 427 | isfile = os.path.isfile 428 | islink = os.path.islink 429 | ismount = os.path.ismount 430 | 431 | if hasattr(os.path, 'samefile'): 432 | samefile = os.path.samefile 433 | 434 | getatime = os.path.getatime 435 | atime = property( 436 | getatime, None, None, 437 | """ Last access time of the file. """) 438 | 439 | getmtime = os.path.getmtime 440 | mtime = property( 441 | getmtime, None, None, 442 | """ Last-modified time of the file. """) 443 | 444 | if hasattr(os.path, 'getctime'): 445 | getctime = os.path.getctime 446 | ctime = property( 447 | getctime, None, None, 448 | """ Creation time of the file. """) 449 | 450 | getsize = os.path.getsize 451 | size = property( 452 | getsize, None, None, 453 | """ Size of the file, in bytes. """) 454 | 455 | if hasattr(os, 'access'): 456 | def access(self, mode): 457 | """ Return true if current user has access to this path. 458 | 459 | mode - One of the constants os.F_OK, os.R_OK, os.W_OK, os.X_OK 460 | """ 461 | return os.access(self, mode) 462 | 463 | def stat(self): 464 | """ Perform a stat() system call on this path. """ 465 | return os.stat(self) 466 | 467 | def lstat(self): 468 | """ Like path.stat(), but do not follow symbolic links. """ 469 | return os.lstat(self) 470 | 471 | if hasattr(os, 'statvfs'): 472 | def statvfs(self): 473 | """ Perform a statvfs() system call on this path. """ 474 | return os.statvfs(self) 475 | 476 | if hasattr(os, 'pathconf'): 477 | def pathconf(self, name): 478 | return os.pathconf(self, name) 479 | 480 | 481 | # --- Modifying operations on files and directories 482 | 483 | def utime(self, times): 484 | """ Set the access and modified times of this file. """ 485 | os.utime(self, times) 486 | 487 | def chmod(self, mode): 488 | os.chmod(self, mode) 489 | 490 | if hasattr(os, 'chown'): 491 | def chown(self, uid, gid): 492 | os.chown(self, uid, gid) 493 | 494 | def rename(self, new): 495 | os.rename(self, new) 496 | 497 | def renames(self, new): 498 | os.renames(self, new) 499 | 500 | 501 | # --- Create/delete operations on directories 502 | 503 | def mkdir(self, mode=0777): 504 | os.mkdir(self, mode) 505 | 506 | def makedirs(self, mode=0777): 507 | os.makedirs(self, mode) 508 | 509 | def rmdir(self): 510 | os.rmdir(self) 511 | 512 | def removedirs(self): 513 | os.removedirs(self) 514 | 515 | 516 | # --- Modifying operations on files 517 | 518 | def touch(self): 519 | """ Set the access/modified times of this file to the current time. 520 | Create the file if it does not exist. 521 | """ 522 | fd = os.open(self, os.O_WRONLY | os.O_CREAT, 0666) 523 | os.close(fd) 524 | os.utime(self, None) 525 | 526 | def remove(self): 527 | os.remove(self) 528 | 529 | def unlink(self): 530 | os.unlink(self) 531 | 532 | 533 | # --- Links 534 | 535 | if hasattr(os, 'link'): 536 | def link(self, newpath): 537 | """ Create a hard link at 'newpath', pointing to this file. """ 538 | os.link(self, newpath) 539 | 540 | if hasattr(os, 'symlink'): 541 | def symlink(self, newlink): 542 | """ Create a symbolic link at 'newlink', pointing here. """ 543 | os.symlink(self, newlink) 544 | 545 | if hasattr(os, 'readlink'): 546 | def readlink(self): 547 | """ Return the path to which this symbolic link points. 548 | 549 | The result may be an absolute or a relative path. 550 | """ 551 | return path(os.readlink(self)) 552 | 553 | def readlinkabs(self): 554 | """ Return the path to which this symbolic link points. 555 | 556 | The result is always an absolute path. 557 | """ 558 | p = self.readlink() 559 | if p.isabs(): 560 | return p 561 | else: 562 | return (self.parent / p).abspath() 563 | 564 | 565 | # --- High-level functions from shutil 566 | 567 | copyfile = shutil.copyfile 568 | copymode = shutil.copymode 569 | copystat = shutil.copystat 570 | copy = shutil.copy 571 | copy2 = shutil.copy2 572 | copytree = shutil.copytree 573 | if hasattr(shutil, 'move'): 574 | move = shutil.move 575 | rmtree = shutil.rmtree 576 | 577 | 578 | # --- Special stuff from os 579 | 580 | if hasattr(os, 'chroot'): 581 | def chroot(self): 582 | os.chroot(self) 583 | 584 | if hasattr(os, 'startfile'): 585 | def startfile(self): 586 | os.startfile(self) 587 | 588 | --------------------------------------------------------------------------------