├── elementtidy-1.0-20050212 ├── elementtidy │ ├── __init__.py │ ├── __init__.pyc │ ├── TidyHTMLTreeBuilder.pyc │ └── TidyHTMLTreeBuilder.py ├── selftest.py ├── PKG-INFO ├── tidylib │ ├── src │ │ ├── iconvtc.h │ │ ├── charsets.h │ │ ├── win32tc.h │ │ ├── entities.h │ │ ├── forward.h │ │ ├── utf8.h │ │ ├── alloc.c │ │ ├── tmbstr.h │ │ ├── fileio.c │ │ ├── parser.h │ │ ├── clean.h │ │ ├── pprint.h │ │ ├── attrdict.h │ │ ├── buffio.c │ │ ├── tidy-int.h │ │ ├── attrask.c │ │ ├── streamio.h │ │ ├── tmbstr.c │ │ ├── config.h │ │ ├── attrget.c │ │ ├── message.h │ │ ├── istack.c │ │ ├── tagask.c │ │ ├── tags.h │ │ ├── access.h │ │ ├── entities.c │ │ ├── utf8.c │ │ └── lexer.h │ └── include │ │ ├── fileio.h │ │ ├── buffio.h │ │ └── platform.h ├── setup.py ├── _elementtidy.c └── README ├── TODO ├── README └── juniperncprompt.py /elementtidy-1.0-20050212/elementtidy/__init__.py: -------------------------------------------------------------------------------- 1 | # $Id: __init__.py 1764 2004-03-29 07:07:36Z fredrik $ 2 | # package marker 3 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/selftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crimsonknave/juniperncprompt/HEAD/elementtidy-1.0-20050212/selftest.py -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/elementtidy/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crimsonknave/juniperncprompt/HEAD/elementtidy-1.0-20050212/elementtidy/__init__.pyc -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/elementtidy/TidyHTMLTreeBuilder.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crimsonknave/juniperncprompt/HEAD/elementtidy-1.0-20050212/elementtidy/TidyHTMLTreeBuilder.pyc -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | In no particular order 2 | 3 | Logging? 4 | Update README with troubleshooting? (ncLinux*.jar, resolv.conf not moving /tmp partition) 5 | Allow the password fields to have printed names 6 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: elementtidy 3 | Version: 1.0-20050212 4 | Summary: ElementTidy - a tidylib interface for ElementTree 5 | Home-page: http://effbot.org/zone/element-tidylib.htm 6 | Author: Fredrik Lundh 7 | Author-email: fredrik@pythonware.com 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/iconvtc.h: -------------------------------------------------------------------------------- 1 | #ifndef __ICONVTC_H__ 2 | #define __ICONVTC_H__ 3 | #ifdef TIDY_ICONV_SUPPORT 4 | 5 | /* iconvtc.h -- Interface to iconv transcoding routines 6 | 7 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 8 | See tidy.h for the copyright notice. 9 | 10 | $Id$ 11 | */ 12 | 13 | 14 | #endif /* TIDY_ICONV_SUPPORT */ 15 | #endif /* __ICONVTC_H__ */ 16 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/charsets.h: -------------------------------------------------------------------------------- 1 | /* charsets.h -- character set information and mappings 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | $Id$ 7 | */ 8 | 9 | uint GetEncodingIdFromName(ctmbstr name); 10 | uint GetEncodingIdFromCodePage(uint cp); 11 | uint GetEncodingCodePageFromName(ctmbstr name); 12 | uint GetEncodingCodePageFromId(uint id); 13 | ctmbstr GetEncodingNameFromId(uint id); 14 | ctmbstr GetEncodingNameFromCodePage(uint cp); 15 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/win32tc.h: -------------------------------------------------------------------------------- 1 | #ifndef __WIN32TC_H__ 2 | #define __WIN32TC_H__ 3 | #ifdef TIDY_WIN32_MLANG_SUPPORT 4 | 5 | /* win32tc.h -- Interface to Win32 transcoding routines 6 | 7 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 8 | See tidy.h for the copyright notice. 9 | 10 | $Id$ 11 | */ 12 | 13 | uint Win32MLangGetCPFromName(ctmbstr encoding); 14 | Bool Win32MLangInitInputTranscoder(StreamIn * in, uint wincp); 15 | void Win32MLangUninitInputTranscoder(StreamIn * in); 16 | int Win32MLangGetChar(byte firstByte, StreamIn * in, uint * bytesRead); 17 | 18 | #endif /* TIDY_WIN32_MLANG_SUPPORT */ 19 | #endif /* __WIN32TC_H__ */ 20 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/entities.h: -------------------------------------------------------------------------------- 1 | #ifndef __ENTITIES_H__ 2 | #define __ENTITIES_H__ 3 | 4 | /* entities.h -- recognize character entities 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | */ 16 | 17 | #include "forward.h" 18 | 19 | /* entity starting with "&" returns zero on error */ 20 | uint EntityCode( ctmbstr name, uint versions ); 21 | ctmbstr EntityName( uint charCode, uint versions ); 22 | Bool EntityInfo( ctmbstr name, Bool isXml, uint* code, uint* versions ); 23 | 24 | #endif /* __ENTITIES_H__ */ 25 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/include/fileio.h: -------------------------------------------------------------------------------- 1 | #ifndef __FILEIO_H__ 2 | #define __FILEIO_H__ 3 | 4 | /** @file fileio.h - does standard C I/O 5 | 6 | Implementation of a FILE* based TidyInputSource and 7 | TidyOutputSink. 8 | 9 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 10 | See tidy.h for the copyright notice. 11 | 12 | CVS Info: 13 | $LastChangedBy$ 14 | $LastChangedDate$ 15 | $LastChangedRevision$ 16 | */ 17 | 18 | #include "buffio.h" 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | /** Allocate and initialize file input source */ 24 | void initFileSource( TidyInputSource* source, FILE* fp ); 25 | 26 | /** Free file input source */ 27 | void freeFileSource( TidyInputSource* source, Bool closeIt ); 28 | 29 | /** Initialize file output sink */ 30 | void initFileSink( TidyOutputSink* sink, FILE* fp ); 31 | 32 | /* Needed for internal declarations */ 33 | void filesink_putByte( ulong sinkData, byte bv ); 34 | 35 | #ifdef __cplusplus 36 | } 37 | #endif 38 | #endif /* __FILEIO_H__ */ 39 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/forward.h: -------------------------------------------------------------------------------- 1 | #ifndef __FORWARD_H__ 2 | #define __FORWARD_H__ 3 | 4 | /* forward.h -- Forward declarations for major Tidy structures 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | Avoids many include file circular dependencies. 16 | 17 | Try to keep this file down to the minimum to avoid 18 | cross-talk between modules. 19 | 20 | Header files include this file. C files include tidy-int.h. 21 | 22 | */ 23 | 24 | #include "platform.h" 25 | #include "tidy.h" 26 | 27 | struct _StreamIn; 28 | typedef struct _StreamIn StreamIn; 29 | 30 | struct _StreamOut; 31 | typedef struct _StreamOut StreamOut; 32 | 33 | struct _TidyDocImpl; 34 | typedef struct _TidyDocImpl TidyDocImpl; 35 | 36 | 37 | struct _Dict; 38 | typedef struct _Dict Dict; 39 | 40 | struct _Attribute; 41 | typedef struct _Attribute Attribute; 42 | 43 | struct _AttVal; 44 | typedef struct _AttVal AttVal; 45 | 46 | struct _Node; 47 | typedef struct _Node Node; 48 | 49 | struct _IStack; 50 | typedef struct _IStack IStack; 51 | 52 | struct _Lexer; 53 | typedef struct _Lexer Lexer; 54 | 55 | 56 | 57 | #endif /* __FORWARD_H__ */ 58 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/utf8.h: -------------------------------------------------------------------------------- 1 | #ifndef __UTF8_H__ 2 | #define __UTF8_H__ 3 | 4 | /* utf8.h -- convert characters to/from UTF-8 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | */ 16 | 17 | #include "platform.h" 18 | #include "buffio.h" 19 | 20 | /* UTF-8 encoding/decoding support 21 | ** Does not convert character "codepoints", i.e. to/from 10646. 22 | */ 23 | 24 | int DecodeUTF8BytesToChar( uint* c, uint firstByte, tmbstr successorBytes, 25 | TidyInputSource* inp, int* count ); 26 | 27 | int EncodeCharToUTF8Bytes( uint c, tmbstr encodebuf, 28 | TidyOutputSink* outp, int* count ); 29 | 30 | 31 | uint GetUTF8( tmbstr str, uint *ch ); 32 | tmbstr PutUTF8( tmbstr buf, uint c ); 33 | 34 | #define UNICODE_BOM_BE 0xFEFF /* big-endian (default) UNICODE BOM */ 35 | #define UNICODE_BOM UNICODE_BOM_BE 36 | #define UNICODE_BOM_LE 0xFFFE /* little-endian UNICODE BOM */ 37 | #define UNICODE_BOM_UTF8 0xEFBBBF /* UTF-8 UNICODE BOM */ 38 | 39 | 40 | Bool IsValidUTF16FromUCS4( tchar ucs4 ); 41 | Bool IsHighSurrogate( tchar ch ); 42 | Bool IsLowSurrogate( tchar ch ); 43 | 44 | Bool IsCombinedChar( tchar ch ); 45 | Bool IsValidCombinedChar( tchar ch ); 46 | 47 | tchar CombineSurrogatePair( tchar high, tchar low ); 48 | Bool SplitSurrogatePair( tchar utf16, tchar* high, tchar* low ); 49 | 50 | 51 | 52 | #endif /* __UTF8_H__ */ 53 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Setup script for the elementtidy library 4 | # $Id: setup.py 2275 2005-02-03 18:20:56Z fredrik $ 5 | # 6 | # Usage: python setup.py install 7 | # 8 | 9 | from distutils.core import setup, Extension 10 | 11 | NAME = "elementtidy" 12 | VERSION = "1.0-20050212" 13 | 14 | TIDYFILES = [ 15 | "tidylib/src/access.c", 16 | "tidylib/src/alloc.c", 17 | "tidylib/src/attrask.c", 18 | "tidylib/src/attrdict.c", 19 | "tidylib/src/attrget.c", 20 | "tidylib/src/attrs.c", 21 | "tidylib/src/buffio.c", 22 | "tidylib/src/clean.c", 23 | "tidylib/src/config.c", 24 | "tidylib/src/entities.c", 25 | "tidylib/src/fileio.c", 26 | "tidylib/src/istack.c", 27 | "tidylib/src/lexer.c", 28 | "tidylib/src/localize.c", 29 | "tidylib/src/parser.c", 30 | "tidylib/src/pprint.c", 31 | "tidylib/src/streamio.c", 32 | "tidylib/src/tagask.c", 33 | "tidylib/src/tags.c", 34 | "tidylib/src/tidylib.c", 35 | "tidylib/src/tmbstr.c", 36 | "tidylib/src/utf8.c", 37 | ] 38 | 39 | setup( 40 | name=NAME, 41 | version=VERSION, 42 | author="Fredrik Lundh", 43 | author_email="fredrik@pythonware.com", 44 | description="ElementTidy - a tidylib interface for ElementTree", 45 | url="http://effbot.org/zone/element-tidylib.htm", 46 | packages=["elementtidy"], 47 | ext_modules = [ 48 | Extension( 49 | "_elementtidy", 50 | ["_elementtidy.c"] + TIDYFILES, 51 | define_macros=[("NDEBUG", None)], 52 | include_dirs=["tidylib/include"], 53 | ) 54 | ] 55 | ) 56 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/alloc.c: -------------------------------------------------------------------------------- 1 | /* alloc.c -- Default memory allocation routines. 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info : 7 | 8 | $LastChangedBy$ 9 | $LastChangedDate$ 10 | $LastChangedRevision$ 11 | 12 | */ 13 | 14 | #include "tidy.h" 15 | 16 | static TidyMalloc g_malloc = NULL; 17 | static TidyRealloc g_realloc = NULL; 18 | static TidyFree g_free = NULL; 19 | static TidyPanic g_panic = NULL; 20 | 21 | Bool tidySetMallocCall( TidyMalloc fmalloc ) 22 | { 23 | g_malloc = fmalloc; 24 | return yes; 25 | } 26 | Bool tidySetReallocCall( TidyRealloc frealloc ) 27 | { 28 | g_realloc = frealloc; 29 | return yes; 30 | } 31 | Bool tidySetFreeCall( TidyFree ffree ) 32 | { 33 | g_free = ffree; 34 | return yes; 35 | } 36 | Bool tidySetPanicCall( TidyPanic fpanic ) 37 | { 38 | g_panic = fpanic; 39 | return yes; 40 | } 41 | 42 | void FatalError( ctmbstr msg ) 43 | { 44 | if ( g_panic ) 45 | g_panic( msg ); 46 | else 47 | { 48 | /* 2 signifies a serious error */ 49 | fprintf( stderr, "Fatal error: %s\n", msg ); 50 | exit(2); 51 | } 52 | } 53 | 54 | void* MemAlloc( size_t size ) 55 | { 56 | void *p = ( g_malloc ? g_malloc(size) : malloc(size) ); 57 | if ( !p ) 58 | FatalError("Out of memory!"); 59 | return p; 60 | } 61 | 62 | void* MemRealloc( void* mem, size_t newsize ) 63 | { 64 | void *p; 65 | if ( mem == NULL ) 66 | return MemAlloc( newsize ); 67 | 68 | p = ( g_realloc ? g_realloc(mem, newsize) : realloc(mem, newsize) ); 69 | if (!p) 70 | FatalError("Out of memory!"); 71 | return p; 72 | } 73 | 74 | void MemFree( void* mem ) 75 | { 76 | if ( mem ) 77 | { 78 | if ( g_free ) 79 | g_free( mem ); 80 | else 81 | free( mem ); 82 | } 83 | } 84 | 85 | void ClearMemory( void *mem, size_t size ) 86 | { 87 | memset(mem, 0, size); 88 | } 89 | 90 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/tmbstr.h: -------------------------------------------------------------------------------- 1 | #ifndef __TMBSTR_H__ 2 | #define __TMBSTR_H__ 3 | 4 | /* tmbstr.h - Tidy string utility functions 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | */ 16 | 17 | #include "platform.h" 18 | 19 | #ifdef __cplusplus 20 | extern "C" 21 | { 22 | #endif 23 | 24 | /* like strdup but using MemAlloc */ 25 | tmbstr tmbstrdup( ctmbstr str ); 26 | 27 | /* like strndup but using MemAlloc */ 28 | tmbstr tmbstrndup( ctmbstr str, uint len); 29 | 30 | /* exactly same as strncpy */ 31 | uint tmbstrncpy( tmbstr s1, ctmbstr s2, uint size ); 32 | 33 | uint tmbstrcpy( tmbstr s1, ctmbstr s2 ); 34 | 35 | uint tmbstrcat( tmbstr s1, ctmbstr s2 ); 36 | 37 | /* exactly same as strcmp */ 38 | int tmbstrcmp( ctmbstr s1, ctmbstr s2 ); 39 | 40 | /* returns byte count, not char count */ 41 | uint tmbstrlen( ctmbstr str ); 42 | 43 | /* 44 | MS C 4.2 doesn't include strcasecmp. 45 | Note that tolower and toupper won't 46 | work on chars > 127. 47 | 48 | Neither do Lexer.ToLower() or Lexer.ToUpper()! 49 | 50 | We get away with this because, except for XML tags, 51 | we are always comparing to ascii element and 52 | attribute names defined by HTML specs. 53 | */ 54 | int tmbstrcasecmp( ctmbstr s1, ctmbstr s2 ); 55 | 56 | int tmbstrncmp( ctmbstr s1, ctmbstr s2, uint n ); 57 | 58 | int tmbstrncasecmp( ctmbstr s1, ctmbstr s2, uint n ); 59 | 60 | /* return offset of cc from beginning of s1, 61 | ** -1 if not found. 62 | */ 63 | int tmbstrnchr( ctmbstr s1, uint len1, tmbchar cc ); 64 | 65 | ctmbstr tmbsubstrn( ctmbstr s1, uint len1, ctmbstr s2 ); 66 | ctmbstr tmbsubstrncase( ctmbstr s1, uint len1, ctmbstr s2 ); 67 | ctmbstr tmbsubstr( ctmbstr s1, ctmbstr s2 ); 68 | 69 | /* transform string to lower case */ 70 | tmbstr tmbstrtolower( tmbstr s ); 71 | 72 | /* Transform ASCII chars in string to upper case */ 73 | tmbstr tmbstrtoupper(tmbstr s); 74 | 75 | Bool tmbsamefile( ctmbstr filename1, ctmbstr filename2 ); 76 | 77 | #ifdef __cplusplus 78 | } /* extern "C" */ 79 | #endif 80 | 81 | #endif /* __TMBSTR_H__ */ 82 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/fileio.c: -------------------------------------------------------------------------------- 1 | /* fileio.c -- does standard I/O 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info : 7 | 8 | $LastChangedBy$ 9 | $LastChangedDate$ 10 | $LastChangedRevision$ 11 | 12 | Default implementations of Tidy input sources 13 | and output sinks based on standard C FILE*. 14 | 15 | */ 16 | 17 | #include 18 | 19 | #include "fileio.h" 20 | #include "tidy.h" 21 | 22 | 23 | typedef struct _fp_input_source 24 | { 25 | FILE* fp; 26 | TidyBuffer unget; 27 | } FileSource; 28 | 29 | int filesrc_getByte( ulong sourceData ) 30 | { 31 | FileSource* fin = (FileSource*) sourceData; 32 | int bv; 33 | if ( fin->unget.size > 0 ) 34 | bv = tidyBufPopByte( &fin->unget ); 35 | else 36 | bv = fgetc( fin->fp ); 37 | return bv; 38 | } 39 | Bool filesrc_eof( ulong sourceData ) 40 | { 41 | FileSource* fin = (FileSource*) sourceData; 42 | Bool isEOF = ( fin->unget.size == 0 ); 43 | if ( isEOF ) 44 | isEOF = feof( fin->fp ); 45 | return isEOF; 46 | } 47 | void filesrc_ungetByte( ulong sourceData, byte bv ) 48 | { 49 | FileSource* fin = (FileSource*) sourceData; 50 | tidyBufPutByte( &fin->unget, bv ); 51 | } 52 | 53 | void initFileSource( TidyInputSource* inp, FILE* fp ) 54 | { 55 | FileSource* fin = NULL; 56 | 57 | inp->getByte = filesrc_getByte; 58 | inp->eof = filesrc_eof; 59 | inp->ungetByte = filesrc_ungetByte; 60 | 61 | fin = (FileSource*) MemAlloc( sizeof(FileSource) ); 62 | ClearMemory( fin, sizeof(FileSource) ); 63 | fin->fp = fp; 64 | inp->sourceData = (ulong) fin; 65 | } 66 | 67 | void freeFileSource( TidyInputSource* inp, Bool closeIt ) 68 | { 69 | FileSource* fin = (FileSource*) inp->sourceData; 70 | if ( closeIt && fin && fin->fp ) 71 | fclose( fin->fp ); 72 | tidyBufFree( &fin->unget ); 73 | MemFree( fin ); 74 | } 75 | 76 | void filesink_putByte( ulong sinkData, byte bv ) 77 | { 78 | FILE* fout = (FILE*) sinkData; 79 | fputc( bv, fout ); 80 | } 81 | 82 | void initFileSink( TidyOutputSink* outp, FILE* fp ) 83 | { 84 | outp->putByte = filesink_putByte; 85 | outp->sinkData = (ulong) fp; 86 | } 87 | 88 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/parser.h: -------------------------------------------------------------------------------- 1 | #ifndef __PARSER_H__ 2 | #define __PARSER_H__ 3 | 4 | /* parser.h -- HTML Parser 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | */ 16 | 17 | #include "forward.h" 18 | 19 | Bool CheckNodeIntegrity(Node *node); 20 | 21 | /* 22 | used to determine how attributes 23 | without values should be printed 24 | this was introduced to deal with 25 | user defined tags e.g. Cold Fusion 26 | */ 27 | Bool IsNewNode(Node *node); 28 | 29 | void CoerceNode(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool expected); 30 | 31 | /* extract a node and its children from a markup tree */ 32 | Node *RemoveNode(Node *node); 33 | 34 | /* remove node from markup tree and discard it */ 35 | Node *DiscardElement( TidyDocImpl* doc, Node *element); 36 | 37 | /* insert node into markup tree */ 38 | void InsertNodeAtStart(Node *element, Node *node); 39 | 40 | /* insert node into markup tree */ 41 | void InsertNodeAtEnd(Node *element, Node *node); 42 | 43 | /* insert node into markup tree before element */ 44 | void InsertNodeBeforeElement(Node *element, Node *node); 45 | 46 | /* insert node into markup tree after element */ 47 | void InsertNodeAfterElement(Node *element, Node *node); 48 | 49 | Node *TrimEmptyElement( TidyDocImpl* doc, Node *element ); 50 | 51 | 52 | 53 | /* assumes node is a text node */ 54 | Bool IsBlank(Lexer *lexer, Node *node); 55 | 56 | 57 | /* 58 | duplicate name attribute as an id 59 | and check if id and name match 60 | */ 61 | void FixId( TidyDocImpl* doc, Node *node ); 62 | 63 | void FixXmlLang(TidyDocImpl* doc, Node* node); 64 | 65 | /* acceptable content for pre elements */ 66 | Bool PreContent( TidyDocImpl* doc, Node *node ); 67 | 68 | Bool IsJavaScript(Node *node); 69 | Bool DescendantOf(Node *element, TidyTagId tid); 70 | 71 | /* 72 | HTML is the top level element 73 | */ 74 | void ParseDocument( TidyDocImpl* doc ); 75 | 76 | 77 | 78 | /* 79 | XML documents 80 | */ 81 | Bool XMLPreserveWhiteSpace( TidyDocImpl* doc, Node *element ); 82 | 83 | void ParseXMLDocument( TidyDocImpl* doc ); 84 | 85 | #endif /* __PARSER_H__ */ 86 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Install steps (below) are compiled from http://makefile.com/.plan/2009/10/juniper-vpn-64-bit-linux-an-unsolved-mystery/ 2 | 3 | This script logs into a juniper network connect vpn website using authentication provided by you and retrieves the DSID cookie which can be passed to the ncui executable to create a VPN tunnel. This allows users on linux to connect to the tunnel without using the web client and/or a 32 bit java. 4 | 5 | I found it neccessary to use this solution because my work required both a password and a secureid token and all of the tools provided by Juniper do not allow you to pass a second password. If you only need to use a single password the accepted answer seems to be to use http://mad-scientist.us/juniper.html, but my script should work as well (Haven't tested it, but see no reason it won't) 6 | 7 | What you need to get this to work: 8 | 1) xterm (the java app won't install the files without xterm) 9 | 2) gcc multilib (I've only tested this on 64 bit) 10 | 3) 32bit zlib 11 | 4) /tmp and /etc on the same partition (the program can't swap your 12 | /etc/resolv.conf otherwise... sad, I know) 13 | 5) python and python-devel (I believe 2.7 is needed, it might run on 2.6) 14 | 15 | To get the files we'll need do the following: 16 | 1) Log in to your vpn website 17 | 2) If you aren't redirected to the network connect page go there 18 | 3) A java app should try to start let it run (you will likely be asked twice) 19 | 4) Make sure that you have libncui.so and ncsvc in ~/.juniper_networks/network_connect/ 20 | 5) If you don't have both of those make sure xterm is installed and that a java prompt didn't get hidden under a window 21 | 22 | Setup the files: 23 | 1) cd ~/.juniper_networks/network_connect 24 | 2) gcc -m32 -Wl,-rpath,`pwd` -o ncui libncui.so 25 | 3) sudo chown root:root ncui 26 | 4) sudo chmod 4775 ncui 27 | 5) echo | openssl s_client -connect vpn.constantcontact.com:443 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -outform der > ssl.crt 28 | 6) cd /juniperncprompt/elementtidy-1.0-20050212 29 | 7) sudo python setup.py install (make sure you use the correct version of python to setup and install if you have multiple versions on your system) 30 | 8) cd ../ 31 | 9) ./juniperncprompt.py vpn.website.com 32 | 10) ifconfig (typically /sbin for those who don't have sbin in their path) 33 | You should see a tun0, if so everything worked and you are connected to the vpn 34 | 35 | The default values are set up for the vpn setup at my work, they can all be adjusted with a command line flag or by changing the source file if you want. 36 | 37 | If you have any questions or comments please feel free to email me at crimsonknave@gmail.com 38 | 39 | This distribution includes the sources for elementtidy and tidylib. You can get the latest elementtidy from http://effbot.org/downloads/ and tidylib from http://tidy.sourceforge.net/ 40 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/clean.h: -------------------------------------------------------------------------------- 1 | #ifndef __CLEAN_H__ 2 | #define __CLEAN_H__ 3 | 4 | /* clean.h -- clean up misuse of presentation markup 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info: 10 | $LastChangedBy$ 11 | $LastChangedDate$ 12 | $LastChangedRevision$ 13 | 14 | */ 15 | 16 | void RenameElem( Node* node, TidyTagId tid ); 17 | 18 | Node* CleanNode( TidyDocImpl* doc, Node* node ); 19 | 20 | void FreeStyles( TidyDocImpl* doc ); 21 | 22 | /* Add class="foo" to node 23 | */ 24 | void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname ); 25 | 26 | void CleanDocument( TidyDocImpl* doc ); 27 | 28 | /* simplifies ... ... etc. */ 29 | void NestedEmphasis( TidyDocImpl* doc, Node* node ); 30 | 31 | /* replace i by em and b by strong */ 32 | void EmFromI( TidyDocImpl* doc, Node* node ); 33 | 34 | /* 35 | Some people use dir or ul without an li 36 | to indent the content. The pattern to 37 | look for is a list with a single implicit 38 | li. This is recursively replaced by an 39 | implicit blockquote. 40 | */ 41 | void List2BQ( TidyDocImpl* doc, Node* node ); 42 | 43 | /* 44 | Replace implicit blockquote by div with an indent 45 | taking care to reduce nested blockquotes to a single 46 | div with the indent set to match the nesting depth 47 | */ 48 | void BQ2Div( TidyDocImpl* doc, Node* node ); 49 | 50 | 51 | Node *FindEnclosingCell( TidyDocImpl* doc, Node* node ); 52 | 53 | void DropSections( TidyDocImpl* doc, Node* node ); 54 | 55 | /* used to hunt for hidden preformatted sections */ 56 | Bool NoMargins(Node *node); 57 | 58 | /* does element have a single space as its content? */ 59 | Bool IsSingleSpace(Lexer *lexer, Node *node); 60 | 61 | 62 | /* 63 | This is a major clean up to strip out all the extra stuff you get 64 | when you save as web page from Word 2000. It doesn't yet know what 65 | to do with VML tags, but these will appear as errors unless you 66 | declare them as new tags, such as o:p which needs to be declared 67 | as inline. 68 | */ 69 | void CleanWord2000( TidyDocImpl* doc, Node *node); 70 | 71 | Bool IsWord2000( TidyDocImpl* doc ); 72 | 73 | /* where appropriate move object elements from head to body */ 74 | void BumpObject( TidyDocImpl* doc, Node *html ); 75 | 76 | void FixBrakes( TidyDocImpl* pDoc, Node *pParent ); 77 | 78 | void VerifyHTTPEquiv( TidyDocImpl* pDoc, Node *pParent ); 79 | 80 | void DropComments(TidyDocImpl* doc, Node* node); 81 | void DropFontElements(TidyDocImpl* doc, Node* node, Node **pnode); 82 | void WbrToSpace(TidyDocImpl* doc, Node* node); 83 | void DowngradeTypography(TidyDocImpl* doc, Node* node); 84 | void ReplacePreformattedSpaces(TidyDocImpl* doc, Node* node); 85 | void NormalizeSpaces(Lexer *lexer, Node *node); 86 | void ConvertCDATANodes(TidyDocImpl* doc, Node* node); 87 | 88 | #endif /* __CLEAN_H__ */ 89 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/pprint.h: -------------------------------------------------------------------------------- 1 | #ifndef __PPRINT_H__ 2 | #define __PPRINT_H__ 3 | 4 | /* pprint.h -- pretty print parse tree 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info: 10 | $LastChangedBy$ 11 | $LastChangedDate$ 12 | $LastChangedRevision$ 13 | 14 | */ 15 | 16 | #include "forward.h" 17 | 18 | /* 19 | Block-level and unknown elements are printed on 20 | new lines and their contents indented 2 spaces 21 | 22 | Inline elements are printed inline. 23 | 24 | Inline content is wrapped on spaces (except in 25 | attribute values or preformatted text, after 26 | start tags and before end tags 27 | */ 28 | 29 | #define NORMAL 0 30 | #define PREFORMATTED 1 31 | #define COMMENT 2 32 | #define ATTRIBVALUE 4 33 | #define NOWRAP 8 34 | #define CDATA 16 35 | 36 | 37 | /* The pretty printer keeps at most two lines of text in the 38 | ** buffer before flushing output. We need to capture the 39 | ** indent state (indent level) at the _beginning_ of _each_ 40 | ** line, not the end of just the second line. 41 | ** 42 | ** We must also keep track "In Attribute" and "In String" 43 | ** states at the _end_ of each line, 44 | */ 45 | 46 | typedef struct _TidyIndent 47 | { 48 | int spaces; 49 | int attrValStart; 50 | int attrStringStart; 51 | } TidyIndent; 52 | 53 | typedef struct _TidyPrintImpl 54 | { 55 | uint *linebuf; 56 | uint lbufsize; 57 | uint linelen; 58 | uint wraphere; 59 | uint linecount; 60 | 61 | uint ixInd; 62 | TidyIndent indent[2]; /* Two lines worth of indent state */ 63 | 64 | } TidyPrintImpl; 65 | 66 | void PPrintDocument( TidyDocImpl* doc ); 67 | 68 | 69 | #if SUPPORT_ASIAN_ENCODINGS 70 | /* #431953 - start RJ Wraplen adjusted for smooth international ride */ 71 | uint CWrapLen( TidyDocImpl* doc, uint ind ); 72 | #endif 73 | 74 | void InitPrintBuf( TidyDocImpl* doc ); 75 | void FreePrintBuf( TidyDocImpl* doc ); 76 | 77 | void PFlushLine( TidyDocImpl* doc, uint indent ); 78 | void PCondFlushLine( TidyDocImpl* doc, uint indent ); 79 | 80 | void PPrintScriptStyle( TidyDocImpl* doc, uint mode, uint indent, Node* node ); 81 | 82 | /* print just the content of the body element. 83 | ** useful when you want to reuse material from 84 | ** other documents. 85 | ** 86 | ** -- Sebastiano Vigna 87 | */ 88 | 89 | void PrintPreamble( TidyDocImpl* doc ); /* Between these 3, */ 90 | void PrintBody( TidyDocImpl* doc ); /* you can print an entire document */ 91 | void PrintPostamble( TidyDocImpl* doc ); /* or you can substitute another */ 92 | /* node as body using PPrintTree() */ 93 | 94 | void PPrintTree( TidyDocImpl* doc, uint mode, uint indent, Node *node ); 95 | 96 | void PPrintXMLTree( TidyDocImpl* doc, uint mode, uint indent, Node *node ); 97 | 98 | 99 | #endif /* __PPRINT_H__ */ 100 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/include/buffio.h: -------------------------------------------------------------------------------- 1 | #ifndef __BUFFIO_H__ 2 | #define __BUFFIO_H__ 3 | 4 | /** @file buffio.h - Treat buffer as an I/O stream. 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | Requires buffer to automatically grow as bytes are added. 16 | Must keep track of current read and write points. 17 | 18 | */ 19 | 20 | #include "platform.h" 21 | #include "tidy.h" 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | /** TidyBuffer - A chunk of memory */ 28 | TIDY_STRUCT 29 | struct _TidyBuffer 30 | { 31 | byte* bp; /**< Pointer to bytes */ 32 | uint size; /**< # bytes currently in use */ 33 | uint allocated; /**< # bytes allocated */ 34 | uint next; /**< Offset of current input position */ 35 | }; 36 | 37 | /** Zero out data structure */ 38 | TIDY_EXPORT void tidyBufInit( TidyBuffer* buf ); 39 | 40 | /** Free current buffer, allocate given amount, reset input pointer */ 41 | TIDY_EXPORT void tidyBufAlloc( TidyBuffer* buf, uint allocSize ); 42 | 43 | /** Expand buffer to given size. 44 | ** Chunk size is minimum growth. Pass 0 for default of 256 bytes. 45 | */ 46 | TIDY_EXPORT void tidyBufCheckAlloc( TidyBuffer* buf, 47 | uint allocSize, uint chunkSize ); 48 | 49 | /** Free current contents and zero out */ 50 | TIDY_EXPORT void tidyBufFree( TidyBuffer* buf ); 51 | 52 | /** Set buffer bytes to 0 */ 53 | TIDY_EXPORT void tidyBufClear( TidyBuffer* buf ); 54 | 55 | /** Attach to existing buffer */ 56 | TIDY_EXPORT void tidyBufAttach( TidyBuffer* buf, void* bp, uint size ); 57 | 58 | /** Detach from buffer. Caller must free. */ 59 | TIDY_EXPORT void tidyBufDetach( TidyBuffer* buf ); 60 | 61 | 62 | /** Append bytes to buffer. Expand if necessary. */ 63 | TIDY_EXPORT void tidyBufAppend( TidyBuffer* buf, void* vp, uint size ); 64 | 65 | /** Append one byte to buffer. Expand if necessary. */ 66 | TIDY_EXPORT void tidyBufPutByte( TidyBuffer* buf, byte bv ); 67 | 68 | /** Get byte from end of buffer */ 69 | TIDY_EXPORT int tidyBufPopByte( TidyBuffer* buf ); 70 | 71 | 72 | /** Get byte from front of buffer. Increment input offset. */ 73 | TIDY_EXPORT int tidyBufGetByte( TidyBuffer* buf ); 74 | 75 | /** At end of buffer? */ 76 | TIDY_EXPORT Bool tidyBufEndOfInput( TidyBuffer* buf ); 77 | 78 | /** Put a byte back into the buffer. Decrement input offset. */ 79 | TIDY_EXPORT void tidyBufUngetByte( TidyBuffer* buf, byte bv ); 80 | 81 | 82 | /************** 83 | TIDY 84 | **************/ 85 | 86 | /* Forward declarations 87 | */ 88 | 89 | /** Initialize a buffer input source */ 90 | TIDY_EXPORT void initInputBuffer( TidyInputSource* inp, TidyBuffer* buf ); 91 | 92 | /** Initialize a buffer output sink */ 93 | TIDY_EXPORT void initOutputBuffer( TidyOutputSink* outp, TidyBuffer* buf ); 94 | 95 | #ifdef __cplusplus 96 | } 97 | #endif 98 | #endif /* __BUFFIO_H__ */ 99 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/elementtidy/TidyHTMLTreeBuilder.py: -------------------------------------------------------------------------------- 1 | # 2 | # ElementTree 3 | # $Id: TidyHTMLTreeBuilder.py 2276 2005-02-03 19:21:25Z fredrik $ 4 | # 5 | # tree builder based on the _elementtidy tidylib wrapper 6 | # 7 | # history: 8 | # 2003-07-06 fl created 9 | # 2003-09-17 fl capture stderr as well 10 | # 2005-02-03 fl added encoding support 11 | # 12 | # Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved. 13 | # 14 | # fredrik@pythonware.com 15 | # http://www.pythonware.com 16 | # 17 | # -------------------------------------------------------------------- 18 | # The ElementTree toolkit is 19 | # 20 | # Copyright (c) 1999-2005 by Fredrik Lundh 21 | # 22 | # By obtaining, using, and/or copying this software and/or its 23 | # associated documentation, you agree that you have read, understood, 24 | # and will comply with the following terms and conditions: 25 | # 26 | # Permission to use, copy, modify, and distribute this software and 27 | # its associated documentation for any purpose and without fee is 28 | # hereby granted, provided that the above copyright notice appears in 29 | # all copies, and that both that copyright notice and this permission 30 | # notice appear in supporting documentation, and that the name of 31 | # Secret Labs AB or the author not be used in advertising or publicity 32 | # pertaining to distribution of the software without specific, written 33 | # prior permission. 34 | # 35 | # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 36 | # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 37 | # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 38 | # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 39 | # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 40 | # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 41 | # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 42 | # OF THIS SOFTWARE. 43 | # -------------------------------------------------------------------- 44 | 45 | # note: route all elementtree access via ElementTree, so that external 46 | # users can "patch in" another implementation if they want to (such as 47 | # cElementTree) 48 | 49 | # Support for python >= 2.5 50 | try: 51 | from elementtree import ElementTree 52 | except ImportError: 53 | from xml.etree import ElementTree 54 | 55 | import _elementtidy 56 | import string 57 | 58 | ## 59 | # ElementTree builder for HTML source code. This builder converts an 60 | # HTML document or fragment to an XHTML ElementTree, by running it 61 | # through the _elementtidy processor. 62 | # 63 | # @kwparam encoding Optional source document encoding. 64 | # 65 | # @see elementtree.ElementTree 66 | 67 | class TidyHTMLTreeBuilder: 68 | 69 | def __init__(self, encoding=None): 70 | self.__data = [] 71 | if encoding: 72 | if encoding == "iso-8859-1": 73 | encoding = "latin1" 74 | else: 75 | encoding = string.replace(encoding, "-", "") 76 | self.__encoding = encoding 77 | self.errlog = None 78 | 79 | ## 80 | # Add data to parser buffers. 81 | 82 | def feed(self, text): 83 | self.__data.append(text) 84 | 85 | ## 86 | # Flush parser buffers, and return the root element. 87 | # 88 | # @return An Element instance. 89 | 90 | def close(self): 91 | args = [string.join(self.__data, "")] 92 | if self.__encoding: 93 | args.append(self.__encoding) 94 | stdout, stderr = _elementtidy.fixup(*args) 95 | self.errlog = stderr 96 | return ElementTree.XML(stdout) 97 | 98 | ## 99 | # An alias for the TidyHTMLTreeBuilder class. 100 | 101 | TreeBuilder = TidyHTMLTreeBuilder 102 | 103 | ## 104 | # Parse an HTML document into an XHTML-style element tree. 105 | # 106 | # @param source A filename or file object containing HTML data. 107 | # @return An ElementTree instance 108 | 109 | def parse(source): 110 | return ElementTree.parse(source, TreeBuilder()) 111 | 112 | ## 113 | # Parse an HTML document into an XHTML-style element tree, and return 114 | # both the tree and the error log. 115 | # 116 | # @param source A filename or file object containing HTML data. 117 | # @return A 2-tuple containing an ElementTree instance and a string 118 | # with TidyLib's error log. 119 | 120 | def parse2(source): 121 | builder = TreeBuilder() 122 | tree = ElementTree.parse(source, builder) 123 | return tree, builder.errlog 124 | 125 | if __name__ == "__main__": 126 | import sys 127 | ElementTree.dump(parse(open(sys.argv[1]))) 128 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/_elementtidy.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ElementTree 3 | * $Id: _elementtidy.c 2276 2005-02-03 19:21:25Z fredrik $ 4 | * 5 | * TidyHTMLTreeBuilder driver for the ElementTree package, based 6 | * on tidylib (from http://tidy.sourceforge.net) 7 | * 8 | * Copyright (c) 2003-2005 by Fredrik Lundh. All rights reserved. 9 | */ 10 | 11 | /* -------------------------------------------------------------------- 12 | Copyright (c) 2003-2005 by Fredrik Lundh 13 | 14 | By obtaining, using, and/or copying this software and/or its 15 | associated documentation, you agree that you have read, understood, 16 | and will comply with the following terms and conditions: 17 | 18 | Permission to use, copy, modify, and distribute this software and its 19 | associated documentation for any purpose and without fee is hereby 20 | granted, provided that the above copyright notice appears in all 21 | copies, and that both that copyright notice and this permission notice 22 | appear in supporting documentation, and that the name of Secret Labs 23 | AB or the author not be used in advertising or publicity pertaining to 24 | distribution of the software without specific, written prior 25 | permission. 26 | 27 | SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 28 | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 29 | FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 30 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 31 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 32 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 33 | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 34 | -------------------------------------------------------------------- */ 35 | 36 | #include "Python.h" 37 | 38 | /* TODO: instead of saving to string, generate tree events */ 39 | 40 | #include "tidy.h" 41 | #include "buffio.h" 42 | 43 | static PyObject* 44 | elementtidy_fixup(PyObject* self, PyObject* args) 45 | { 46 | int rc; 47 | TidyDoc doc; 48 | TidyBuffer out = {0}; 49 | TidyBuffer err = {0}; 50 | PyObject* pyout; 51 | PyObject* pyerr; 52 | 53 | char* text; 54 | char* encoding = NULL; 55 | if (!PyArg_ParseTuple(args, "s|s:fixup", &text, &encoding)) 56 | return NULL; 57 | 58 | doc = tidyCreate(); 59 | 60 | /* options for nice XHTML output */ 61 | if (encoding) 62 | /* if an encoding is given, use it for both input and output */ 63 | tidyOptSetValue(doc, TidyCharEncoding, encoding); 64 | else 65 | /* if no encoding is given, use default input and utf-8 output */ 66 | tidyOptSetValue(doc, TidyOutCharEncoding, "utf8"); 67 | tidyOptSetBool(doc, TidyForceOutput, yes); 68 | tidyOptSetInt(doc, TidyWrapLen, 0); 69 | tidyOptSetBool(doc, TidyQuiet, yes); 70 | tidyOptSetBool(doc, TidyXhtmlOut, yes); 71 | tidyOptSetBool(doc, TidyXmlDecl, yes); 72 | tidyOptSetInt(doc, TidyIndentContent, 0); 73 | tidyOptSetBool(doc, TidyNumEntities, yes); 74 | 75 | rc = tidySetErrorBuffer(doc, &err); 76 | if (rc < 0) { 77 | PyErr_SetString(PyExc_IOError, "tidySetErrorBuffer failed"); 78 | goto error; 79 | } 80 | 81 | rc = tidyParseString(doc, text); 82 | if (rc < 0) { 83 | PyErr_SetString(PyExc_IOError, "tidyParseString failed"); 84 | goto error; 85 | } 86 | 87 | rc = tidyCleanAndRepair(doc); 88 | if (rc < 0) { 89 | PyErr_SetString(PyExc_IOError, "tidyCleanAndRepair failed"); 90 | goto error; 91 | } 92 | 93 | rc = tidyRunDiagnostics(doc); 94 | if (rc < 0) { 95 | PyErr_SetString(PyExc_IOError, "tidyRunDiagnostics failed"); 96 | goto error; 97 | } 98 | 99 | rc = tidySaveBuffer(doc, &out); 100 | if (rc < 0) { 101 | PyErr_SetString(PyExc_IOError, "tidyRunDiagnostics failed"); 102 | goto error; 103 | } 104 | 105 | 106 | pyout = PyString_FromString(out.bp ? out.bp : ""); 107 | if (!pyout) 108 | goto error; 109 | pyerr = PyString_FromString(err.bp ? err.bp : ""); 110 | if (!pyerr) { 111 | Py_DECREF(pyout); 112 | goto error; 113 | } 114 | 115 | tidyBufFree(&out); 116 | tidyBufFree(&err); 117 | 118 | tidyRelease(doc); 119 | 120 | return Py_BuildValue("NN", pyout, pyerr); 121 | 122 | error: 123 | tidyBufFree(&out); 124 | tidyBufFree(&err); 125 | 126 | tidyRelease(doc); 127 | 128 | return NULL; 129 | } 130 | 131 | static PyMethodDef _functions[] = { 132 | {"fixup", elementtidy_fixup, 1}, 133 | {NULL, NULL} 134 | }; 135 | 136 | void 137 | #ifdef WIN32 138 | __declspec(dllexport) 139 | #endif 140 | init_elementtidy() 141 | { 142 | Py_InitModule("_elementtidy", _functions); 143 | } 144 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/attrdict.h: -------------------------------------------------------------------------------- 1 | #ifndef __ATTRDICT_H__ 2 | #define __ATTRDICT_H__ 3 | 4 | /* attrdict.h -- extended attribute information 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | $Id$ 10 | */ 11 | 12 | #include "tidy.h" 13 | 14 | typedef struct _AttrVersion 15 | { 16 | uint attribute; 17 | uint versions; 18 | } AttrVersion; 19 | 20 | extern AttrVersion W3CAttrsFor_A[]; 21 | extern AttrVersion W3CAttrsFor_ABBR[]; 22 | extern AttrVersion W3CAttrsFor_ACRONYM[]; 23 | extern AttrVersion W3CAttrsFor_ADDRESS[]; 24 | extern AttrVersion W3CAttrsFor_APPLET[]; 25 | extern AttrVersion W3CAttrsFor_AREA[]; 26 | extern AttrVersion W3CAttrsFor_B[]; 27 | extern AttrVersion W3CAttrsFor_BASE[]; 28 | extern AttrVersion W3CAttrsFor_BASEFONT[]; 29 | extern AttrVersion W3CAttrsFor_BDO[]; 30 | extern AttrVersion W3CAttrsFor_BIG[]; 31 | extern AttrVersion W3CAttrsFor_BLOCKQUOTE[]; 32 | extern AttrVersion W3CAttrsFor_BODY[]; 33 | extern AttrVersion W3CAttrsFor_BR[]; 34 | extern AttrVersion W3CAttrsFor_BUTTON[]; 35 | extern AttrVersion W3CAttrsFor_CAPTION[]; 36 | extern AttrVersion W3CAttrsFor_CENTER[]; 37 | extern AttrVersion W3CAttrsFor_CITE[]; 38 | extern AttrVersion W3CAttrsFor_CODE[]; 39 | extern AttrVersion W3CAttrsFor_COL[]; 40 | extern AttrVersion W3CAttrsFor_COLGROUP[]; 41 | extern AttrVersion W3CAttrsFor_DD[]; 42 | extern AttrVersion W3CAttrsFor_DEL[]; 43 | extern AttrVersion W3CAttrsFor_DFN[]; 44 | extern AttrVersion W3CAttrsFor_DIR[]; 45 | extern AttrVersion W3CAttrsFor_DIV[]; 46 | extern AttrVersion W3CAttrsFor_DL[]; 47 | extern AttrVersion W3CAttrsFor_DT[]; 48 | extern AttrVersion W3CAttrsFor_EM[]; 49 | extern AttrVersion W3CAttrsFor_FIELDSET[]; 50 | extern AttrVersion W3CAttrsFor_FONT[]; 51 | extern AttrVersion W3CAttrsFor_FORM[]; 52 | extern AttrVersion W3CAttrsFor_FRAME[]; 53 | extern AttrVersion W3CAttrsFor_FRAMESET[]; 54 | extern AttrVersion W3CAttrsFor_H1[]; 55 | extern AttrVersion W3CAttrsFor_H2[]; 56 | extern AttrVersion W3CAttrsFor_H3[]; 57 | extern AttrVersion W3CAttrsFor_H4[]; 58 | extern AttrVersion W3CAttrsFor_H5[]; 59 | extern AttrVersion W3CAttrsFor_H6[]; 60 | extern AttrVersion W3CAttrsFor_HEAD[]; 61 | extern AttrVersion W3CAttrsFor_HR[]; 62 | extern AttrVersion W3CAttrsFor_HTML[]; 63 | extern AttrVersion W3CAttrsFor_I[]; 64 | extern AttrVersion W3CAttrsFor_IFRAME[]; 65 | extern AttrVersion W3CAttrsFor_IMG[]; 66 | extern AttrVersion W3CAttrsFor_INPUT[]; 67 | extern AttrVersion W3CAttrsFor_INS[]; 68 | extern AttrVersion W3CAttrsFor_ISINDEX[]; 69 | extern AttrVersion W3CAttrsFor_KBD[]; 70 | extern AttrVersion W3CAttrsFor_LABEL[]; 71 | extern AttrVersion W3CAttrsFor_LEGEND[]; 72 | extern AttrVersion W3CAttrsFor_LI[]; 73 | extern AttrVersion W3CAttrsFor_LINK[]; 74 | extern AttrVersion W3CAttrsFor_LISTING[]; 75 | extern AttrVersion W3CAttrsFor_MAP[]; 76 | extern AttrVersion W3CAttrsFor_MENU[]; 77 | extern AttrVersion W3CAttrsFor_META[]; 78 | extern AttrVersion W3CAttrsFor_NEXTID[]; 79 | extern AttrVersion W3CAttrsFor_NOFRAMES[]; 80 | extern AttrVersion W3CAttrsFor_NOSCRIPT[]; 81 | extern AttrVersion W3CAttrsFor_OBJECT[]; 82 | extern AttrVersion W3CAttrsFor_OL[]; 83 | extern AttrVersion W3CAttrsFor_OPTGROUP[]; 84 | extern AttrVersion W3CAttrsFor_OPTION[]; 85 | extern AttrVersion W3CAttrsFor_P[]; 86 | extern AttrVersion W3CAttrsFor_PARAM[]; 87 | extern AttrVersion W3CAttrsFor_PLAINTEXT[]; 88 | extern AttrVersion W3CAttrsFor_PRE[]; 89 | extern AttrVersion W3CAttrsFor_Q[]; 90 | extern AttrVersion W3CAttrsFor_RB[]; 91 | extern AttrVersion W3CAttrsFor_RBC[]; 92 | extern AttrVersion W3CAttrsFor_RP[]; 93 | extern AttrVersion W3CAttrsFor_RT[]; 94 | extern AttrVersion W3CAttrsFor_RTC[]; 95 | extern AttrVersion W3CAttrsFor_RUBY[]; 96 | extern AttrVersion W3CAttrsFor_S[]; 97 | extern AttrVersion W3CAttrsFor_SAMP[]; 98 | extern AttrVersion W3CAttrsFor_SCRIPT[]; 99 | extern AttrVersion W3CAttrsFor_SELECT[]; 100 | extern AttrVersion W3CAttrsFor_SMALL[]; 101 | extern AttrVersion W3CAttrsFor_SPAN[]; 102 | extern AttrVersion W3CAttrsFor_STRIKE[]; 103 | extern AttrVersion W3CAttrsFor_STRONG[]; 104 | extern AttrVersion W3CAttrsFor_STYLE[]; 105 | extern AttrVersion W3CAttrsFor_SUB[]; 106 | extern AttrVersion W3CAttrsFor_SUP[]; 107 | extern AttrVersion W3CAttrsFor_TABLE[]; 108 | extern AttrVersion W3CAttrsFor_TBODY[]; 109 | extern AttrVersion W3CAttrsFor_TD[]; 110 | extern AttrVersion W3CAttrsFor_TEXTAREA[]; 111 | extern AttrVersion W3CAttrsFor_TFOOT[]; 112 | extern AttrVersion W3CAttrsFor_TH[]; 113 | extern AttrVersion W3CAttrsFor_THEAD[]; 114 | extern AttrVersion W3CAttrsFor_TITLE[]; 115 | extern AttrVersion W3CAttrsFor_TR[]; 116 | extern AttrVersion W3CAttrsFor_TT[]; 117 | extern AttrVersion W3CAttrsFor_U[]; 118 | extern AttrVersion W3CAttrsFor_UL[]; 119 | extern AttrVersion W3CAttrsFor_VAR[]; 120 | extern AttrVersion W3CAttrsFor_XMP[]; 121 | 122 | #endif /* __ATTRDICT_H__ */ 123 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/buffio.c: -------------------------------------------------------------------------------- 1 | /* buffio.c -- Treat buffer as an I/O stream. 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info : 7 | 8 | $LastChangedBy$ 9 | $LastChangedDate$ 10 | $LastChangedRevision$ 11 | 12 | Requires buffer to automatically grow as bytes are added. 13 | Must keep track of current read and write points. 14 | 15 | */ 16 | 17 | #include "tidy.h" 18 | #include "buffio.h" 19 | 20 | 21 | /************** 22 | TIDY 23 | **************/ 24 | 25 | static int insrc_getByte( ulong appData ) 26 | { 27 | TidyBuffer* buf = (TidyBuffer*) appData; 28 | return tidyBufGetByte( buf ); 29 | } 30 | static Bool insrc_eof( ulong appData ) 31 | { 32 | TidyBuffer* buf = (TidyBuffer*) appData; 33 | return tidyBufEndOfInput( buf ); 34 | } 35 | static void insrc_ungetByte( ulong appData, byte bv ) 36 | { 37 | TidyBuffer* buf = (TidyBuffer*) appData; 38 | tidyBufUngetByte( buf, bv ); 39 | } 40 | 41 | void initInputBuffer( TidyInputSource* inp, TidyBuffer* buf ) 42 | { 43 | inp->getByte = insrc_getByte; 44 | inp->eof = insrc_eof; 45 | inp->ungetByte = insrc_ungetByte; 46 | inp->sourceData = (ulong) buf; 47 | } 48 | 49 | static void outsink_putByte( ulong appData, byte bv ) 50 | { 51 | TidyBuffer* buf = (TidyBuffer*) appData; 52 | tidyBufPutByte( buf, bv ); 53 | } 54 | 55 | void initOutputBuffer( TidyOutputSink* outp, TidyBuffer* buf ) 56 | { 57 | outp->putByte = outsink_putByte; 58 | outp->sinkData = (ulong) buf; 59 | } 60 | 61 | 62 | void tidyBufInit( TidyBuffer* buf ) 63 | { 64 | assert( buf != NULL ); 65 | ClearMemory( buf, sizeof(TidyBuffer) ); 66 | } 67 | 68 | void tidyBufAlloc( TidyBuffer* buf, uint allocSize ) 69 | { 70 | tidyBufInit( buf ); 71 | tidyBufCheckAlloc( buf, allocSize, 0 ); 72 | buf->next = 0; 73 | } 74 | void tidyBufFree( TidyBuffer* buf ) 75 | { 76 | assert( buf != NULL ); 77 | MemFree( buf->bp ); 78 | tidyBufInit( buf ); 79 | } 80 | 81 | void tidyBufClear( TidyBuffer* buf ) 82 | { 83 | assert( buf != NULL ); 84 | if ( buf->bp ) 85 | { 86 | ClearMemory( buf->bp, buf->allocated ); 87 | buf->size = 0; 88 | } 89 | buf->next = 0; 90 | } 91 | 92 | /* Avoid thrashing memory by doubling buffer size 93 | ** until larger than requested size. 94 | */ 95 | void tidyBufCheckAlloc( TidyBuffer* buf, uint allocSize, uint chunkSize ) 96 | { 97 | assert( buf != NULL ); 98 | if ( 0 == chunkSize ) 99 | chunkSize = 256; 100 | if ( allocSize > buf->allocated ) 101 | { 102 | byte* bp; 103 | uint allocAmt = chunkSize; 104 | if ( buf->allocated > 0 ) 105 | allocAmt = buf->allocated; 106 | while ( allocAmt < allocSize ) 107 | allocAmt *= 2; 108 | 109 | bp = MemRealloc( buf->bp, allocAmt ); 110 | if ( bp != NULL ) 111 | { 112 | ClearMemory( bp + buf->allocated, allocAmt - buf->allocated ); 113 | buf->bp = bp; 114 | buf->allocated = allocAmt; 115 | } 116 | } 117 | } 118 | 119 | /* Attach buffer to a chunk O' memory w/out allocation */ 120 | void tidyBufAttach( TidyBuffer* buf, void* bp, uint size ) 121 | { 122 | assert( buf != NULL ); 123 | buf->bp = bp; 124 | buf->size = buf->allocated = size; 125 | buf->next = 0; 126 | } 127 | 128 | /* Clear pointer to memory w/out deallocation */ 129 | void tidyBufDetach( TidyBuffer* buf ) 130 | { 131 | tidyBufInit( buf ); 132 | } 133 | 134 | 135 | /************** 136 | OUTPUT 137 | **************/ 138 | 139 | void tidyBufAppend( TidyBuffer* buf, void* vp, uint size ) 140 | { 141 | assert( buf != NULL ); 142 | if ( vp != NULL && size > 0 ) 143 | { 144 | tidyBufCheckAlloc( buf, buf->size + size, 0 ); 145 | memcpy( buf->bp + buf->size, vp, size ); 146 | buf->size += size; 147 | } 148 | } 149 | 150 | void tidyBufPutByte( TidyBuffer* buf, byte bv ) 151 | { 152 | assert( buf != NULL ); 153 | tidyBufCheckAlloc( buf, buf->size + 1, 0 ); 154 | buf->bp[ buf->size++ ] = bv; 155 | } 156 | 157 | 158 | int tidyBufPopByte( TidyBuffer* buf ) 159 | { 160 | int bv = EOF; 161 | assert( buf != NULL ); 162 | if ( buf->size > 0 ) 163 | bv = buf->bp[ --buf->size ]; 164 | return bv; 165 | } 166 | 167 | /************** 168 | INPUT 169 | **************/ 170 | 171 | int tidyBufGetByte( TidyBuffer* buf ) 172 | { 173 | int bv = EOF; 174 | if ( ! tidyBufEndOfInput(buf) ) 175 | bv = buf->bp[ buf->next++ ]; 176 | return bv; 177 | } 178 | 179 | Bool tidyBufEndOfInput( TidyBuffer* buf ) 180 | { 181 | return ( buf->next >= buf->size ); 182 | } 183 | 184 | void tidyBufUngetByte( TidyBuffer* buf, byte bv ) 185 | { 186 | if ( buf->next > 0 ) 187 | { 188 | --buf->next; 189 | assert( bv == buf->bp[ buf->next ] ); 190 | } 191 | } 192 | 193 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/tidy-int.h: -------------------------------------------------------------------------------- 1 | #ifndef __TIDY_INT_H__ 2 | #define __TIDY_INT_H__ 3 | 4 | /* tidy-int.h -- internal library declarations 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | */ 16 | 17 | #include "tidy.h" 18 | #include "config.h" 19 | #include "tags.h" 20 | #include "attrs.h" 21 | #include "lexer.h" 22 | #include "pprint.h" 23 | #include "access.h" 24 | 25 | #ifndef MAX 26 | #define MAX(a,b) (((a) > (b))?(a):(b)) 27 | #endif 28 | #ifndef MIN 29 | #define MIN(a,b) (((a) < (b))?(a):(b)) 30 | #endif 31 | 32 | struct _TidyDocImpl 33 | { 34 | /* The Document Tree (and backing store buffer) */ 35 | Node root; /* This MUST remain the first declared 36 | variable in this structure */ 37 | Lexer* lexer; 38 | 39 | /* Config + Markup Declarations */ 40 | TidyConfigImpl config; 41 | TidyTagImpl tags; 42 | TidyAttribImpl attribs; 43 | 44 | #if SUPPORT_ACCESSIBILITY_CHECKS 45 | /* Accessibility Checks state */ 46 | TidyAccessImpl access; 47 | #endif 48 | 49 | /* The Pretty Print buffer */ 50 | TidyPrintImpl pprint; 51 | 52 | /* I/O */ 53 | StreamIn* docIn; 54 | StreamOut* docOut; 55 | StreamOut* errout; 56 | TidyReportFilter mssgFilt; 57 | TidyOptCallback pOptCallback; 58 | 59 | /* Parse + Repair Results */ 60 | uint optionErrors; 61 | uint errors; 62 | uint warnings; 63 | uint accessErrors; 64 | uint infoMessages; 65 | uint docErrors; 66 | int parseStatus; 67 | 68 | uint badAccess; /* for accessibility errors */ 69 | uint badLayout; /* for bad style errors */ 70 | uint badChars; /* for bad char encodings */ 71 | uint badForm; /* for badly placed form tags */ 72 | 73 | /* Miscellaneous */ 74 | ulong appData; 75 | uint nClassId; 76 | Bool inputHadBOM; 77 | 78 | #if PRESERVE_FILE_TIMES 79 | struct utimbuf filetimes; 80 | #endif 81 | Node* givenDoctype; 82 | }; 83 | 84 | 85 | /* Twizzle internal/external types */ 86 | #ifdef NEVER 87 | TidyDocImpl* tidyDocToImpl( TidyDoc tdoc ); 88 | TidyDoc tidyImplToDoc( TidyDocImpl* impl ); 89 | 90 | Node* tidyNodeToImpl( TidyNode tnod ); 91 | TidyNode tidyImplToNode( Node* node ); 92 | 93 | AttVal* tidyAttrToImpl( TidyAttr tattr ); 94 | TidyAttr tidyImplToAttr( AttVal* attval ); 95 | 96 | const TidyOptionImpl* tidyOptionToImpl( TidyOption topt ); 97 | TidyOption tidyImplToOption( const TidyOptionImpl* option ); 98 | #else 99 | 100 | #define tidyDocToImpl( tdoc ) ((TidyDocImpl*)(tdoc)) 101 | #define tidyImplToDoc( doc ) ((TidyDoc)(doc)) 102 | 103 | #define tidyNodeToImpl( tnod ) ((Node*)(tnod)) 104 | #define tidyImplToNode( node ) ((TidyNode)(node)) 105 | 106 | #define tidyAttrToImpl( tattr ) ((AttVal*)(tattr)) 107 | #define tidyImplToAttr( attval ) ((TidyAttr)(attval)) 108 | 109 | #define tidyOptionToImpl( topt ) ((const TidyOptionImpl*)(topt)) 110 | #define tidyImplToOption( option ) ((TidyOption)(option)) 111 | 112 | #endif 113 | 114 | /* Create/Destroy a Tidy "document" object */ 115 | TidyDocImpl* tidyDocCreate(void); 116 | void tidyDocRelease( TidyDocImpl* impl ); 117 | 118 | int tidyDocStatus( TidyDocImpl* impl ); 119 | 120 | /* Parse Markup */ 121 | int tidyDocParseFile( TidyDocImpl* impl, ctmbstr htmlfil ); 122 | int tidyDocParseStdin( TidyDocImpl* impl ); 123 | int tidyDocParseString( TidyDocImpl* impl, ctmbstr content ); 124 | int tidyDocParseBuffer( TidyDocImpl* impl, TidyBuffer* inbuf ); 125 | int tidyDocParseSource( TidyDocImpl* impl, TidyInputSource* docIn ); 126 | int tidyDocParseStream( TidyDocImpl* impl, StreamIn* in ); 127 | 128 | 129 | /* Execute post-parse diagnostics and cleanup. 130 | ** Note, the order is important. You will get different 131 | ** results from the diagnostics depending on if they are run 132 | ** pre-or-post repair. 133 | */ 134 | int tidyDocRunDiagnostics( TidyDocImpl* doc ); 135 | int tidyDocCleanAndRepair( TidyDocImpl* doc ); 136 | 137 | 138 | /* Save cleaned up file to file/buffer/sink */ 139 | int tidyDocSaveFile( TidyDocImpl* impl, ctmbstr htmlfil ); 140 | int tidyDocSaveStdout( TidyDocImpl* impl ); 141 | int tidyDocSaveString( TidyDocImpl* impl, tmbstr buffer, uint* buflen ); 142 | int tidyDocSaveBuffer( TidyDocImpl* impl, TidyBuffer* outbuf ); 143 | int tidyDocSaveSink( TidyDocImpl* impl, TidyOutputSink* docOut ); 144 | int tidyDocSaveStream( TidyDocImpl* impl, StreamOut* out ); 145 | 146 | #endif /* __TIDY_INT_H__ */ 147 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/README: -------------------------------------------------------------------------------- 1 | $Id: README 2275 2005-02-03 18:20:56Z fredrik $ 2 | 3 | ====================== 4 | The elementtidy module 5 | ====================== 6 | 7 | This module provides an ElementTree builder based on the HTML TidyLib 8 | library. For more information, see: 9 | 10 | http://effbot.org/zone/element-tidylib.htm 11 | 12 | For more information on the ElementTree library, see: 13 | 14 | http://effbot.org/zone/element-index.htm 15 | 16 | This distribution includes a snapshot of the HTML Tidy sources. You 17 | can get the latest version from: 18 | 19 | http://tidy.sourceforge.net 20 | 21 | Enjoy /F 22 | 23 | fredrik@pythonware.com 24 | http://www.pythonware.com 25 | 26 | -------------------------------------------------------------------- 27 | Changes 28 | -------------------------------------------------------------------- 29 | 30 | (1.0 final released) 31 | 32 | - Improved error checking. The library may now raise IOError 33 | exceptions if the underlying Tidy library fails. 34 | 35 | (1.0 beta 1 released) 36 | 37 | - Use 'ForceOutput' flag to force output even for badly malformed 38 | HTML. 39 | 40 | - Added source encoding support (based on code by Kevin Dangoor). 41 | 42 | (1.0 alpha 3 released) 43 | 44 | - Fixed core dump when pages are broken beyond repair (reported by 45 | many, fix proposed by Brad Clements) 46 | 47 | (1.0 alpha 2 released) 48 | 49 | - Capture error output (available via the 'errlog' attribute on the 50 | parser instance). 51 | 52 | (1.0 alpha 1 released -- initial release) 53 | 54 | -------------------------------------------------------------------- 55 | Software License 56 | -------------------------------------------------------------------- 57 | 58 | The software components in this package are copyrighted, but can all 59 | be used freely in all sorts of applications. 60 | 61 | The _elementtree binding, and associated Python code, is 62 | 63 | Copyright (c) 2003-2005 by Fredrik Lundh. All rights reserved. 64 | 65 | By obtaining, using, and/or copying this software and/or its 66 | associated documentation, you agree that you have read, understood, 67 | and will comply with the following terms and conditions: 68 | 69 | Permission to use, copy, modify, and distribute this software and its 70 | associated documentation for any purpose and without fee is hereby 71 | granted, provided that the above copyright notice appears in all 72 | copies, and that both that copyright notice and this permission notice 73 | appear in supporting documentation, and that the name of Secret Labs 74 | AB or the author not be used in advertising or publicity pertaining to 75 | distribution of the software without specific, written prior 76 | permission. 77 | 78 | SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 79 | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 80 | FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 81 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 82 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 83 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 84 | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 85 | 86 | -------------------------------------------------------------------- 87 | 88 | The TidyLib library is copyright (c) by 1998-2003 World Wide Web 89 | Consortium. Software license: 90 | 91 | HTML Tidy 92 | 93 | HTML parser and pretty printer 94 | 95 | Copyright (c) 1998-2003 World Wide Web Consortium 96 | (Massachusetts Institute of Technology, European Research 97 | Consortium for Informatics and Mathematics, Keio University). 98 | All Rights Reserved. 99 | 100 | This software and documentation is provided "as is," and 101 | the copyright holders and contributing author(s) make no 102 | representations or warranties, express or implied, including 103 | but not limited to, warranties of merchantability or fitness 104 | for any particular purpose or that the use of the software or 105 | documentation will not infringe any third party patents, 106 | copyrights, trademarks or other rights. 107 | 108 | The copyright holders and contributing author(s) will not be held 109 | liable for any direct, indirect, special or consequential damages 110 | arising out of any use of the software or documentation, even if 111 | advised of the possibility of such damage. 112 | 113 | Permission is hereby granted to use, copy, modify, and distribute 114 | this source code, or portions hereof, documentation and executables, 115 | for any purpose, without fee, subject to the following restrictions: 116 | 117 | 1. The origin of this source code must not be misrepresented. 118 | 2. Altered versions must be plainly marked as such and must 119 | not be misrepresented as being the original source. 120 | 3. This Copyright notice may not be removed or altered from any 121 | source or altered source distribution. 122 | 123 | The copyright holders and contributing author(s) specifically 124 | permit, without fee, and encourage the use of this source code 125 | as a component for supporting the Hypertext Markup Language in 126 | commercial products. If you use this source code in a product, 127 | acknowledgment is not required but would be appreciated. 128 | 129 | (from http://tidy.sourceforge.net/license.html) 130 | 131 | -------------------------------------------------------------------- 132 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/attrask.c: -------------------------------------------------------------------------------- 1 | /* attrask.c -- Interrogate attribute type 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info: 7 | $LastChangedBy$ 8 | $LastChangedDate$ 9 | $LastChangedRevision$ 10 | 11 | */ 12 | 13 | #include "tidy-int.h" 14 | #include "tidy.h" 15 | #include "attrs.h" 16 | 17 | Bool tidyAttrIsHREF( TidyAttr tattr ) 18 | { 19 | return attrIsHREF( tidyAttrToImpl(tattr) ); 20 | } 21 | Bool tidyAttrIsSRC( TidyAttr tattr ) 22 | { 23 | return attrIsSRC( tidyAttrToImpl(tattr) ); 24 | } 25 | Bool tidyAttrIsID( TidyAttr tattr ) 26 | { 27 | return attrIsID( tidyAttrToImpl(tattr) ); 28 | } 29 | Bool tidyAttrIsNAME( TidyAttr tattr ) 30 | { 31 | return attrIsNAME( tidyAttrToImpl(tattr) ); 32 | } 33 | Bool tidyAttrIsSUMMARY( TidyAttr tattr ) 34 | { 35 | return attrIsSUMMARY( tidyAttrToImpl(tattr) ); 36 | } 37 | Bool tidyAttrIsALT( TidyAttr tattr ) 38 | { 39 | return attrIsALT( tidyAttrToImpl(tattr) ); 40 | } 41 | Bool tidyAttrIsLONGDESC( TidyAttr tattr ) 42 | { 43 | return attrIsLONGDESC( tidyAttrToImpl(tattr) ); 44 | } 45 | Bool tidyAttrIsUSEMAP( TidyAttr tattr ) 46 | { 47 | return attrIsUSEMAP( tidyAttrToImpl(tattr) ); 48 | } 49 | Bool tidyAttrIsISMAP( TidyAttr tattr ) 50 | { 51 | return attrIsISMAP( tidyAttrToImpl(tattr) ); 52 | } 53 | Bool tidyAttrIsLANGUAGE( TidyAttr tattr ) 54 | { 55 | return attrIsLANGUAGE( tidyAttrToImpl(tattr) ); 56 | } 57 | Bool tidyAttrIsTYPE( TidyAttr tattr ) 58 | { 59 | return attrIsTYPE( tidyAttrToImpl(tattr) ); 60 | } 61 | Bool tidyAttrIsVALUE( TidyAttr tattr ) 62 | { 63 | return attrIsVALUE( tidyAttrToImpl(tattr) ); 64 | } 65 | Bool tidyAttrIsCONTENT( TidyAttr tattr ) 66 | { 67 | return attrIsCONTENT( tidyAttrToImpl(tattr) ); 68 | } 69 | Bool tidyAttrIsTITLE( TidyAttr tattr ) 70 | { 71 | return attrIsTITLE( tidyAttrToImpl(tattr) ); 72 | } 73 | Bool tidyAttrIsXMLNS( TidyAttr tattr ) 74 | { 75 | return attrIsXMLNS( tidyAttrToImpl(tattr) ); 76 | } 77 | Bool tidyAttrIsDATAFLD( TidyAttr tattr ) 78 | { 79 | return attrIsDATAFLD( tidyAttrToImpl(tattr) ); 80 | } 81 | Bool tidyAttrIsWIDTH( TidyAttr tattr ) 82 | { 83 | return attrIsWIDTH( tidyAttrToImpl(tattr) ); 84 | } 85 | Bool tidyAttrIsHEIGHT( TidyAttr tattr ) 86 | { 87 | return attrIsHEIGHT( tidyAttrToImpl(tattr) ); 88 | } 89 | Bool tidyAttrIsFOR( TidyAttr tattr ) 90 | { 91 | return attrIsFOR( tidyAttrToImpl(tattr) ); 92 | } 93 | Bool tidyAttrIsSELECTED( TidyAttr tattr ) 94 | { 95 | return attrIsSELECTED( tidyAttrToImpl(tattr) ); 96 | } 97 | Bool tidyAttrIsCHECKED( TidyAttr tattr ) 98 | { 99 | return attrIsCHECKED( tidyAttrToImpl(tattr) ); 100 | } 101 | Bool tidyAttrIsLANG( TidyAttr tattr ) 102 | { 103 | return attrIsLANG( tidyAttrToImpl(tattr) ); 104 | } 105 | Bool tidyAttrIsTARGET( TidyAttr tattr ) 106 | { 107 | return attrIsTARGET( tidyAttrToImpl(tattr) ); 108 | } 109 | Bool tidyAttrIsHTTP_EQUIV( TidyAttr tattr ) 110 | { 111 | return attrIsHTTP_EQUIV( tidyAttrToImpl(tattr) ); 112 | } 113 | Bool tidyAttrIsREL( TidyAttr tattr ) 114 | { 115 | return attrIsREL( tidyAttrToImpl(tattr) ); 116 | } 117 | Bool tidyAttrIsEvent( TidyAttr tattr ) 118 | { 119 | return attrIsEvent( tidyAttrToImpl(tattr) ); 120 | } 121 | Bool tidyAttrIsOnMOUSEMOVE( TidyAttr tattr ) 122 | { 123 | return attrIsOnMOUSEMOVE( tidyAttrToImpl(tattr) ); 124 | } 125 | Bool tidyAttrIsOnMOUSEDOWN( TidyAttr tattr ) 126 | { 127 | return attrIsOnMOUSEDOWN( tidyAttrToImpl(tattr) ); 128 | } 129 | Bool tidyAttrIsOnMOUSEUP( TidyAttr tattr ) 130 | { 131 | return attrIsOnMOUSEUP( tidyAttrToImpl(tattr) ); 132 | } 133 | Bool tidyAttrIsOnCLICK( TidyAttr tattr ) 134 | { 135 | return attrIsOnCLICK( tidyAttrToImpl(tattr) ); 136 | } 137 | Bool tidyAttrIsOnMOUSEOVER( TidyAttr tattr ) 138 | { 139 | return attrIsOnMOUSEOVER( tidyAttrToImpl(tattr) ); 140 | } 141 | Bool tidyAttrIsOnMOUSEOUT( TidyAttr tattr ) 142 | { 143 | return attrIsOnMOUSEOUT( tidyAttrToImpl(tattr) ); 144 | } 145 | Bool tidyAttrIsOnKEYDOWN( TidyAttr tattr ) 146 | { 147 | return attrIsOnKEYDOWN( tidyAttrToImpl(tattr) ); 148 | } 149 | Bool tidyAttrIsOnKEYUP( TidyAttr tattr ) 150 | { 151 | return attrIsOnKEYUP( tidyAttrToImpl(tattr) ); 152 | } 153 | Bool tidyAttrIsOnKEYPRESS( TidyAttr tattr ) 154 | { 155 | return attrIsOnKEYPRESS( tidyAttrToImpl(tattr) ); 156 | } 157 | Bool tidyAttrIsOnFOCUS( TidyAttr tattr ) 158 | { 159 | return attrIsOnFOCUS( tidyAttrToImpl(tattr) ); 160 | } 161 | Bool tidyAttrIsOnBLUR( TidyAttr tattr ) 162 | { 163 | return attrIsOnBLUR( tidyAttrToImpl(tattr) ); 164 | } 165 | Bool tidyAttrIsBGCOLOR( TidyAttr tattr ) 166 | { 167 | return attrIsBGCOLOR( tidyAttrToImpl(tattr) ); 168 | } 169 | Bool tidyAttrIsLINK( TidyAttr tattr ) 170 | { 171 | return attrIsLINK( tidyAttrToImpl(tattr) ); 172 | } 173 | Bool tidyAttrIsALINK( TidyAttr tattr ) 174 | { 175 | return attrIsALINK( tidyAttrToImpl(tattr) ); 176 | } 177 | Bool tidyAttrIsVLINK( TidyAttr tattr ) 178 | { 179 | return attrIsVLINK( tidyAttrToImpl(tattr) ); 180 | } 181 | Bool tidyAttrIsTEXT( TidyAttr tattr ) 182 | { 183 | return attrIsTEXT( tidyAttrToImpl(tattr) ); 184 | } 185 | Bool tidyAttrIsSTYLE( TidyAttr tattr ) 186 | { 187 | return attrIsSTYLE( tidyAttrToImpl(tattr) ); 188 | } 189 | Bool tidyAttrIsABBR( TidyAttr tattr ) 190 | { 191 | return attrIsABBR( tidyAttrToImpl(tattr) ); 192 | } 193 | Bool tidyAttrIsCOLSPAN( TidyAttr tattr ) 194 | { 195 | return attrIsCOLSPAN( tidyAttrToImpl(tattr) ); 196 | } 197 | Bool tidyAttrIsROWSPAN( TidyAttr tattr ) 198 | { 199 | return attrIsROWSPAN( tidyAttrToImpl(tattr) ); 200 | } 201 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/streamio.h: -------------------------------------------------------------------------------- 1 | #ifndef __STREAMIO_H__ 2 | #define __STREAMIO_H__ 3 | 4 | /* streamio.h -- handles character stream I/O 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | Wrapper around Tidy input source and output sink 16 | that calls appropriate interfaces, and applies 17 | necessary char encoding transformations: to/from 18 | ISO-10646 and/or UTF-8. 19 | 20 | */ 21 | 22 | #include "forward.h" 23 | #include "buffio.h" 24 | #include "fileio.h" 25 | 26 | #ifdef __cplusplus 27 | extern "C" 28 | { 29 | #endif 30 | typedef enum 31 | { 32 | FileIO, 33 | BufferIO, 34 | UserIO 35 | } IOType; 36 | 37 | /************************ 38 | ** Source 39 | ************************/ 40 | 41 | #define CHARBUF_SIZE 5 42 | 43 | /* non-raw input is cleaned up*/ 44 | struct _StreamIn 45 | { 46 | int state; /* FSM for ISO2022 */ 47 | Bool pushed; 48 | uint charbuf[ CHARBUF_SIZE ]; 49 | int bufpos; 50 | int tabs; 51 | int lastcol; 52 | int curcol; 53 | int curline; 54 | 55 | int encoding; 56 | IOType iotype; 57 | TidyInputSource source; 58 | 59 | #ifdef TIDY_WIN32_MLANG_SUPPORT 60 | ulong mlang; 61 | #endif 62 | 63 | /* Pointer back to document for error reporting */ 64 | TidyDocImpl* doc; 65 | }; 66 | 67 | StreamIn* FileInput( TidyDocImpl* doc, FILE* fp, int encoding ); 68 | StreamIn* BufferInput( TidyDocImpl* doc, TidyBuffer* content, int encoding ); 69 | StreamIn* UserInput( TidyDocImpl* doc, TidyInputSource* source, int encoding ); 70 | 71 | int ReadBOMEncoding(StreamIn *in); 72 | uint ReadChar( StreamIn* in ); 73 | void UngetChar( uint c, StreamIn* in ); 74 | uint PopChar( StreamIn *in ); 75 | Bool IsEOF( StreamIn* in ); 76 | 77 | 78 | /************************ 79 | ** Sink 80 | ************************/ 81 | 82 | struct _StreamOut 83 | { 84 | int encoding; 85 | int state; /* for ISO 2022 */ 86 | uint nl; 87 | 88 | #ifdef TIDY_WIN32_MLANG_SUPPORT 89 | ulong mlang; 90 | #endif 91 | 92 | IOType iotype; 93 | TidyOutputSink sink; 94 | }; 95 | 96 | StreamOut* FileOutput( FILE* fp, int encoding, uint newln ); 97 | StreamOut* BufferOutput( TidyBuffer* buf, int encoding, uint newln ); 98 | StreamOut* UserOutput( TidyOutputSink* sink, int encoding, uint newln ); 99 | 100 | StreamOut* StdErrOutput(void); 101 | StreamOut* StdOutOutput(void); 102 | void ReleaseStreamOut( StreamOut* out ); 103 | 104 | void WriteChar( uint c, StreamOut* out ); 105 | void outBOM( StreamOut *out ); 106 | 107 | ctmbstr GetEncodingNameFromTidyId(uint id); 108 | 109 | /************************ 110 | ** Misc 111 | ************************/ 112 | 113 | /* character encodings 114 | */ 115 | #define RAW 0 116 | #define ASCII 1 117 | #define LATIN0 2 118 | #define LATIN1 3 119 | #define UTF8 4 120 | #define ISO2022 5 121 | #define MACROMAN 6 122 | #define WIN1252 7 123 | #define IBM858 8 124 | 125 | #if SUPPORT_UTF16_ENCODINGS 126 | #define UTF16LE 9 127 | #define UTF16BE 10 128 | #define UTF16 11 129 | #endif 130 | 131 | /* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints 132 | ** (i.e., to Unicode) before being recoded into UTF-8. This may be 133 | ** confusing: usually UTF-8 implies ISO10646 codepoints. 134 | */ 135 | #if SUPPORT_ASIAN_ENCODINGS 136 | #if SUPPORT_UTF16_ENCODINGS 137 | #define BIG5 12 138 | #define SHIFTJIS 13 139 | #else 140 | #define BIG5 9 141 | #define SHIFTJIS 10 142 | #endif 143 | #endif 144 | 145 | #ifdef TIDY_WIN32_MLANG_SUPPORT 146 | /* hack: windows code page numbers start at 37 */ 147 | #define WIN32MLANG 36 148 | #endif 149 | 150 | /* states for ISO 2022 151 | 152 | A document in ISO-2022 based encoding uses some ESC sequences called 153 | "designator" to switch character sets. The designators defined and 154 | used in ISO-2022-JP are: 155 | 156 | "ESC" + "(" + ? for ISO646 variants 157 | 158 | "ESC" + "$" + ? and 159 | "ESC" + "$" + "(" + ? for multibyte character sets 160 | */ 161 | #define FSM_ASCII 0 162 | #define FSM_ESC 1 163 | #define FSM_ESCD 2 164 | #define FSM_ESCDP 3 165 | #define FSM_ESCP 4 166 | #define FSM_NONASCII 5 167 | 168 | 169 | /* char encoding used when replacing illegal SGML chars, 170 | ** regardless of specified encoding. Set at compile time 171 | ** to either Windows or Mac. 172 | */ 173 | extern const int ReplacementCharEncoding; 174 | 175 | /* Function for conversion from Windows-1252 to Unicode */ 176 | uint DecodeWin1252(uint c); 177 | 178 | /* Function to convert from MacRoman to Unicode */ 179 | uint DecodeMacRoman(uint c); 180 | 181 | /* Function for conversion from OS/2-850 to Unicode */ 182 | uint DecodeIbm850(uint c); 183 | 184 | /* Function for conversion from Latin0 to Unicode */ 185 | uint DecodeLatin0(uint c); 186 | 187 | /* Function to convert from Symbol Font chars to Unicode */ 188 | uint DecodeSymbolFont(uint c); 189 | #ifdef __cplusplus 190 | } 191 | #endif 192 | 193 | 194 | /* Use numeric constants as opposed to escape chars (\r, \n) 195 | ** to avoid conflict Mac compilers that may re-define these. 196 | */ 197 | #define CR 0xD 198 | #define LF 0xA 199 | 200 | #if defined(MAC_OS_CLASSIC) 201 | #define DEFAULT_NL_CONFIG TidyCR 202 | #elif defined(_WIN32) || defined(OS2_OS) 203 | #define DEFAULT_NL_CONFIG TidyCRLF 204 | #else 205 | #define DEFAULT_NL_CONFIG TidyLF 206 | #endif 207 | 208 | 209 | #endif /* __STREAMIO_H__ */ 210 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/tmbstr.c: -------------------------------------------------------------------------------- 1 | /* tmbstr.c -- Tidy string utility functions 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info : 7 | 8 | $LastChangedBy$ 9 | $LastChangedDate$ 10 | $LastChangedRevision$ 11 | 12 | */ 13 | 14 | #include "tmbstr.h" 15 | #include "lexer.h" 16 | 17 | /* like strdup but using MemAlloc */ 18 | tmbstr tmbstrdup( ctmbstr str ) 19 | { 20 | tmbstr s = NULL; 21 | if ( str ) 22 | { 23 | uint len = tmbstrlen( str ); 24 | tmbstr cp = s = (tmbstr) MemAlloc( 1+len ); 25 | while ( *cp++ = *str++ ) 26 | /**/; 27 | } 28 | return s; 29 | } 30 | 31 | /* like strndup but using MemAlloc */ 32 | tmbstr tmbstrndup( ctmbstr str, uint len ) 33 | { 34 | tmbstr s = NULL; 35 | if ( str && len > 0 ) 36 | { 37 | tmbstr cp = s = (tmbstr) MemAlloc( 1+len ); 38 | while ( len-- > 0 && (*cp++ = *str++) ) 39 | /**/; 40 | *cp = 0; 41 | } 42 | return s; 43 | } 44 | 45 | /* exactly same as strncpy */ 46 | uint tmbstrncpy( tmbstr s1, ctmbstr s2, uint size ) 47 | { 48 | if ( s1 != NULL && s2 != NULL ) 49 | { 50 | tmbstr cp = s1; 51 | while ( *s2 && --size ) /* Predecrement: reserve byte */ 52 | *cp++ = *s2++; /* for NULL terminator. */ 53 | *cp = 0; 54 | } 55 | return size; 56 | } 57 | 58 | /* Allows expressions like: cp += tmbstrcpy( cp, "joebob" ); 59 | */ 60 | uint tmbstrcpy( tmbstr s1, ctmbstr s2 ) 61 | { 62 | uint ncpy = 0; 63 | while ( *s1++ = *s2++ ) 64 | ++ncpy; 65 | return ncpy; 66 | } 67 | 68 | /* Allows expressions like: cp += tmbstrcat( cp, "joebob" ); 69 | */ 70 | uint tmbstrcat( tmbstr s1, ctmbstr s2 ) 71 | { 72 | uint ncpy = 0; 73 | while ( *s1 ) 74 | ++s1; 75 | 76 | while ( *s1++ = *s2++ ) 77 | ++ncpy; 78 | return ncpy; 79 | } 80 | 81 | /* exactly same as strcmp */ 82 | int tmbstrcmp( ctmbstr s1, ctmbstr s2 ) 83 | { 84 | int c; 85 | while ((c = *s1) == *s2) 86 | { 87 | if (c == '\0') 88 | return 0; 89 | 90 | ++s1; 91 | ++s2; 92 | } 93 | 94 | return (*s1 > *s2 ? 1 : -1); 95 | } 96 | 97 | /* returns byte count, not char count */ 98 | uint tmbstrlen( ctmbstr str ) 99 | { 100 | uint len = 0; 101 | if ( str ) 102 | { 103 | while ( *str++ ) 104 | ++len; 105 | } 106 | return len; 107 | } 108 | 109 | /* 110 | MS C 4.2 doesn't include strcasecmp. 111 | Note that tolower and toupper won't 112 | work on chars > 127. 113 | 114 | Neither does ToLower()! 115 | */ 116 | int tmbstrcasecmp( ctmbstr s1, ctmbstr s2 ) 117 | { 118 | uint c; 119 | 120 | while (c = (uint)(*s1), ToLower(c) == ToLower((uint)(*s2))) 121 | { 122 | if (c == '\0') 123 | return 0; 124 | 125 | ++s1; 126 | ++s2; 127 | } 128 | 129 | return (*s1 > *s2 ? 1 : -1); 130 | } 131 | 132 | int tmbstrncmp( ctmbstr s1, ctmbstr s2, uint n ) 133 | { 134 | uint c; 135 | 136 | while ((c = (byte)*s1) == (byte)*s2) 137 | { 138 | if (c == '\0') 139 | return 0; 140 | 141 | if (n == 0) 142 | return 0; 143 | 144 | ++s1; 145 | ++s2; 146 | --n; 147 | } 148 | 149 | if (n == 0) 150 | return 0; 151 | 152 | return (*s1 > *s2 ? 1 : -1); 153 | } 154 | 155 | int tmbstrncasecmp( ctmbstr s1, ctmbstr s2, uint n ) 156 | { 157 | uint c; 158 | 159 | while ( (c = tolower(*s1)) == (uint) tolower(*s2) ) 160 | { 161 | if (c == '\0') 162 | return 0; 163 | 164 | if (n == 0) 165 | return 0; 166 | 167 | ++s1; 168 | ++s2; 169 | --n; 170 | } 171 | 172 | if (n == 0) 173 | return 0; 174 | 175 | return (*s1 > *s2 ? 1 : -1); 176 | } 177 | 178 | /* return offset of cc from beginning of s1, 179 | ** -1 if not found. 180 | */ 181 | int tmbstrnchr( ctmbstr s1, uint maxlen, tmbchar cc ) 182 | { 183 | int i; 184 | ctmbstr cp = s1; 185 | 186 | for ( i = 0; (uint)i < maxlen; ++i, ++cp ) 187 | { 188 | if ( *cp == cc ) 189 | return i; 190 | } 191 | 192 | return -1; 193 | } 194 | 195 | ctmbstr tmbsubstrn( ctmbstr s1, uint len1, ctmbstr s2 ) 196 | { 197 | uint len2 = tmbstrlen(s2); 198 | int ix, diff = len1 - len2; 199 | 200 | for ( ix = 0; ix <= diff; ++ix ) 201 | { 202 | if ( tmbstrncmp(s1+ix, s2, len2) == 0 ) 203 | return (ctmbstr) s1+ix; 204 | } 205 | return NULL; 206 | } 207 | 208 | ctmbstr tmbsubstrncase( ctmbstr s1, uint len1, ctmbstr s2 ) 209 | { 210 | uint len2 = tmbstrlen(s2); 211 | int ix, diff = len1 - len2; 212 | 213 | for ( ix = 0; ix <= diff; ++ix ) 214 | { 215 | if ( tmbstrncasecmp(s1+ix, s2, len2) == 0 ) 216 | return (ctmbstr) s1+ix; 217 | } 218 | return NULL; 219 | } 220 | 221 | ctmbstr tmbsubstr( ctmbstr s1, ctmbstr s2 ) 222 | { 223 | uint len1 = tmbstrlen(s1), len2 = tmbstrlen(s2); 224 | int ix, diff = len1 - len2; 225 | 226 | for ( ix = 0; ix <= diff; ++ix ) 227 | { 228 | if ( tmbstrncasecmp(s1+ix, s2, len2) == 0 ) 229 | return (ctmbstr) s1+ix; 230 | } 231 | return NULL; 232 | } 233 | 234 | /* Transform ASCII chars in string to lower case */ 235 | tmbstr tmbstrtolower( tmbstr s ) 236 | { 237 | tmbstr cp; 238 | for ( cp=s; *cp; ++cp ) 239 | *cp = (tmbchar) ToLower( *cp ); 240 | return s; 241 | } 242 | 243 | /* Transform ASCII chars in string to upper case */ 244 | tmbstr tmbstrtoupper(tmbstr s) 245 | { 246 | tmbstr cp; 247 | 248 | for (cp = s; *cp; ++cp) 249 | *cp = (tmbchar)ToUpper(*cp); 250 | 251 | return s; 252 | } 253 | 254 | Bool tmbsamefile( ctmbstr filename1, ctmbstr filename2 ) 255 | { 256 | #if FILENAMES_CASE_SENSITIVE 257 | return ( tmbstrcmp( filename1, filename2 ) == 0 ); 258 | #else 259 | return ( tmbstrcasecmp( filename1, filename2 ) == 0 ); 260 | #endif 261 | } 262 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/config.h: -------------------------------------------------------------------------------- 1 | #ifndef __CONFIG_H__ 2 | #define __CONFIG_H__ 3 | 4 | /* config.h -- read config file and manage config properties 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | config files associate a property name with a value. 16 | 17 | // comments can start at the beginning of a line 18 | # comments can start at the beginning of a line 19 | name: short values fit onto one line 20 | name: a really long value that 21 | continues on the next line 22 | 23 | property names are case insensitive and should be less than 24 | 60 characters in length and must start at the begining of 25 | the line, as whitespace at the start of a line signifies a 26 | line continuation. 27 | 28 | */ 29 | 30 | #include "forward.h" 31 | #include "tidy.h" 32 | #include "streamio.h" 33 | 34 | struct _tidy_option; 35 | typedef struct _tidy_option TidyOptionImpl; 36 | 37 | typedef Bool (ParseProperty)( TidyDocImpl* doc, const TidyOptionImpl* opt ); 38 | 39 | struct _tidy_option 40 | { 41 | TidyOptionId id; 42 | TidyConfigCategory category; /* put 'em in groups */ 43 | ctmbstr name; /* property name */ 44 | TidyOptionType type; /* string, int or bool */ 45 | ulong dflt; /* factory default */ 46 | ParseProperty* parser; /* parsing method, read-only if NULL */ 47 | const ctmbstr* pickList; /* pick list */ 48 | }; 49 | 50 | 51 | typedef struct _tidy_config 52 | { 53 | ulong value[ N_TIDY_OPTIONS + 1 ]; /* current config values */ 54 | ulong snapshot[ N_TIDY_OPTIONS + 1 ]; /* Snapshot of values to be restored later */ 55 | 56 | /* track what tags user has defined to eliminate unnecessary searches */ 57 | uint defined_tags; 58 | 59 | uint c; /* current char in input stream */ 60 | StreamIn* cfgIn; /* current input source */ 61 | 62 | } TidyConfigImpl; 63 | 64 | 65 | const TidyOptionImpl* lookupOption( ctmbstr optnam ); 66 | const TidyOptionImpl* getOption( TidyOptionId optId ); 67 | 68 | TidyIterator getOptionList( TidyDocImpl* doc ); 69 | const TidyOptionImpl* getNextOption( TidyDocImpl* doc, TidyIterator* iter ); 70 | 71 | TidyIterator getOptionPickList( const TidyOptionImpl* option ); 72 | ctmbstr getNextOptionPick( const TidyOptionImpl* option, TidyIterator* iter ); 73 | 74 | void InitConfig( TidyDocImpl* doc ); 75 | void FreeConfig( TidyDocImpl* doc ); 76 | 77 | Bool SetOptionValue( TidyDocImpl* doc, TidyOptionId optId, ctmbstr val ); 78 | Bool SetOptionInt( TidyDocImpl* doc, TidyOptionId optId, ulong val ); 79 | Bool SetOptionBool( TidyDocImpl* doc, TidyOptionId optId, Bool val ); 80 | 81 | Bool ResetOptionToDefault( TidyDocImpl* doc, TidyOptionId optId ); 82 | void ResetConfigToDefault( TidyDocImpl* doc ); 83 | void TakeConfigSnapshot( TidyDocImpl* doc ); 84 | void ResetConfigToSnapshot( TidyDocImpl* doc ); 85 | 86 | void CopyConfig( TidyDocImpl* docTo, TidyDocImpl* docFrom ); 87 | 88 | 89 | #ifdef SUPPORT_GETPWNAM 90 | /* 91 | Tod Lewis contributed this code for expanding 92 | ~/foo or ~your/foo according to $HOME and your 93 | user name. This will only work on Unix systems. 94 | */ 95 | ctmbstr ExpandTilde(ctmbstr filename); 96 | #endif /* SUPPORT_GETPWNAM */ 97 | 98 | int ParseConfigFile( TidyDocImpl* doc, ctmbstr cfgfil ); 99 | int ParseConfigFileEnc( TidyDocImpl* doc, 100 | ctmbstr cfgfil, ctmbstr charenc ); 101 | 102 | int SaveConfigFile( TidyDocImpl* doc, ctmbstr cfgfil ); 103 | int SaveConfigSink( TidyDocImpl* doc, TidyOutputSink* sink ); 104 | 105 | /* returns false if unknown option, missing parameter, or 106 | option doesn't use parameter 107 | */ 108 | Bool ParseConfigOption( TidyDocImpl* doc, ctmbstr optnam, ctmbstr optVal ); 109 | Bool ParseConfigValue( TidyDocImpl* doc, TidyOptionId optId, ctmbstr optVal ); 110 | 111 | /* ensure that char encodings are self consistent */ 112 | Bool AdjustCharEncoding( TidyDocImpl* doc, int encoding ); 113 | 114 | /* ensure that config is self consistent */ 115 | void AdjustConfig( TidyDocImpl* doc ); 116 | 117 | Bool ConfigDiffThanDefault( TidyDocImpl* doc ); 118 | Bool ConfigDiffThanSnapshot( TidyDocImpl* doc ); 119 | 120 | int CharEncodingId( ctmbstr charenc ); 121 | ctmbstr CharEncodingName( int encoding ); 122 | 123 | void SetEmacsFilename( TidyDocImpl* doc, ctmbstr filename ); 124 | 125 | 126 | #ifdef _DEBUG 127 | 128 | /* Debug lookup functions will be type-safe and assert option type match */ 129 | ulong _cfgGet( TidyDocImpl* doc, TidyOptionId optId ); 130 | Bool _cfgGetBool( TidyDocImpl* doc, TidyOptionId optId ); 131 | ctmbstr _cfgGetString( TidyDocImpl* doc, TidyOptionId optId ); 132 | 133 | #define cfg(doc, id) _cfgGet( (doc), (id) ) 134 | #define cfgBool(doc, id) _cfgGetBool( (doc), (id) ) 135 | #define cfgStr(doc, id) _cfgGetString( (doc), (id) ) 136 | 137 | #else 138 | 139 | /* Release build macros for speed */ 140 | #define cfg(doc, id) ((doc)->config.value[ (id) ]) 141 | #define cfgBool(doc, id) ((Bool) cfg(doc, id)) 142 | #define cfgStr(doc, id) ((ctmbstr) cfg(doc, id)) 143 | 144 | #endif /* _DEBUG */ 145 | 146 | 147 | 148 | /* parser for integer values */ 149 | ParseProperty ParseInt; 150 | 151 | /* parser for 't'/'f', 'true'/'false', 'y'/'n', 'yes'/'no' or '1'/'0' */ 152 | ParseProperty ParseBool; 153 | 154 | /* a string excluding whitespace */ 155 | ParseProperty ParseName; 156 | 157 | /* a CSS1 selector - CSS class naming for -clean option */ 158 | ParseProperty ParseCSS1Selector; 159 | 160 | /* a string including whitespace */ 161 | ParseProperty ParseString; 162 | 163 | /* a space or comma separated list of tag names */ 164 | ParseProperty ParseTagNames; 165 | 166 | /* RAW, ASCII, LATIN0, LATIN1, UTF8, ISO2022, MACROMAN, 167 | WIN1252, IBM858, UTF16LE, UTF16BE, UTF16, BIG5, SHIFTJIS 168 | */ 169 | ParseProperty ParseCharEnc; 170 | ParseProperty ParseNewline; 171 | 172 | /* specific to the indent option - Bool and 'auto' */ 173 | ParseProperty ParseIndent; 174 | 175 | /* omit | auto | strict | loose | */ 176 | ParseProperty ParseDocType; 177 | 178 | /* keep-first or keep-last? */ 179 | ParseProperty ParseRepeatAttr; 180 | 181 | /* specific to the output-bom option - Bool and 'auto' */ 182 | ParseProperty ParseBOM; 183 | 184 | #endif /* __CONFIG_H__ */ 185 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/attrget.c: -------------------------------------------------------------------------------- 1 | /* attrget.c -- Locate attribute value by type 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info: 7 | $LastChangedBy$ 8 | $LastChangedDate$ 9 | $LastChangedRevision$ 10 | 11 | */ 12 | 13 | #include "tidy-int.h" 14 | #include "tags.h" 15 | #include "attrs.h" 16 | #include "tidy.h" 17 | 18 | TidyAttr tidyAttrGetHREF( TidyNode tnod ) 19 | { 20 | return tidyImplToAttr( attrGetHREF( tidyNodeToImpl(tnod) ) ); 21 | } 22 | TidyAttr tidyAttrGetSRC( TidyNode tnod ) 23 | { 24 | return tidyImplToAttr( attrGetSRC( tidyNodeToImpl(tnod) ) ); 25 | } 26 | TidyAttr tidyAttrGetID( TidyNode tnod ) 27 | { 28 | return tidyImplToAttr( attrGetID( tidyNodeToImpl(tnod) ) ); 29 | } 30 | TidyAttr tidyAttrGetNAME( TidyNode tnod ) 31 | { 32 | return tidyImplToAttr( attrGetNAME( tidyNodeToImpl(tnod) ) ); 33 | } 34 | TidyAttr tidyAttrGetSUMMARY( TidyNode tnod ) 35 | { 36 | return tidyImplToAttr( attrGetSUMMARY( tidyNodeToImpl(tnod) ) ); 37 | } 38 | TidyAttr tidyAttrGetALT( TidyNode tnod ) 39 | { 40 | return tidyImplToAttr( attrGetALT( tidyNodeToImpl(tnod) ) ); 41 | } 42 | TidyAttr tidyAttrGetLONGDESC( TidyNode tnod ) 43 | { 44 | return tidyImplToAttr( attrGetLONGDESC( tidyNodeToImpl(tnod) ) ); 45 | } 46 | TidyAttr tidyAttrGetUSEMAP( TidyNode tnod ) 47 | { 48 | return tidyImplToAttr( attrGetUSEMAP( tidyNodeToImpl(tnod) ) ); 49 | } 50 | TidyAttr tidyAttrGetISMAP( TidyNode tnod ) 51 | { 52 | return tidyImplToAttr( attrGetISMAP( tidyNodeToImpl(tnod) ) ); 53 | } 54 | TidyAttr tidyAttrGetLANGUAGE( TidyNode tnod ) 55 | { 56 | return tidyImplToAttr( attrGetLANGUAGE( tidyNodeToImpl(tnod) ) ); 57 | } 58 | TidyAttr tidyAttrGetTYPE( TidyNode tnod ) 59 | { 60 | return tidyImplToAttr( attrGetTYPE( tidyNodeToImpl(tnod) ) ); 61 | } 62 | TidyAttr tidyAttrGetVALUE( TidyNode tnod ) 63 | { 64 | return tidyImplToAttr( attrGetVALUE( tidyNodeToImpl(tnod) ) ); 65 | } 66 | TidyAttr tidyAttrGetCONTENT( TidyNode tnod ) 67 | { 68 | return tidyImplToAttr( attrGetCONTENT( tidyNodeToImpl(tnod) ) ); 69 | } 70 | TidyAttr tidyAttrGetTITLE( TidyNode tnod ) 71 | { 72 | return tidyImplToAttr( attrGetTITLE( tidyNodeToImpl(tnod) ) ); 73 | } 74 | TidyAttr tidyAttrGetXMLNS( TidyNode tnod ) 75 | { 76 | return tidyImplToAttr( attrGetXMLNS( tidyNodeToImpl(tnod) ) ); 77 | } 78 | TidyAttr tidyAttrGetDATAFLD( TidyNode tnod ) 79 | { 80 | return tidyImplToAttr( attrGetDATAFLD( tidyNodeToImpl(tnod) ) ); 81 | } 82 | TidyAttr tidyAttrGetWIDTH( TidyNode tnod ) 83 | { 84 | return tidyImplToAttr( attrGetWIDTH( tidyNodeToImpl(tnod) ) ); 85 | } 86 | TidyAttr tidyAttrGetHEIGHT( TidyNode tnod ) 87 | { 88 | return tidyImplToAttr( attrGetHEIGHT( tidyNodeToImpl(tnod) ) ); 89 | } 90 | TidyAttr tidyAttrGetFOR( TidyNode tnod ) 91 | { 92 | return tidyImplToAttr( attrGetFOR( tidyNodeToImpl(tnod) ) ); 93 | } 94 | TidyAttr tidyAttrGetSELECTED( TidyNode tnod ) 95 | { 96 | return tidyImplToAttr( attrGetSELECTED( tidyNodeToImpl(tnod) ) ); 97 | } 98 | TidyAttr tidyAttrGetCHECKED( TidyNode tnod ) 99 | { 100 | return tidyImplToAttr( attrGetCHECKED( tidyNodeToImpl(tnod) ) ); 101 | } 102 | TidyAttr tidyAttrGetLANG( TidyNode tnod ) 103 | { 104 | return tidyImplToAttr( attrGetLANG( tidyNodeToImpl(tnod) ) ); 105 | } 106 | TidyAttr tidyAttrGetTARGET( TidyNode tnod ) 107 | { 108 | return tidyImplToAttr( attrGetTARGET( tidyNodeToImpl(tnod) ) ); 109 | } 110 | TidyAttr tidyAttrGetHTTP_EQUIV( TidyNode tnod ) 111 | { 112 | return tidyImplToAttr( attrGetHTTP_EQUIV( tidyNodeToImpl(tnod) ) ); 113 | } 114 | TidyAttr tidyAttrGetREL( TidyNode tnod ) 115 | { 116 | return tidyImplToAttr( attrGetREL( tidyNodeToImpl(tnod) ) ); 117 | } 118 | 119 | TidyAttr tidyAttrGetOnMOUSEMOVE( TidyNode tnod ) 120 | { 121 | return tidyImplToAttr( attrGetOnMOUSEMOVE( tidyNodeToImpl(tnod) ) ); 122 | } 123 | TidyAttr tidyAttrGetOnMOUSEDOWN( TidyNode tnod ) 124 | { 125 | return tidyImplToAttr( attrGetOnMOUSEDOWN( tidyNodeToImpl(tnod) ) ); 126 | } 127 | TidyAttr tidyAttrGetOnMOUSEUP( TidyNode tnod ) 128 | { 129 | return tidyImplToAttr( attrGetOnMOUSEUP( tidyNodeToImpl(tnod) ) ); 130 | } 131 | TidyAttr tidyAttrGetOnCLICK( TidyNode tnod ) 132 | { 133 | return tidyImplToAttr( attrGetOnCLICK( tidyNodeToImpl(tnod) ) ); 134 | } 135 | TidyAttr tidyAttrGetOnMOUSEOVER( TidyNode tnod ) 136 | { 137 | return tidyImplToAttr( attrGetOnMOUSEOVER( tidyNodeToImpl(tnod) ) ); 138 | } 139 | TidyAttr tidyAttrGetOnMOUSEOUT( TidyNode tnod ) 140 | { 141 | return tidyImplToAttr( attrGetOnMOUSEOUT( tidyNodeToImpl(tnod) ) ); 142 | } 143 | TidyAttr tidyAttrGetOnKEYDOWN( TidyNode tnod ) 144 | { 145 | return tidyImplToAttr( attrGetOnKEYDOWN( tidyNodeToImpl(tnod) ) ); 146 | } 147 | TidyAttr tidyAttrGetOnKEYUP( TidyNode tnod ) 148 | { 149 | return tidyImplToAttr( attrGetOnKEYUP( tidyNodeToImpl(tnod) ) ); 150 | } 151 | TidyAttr tidyAttrGetOnKEYPRESS( TidyNode tnod ) 152 | { 153 | return tidyImplToAttr( attrGetOnKEYPRESS( tidyNodeToImpl(tnod) ) ); 154 | } 155 | TidyAttr tidyAttrGetOnFOCUS( TidyNode tnod ) 156 | { 157 | return tidyImplToAttr( attrGetOnFOCUS( tidyNodeToImpl(tnod) ) ); 158 | } 159 | TidyAttr tidyAttrGetOnBLUR( TidyNode tnod ) 160 | { 161 | return tidyImplToAttr( attrGetOnBLUR( tidyNodeToImpl(tnod) ) ); 162 | } 163 | TidyAttr tidyAttrGetBGCOLOR( TidyNode tnod ) 164 | { 165 | return tidyImplToAttr( attrGetBGCOLOR( tidyNodeToImpl(tnod) ) ); 166 | } 167 | TidyAttr tidyAttrGetLINK( TidyNode tnod ) 168 | { 169 | return tidyImplToAttr( attrGetLINK( tidyNodeToImpl(tnod) ) ); 170 | } 171 | TidyAttr tidyAttrGetALINK( TidyNode tnod ) 172 | { 173 | return tidyImplToAttr( attrGetALINK( tidyNodeToImpl(tnod) ) ); 174 | } 175 | TidyAttr tidyAttrGetVLINK( TidyNode tnod ) 176 | { 177 | return tidyImplToAttr( attrGetVLINK( tidyNodeToImpl(tnod) ) ); 178 | } 179 | 180 | TidyAttr tidyAttrGetTEXT( TidyNode tnod ) 181 | { 182 | return tidyImplToAttr( attrGetTEXT( tidyNodeToImpl(tnod) ) ); 183 | } 184 | TidyAttr tidyAttrGetSTYLE( TidyNode tnod ) 185 | { 186 | return tidyImplToAttr( attrGetSTYLE( tidyNodeToImpl(tnod) ) ); 187 | } 188 | TidyAttr tidyAttrGetABBR( TidyNode tnod ) 189 | { 190 | return tidyImplToAttr( attrGetABBR( tidyNodeToImpl(tnod) ) ); 191 | } 192 | TidyAttr tidyAttrGetCOLSPAN( TidyNode tnod ) 193 | { 194 | return tidyImplToAttr( attrGetCOLSPAN( tidyNodeToImpl(tnod) ) ); 195 | } 196 | TidyAttr tidyAttrGetROWSPAN( TidyNode tnod ) 197 | { 198 | return tidyImplToAttr( attrGetROWSPAN( tidyNodeToImpl(tnod) ) ); 199 | } 200 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/message.h: -------------------------------------------------------------------------------- 1 | #ifndef __MESSAGE_H__ 2 | #define __MESSAGE_H__ 3 | 4 | /* message.h -- general message writing routines 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | */ 16 | 17 | #include "forward.h" 18 | #include "tidy.h" /* For TidyReportLevel */ 19 | 20 | /* General message writing routines. 21 | ** Each message is a single warning, error, etc. 22 | ** 23 | ** This routine will keep track of counts and, 24 | ** if the caller has set a filter, it will be 25 | ** called. The new preferred way of handling 26 | ** Tidy diagnostics output is either a) define 27 | ** a new output sink or b) install a message 28 | ** filter routine. 29 | ** 30 | ** Keeps track of ShowWarnings, ShowErrors, etc. 31 | */ 32 | 33 | ctmbstr ReleaseDate(void); 34 | 35 | /* Reports error at current Lexer line/column. */ 36 | void message( TidyDocImpl* doc, TidyReportLevel level, ctmbstr msg, ... ); 37 | 38 | /* Reports error at node line/column. */ 39 | void messageNode( TidyDocImpl* doc, TidyReportLevel level, 40 | Node* node, ctmbstr msg, ... ); 41 | 42 | /* Reports error at given line/column. */ 43 | void messageLexer( TidyDocImpl* doc, TidyReportLevel level, 44 | ctmbstr msg, ... ); 45 | 46 | /* For general reporting. Emits nothing if --quiet yes */ 47 | void tidy_out( TidyDocImpl* doc, ctmbstr msg, ... ); 48 | 49 | 50 | void ShowVersion( TidyDocImpl* doc ); 51 | void ReportUnknownOption( TidyDocImpl* doc, ctmbstr option ); 52 | void ReportBadArgument( TidyDocImpl* doc, ctmbstr option ); 53 | void NeedsAuthorIntervention( TidyDocImpl* doc ); 54 | 55 | void HelloMessage( TidyDocImpl* doc, ctmbstr date, ctmbstr filename ); 56 | void ReportMarkupVersion( TidyDocImpl* doc ); 57 | void ReportNumWarnings( TidyDocImpl* doc ); 58 | 59 | void GeneralInfo( TidyDocImpl* doc ); 60 | void UnknownOption( TidyDocImpl* doc, char c ); 61 | void UnknownFile( TidyDocImpl* doc, ctmbstr program, ctmbstr file ); 62 | void FileError( TidyDocImpl* doc, ctmbstr file, TidyReportLevel level ); 63 | 64 | void ErrorSummary( TidyDocImpl* doc ); 65 | 66 | void ReportEncodingWarning(TidyDocImpl* doc, uint code, uint encoding); 67 | void ReportEncodingError(TidyDocImpl* doc, uint code, uint c, Bool discarded); 68 | void ReportEntityError( TidyDocImpl* doc, uint code, ctmbstr entity, int c ); 69 | void ReportAttrError( TidyDocImpl* doc, Node* node, AttVal* av, uint code ); 70 | void ReportMissingAttr( TidyDocImpl* doc, Node* node, ctmbstr name ); 71 | 72 | void ReportNotice(TidyDocImpl* doc, Node *element, Node *node, uint code); 73 | void ReportWarning(TidyDocImpl* doc, Node *element, Node *node, uint code); 74 | void ReportError(TidyDocImpl* doc, Node* element, Node* node, uint code); 75 | void ReportFatal(TidyDocImpl* doc, Node* element, Node* node, uint code); 76 | 77 | /* error codes for entities/numeric character references */ 78 | 79 | #define MISSING_SEMICOLON 1 80 | #define MISSING_SEMICOLON_NCR 2 81 | #define UNKNOWN_ENTITY 3 82 | #define UNESCAPED_AMPERSAND 4 83 | #define APOS_UNDEFINED 5 84 | 85 | /* error codes for element messages */ 86 | 87 | #define MISSING_ENDTAG_FOR 6 88 | #define MISSING_ENDTAG_BEFORE 7 89 | #define DISCARDING_UNEXPECTED 8 90 | #define NESTED_EMPHASIS 9 91 | #define NON_MATCHING_ENDTAG 10 92 | #define TAG_NOT_ALLOWED_IN 11 93 | #define MISSING_STARTTAG 12 94 | #define UNEXPECTED_ENDTAG 13 95 | #define USING_BR_INPLACE_OF 14 96 | #define INSERTING_TAG 15 97 | #define SUSPECTED_MISSING_QUOTE 16 98 | #define MISSING_TITLE_ELEMENT 17 99 | #define DUPLICATE_FRAMESET 18 100 | #define CANT_BE_NESTED 19 101 | #define OBSOLETE_ELEMENT 20 102 | #define PROPRIETARY_ELEMENT 21 103 | #define UNKNOWN_ELEMENT 22 104 | #define TRIM_EMPTY_ELEMENT 23 105 | #define COERCE_TO_ENDTAG 24 106 | #define ILLEGAL_NESTING 25 107 | #define NOFRAMES_CONTENT 26 108 | #define CONTENT_AFTER_BODY 27 109 | #define INCONSISTENT_VERSION 28 110 | #define MALFORMED_COMMENT 29 111 | #define BAD_COMMENT_CHARS 30 112 | #define BAD_XML_COMMENT 31 113 | #define BAD_CDATA_CONTENT 32 114 | #define INCONSISTENT_NAMESPACE 33 115 | #define DOCTYPE_AFTER_TAGS 34 116 | #define MALFORMED_DOCTYPE 35 117 | #define UNEXPECTED_END_OF_FILE 36 118 | #define DTYPE_NOT_UPPER_CASE 37 119 | #define TOO_MANY_ELEMENTS 38 120 | #define UNESCAPED_ELEMENT 39 121 | #define NESTED_QUOTATION 40 122 | #define ELEMENT_NOT_EMPTY 41 123 | #define ENCODING_IO_CONFLICT 42 124 | #define MIXED_CONTENT_IN_BLOCK 43 125 | #define MISSING_DOCTYPE 44 126 | #define SPACE_PRECEDING_XMLDECL 45 127 | #define TOO_MANY_ELEMENTS_IN 46 128 | #define UNEXPECTED_ENDTAG_IN 47 129 | #define REPLACING_ELEMENT 83 130 | #define REPLACING_UNEX_ELEMENT 84 131 | #define COERCE_TO_ENDTAG_WARN 85 /* last */ 132 | 133 | /* error codes used for attribute messages */ 134 | 135 | #define UNKNOWN_ATTRIBUTE 48 136 | #define INSERTING_ATTRIBUTE 49 137 | #define MISSING_ATTR_VALUE 50 138 | #define BAD_ATTRIBUTE_VALUE 51 139 | #define UNEXPECTED_GT 52 140 | #define PROPRIETARY_ATTRIBUTE 53 141 | #define PROPRIETARY_ATTR_VALUE 54 142 | #define REPEATED_ATTRIBUTE 55 143 | #define MISSING_IMAGEMAP 56 144 | #define XML_ATTRIBUTE_VALUE 57 145 | #define UNEXPECTED_QUOTEMARK 58 146 | #define MISSING_QUOTEMARK 59 147 | #define ID_NAME_MISMATCH 60 148 | 149 | #define BACKSLASH_IN_URI 61 150 | #define FIXED_BACKSLASH 62 151 | #define ILLEGAL_URI_REFERENCE 63 152 | #define ESCAPED_ILLEGAL_URI 64 153 | 154 | #define NEWLINE_IN_URI 65 155 | #define ANCHOR_NOT_UNIQUE 66 156 | 157 | #define JOINING_ATTRIBUTE 68 158 | #define UNEXPECTED_EQUALSIGN 69 159 | #define ATTR_VALUE_NOT_LCASE 70 160 | #define XML_ID_SYNTAX 71 161 | 162 | #define INVALID_ATTRIBUTE 72 163 | 164 | #define BAD_ATTRIBUTE_VALUE_REPLACED 73 165 | 166 | #define INVALID_XML_ID 74 167 | #define UNEXPECTED_END_OF_FILE_ATTR 75 168 | 169 | 170 | /* character encoding errors */ 171 | 172 | #define VENDOR_SPECIFIC_CHARS 76 173 | #define INVALID_SGML_CHARS 77 174 | #define INVALID_UTF8 78 175 | #define INVALID_UTF16 79 176 | #define ENCODING_MISMATCH 80 177 | #define INVALID_URI 81 178 | #define INVALID_NCR 82 179 | 180 | /* accessibility flaws */ 181 | 182 | #define MISSING_IMAGE_ALT 1 183 | #define MISSING_LINK_ALT 2 184 | #define MISSING_SUMMARY 4 185 | #define MISSING_IMAGE_MAP 8 186 | #define USING_FRAMES 16 187 | #define USING_NOFRAMES 32 188 | 189 | /* presentation flaws */ 190 | 191 | #define USING_SPACER 1 192 | #define USING_LAYER 2 193 | #define USING_NOBR 4 194 | #define USING_FONT 8 195 | #define USING_BODY 16 196 | 197 | #define REPLACED_CHAR 0 198 | #define DISCARDED_CHAR 1 199 | 200 | /* badchar bit field */ 201 | 202 | #define BC_VENDOR_SPECIFIC_CHARS 1 203 | #define BC_INVALID_SGML_CHARS 2 204 | #define BC_INVALID_UTF8 4 205 | #define BC_INVALID_UTF16 8 206 | #define BC_ENCODING_MISMATCH 16 /* fatal error */ 207 | #define BC_INVALID_URI 32 208 | #define BC_INVALID_NCR 64 209 | 210 | #endif /* __MESSAGE_H__ */ 211 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/istack.c: -------------------------------------------------------------------------------- 1 | /* istack.c -- inline stack for compatibility with Mosaic 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info : 7 | 8 | $LastChangedBy$ 9 | $LastChangedDate$ 10 | $LastChangedRevision$ 11 | 12 | */ 13 | 14 | #include "tidy-int.h" 15 | #include "lexer.h" 16 | #include "attrs.h" 17 | #include "streamio.h" 18 | #include "tmbstr.h" 19 | 20 | extern Bool debug_flag; 21 | extern Node *debug_element; 22 | extern Lexer *debug_lexer; 23 | 24 | /* duplicate attributes */ 25 | AttVal *DupAttrs( TidyDocImpl* doc, AttVal *attrs) 26 | { 27 | AttVal *newattrs; 28 | 29 | if (attrs == NULL) 30 | return attrs; 31 | 32 | newattrs = NewAttribute(); 33 | *newattrs = *attrs; 34 | newattrs->next = DupAttrs( doc, attrs->next ); 35 | newattrs->attribute = tmbstrdup(attrs->attribute); 36 | newattrs->value = tmbstrdup(attrs->value); 37 | newattrs->dict = FindAttribute(doc, newattrs); 38 | return newattrs; 39 | } 40 | 41 | /* 42 | push a copy of an inline node onto stack 43 | but don't push if implicit or OBJECT or APPLET 44 | (implicit tags are ones generated from the istack) 45 | 46 | One issue arises with pushing inlines when 47 | the tag is already pushed. For instance: 48 | 49 |

text 50 |

more text 51 | 52 | Shouldn't be mapped to 53 | 54 |

text

55 |

more text 56 | */ 57 | void PushInline( TidyDocImpl* doc, Node *node) 58 | { 59 | Lexer* lexer = doc->lexer; 60 | IStack *istack; 61 | 62 | if (node->implicit) 63 | return; 64 | 65 | if (node->tag == NULL) 66 | return; 67 | 68 | if (!(node->tag->model & CM_INLINE)) 69 | return; 70 | 71 | if (node->tag->model & CM_OBJECT) 72 | return; 73 | 74 | if ( !nodeIsFONT(node) && IsPushed(doc, node) ) 75 | return; 76 | 77 | /* make sure there is enough space for the stack */ 78 | if (lexer->istacksize + 1 > lexer->istacklength) 79 | { 80 | if (lexer->istacklength == 0) 81 | lexer->istacklength = 6; /* this is perhaps excessive */ 82 | 83 | lexer->istacklength = lexer->istacklength * 2; 84 | lexer->istack = (IStack *)MemRealloc(lexer->istack, 85 | sizeof(IStack)*(lexer->istacklength)); 86 | } 87 | 88 | istack = &(lexer->istack[lexer->istacksize]); 89 | istack->tag = node->tag; 90 | 91 | istack->element = tmbstrdup(node->element); 92 | istack->attributes = DupAttrs( doc, node->attributes ); 93 | ++(lexer->istacksize); 94 | } 95 | 96 | /* pop inline stack */ 97 | void PopInline( TidyDocImpl* doc, Node *node ) 98 | { 99 | Lexer* lexer = doc->lexer; 100 | AttVal *av; 101 | IStack *istack; 102 | 103 | if (node) 104 | { 105 | if (node->tag == NULL) 106 | return; 107 | 108 | if (!(node->tag->model & CM_INLINE)) 109 | return; 110 | 111 | if (node->tag->model & CM_OBJECT) 112 | return; 113 | 114 | /* if node is then pop until we find an */ 115 | if ( nodeIsA(node) ) 116 | { 117 | while (lexer->istacksize > 0) 118 | { 119 | --(lexer->istacksize); 120 | istack = &(lexer->istack[lexer->istacksize]); 121 | 122 | while (istack->attributes) 123 | { 124 | av = istack->attributes; 125 | 126 | if (av->attribute) 127 | MemFree(av->attribute); 128 | if (av->value) 129 | MemFree(av->value); 130 | 131 | istack->attributes = av->next; 132 | MemFree(av); 133 | } 134 | 135 | if ( istack->tag->id == TidyTag_A ) 136 | { 137 | MemFree(istack->element); 138 | break; 139 | } 140 | 141 | MemFree(istack->element); 142 | } 143 | 144 | return; 145 | } 146 | } 147 | 148 | if (lexer->istacksize > 0) 149 | { 150 | --(lexer->istacksize); 151 | istack = &(lexer->istack[lexer->istacksize]); 152 | 153 | while (istack->attributes) 154 | { 155 | av = istack->attributes; 156 | 157 | if (av->attribute) 158 | MemFree(av->attribute); 159 | if (av->value) 160 | MemFree(av->value); 161 | 162 | istack->attributes = av->next; 163 | MemFree(av); 164 | } 165 | 166 | MemFree(istack->element); 167 | 168 | /* #427822 - fix by Randy Waki 7 Aug 00 */ 169 | if (lexer->insert >= lexer->istack + lexer->istacksize) 170 | lexer->insert = NULL; 171 | } 172 | } 173 | 174 | Bool IsPushed( TidyDocImpl* doc, Node *node) 175 | { 176 | Lexer* lexer = doc->lexer; 177 | int i; 178 | 179 | for (i = lexer->istacksize - 1; i >= 0; --i) 180 | { 181 | if (lexer->istack[i].tag == node->tag) 182 | return yes; 183 | } 184 | 185 | return no; 186 | } 187 | 188 | /* 189 | This has the effect of inserting "missing" inline 190 | elements around the contents of blocklevel elements 191 | such as P, TD, TH, DIV, PRE etc. This procedure is 192 | called at the start of ParseBlock. when the inline 193 | stack is not empty, as will be the case in: 194 | 195 |

italic heading

196 | 197 | which is then treated as equivalent to 198 | 199 |

italic heading

200 | 201 | This is implemented by setting the lexer into a mode 202 | where it gets tokens from the inline stack rather than 203 | from the input stream. 204 | */ 205 | int InlineDup( TidyDocImpl* doc, Node* node ) 206 | { 207 | Lexer* lexer = doc->lexer; 208 | int n; 209 | 210 | if ((n = lexer->istacksize - lexer->istackbase) > 0) 211 | { 212 | lexer->insert = &(lexer->istack[lexer->istackbase]); 213 | lexer->inode = node; 214 | } 215 | 216 | return n; 217 | } 218 | 219 | /* 220 | defer duplicates when entering a table or other 221 | element where the inlines shouldn't be duplicated 222 | */ 223 | void DeferDup( TidyDocImpl* doc ) 224 | { 225 | doc->lexer->insert = NULL; 226 | doc->lexer->inode = NULL; 227 | } 228 | 229 | Node *InsertedToken( TidyDocImpl* doc ) 230 | { 231 | Lexer* lexer = doc->lexer; 232 | Node *node; 233 | IStack *istack; 234 | uint n; 235 | 236 | /* this will only be NULL if inode != NULL */ 237 | if (lexer->insert == NULL) 238 | { 239 | node = lexer->inode; 240 | lexer->inode = NULL; 241 | return node; 242 | } 243 | 244 | /* 245 | 246 | is this is the "latest" node then update 247 | the position, otherwise use current values 248 | */ 249 | 250 | if (lexer->inode == NULL) 251 | { 252 | lexer->lines = doc->docIn->curline; 253 | lexer->columns = doc->docIn->curcol; 254 | } 255 | 256 | node = NewNode(lexer); 257 | node->type = StartTag; 258 | node->implicit = yes; 259 | node->start = lexer->txtstart; 260 | /* #431734 [JTidy bug #226261 (was 126261)] - fix by Gary Peskin 20 Dec 00 */ 261 | node->end = lexer->txtend; /* was : lexer->txtstart; */ 262 | istack = lexer->insert; 263 | 264 | #if 0 && defined(_DEBUG) 265 | if ( lexer->istacksize == 0 ) 266 | fprintf( stderr, "0-size istack!\n" ); 267 | #endif 268 | 269 | node->element = tmbstrdup(istack->element); 270 | node->tag = istack->tag; 271 | node->attributes = DupAttrs( doc, istack->attributes ); 272 | 273 | /* advance lexer to next item on the stack */ 274 | n = (uint)(lexer->insert - &(lexer->istack[0])); 275 | 276 | /* and recover state if we have reached the end */ 277 | if (++n < lexer->istacksize) 278 | lexer->insert = &(lexer->istack[n]); 279 | else 280 | lexer->insert = NULL; 281 | 282 | return node; 283 | } 284 | 285 | 286 | 287 | 288 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/tagask.c: -------------------------------------------------------------------------------- 1 | /* tagask.c -- Interrogate node type 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info : 7 | 8 | $LastChangedBy$ 9 | $LastChangedDate$ 10 | $LastChangedRevision$ 11 | 12 | */ 13 | 14 | #include "tidy-int.h" 15 | #include "tags.h" 16 | #include "tidy.h" 17 | 18 | Bool tidyNodeIsText( TidyNode tnod ) 19 | { return nodeIsText( tidyNodeToImpl(tnod) ); 20 | } 21 | Bool tidyNodeCMIsBlock( TidyNode tnod ) 22 | { return nodeCMIsBlock( tidyNodeToImpl(tnod) ); 23 | } 24 | Bool tidyNodeCMIsInline( TidyNode tnod ) 25 | { return nodeCMIsInline( tidyNodeToImpl(tnod) ); 26 | } 27 | Bool tidyNodeCMIsEmpty( TidyNode tnod ) 28 | { return nodeCMIsEmpty( tidyNodeToImpl(tnod) ); 29 | } 30 | Bool tidyNodeIsHeader( TidyNode tnod ) 31 | { return nodeIsHeader( tidyNodeToImpl(tnod) ); 32 | } 33 | 34 | Bool tidyNodeIsHTML( TidyNode tnod ) 35 | { return nodeIsHTML( tidyNodeToImpl(tnod) ); 36 | } 37 | Bool tidyNodeIsHEAD( TidyNode tnod ) 38 | { return nodeIsHEAD( tidyNodeToImpl(tnod) ); 39 | } 40 | Bool tidyNodeIsTITLE( TidyNode tnod ) 41 | { return nodeIsTITLE( tidyNodeToImpl(tnod) ); 42 | } 43 | Bool tidyNodeIsBASE( TidyNode tnod ) 44 | { return nodeIsBASE( tidyNodeToImpl(tnod) ); 45 | } 46 | Bool tidyNodeIsMETA( TidyNode tnod ) 47 | { return nodeIsMETA( tidyNodeToImpl(tnod) ); 48 | } 49 | Bool tidyNodeIsBODY( TidyNode tnod ) 50 | { return nodeIsBODY( tidyNodeToImpl(tnod) ); 51 | } 52 | Bool tidyNodeIsFRAMESET( TidyNode tnod ) 53 | { return nodeIsFRAMESET( tidyNodeToImpl(tnod) ); 54 | } 55 | Bool tidyNodeIsFRAME( TidyNode tnod ) 56 | { return nodeIsFRAME( tidyNodeToImpl(tnod) ); 57 | } 58 | Bool tidyNodeIsIFRAME( TidyNode tnod ) 59 | { return nodeIsIFRAME( tidyNodeToImpl(tnod) ); 60 | } 61 | Bool tidyNodeIsNOFRAMES( TidyNode tnod ) 62 | { return nodeIsNOFRAMES( tidyNodeToImpl(tnod) ); 63 | } 64 | Bool tidyNodeIsHR( TidyNode tnod ) 65 | { return nodeIsHR( tidyNodeToImpl(tnod) ); 66 | } 67 | Bool tidyNodeIsH1( TidyNode tnod ) 68 | { return nodeIsH1( tidyNodeToImpl(tnod) ); 69 | } 70 | Bool tidyNodeIsH2( TidyNode tnod ) 71 | { return nodeIsH2( tidyNodeToImpl(tnod) ); 72 | } 73 | Bool tidyNodeIsPRE( TidyNode tnod ) 74 | { return nodeIsPRE( tidyNodeToImpl(tnod) ); 75 | } 76 | Bool tidyNodeIsLISTING( TidyNode tnod ) 77 | { return nodeIsLISTING( tidyNodeToImpl(tnod) ); 78 | } 79 | Bool tidyNodeIsP( TidyNode tnod ) 80 | { return nodeIsP( tidyNodeToImpl(tnod) ); 81 | } 82 | Bool tidyNodeIsUL( TidyNode tnod ) 83 | { return nodeIsUL( tidyNodeToImpl(tnod) ); 84 | } 85 | Bool tidyNodeIsOL( TidyNode tnod ) 86 | { return nodeIsOL( tidyNodeToImpl(tnod) ); 87 | } 88 | Bool tidyNodeIsDL( TidyNode tnod ) 89 | { return nodeIsDL( tidyNodeToImpl(tnod) ); 90 | } 91 | Bool tidyNodeIsDIR( TidyNode tnod ) 92 | { return nodeIsDIR( tidyNodeToImpl(tnod) ); 93 | } 94 | Bool tidyNodeIsLI( TidyNode tnod ) 95 | { return nodeIsLI( tidyNodeToImpl(tnod) ); 96 | } 97 | Bool tidyNodeIsDT( TidyNode tnod ) 98 | { return nodeIsDT( tidyNodeToImpl(tnod) ); 99 | } 100 | Bool tidyNodeIsDD( TidyNode tnod ) 101 | { return nodeIsDD( tidyNodeToImpl(tnod) ); 102 | } 103 | Bool tidyNodeIsTABLE( TidyNode tnod ) 104 | { return nodeIsTABLE( tidyNodeToImpl(tnod) ); 105 | } 106 | Bool tidyNodeIsCAPTION( TidyNode tnod ) 107 | { return nodeIsCAPTION( tidyNodeToImpl(tnod) ); 108 | } 109 | Bool tidyNodeIsTD( TidyNode tnod ) 110 | { return nodeIsTD( tidyNodeToImpl(tnod) ); 111 | } 112 | Bool tidyNodeIsTH( TidyNode tnod ) 113 | { return nodeIsTH( tidyNodeToImpl(tnod) ); 114 | } 115 | Bool tidyNodeIsTR( TidyNode tnod ) 116 | { return nodeIsTR( tidyNodeToImpl(tnod) ); 117 | } 118 | Bool tidyNodeIsCOL( TidyNode tnod ) 119 | { return nodeIsCOL( tidyNodeToImpl(tnod) ); 120 | } 121 | Bool tidyNodeIsCOLGROUP( TidyNode tnod ) 122 | { return nodeIsCOLGROUP( tidyNodeToImpl(tnod) ); 123 | } 124 | Bool tidyNodeIsBR( TidyNode tnod ) 125 | { return nodeIsBR( tidyNodeToImpl(tnod) ); 126 | } 127 | Bool tidyNodeIsA( TidyNode tnod ) 128 | { return nodeIsA( tidyNodeToImpl(tnod) ); 129 | } 130 | Bool tidyNodeIsLINK( TidyNode tnod ) 131 | { return nodeIsLINK( tidyNodeToImpl(tnod) ); 132 | } 133 | Bool tidyNodeIsB( TidyNode tnod ) 134 | { return nodeIsB( tidyNodeToImpl(tnod) ); 135 | } 136 | Bool tidyNodeIsI( TidyNode tnod ) 137 | { return nodeIsI( tidyNodeToImpl(tnod) ); 138 | } 139 | Bool tidyNodeIsSTRONG( TidyNode tnod ) 140 | { return nodeIsSTRONG( tidyNodeToImpl(tnod) ); 141 | } 142 | Bool tidyNodeIsEM( TidyNode tnod ) 143 | { return nodeIsEM( tidyNodeToImpl(tnod) ); 144 | } 145 | Bool tidyNodeIsBIG( TidyNode tnod ) 146 | { return nodeIsBIG( tidyNodeToImpl(tnod) ); 147 | } 148 | Bool tidyNodeIsSMALL( TidyNode tnod ) 149 | { return nodeIsSMALL( tidyNodeToImpl(tnod) ); 150 | } 151 | Bool tidyNodeIsPARAM( TidyNode tnod ) 152 | { return nodeIsPARAM( tidyNodeToImpl(tnod) ); 153 | } 154 | Bool tidyNodeIsOPTION( TidyNode tnod ) 155 | { return nodeIsOPTION( tidyNodeToImpl(tnod) ); 156 | } 157 | Bool tidyNodeIsOPTGROUP( TidyNode tnod ) 158 | { return nodeIsOPTGROUP( tidyNodeToImpl(tnod) ); 159 | } 160 | Bool tidyNodeIsIMG( TidyNode tnod ) 161 | { return nodeIsIMG( tidyNodeToImpl(tnod) ); 162 | } 163 | Bool tidyNodeIsMAP( TidyNode tnod ) 164 | { return nodeIsMAP( tidyNodeToImpl(tnod) ); 165 | } 166 | Bool tidyNodeIsAREA( TidyNode tnod ) 167 | { return nodeIsAREA( tidyNodeToImpl(tnod) ); 168 | } 169 | Bool tidyNodeIsNOBR( TidyNode tnod ) 170 | { return nodeIsNOBR( tidyNodeToImpl(tnod) ); 171 | } 172 | Bool tidyNodeIsWBR( TidyNode tnod ) 173 | { return nodeIsWBR( tidyNodeToImpl(tnod) ); 174 | } 175 | Bool tidyNodeIsFONT( TidyNode tnod ) 176 | { return nodeIsFONT( tidyNodeToImpl(tnod) ); 177 | } 178 | Bool tidyNodeIsLAYER( TidyNode tnod ) 179 | { return nodeIsLAYER( tidyNodeToImpl(tnod) ); 180 | } 181 | Bool tidyNodeIsSPACER( TidyNode tnod ) 182 | { return nodeIsSPACER( tidyNodeToImpl(tnod) ); 183 | } 184 | Bool tidyNodeIsCENTER( TidyNode tnod ) 185 | { return nodeIsCENTER( tidyNodeToImpl(tnod) ); 186 | } 187 | Bool tidyNodeIsSTYLE( TidyNode tnod ) 188 | { return nodeIsSTYLE( tidyNodeToImpl(tnod) ); 189 | } 190 | Bool tidyNodeIsSCRIPT( TidyNode tnod ) 191 | { return nodeIsSCRIPT( tidyNodeToImpl(tnod) ); 192 | } 193 | Bool tidyNodeIsNOSCRIPT( TidyNode tnod ) 194 | { return nodeIsNOSCRIPT( tidyNodeToImpl(tnod) ); 195 | } 196 | Bool tidyNodeIsFORM( TidyNode tnod ) 197 | { return nodeIsFORM( tidyNodeToImpl(tnod) ); 198 | } 199 | Bool tidyNodeIsTEXTAREA( TidyNode tnod ) 200 | { return nodeIsTEXTAREA( tidyNodeToImpl(tnod) ); 201 | } 202 | Bool tidyNodeIsBLOCKQUOTE( TidyNode tnod ) 203 | { return nodeIsBLOCKQUOTE( tidyNodeToImpl(tnod) ); 204 | } 205 | Bool tidyNodeIsAPPLET( TidyNode tnod ) 206 | { return nodeIsAPPLET( tidyNodeToImpl(tnod) ); 207 | } 208 | Bool tidyNodeIsOBJECT( TidyNode tnod ) 209 | { return nodeIsOBJECT( tidyNodeToImpl(tnod) ); 210 | } 211 | Bool tidyNodeIsDIV( TidyNode tnod ) 212 | { return nodeIsDIV( tidyNodeToImpl(tnod) ); 213 | } 214 | Bool tidyNodeIsSPAN( TidyNode tnod ) 215 | { return nodeIsSPAN( tidyNodeToImpl(tnod) ); 216 | } 217 | Bool tidyNodeIsINPUT( TidyNode tnod ) 218 | { return nodeIsINPUT( tidyNodeToImpl(tnod) ); 219 | } 220 | Bool tidyNodeIsQ( TidyNode tnod ) 221 | { return nodeIsQ( tidyNodeToImpl(tnod) ); 222 | } 223 | Bool tidyNodeIsLABEL( TidyNode tnod ) 224 | { return nodeIsLABEL( tidyNodeToImpl(tnod) ); 225 | } 226 | Bool tidyNodeIsH3( TidyNode tnod ) 227 | { return nodeIsH3( tidyNodeToImpl(tnod) ); 228 | } 229 | Bool tidyNodeIsH4( TidyNode tnod ) 230 | { return nodeIsH4( tidyNodeToImpl(tnod) ); 231 | } 232 | Bool tidyNodeIsH5( TidyNode tnod ) 233 | { return nodeIsH5( tidyNodeToImpl(tnod) ); 234 | } 235 | Bool tidyNodeIsH6( TidyNode tnod ) 236 | { return nodeIsH6( tidyNodeToImpl(tnod) ); 237 | } 238 | Bool tidyNodeIsADDRESS( TidyNode tnod ) 239 | { return nodeIsADDRESS( tidyNodeToImpl(tnod) ); 240 | } 241 | Bool tidyNodeIsXMP( TidyNode tnod ) 242 | { return nodeIsXMP( tidyNodeToImpl(tnod) ); 243 | } 244 | Bool tidyNodeIsSELECT( TidyNode tnod ) 245 | { return nodeIsSELECT( tidyNodeToImpl(tnod) ); 246 | } 247 | Bool tidyNodeIsBLINK( TidyNode tnod ) 248 | { return nodeIsBLINK( tidyNodeToImpl(tnod) ); 249 | } 250 | Bool tidyNodeIsMARQUEE( TidyNode tnod ) 251 | { return nodeIsMARQUEE( tidyNodeToImpl(tnod) ); 252 | } 253 | Bool tidyNodeIsEMBED( TidyNode tnod ) 254 | { return nodeIsEMBED( tidyNodeToImpl(tnod) ); 255 | } 256 | Bool tidyNodeIsBASEFONT( TidyNode tnod ) 257 | { return nodeIsBASEFONT( tidyNodeToImpl(tnod) ); 258 | } 259 | Bool tidyNodeIsISINDEX( TidyNode tnod ) 260 | { return nodeIsISINDEX( tidyNodeToImpl(tnod) ); 261 | } 262 | Bool tidyNodeIsS( TidyNode tnod ) 263 | { return nodeIsS( tidyNodeToImpl(tnod) ); 264 | } 265 | Bool tidyNodeIsSTRIKE( TidyNode tnod ) 266 | { return nodeIsSTRIKE( tidyNodeToImpl(tnod) ); 267 | } 268 | Bool tidyNodeIsU( TidyNode tnod ) 269 | { return nodeIsU( tidyNodeToImpl(tnod) ); 270 | } 271 | Bool tidyNodeIsMENU( TidyNode tnod ) 272 | { return nodeIsMENU( tidyNodeToImpl(tnod) ); 273 | } 274 | 275 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/tags.h: -------------------------------------------------------------------------------- 1 | #ifndef __TAGS_H__ 2 | #define __TAGS_H__ 3 | 4 | /* tags.h -- recognize HTML tags 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | The HTML tags are stored as 8 bit ASCII strings. 16 | Use lookupw() to find a tag given a wide char string. 17 | 18 | */ 19 | 20 | #include "forward.h" 21 | #include "attrdict.h" 22 | 23 | typedef void (Parser)( TidyDocImpl* doc, Node *node, uint mode ); 24 | typedef void (CheckAttribs)( TidyDocImpl* doc, Node *node ); 25 | 26 | /* 27 | Tag dictionary node 28 | */ 29 | 30 | /* types of tags that the user can define */ 31 | #define tagtype_empty 1 32 | #define tagtype_inline 2 33 | #define tagtype_block 4 34 | #define tagtype_pre 8 35 | 36 | struct _Dict 37 | { 38 | TidyTagId id; 39 | tmbstr name; 40 | uint versions; 41 | AttrVersion* attrvers; 42 | uint model; 43 | Parser* parser; 44 | CheckAttribs* chkattrs; 45 | Dict* next; 46 | }; 47 | 48 | #ifdef ELEMENT_HASH_LOOKUP 49 | #define ELEMENT_HASH_SIZE 178 50 | #endif 51 | 52 | struct _TidyTagImpl 53 | { 54 | Dict* xml_tags; /* placeholder for all xml tags */ 55 | Dict* declared_tag_list; /* User declared tags */ 56 | #ifdef ELEMENT_HASH_LOOKUP 57 | Dict* hashtab[ELEMENT_HASH_SIZE]; 58 | #endif 59 | }; 60 | 61 | typedef struct _TidyTagImpl TidyTagImpl; 62 | 63 | /* interface for finding tag by name */ 64 | const Dict* LookupTagDef( TidyTagId tid ); 65 | Bool FindTag( TidyDocImpl* doc, Node *node ); 66 | Parser* FindParser( TidyDocImpl* doc, Node *node ); 67 | void DefineTag( TidyDocImpl* doc, int tagType, ctmbstr name ); 68 | void FreeDeclaredTags( TidyDocImpl* doc, int tagType ); /* 0 to free all */ 69 | 70 | TidyIterator GetDeclaredTagList( TidyDocImpl* doc ); 71 | Dict* GetNextDeclaredDict( TidyDocImpl* doc, TidyIterator* iter ); 72 | ctmbstr GetNextDeclaredTag( TidyDocImpl* doc, int tagType, 73 | TidyIterator* iter ); 74 | 75 | void InitTags( TidyDocImpl* doc ); 76 | void FreeTags( TidyDocImpl* doc ); 77 | 78 | 79 | /* Parser methods for tags */ 80 | 81 | Parser ParseHTML; 82 | Parser ParseHead; 83 | Parser ParseTitle; 84 | Parser ParseScript; 85 | Parser ParseFrameSet; 86 | Parser ParseNoFrames; 87 | Parser ParseBody; 88 | Parser ParsePre; 89 | Parser ParseList; 90 | Parser ParseLI; 91 | Parser ParseDefList; 92 | Parser ParseBlock; 93 | Parser ParseInline; 94 | Parser ParseEmpty; 95 | Parser ParseTableTag; 96 | Parser ParseColGroup; 97 | Parser ParseRowGroup; 98 | Parser ParseRow; 99 | Parser ParseSelect; 100 | Parser ParseOptGroup; 101 | Parser ParseText; 102 | Parser ParseObject; 103 | Parser ParseMap; 104 | 105 | /* Attribute checking methods */ 106 | 107 | CheckAttribs CheckAttributes; 108 | CheckAttribs CheckIMG; 109 | CheckAttribs CheckLINK; 110 | CheckAttribs CheckAREA; 111 | CheckAttribs CheckTABLE; 112 | CheckAttribs CheckCaption; 113 | CheckAttribs CheckSCRIPT; 114 | CheckAttribs CheckSTYLE; 115 | CheckAttribs CheckHTML; 116 | CheckAttribs CheckFORM; 117 | CheckAttribs CheckMETA; 118 | 119 | /* 0 == TidyTag_UNKNOWN */ 120 | #define TagId(node) ((node) && (node)->tag ? (node)->tag->id : TidyTag_UNKNOWN) 121 | #define TagIsId(node, tid) ((node) && (node)->tag && (node)->tag->id == tid) 122 | 123 | Bool nodeIsText( Node* node ); 124 | Bool nodeIsElement( Node* node ); 125 | 126 | Bool nodeHasText( TidyDocImpl* doc, Node* node ); 127 | 128 | /* Compare & result to operand. If equal, then all bits 129 | ** requested are set. 130 | */ 131 | Bool nodeMatchCM( Node* node, uint contentModel ); 132 | 133 | /* True if any of the bits requested are set. 134 | */ 135 | Bool nodeHasCM( Node* node, uint contentModel ); 136 | 137 | Bool nodeCMIsBlock( Node* node ); 138 | Bool nodeCMIsInline( Node* node ); 139 | Bool nodeCMIsEmpty( Node* node ); 140 | 141 | 142 | Bool nodeIsHeader( Node* node ); /* H1, H2, ..., H6 */ 143 | uint nodeHeaderLevel( Node* node ); /* 1, 2, ..., 6 */ 144 | 145 | #define nodeIsHTML( node ) TagIsId( node, TidyTag_HTML ) 146 | #define nodeIsHEAD( node ) TagIsId( node, TidyTag_HEAD ) 147 | #define nodeIsTITLE( node ) TagIsId( node, TidyTag_TITLE ) 148 | #define nodeIsBASE( node ) TagIsId( node, TidyTag_BASE ) 149 | #define nodeIsMETA( node ) TagIsId( node, TidyTag_META ) 150 | #define nodeIsBODY( node ) TagIsId( node, TidyTag_BODY ) 151 | #define nodeIsFRAMESET( node ) TagIsId( node, TidyTag_FRAMESET ) 152 | #define nodeIsFRAME( node ) TagIsId( node, TidyTag_FRAME ) 153 | #define nodeIsIFRAME( node ) TagIsId( node, TidyTag_IFRAME ) 154 | #define nodeIsNOFRAMES( node ) TagIsId( node, TidyTag_NOFRAMES ) 155 | #define nodeIsHR( node ) TagIsId( node, TidyTag_HR ) 156 | #define nodeIsH1( node ) TagIsId( node, TidyTag_H1 ) 157 | #define nodeIsH2( node ) TagIsId( node, TidyTag_H2 ) 158 | #define nodeIsPRE( node ) TagIsId( node, TidyTag_PRE ) 159 | #define nodeIsLISTING( node ) TagIsId( node, TidyTag_LISTING ) 160 | #define nodeIsP( node ) TagIsId( node, TidyTag_P ) 161 | #define nodeIsUL( node ) TagIsId( node, TidyTag_UL ) 162 | #define nodeIsOL( node ) TagIsId( node, TidyTag_OL ) 163 | #define nodeIsDL( node ) TagIsId( node, TidyTag_DL ) 164 | #define nodeIsDIR( node ) TagIsId( node, TidyTag_DIR ) 165 | #define nodeIsLI( node ) TagIsId( node, TidyTag_LI ) 166 | #define nodeIsDT( node ) TagIsId( node, TidyTag_DT ) 167 | #define nodeIsDD( node ) TagIsId( node, TidyTag_DD ) 168 | #define nodeIsTABLE( node ) TagIsId( node, TidyTag_TABLE ) 169 | #define nodeIsCAPTION( node ) TagIsId( node, TidyTag_CAPTION ) 170 | #define nodeIsTD( node ) TagIsId( node, TidyTag_TD ) 171 | #define nodeIsTH( node ) TagIsId( node, TidyTag_TH ) 172 | #define nodeIsTR( node ) TagIsId( node, TidyTag_TR ) 173 | #define nodeIsCOL( node ) TagIsId( node, TidyTag_COL ) 174 | #define nodeIsCOLGROUP( node ) TagIsId( node, TidyTag_COLGROUP ) 175 | #define nodeIsBR( node ) TagIsId( node, TidyTag_BR ) 176 | #define nodeIsA( node ) TagIsId( node, TidyTag_A ) 177 | #define nodeIsLINK( node ) TagIsId( node, TidyTag_LINK ) 178 | #define nodeIsB( node ) TagIsId( node, TidyTag_B ) 179 | #define nodeIsI( node ) TagIsId( node, TidyTag_I ) 180 | #define nodeIsSTRONG( node ) TagIsId( node, TidyTag_STRONG ) 181 | #define nodeIsEM( node ) TagIsId( node, TidyTag_EM ) 182 | #define nodeIsBIG( node ) TagIsId( node, TidyTag_BIG ) 183 | #define nodeIsSMALL( node ) TagIsId( node, TidyTag_SMALL ) 184 | #define nodeIsPARAM( node ) TagIsId( node, TidyTag_PARAM ) 185 | #define nodeIsOPTION( node ) TagIsId( node, TidyTag_OPTION ) 186 | #define nodeIsOPTGROUP( node ) TagIsId( node, TidyTag_OPTGROUP ) 187 | #define nodeIsIMG( node ) TagIsId( node, TidyTag_IMG ) 188 | #define nodeIsMAP( node ) TagIsId( node, TidyTag_MAP ) 189 | #define nodeIsAREA( node ) TagIsId( node, TidyTag_AREA ) 190 | #define nodeIsNOBR( node ) TagIsId( node, TidyTag_NOBR ) 191 | #define nodeIsWBR( node ) TagIsId( node, TidyTag_WBR ) 192 | #define nodeIsFONT( node ) TagIsId( node, TidyTag_FONT ) 193 | #define nodeIsLAYER( node ) TagIsId( node, TidyTag_LAYER ) 194 | #define nodeIsSPACER( node ) TagIsId( node, TidyTag_SPACER ) 195 | #define nodeIsCENTER( node ) TagIsId( node, TidyTag_CENTER ) 196 | #define nodeIsSTYLE( node ) TagIsId( node, TidyTag_STYLE ) 197 | #define nodeIsSCRIPT( node ) TagIsId( node, TidyTag_SCRIPT ) 198 | #define nodeIsNOSCRIPT( node ) TagIsId( node, TidyTag_NOSCRIPT ) 199 | #define nodeIsFORM( node ) TagIsId( node, TidyTag_FORM ) 200 | #define nodeIsTEXTAREA( node ) TagIsId( node, TidyTag_TEXTAREA ) 201 | #define nodeIsBLOCKQUOTE( node ) TagIsId( node, TidyTag_BLOCKQUOTE ) 202 | #define nodeIsAPPLET( node ) TagIsId( node, TidyTag_APPLET ) 203 | #define nodeIsOBJECT( node ) TagIsId( node, TidyTag_OBJECT ) 204 | #define nodeIsDIV( node ) TagIsId( node, TidyTag_DIV ) 205 | #define nodeIsSPAN( node ) TagIsId( node, TidyTag_SPAN ) 206 | #define nodeIsINPUT( node ) TagIsId( node, TidyTag_INPUT ) 207 | #define nodeIsQ( node ) TagIsId( node, TidyTag_Q ) 208 | #define nodeIsLABEL( node ) TagIsId( node, TidyTag_LABEL ) 209 | #define nodeIsH3( node ) TagIsId( node, TidyTag_H3 ) 210 | #define nodeIsH4( node ) TagIsId( node, TidyTag_H4 ) 211 | #define nodeIsH5( node ) TagIsId( node, TidyTag_H5 ) 212 | #define nodeIsH6( node ) TagIsId( node, TidyTag_H6 ) 213 | #define nodeIsADDRESS( node ) TagIsId( node, TidyTag_ADDRESS ) 214 | #define nodeIsXMP( node ) TagIsId( node, TidyTag_XMP ) 215 | #define nodeIsSELECT( node ) TagIsId( node, TidyTag_SELECT ) 216 | #define nodeIsBLINK( node ) TagIsId( node, TidyTag_BLINK ) 217 | #define nodeIsMARQUEE( node ) TagIsId( node, TidyTag_MARQUEE ) 218 | #define nodeIsEMBED( node ) TagIsId( node, TidyTag_EMBED ) 219 | #define nodeIsBASEFONT( node ) TagIsId( node, TidyTag_BASEFONT ) 220 | #define nodeIsISINDEX( node ) TagIsId( node, TidyTag_ISINDEX ) 221 | #define nodeIsS( node ) TagIsId( node, TidyTag_S ) 222 | #define nodeIsSTRIKE( node ) TagIsId( node, TidyTag_STRIKE ) 223 | #define nodeIsU( node ) TagIsId( node, TidyTag_U ) 224 | #define nodeIsMENU( node ) TagIsId( node, TidyTag_MENU ) 225 | 226 | 227 | #endif /* __TAGS_H__ */ 228 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/access.h: -------------------------------------------------------------------------------- 1 | #ifndef __ACCESS_H__ 2 | #define __ACCESS_H__ 3 | 4 | /* access.h -- carry out accessibility checks 5 | 6 | Copyright University of Toronto 7 | Portions (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 8 | See tidy.h for the copyright notice. 9 | 10 | CVS Info : 11 | 12 | $LastChangedBy$ 13 | $LastChangedDate$ 14 | $LastChangedRevision$ 15 | 16 | */ 17 | 18 | /********************************************************************* 19 | * AccessibilityChecks 20 | * 21 | * Carries out processes for all accessibility checks. Traverses 22 | * through all the content within the tree and evaluates the tags for 23 | * accessibility. 24 | * 25 | * To perform the following checks, 'AccessibilityChecks' must be 26 | * called AFTER the tree structure has been formed. 27 | * 28 | * If, in the command prompt, there is no specification of which 29 | * accessibility priorities to check, no accessibility checks will be 30 | * performed. (ie. '1' for priority 1, '2' for priorities 1 and 2, 31 | * and '3') for priorities 1, 2 and 3.) 32 | * 33 | * Copyright University of Toronto 34 | * Programmed by: Mike Lam and Chris Ridpath 35 | * Modifications by : Terry Teague (TRT) 36 | * 37 | *********************************************************************/ 38 | 39 | 40 | #include "forward.h" 41 | 42 | #if SUPPORT_ACCESSIBILITY_CHECKS 43 | 44 | /* The accessibility checks to perform depending on user's desire. 45 | 46 | 1. priority 1 47 | 2. priority 1 & 2 48 | 3. priority 1, 2, & 3 49 | */ 50 | 51 | /* Determines if the client-side text link is found within the document 52 | typedef struct AreaLinks 53 | { 54 | struct AreaLinks* next; 55 | char* link; 56 | Bool HasBeenFound; 57 | } AreaLinks; 58 | */ 59 | 60 | #define TEXTBUF_SIZE 128 61 | 62 | struct _TidyAccessImpl; 63 | typedef struct _TidyAccessImpl TidyAccessImpl; 64 | 65 | struct _TidyAccessImpl 66 | { 67 | /* gets set from Tidy variable AccessibilityCheckLevel */ 68 | int PRIORITYCHK; 69 | 70 | /* Number of characters that are found within the concatenated text */ 71 | int counter; 72 | 73 | /* list of characters in the text nodes found within a container element */ 74 | tmbchar textNode[ TEXTBUF_SIZE ]; 75 | 76 | /* The list of characters found within one text node */ 77 | tmbchar text[ TEXTBUF_SIZE ]; 78 | 79 | /* Number of frame elements found within a frameset */ 80 | int numFrames; 81 | 82 | /* Number of 'longdesc' attributes found within a frameset */ 83 | int HasCheckedLongDesc; 84 | 85 | int CheckedHeaders; 86 | int ListElements; 87 | int OtherListElements; 88 | 89 | /* For 'USEMAP' identifier */ 90 | Bool HasUseMap; 91 | Bool HasName; 92 | Bool HasMap; 93 | 94 | /* For tracking nodes that are deleted from the original parse tree - TRT */ 95 | /* Node *access_tree; */ 96 | 97 | Bool HasTH; 98 | Bool HasValidFor; 99 | Bool HasValidId; 100 | Bool HasValidRowHeaders; 101 | Bool HasValidColumnHeaders; 102 | Bool HasInvalidRowHeader; 103 | Bool HasInvalidColumnHeader; 104 | int ForID; 105 | 106 | /* List containing map-links 107 | AreaLinks* links; 108 | AreaLinks* start; 109 | AreaLinks* current; 110 | */ 111 | 112 | }; 113 | 114 | 115 | /* 116 | Determines which error/warning message should be displayed, 117 | depending on the error code that was called. 118 | */ 119 | enum accessErrorCodes 120 | { 121 | /* [1.1.1.1] */ IMG_MISSING_ALT, 122 | /* [1.1.1.2] */ IMG_ALT_SUSPICIOUS_FILENAME, 123 | /* [1.1.1.3] */ IMG_ALT_SUSPICIOUS_FILE_SIZE, 124 | /* [1.1.1.4] */ IMG_ALT_SUSPICIOUS_PLACEHOLDER, 125 | /* [1.1.1.10] */ IMG_ALT_SUSPICIOUS_TOO_LONG, 126 | /* [1.1.1.11] */ IMG_MISSING_ALT_BULLET, 127 | /* [1.1.1.12] */ IMG_MISSING_ALT_H_RULE, 128 | /* [1.1.2.1] */ IMG_MISSING_LONGDESC_DLINK, 129 | /* [1.1.2.2] */ IMG_MISSING_DLINK, 130 | /* [1.1.2.3] */ IMG_MISSING_LONGDESC, 131 | /* [1.1.2.5] */ LONGDESC_NOT_REQUIRED, 132 | /* [1.1.3.1] */ IMG_BUTTON_MISSING_ALT, 133 | /* [1.1.4.1] */ APPLET_MISSING_ALT, 134 | /* [1.1.5.1] */ OBJECT_MISSING_ALT, 135 | /* [1.1.6.1] */ AUDIO_MISSING_TEXT_WAV, 136 | /* [1.1.6.2] */ AUDIO_MISSING_TEXT_AU, 137 | /* [1.1.6.3] */ AUDIO_MISSING_TEXT_AIFF, 138 | /* [1.1.6.4] */ AUDIO_MISSING_TEXT_SND, 139 | /* [1.1.6.5] */ AUDIO_MISSING_TEXT_RA, 140 | /* [1.1.6.6] */ AUDIO_MISSING_TEXT_RM, 141 | /* [1.1.8.1] */ FRAME_MISSING_LONGDESC, 142 | /* [1.1.9.1] */ AREA_MISSING_ALT, 143 | /* [1.1.10.1] */ SCRIPT_MISSING_NOSCRIPT, 144 | /* [1.1.12.1] */ ASCII_REQUIRES_DESCRIPTION, 145 | /* [1.2.1.1] */ IMG_MAP_SERVER_REQUIRES_TEXT_LINKS, 146 | /* [1.4.1.1] */ MULTIMEDIA_REQUIRES_TEXT, 147 | /* [1.5.1.1] */ IMG_MAP_CLIENT_MISSING_TEXT_LINKS, 148 | /* [2.1.1.1] */ INFORMATION_NOT_CONVEYED_IMAGE, 149 | /* [2.1.1.2] */ INFORMATION_NOT_CONVEYED_APPLET, 150 | /* [2.1.1.3] */ INFORMATION_NOT_CONVEYED_OBJECT, 151 | /* [2.1.1.4] */ INFORMATION_NOT_CONVEYED_SCRIPT, 152 | /* [2.1.1.5] */ INFORMATION_NOT_CONVEYED_INPUT, 153 | /* [2.2.1.1] */ COLOR_CONTRAST_TEXT, 154 | /* [2.2.1.2] */ COLOR_CONTRAST_LINK, 155 | /* [2.2.1.3] */ COLOR_CONTRAST_ACTIVE_LINK, 156 | /* [2.2.1.4] */ COLOR_CONTRAST_VISITED_LINK, 157 | /* [3.2.1.1] */ DOCTYPE_MISSING, 158 | /* [3.3.1.1] */ STYLE_SHEET_CONTROL_PRESENTATION, 159 | /* [3.5.1.1] */ HEADERS_IMPROPERLY_NESTED, 160 | /* [3.5.2.1] */ POTENTIAL_HEADER_BOLD, 161 | /* [3.5.2.2] */ POTENTIAL_HEADER_ITALICS, 162 | /* [3.5.2.3] */ POTENTIAL_HEADER_UNDERLINE, 163 | /* [3.5.3.1] */ HEADER_USED_FORMAT_TEXT, 164 | /* [3.6.1.1] */ LIST_USAGE_INVALID_UL, 165 | /* [3.6.1.2] */ LIST_USAGE_INVALID_OL, 166 | /* [3.6.1.4] */ LIST_USAGE_INVALID_LI, 167 | /* [4.1.1.1] */ INDICATE_CHANGES_IN_LANGUAGE, 168 | /* [4.3.1.1] */ LANGUAGE_NOT_IDENTIFIED, 169 | /* [4.3.1.1] */ LANGUAGE_INVALID, 170 | /* [5.1.2.1] */ DATA_TABLE_MISSING_HEADERS, 171 | /* [5.1.2.2] */ DATA_TABLE_MISSING_HEADERS_COLUMN, 172 | /* [5.1.2.3] */ DATA_TABLE_MISSING_HEADERS_ROW, 173 | /* [5.2.1.1] */ DATA_TABLE_REQUIRE_MARKUP_COLUMN_HEADERS, 174 | /* [5.2.1.2] */ DATA_TABLE_REQUIRE_MARKUP_ROW_HEADERS, 175 | /* [5.3.1.1] */ LAYOUT_TABLES_LINEARIZE_PROPERLY, 176 | /* [5.4.1.1] */ LAYOUT_TABLE_INVALID_MARKUP, 177 | /* [5.5.1.1] */ TABLE_MISSING_SUMMARY, 178 | /* [5.5.1.2] */ TABLE_SUMMARY_INVALID_NULL, 179 | /* [5.5.1.3] */ TABLE_SUMMARY_INVALID_SPACES, 180 | /* [5.5.1.6] */ TABLE_SUMMARY_INVALID_PLACEHOLDER, 181 | /* [5.5.2.1] */ TABLE_MISSING_CAPTION, 182 | /* [5.6.1.1] */ TABLE_MAY_REQUIRE_HEADER_ABBR, 183 | /* [5.6.1.2] */ TABLE_MAY_REQUIRE_HEADER_ABBR_NULL, 184 | /* [5.6.1.3] */ TABLE_MAY_REQUIRE_HEADER_ABBR_SPACES, 185 | /* [6.1.1.1] */ STYLESHEETS_REQUIRE_TESTING_LINK, 186 | /* [6.1.1.2] */ STYLESHEETS_REQUIRE_TESTING_STYLE_ELEMENT, 187 | /* [6.1.1.3] */ STYLESHEETS_REQUIRE_TESTING_STYLE_ATTR, 188 | /* [6.2.1.1] */ FRAME_SRC_INVALID, 189 | /* [6.2.2.1] */ TEXT_EQUIVALENTS_REQUIRE_UPDATING_APPLET, 190 | /* [6.2.2.2] */ TEXT_EQUIVALENTS_REQUIRE_UPDATING_SCRIPT, 191 | /* [6.2.2.3] */ TEXT_EQUIVALENTS_REQUIRE_UPDATING_OBJECT, 192 | /* [6.3.1.1] */ PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_SCRIPT, 193 | /* [6.3.1.2] */ PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_OBJECT, 194 | /* [6.3.1.3] */ PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_EMBED, 195 | /* [6.3.1.4] */ PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_APPLET, 196 | /* [6.5.1.1] */ FRAME_MISSING_NOFRAMES, 197 | /* [6.5.1.2] */ NOFRAMES_INVALID_NO_VALUE, 198 | /* [6.5.1.3] */ NOFRAMES_INVALID_CONTENT, 199 | /* [6.5.1.4] */ NOFRAMES_INVALID_LINK, 200 | /* [7.1.1.1] */ REMOVE_FLICKER_SCRIPT, 201 | /* [7.1.1.2] */ REMOVE_FLICKER_OBJECT, 202 | /* [7.1.1.3] */ REMOVE_FLICKER_EMBED, 203 | /* [7.1.1.4] */ REMOVE_FLICKER_APPLET, 204 | /* [7.1.1.5] */ REMOVE_FLICKER_ANIMATED_GIF, 205 | /* [7.2.1.1] */ REMOVE_BLINK_MARQUEE, 206 | /* [7.4.1.1] */ REMOVE_AUTO_REFRESH, 207 | /* [7.5.1.1] */ REMOVE_AUTO_REDIRECT, 208 | /* [8.1.1.1] */ ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_SCRIPT, 209 | /* [8.1.1.2] */ ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_OBJECT, 210 | /* [8.1.1.3] */ ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_APPLET, 211 | /* [8.1.1.4] */ ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_EMBED, 212 | /* [9.1.1.1] */ IMAGE_MAP_SERVER_SIDE_REQUIRES_CONVERSION, 213 | /* [9.3.1.1] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_DOWN, 214 | /* [9.3.1.2] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_UP, 215 | /* [9.3.1.3] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_CLICK, 216 | /* [9.3.1.4] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OVER, 217 | /* [9.3.1.5] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OUT, 218 | /* [9.3.1.6] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_MOVE, 219 | /* [10.1.1.1] */ NEW_WINDOWS_REQUIRE_WARNING_NEW, 220 | /* [10.1.1.2] */ NEW_WINDOWS_REQUIRE_WARNING_BLANK, 221 | /* [10.2.1.1] */ LABEL_NEEDS_REPOSITIONING_BEFORE_INPUT, 222 | /* [10.2.1.2] */ LABEL_NEEDS_REPOSITIONING_AFTER_INPUT, 223 | /* [10.4.1.1] */ FORM_CONTROL_REQUIRES_DEFAULT_TEXT, 224 | /* [10.4.1.2] */ FORM_CONTROL_DEFAULT_TEXT_INVALID_NULL, 225 | /* [10.4.1.3] */ FORM_CONTROL_DEFAULT_TEXT_INVALID_SPACES, 226 | /* [11.2.1.1] */ REPLACE_DEPRECATED_HTML_APPLET, 227 | /* [11.2.1.2] */ REPLACE_DEPRECATED_HTML_BASEFONT, 228 | /* [11.2.1.3] */ REPLACE_DEPRECATED_HTML_CENTER, 229 | /* [11.2.1.4] */ REPLACE_DEPRECATED_HTML_DIR, 230 | /* [11.2.1.5] */ REPLACE_DEPRECATED_HTML_FONT, 231 | /* [11.2.1.6] */ REPLACE_DEPRECATED_HTML_ISINDEX, 232 | /* [11.2.1.7] */ REPLACE_DEPRECATED_HTML_MENU, 233 | /* [11.2.1.8] */ REPLACE_DEPRECATED_HTML_S, 234 | /* [11.2.1.9] */ REPLACE_DEPRECATED_HTML_STRIKE, 235 | /* [11.2.1.10] */ REPLACE_DEPRECATED_HTML_U, 236 | /* [12.1.1.1] */ FRAME_MISSING_TITLE, 237 | /* [12.1.1.2] */ FRAME_TITLE_INVALID_NULL, 238 | /* [12.1.1.3] */ FRAME_TITLE_INVALID_SPACES, 239 | /* [12.4.1.1] */ ASSOCIATE_LABELS_EXPLICITLY, 240 | /* [12.4.1.2] */ ASSOCIATE_LABELS_EXPLICITLY_FOR, 241 | /* [12.4.1.3] */ ASSOCIATE_LABELS_EXPLICITLY_ID, 242 | /* [13.1.1.1] */ LINK_TEXT_NOT_MEANINGFUL, 243 | /* [13.1.1.2] */ LINK_TEXT_MISSING, 244 | /* [13.1.1.3] */ LINK_TEXT_TOO_LONG, 245 | /* [13.1.1.4] */ LINK_TEXT_NOT_MEANINGFUL_CLICK_HERE, 246 | /* [13.1.1.5] */ LINK_TEXT_NOT_MEANINGFUL_MORE, 247 | /* [13.1.1.6] */ LINK_TEXT_NOT_MEANINGFUL_FOLLOW_THIS, 248 | /* [13.2.1.1] */ METADATA_MISSING, 249 | /* [13.2.1.2] */ METADATA_MISSING_LINK, 250 | /* [13.2.1.3] */ METADATA_MISSING_REDIRECT_AUTOREFRESH, 251 | /* [13.10.1.1] */ SKIPOVER_ASCII_ART, 252 | 253 | LAST_ACCESS_ERR /* must be last */ 254 | }; 255 | 256 | 257 | /************************************************************ 258 | * AccessibilityChecks 259 | * 260 | * Traverses through the individual nodes of the tree 261 | * and checks attributes and elements for accessibility. 262 | * after the tree structure has been formed. 263 | ************************************************************/ 264 | 265 | void AccessibilityChecks( TidyDocImpl* doc ); 266 | 267 | 268 | #endif /* SUPPORT_ACCESSIBILITY_CHECKS */ 269 | #endif /* __ACCESS_H__ */ 270 | -------------------------------------------------------------------------------- /juniperncprompt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # If this doesn't run for you, replace python2 with python 3 | 4 | ################### 5 | # Copyright 2011 Joseph Henrich (crimsonknave@gmail.com) 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with this program. If not, see 19 | # 20 | ###################### 21 | 22 | import sys 23 | import getpass 24 | import argparse 25 | import urllib2, urllib, cookielib 26 | import os 27 | import pexpect 28 | import time 29 | from elementtidy import TidyHTMLTreeBuilder 30 | 31 | 32 | 33 | # The defaults are what my set up expected, change them if you want 34 | parser = argparse.ArgumentParser(description="Set up the vpn tunnel.") 35 | parser.add_argument('hostname', help="The hostname of the vpn server") 36 | parser.add_argument('-u', '--username', default=None) 37 | parser.add_argument('--password-fields', help="What are the password fields required by your vpn site. Delimited by commas please (no spaces)", default="password,password#2") 38 | parser.add_argument('-r', '--realm', help="What realm are we using. This will be a hidden field in the web form on the vpn site", default="Internal Users") 39 | parser.add_argument('--login-path', help="The path to the login page (What the submit button points to)", default="/dana-na/auth/url_2/login.cgi") 40 | parser.add_argument('--nc-path', help="Where the juniper network connect files are located", default="{}/.juniper_networks/network_connect".format(os.environ["HOME"])) 41 | parser.add_argument('--logout-path', help="The path to the logout call, so we don't leave sessions trailing behind us.", default="/dana-na/auth/logout.cgi") 42 | parser.add_argument('--cert', help="The location of the cert file to use with ncui", default="{}/.juniper_networks/network_connect/ssl.crt".format(os.environ["HOME"])) 43 | parser.add_argument('--out-file', help="If an error occurs where should the page be written to for review", default="/tmp/juniperncprompt_error.html") 44 | 45 | 46 | def find_sessions(base, par=None): 47 | values = [x.text for x in base.getchildren()] 48 | if values[1:5] == ['Login IP Address', 'Login Time', 'Idle Time', 'Browser']: 49 | return par 50 | 51 | for child in base.getchildren(): 52 | answer = find_sessions(child, base) 53 | if answer is not None: 54 | return answer 55 | 56 | 57 | def find_by_name(base, name): 58 | try: 59 | if dict(base.items())['name'] == name: 60 | return base 61 | except KeyError, AttributeError: 62 | pass 63 | 64 | for child in base.getchildren(): 65 | answer = find_by_name(child, name) 66 | if answer is not None: 67 | return answer 68 | 69 | def find_session_values(table): 70 | return [dict(tr.getchildren()[0].getchildren()[0].items()) for tr in table.getchildren()[1:]] 71 | 72 | def display_session(table): 73 | i = 0 74 | for tr in table.getchildren(): 75 | if i == 0: 76 | print(u" "+u"".join([u"{:<30}".format(text) for text in [td.text for td in tr.getchildren()][1:5]])) 77 | else: 78 | print(u"{:<3}: ".format(i)+u"".join([u"{:<30}".format(text) for text in [td.text for td in tr.getchildren()][1:5]])) 79 | i += 1 80 | 81 | 82 | class JuniperNCPrompt: 83 | def __init__(self): 84 | self.args = parser.parse_args() 85 | if not self.args.username: 86 | self.get_user() 87 | 88 | self.passwords = self.get_passwords() 89 | self.data = self.configure_data(self.passwords) 90 | 91 | self.cj = cookielib.CookieJar() 92 | self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj)) 93 | 94 | def parse_error(self, html): 95 | tree = TidyHTMLTreeBuilder.parse(html) 96 | prefix = '{http://www.w3.org/1999/xhtml}' 97 | root = tree.getroot() 98 | 99 | self.form = root.find('{}body/{}form'.format(prefix, prefix)) 100 | # Some of the returned html has a blockquote before the form, some don't 101 | if self.form is None: 102 | self.form = root.find('{}body/{}blockquote/{}form'.format(*[prefix]*3)) 103 | if self.form is not None: 104 | fields = dict(self.form.items()) 105 | if fields['name'] == 'frmConfirmation': 106 | # Existing sessions open 107 | print("There are existing sessions open") 108 | #submit = form.find('{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}input'.format(*[prefix]*13)) 109 | submit = find_by_name(self.form, 'btnContinue') 110 | values = dict(submit.items()) 111 | table = find_sessions(self.form) 112 | display_session(table) 113 | if values["value"] == "Close Selected Sessions and Log in": 114 | self.close_sessions(table, True) 115 | elif values["value"] == "Log in (and optionally Close Selected Sessions)": 116 | self.close_sessions(table) 117 | session = self.get_session() 118 | if session is not None: 119 | return session 120 | else: 121 | return self.parse_error(self.args.out_file) 122 | elif fields['name'] == 'frmLogin': 123 | print "Found login form, looking post-auth message" 124 | # This may be the post-login form 125 | # check whether there's a hidden field with a key 126 | #print tostring(self.form) 127 | temp = self.form.getiterator('{}input'.format(prefix)) 128 | isSecondary = 0 129 | if temp is not None: 130 | for t in temp: 131 | #print tostring(t) 132 | if t.get('name') == 'key': 133 | key = t.get('value') 134 | isSecondary = 1 135 | break 136 | 137 | if isSecondary == 1: 138 | dat = {"key": key, "sn-postauth-proceed": "Proceed"} 139 | #print dat 140 | self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), urllib.urlencode(dat)) 141 | #print 'Attempting to get session' 142 | session = self.get_session() 143 | if session is not None: 144 | return session 145 | else: 146 | return self.parse_error(self.args.out_file) 147 | else: 148 | # Invalid user/pass try again 149 | print("Invalid user/pass, please try again") 150 | self.get_user() 151 | passwords = self.get_passwords() 152 | self.data = self.configure_data(passwords) 153 | self.log_in() 154 | session = self.get_session() 155 | if session is not None: 156 | return session 157 | else: 158 | return self.parse_error(self.args.out_file) 159 | elif fields['name'] == 'frmNextToken': 160 | # Wait till the next token pops up and then enter it 161 | temp = self.form.find('{}/input'.format(prefix)) 162 | if temp is not None: 163 | values = dict(temp.items()) 164 | try: 165 | if values['name'] == 'key': 166 | key = values['value'] 167 | else: 168 | print("Unable to find the key for the next token form, either we detected the form incorrectly or somthing went wrong.") 169 | return 170 | except KeyError: 171 | return 172 | password = getpass("Please enter the next securID token to appear on your fob") 173 | self.data = self.configure_data({"password":password}) 174 | #self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), self.data) 175 | self.log_in() 176 | session = self.get_session() 177 | 178 | 179 | else: 180 | # Unknown case, note where the file is so they can see what happened 181 | print("An unhandled case has come up. Please view the page at {}".format(self.args.out_file)) 182 | else: 183 | print("Unable to parse the html, please view it at {}".format(self.args.out_file)) 184 | 185 | def close_sessions(self, table, required=False): 186 | if required: 187 | reply = raw_input("Sessions maxed out, select at least one to close (space delimited)") 188 | else: 189 | reply = raw_input("Close any sessions you wish to, or log in with out closing sessions by typing 'n' (space delimited)") 190 | if reply.strip() == 'n': 191 | reply = "" 192 | 193 | try: 194 | to_close = [int(x)-1 for x in reply.split()] 195 | except ValueError: 196 | reply = False 197 | display_session(table) 198 | self.close_sessions(table, required) 199 | return 200 | 201 | button = dict(find_by_name(self.form, "btnContinue").items()) 202 | form_data_str = dict(find_by_name(self.form, "FormDataStr").items()) 203 | 204 | sessions = [dict(tr.getchildren()[0].getchildren()[0].items()) for tr in table.getchildren()[1:]] 205 | sessions_to_close = [(x['name'], x['value']) for x in [sessions[y] for y in to_close]] 206 | print("Closing {} which turns out to be {}".format(to_close, sessions_to_close)) 207 | if to_close: 208 | # We want the FormDataStr to be the last parameter 209 | base_data = [(button['name'], button['value'])] 210 | base_data.extend(sessions_to_close) 211 | base_data.append((form_data_str['name'], form_data_str['value'])) 212 | else: 213 | base_data = [(button['name'],button['value']), (form_data_str['name'],form_data_str['value'])] 214 | self.data = urllib.urlencode(base_data) 215 | self.log_in() 216 | 217 | 218 | 219 | 220 | 221 | def log_out(self): 222 | print("Logging out now") 223 | try: 224 | self.latest_response = self.opener.open("https://{}{}".format(self.args.hostname, self.args.logout_path)) 225 | if self.latest_response.getcode() != 200: 226 | print("Got a non 200 back ({}), there may be a session still around.".format(self.latest_response.getcode())) 227 | except Exception, e: 228 | print("We tried to log out, but were unable to, there may be a lingering session...") 229 | 230 | def get_passwords(self): 231 | passwords = {} 232 | for pass_name in self.args.password_fields.split(','): 233 | passwords[pass_name] = getpass.getpass("'"+pass_name+"':") 234 | return passwords 235 | 236 | def get_user(self): 237 | self.args.username = raw_input("Please enter your username: ") 238 | 239 | def log_in(self): 240 | self.latest_response = self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), self.data) 241 | 242 | def get_session(self): 243 | cookies = self.cj._cookies 244 | try: 245 | session = cookies[self.args.hostname]['/']['DSID'].value 246 | return session 247 | except KeyError, e: 248 | out_file = open(self.args.out_file, 'w') 249 | out_file.write(self.latest_response.read()) 250 | out_file.close() 251 | print("Something went wrong, html of last page is written to {}".format(self.args.out_file)) 252 | session = self.parse_error(self.args.out_file) 253 | if session: 254 | return session 255 | else: 256 | print("Couldn't find any cookies for {}, code was {}, body written to {}".format(self.args.hostname, self.latest_response.getcode(), self.args.out_file)) 257 | sys.exit(1) 258 | 259 | def run_ncui(self, session): 260 | command = "{}/ncui -h {} -c DSID={} -f {}".format(self.args.nc_path, self.args.hostname, session, self.args.cert) 261 | print("Got the session ({}) creating the tunnel now, use Ctrl+C when you are done.".format(session)) 262 | child = pexpect.spawn(command) 263 | child.expect('Password:') 264 | child.sendline("") 265 | #print child.read() 266 | while child.isalive(): 267 | #We don't expect the child to die, but we certainly should exit if it does 268 | time.sleep(1) 269 | 270 | def configure_data(self, passwords): 271 | # By not putting this in the __init__ method we don't risk something wonky 272 | # happening and we have extra password fields from one time to another 273 | # This is only likly to happen if this is imported as a module 274 | params = {"username": self.args.username, "realm": self.args.realm, "btnSubmit": "Sign In"} 275 | params.update(passwords) 276 | return urllib.urlencode(params) 277 | 278 | 279 | if __name__ == "__main__": 280 | try: 281 | attempt = JuniperNCPrompt() 282 | 283 | attempt.log_in() 284 | session = attempt.get_session() 285 | attempt.run_ncui(session) 286 | 287 | 288 | print("Done!") 289 | attempt.log_out() 290 | except Exception, e: 291 | print("Uh-oh, we got an exception: {} cleaning up now".format(e)) 292 | print(sys.exc_info()[0]) 293 | attempt.log_out() 294 | 295 | except KeyboardInterrupt: 296 | attempt.log_out() 297 | 298 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/entities.c: -------------------------------------------------------------------------------- 1 | /* entities.c -- recognize HTML ISO entities 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info : 7 | 8 | $LastChangedBy$ 9 | $LastChangedDate$ 10 | $LastChangedRevision$ 11 | 12 | Entity handling can be static because there are no config or 13 | document-specific values. Lookup table is 100% defined at 14 | compile time. 15 | 16 | */ 17 | 18 | #include 19 | #include "entities.h" 20 | #include "tidy-int.h" 21 | #include "tmbstr.h" 22 | 23 | struct _entity; 24 | typedef struct _entity entity; 25 | 26 | struct _entity 27 | { 28 | ctmbstr name; 29 | uint versions; 30 | uint code; 31 | }; 32 | 33 | 34 | static const entity entities[] = 35 | { 36 | /* 37 | ** Markup pre-defined character entities 38 | */ 39 | { "quot", VERS_ALL|VERS_XML, 34 }, 40 | { "amp", VERS_ALL|VERS_XML, 38 }, 41 | { "apos", VERS_FROM40|VERS_XML, 39 }, 42 | { "lt", VERS_ALL|VERS_XML, 60 }, 43 | { "gt", VERS_ALL|VERS_XML, 62 }, 44 | 45 | /* 46 | ** Latin-1 character entities 47 | */ 48 | { "nbsp", VERS_ALL, 160 }, 49 | { "iexcl", VERS_ALL, 161 }, 50 | { "cent", VERS_ALL, 162 }, 51 | { "pound", VERS_ALL, 163 }, 52 | { "curren", VERS_ALL, 164 }, 53 | { "yen", VERS_ALL, 165 }, 54 | { "brvbar", VERS_ALL, 166 }, 55 | { "sect", VERS_ALL, 167 }, 56 | { "uml", VERS_ALL, 168 }, 57 | { "copy", VERS_ALL, 169 }, 58 | { "ordf", VERS_ALL, 170 }, 59 | { "laquo", VERS_ALL, 171 }, 60 | { "not", VERS_ALL, 172 }, 61 | { "shy", VERS_ALL, 173 }, 62 | { "reg", VERS_ALL, 174 }, 63 | { "macr", VERS_ALL, 175 }, 64 | { "deg", VERS_ALL, 176 }, 65 | { "plusmn", VERS_ALL, 177 }, 66 | { "sup2", VERS_ALL, 178 }, 67 | { "sup3", VERS_ALL, 179 }, 68 | { "acute", VERS_ALL, 180 }, 69 | { "micro", VERS_ALL, 181 }, 70 | { "para", VERS_ALL, 182 }, 71 | { "middot", VERS_ALL, 183 }, 72 | { "cedil", VERS_ALL, 184 }, 73 | { "sup1", VERS_ALL, 185 }, 74 | { "ordm", VERS_ALL, 186 }, 75 | { "raquo", VERS_ALL, 187 }, 76 | { "frac14", VERS_ALL, 188 }, 77 | { "frac12", VERS_ALL, 189 }, 78 | { "frac34", VERS_ALL, 190 }, 79 | { "iquest", VERS_ALL, 191 }, 80 | { "Agrave", VERS_ALL, 192 }, 81 | { "Aacute", VERS_ALL, 193 }, 82 | { "Acirc", VERS_ALL, 194 }, 83 | { "Atilde", VERS_ALL, 195 }, 84 | { "Auml", VERS_ALL, 196 }, 85 | { "Aring", VERS_ALL, 197 }, 86 | { "AElig", VERS_ALL, 198 }, 87 | { "Ccedil", VERS_ALL, 199 }, 88 | { "Egrave", VERS_ALL, 200 }, 89 | { "Eacute", VERS_ALL, 201 }, 90 | { "Ecirc", VERS_ALL, 202 }, 91 | { "Euml", VERS_ALL, 203 }, 92 | { "Igrave", VERS_ALL, 204 }, 93 | { "Iacute", VERS_ALL, 205 }, 94 | { "Icirc", VERS_ALL, 206 }, 95 | { "Iuml", VERS_ALL, 207 }, 96 | { "ETH", VERS_ALL, 208 }, 97 | { "Ntilde", VERS_ALL, 209 }, 98 | { "Ograve", VERS_ALL, 210 }, 99 | { "Oacute", VERS_ALL, 211 }, 100 | { "Ocirc", VERS_ALL, 212 }, 101 | { "Otilde", VERS_ALL, 213 }, 102 | { "Ouml", VERS_ALL, 214 }, 103 | { "times", VERS_ALL, 215 }, 104 | { "Oslash", VERS_ALL, 216 }, 105 | { "Ugrave", VERS_ALL, 217 }, 106 | { "Uacute", VERS_ALL, 218 }, 107 | { "Ucirc", VERS_ALL, 219 }, 108 | { "Uuml", VERS_ALL, 220 }, 109 | { "Yacute", VERS_ALL, 221 }, 110 | { "THORN", VERS_ALL, 222 }, 111 | { "szlig", VERS_ALL, 223 }, 112 | { "agrave", VERS_ALL, 224 }, 113 | { "aacute", VERS_ALL, 225 }, 114 | { "acirc", VERS_ALL, 226 }, 115 | { "atilde", VERS_ALL, 227 }, 116 | { "auml", VERS_ALL, 228 }, 117 | { "aring", VERS_ALL, 229 }, 118 | { "aelig", VERS_ALL, 230 }, 119 | { "ccedil", VERS_ALL, 231 }, 120 | { "egrave", VERS_ALL, 232 }, 121 | { "eacute", VERS_ALL, 233 }, 122 | { "ecirc", VERS_ALL, 234 }, 123 | { "euml", VERS_ALL, 235 }, 124 | { "igrave", VERS_ALL, 236 }, 125 | { "iacute", VERS_ALL, 237 }, 126 | { "icirc", VERS_ALL, 238 }, 127 | { "iuml", VERS_ALL, 239 }, 128 | { "eth", VERS_ALL, 240 }, 129 | { "ntilde", VERS_ALL, 241 }, 130 | { "ograve", VERS_ALL, 242 }, 131 | { "oacute", VERS_ALL, 243 }, 132 | { "ocirc", VERS_ALL, 244 }, 133 | { "otilde", VERS_ALL, 245 }, 134 | { "ouml", VERS_ALL, 246 }, 135 | { "divide", VERS_ALL, 247 }, 136 | { "oslash", VERS_ALL, 248 }, 137 | { "ugrave", VERS_ALL, 249 }, 138 | { "uacute", VERS_ALL, 250 }, 139 | { "ucirc", VERS_ALL, 251 }, 140 | { "uuml", VERS_ALL, 252 }, 141 | { "yacute", VERS_ALL, 253 }, 142 | { "thorn", VERS_ALL, 254 }, 143 | { "yuml", VERS_ALL, 255 }, 144 | 145 | /* 146 | ** Extended Entities defined in HTML 4: Symbols 147 | */ 148 | { "fnof", VERS_FROM40, 402 }, 149 | { "Alpha", VERS_FROM40, 913 }, 150 | { "Beta", VERS_FROM40, 914 }, 151 | { "Gamma", VERS_FROM40, 915 }, 152 | { "Delta", VERS_FROM40, 916 }, 153 | { "Epsilon", VERS_FROM40, 917 }, 154 | { "Zeta", VERS_FROM40, 918 }, 155 | { "Eta", VERS_FROM40, 919 }, 156 | { "Theta", VERS_FROM40, 920 }, 157 | { "Iota", VERS_FROM40, 921 }, 158 | { "Kappa", VERS_FROM40, 922 }, 159 | { "Lambda", VERS_FROM40, 923 }, 160 | { "Mu", VERS_FROM40, 924 }, 161 | { "Nu", VERS_FROM40, 925 }, 162 | { "Xi", VERS_FROM40, 926 }, 163 | { "Omicron", VERS_FROM40, 927 }, 164 | { "Pi", VERS_FROM40, 928 }, 165 | { "Rho", VERS_FROM40, 929 }, 166 | { "Sigma", VERS_FROM40, 931 }, 167 | { "Tau", VERS_FROM40, 932 }, 168 | { "Upsilon", VERS_FROM40, 933 }, 169 | { "Phi", VERS_FROM40, 934 }, 170 | { "Chi", VERS_FROM40, 935 }, 171 | { "Psi", VERS_FROM40, 936 }, 172 | { "Omega", VERS_FROM40, 937 }, 173 | { "alpha", VERS_FROM40, 945 }, 174 | { "beta", VERS_FROM40, 946 }, 175 | { "gamma", VERS_FROM40, 947 }, 176 | { "delta", VERS_FROM40, 948 }, 177 | { "epsilon", VERS_FROM40, 949 }, 178 | { "zeta", VERS_FROM40, 950 }, 179 | { "eta", VERS_FROM40, 951 }, 180 | { "theta", VERS_FROM40, 952 }, 181 | { "iota", VERS_FROM40, 953 }, 182 | { "kappa", VERS_FROM40, 954 }, 183 | { "lambda", VERS_FROM40, 955 }, 184 | { "mu", VERS_FROM40, 956 }, 185 | { "nu", VERS_FROM40, 957 }, 186 | { "xi", VERS_FROM40, 958 }, 187 | { "omicron", VERS_FROM40, 959 }, 188 | { "pi", VERS_FROM40, 960 }, 189 | { "rho", VERS_FROM40, 961 }, 190 | { "sigmaf", VERS_FROM40, 962 }, 191 | { "sigma", VERS_FROM40, 963 }, 192 | { "tau", VERS_FROM40, 964 }, 193 | { "upsilon", VERS_FROM40, 965 }, 194 | { "phi", VERS_FROM40, 966 }, 195 | { "chi", VERS_FROM40, 967 }, 196 | { "psi", VERS_FROM40, 968 }, 197 | { "omega", VERS_FROM40, 969 }, 198 | { "thetasym", VERS_FROM40, 977 }, 199 | { "upsih", VERS_FROM40, 978 }, 200 | { "piv", VERS_FROM40, 982 }, 201 | { "bull", VERS_FROM40, 8226 }, 202 | { "hellip", VERS_FROM40, 8230 }, 203 | { "prime", VERS_FROM40, 8242 }, 204 | { "Prime", VERS_FROM40, 8243 }, 205 | { "oline", VERS_FROM40, 8254 }, 206 | { "frasl", VERS_FROM40, 8260 }, 207 | { "weierp", VERS_FROM40, 8472 }, 208 | { "image", VERS_FROM40, 8465 }, 209 | { "real", VERS_FROM40, 8476 }, 210 | { "trade", VERS_FROM40, 8482 }, 211 | { "alefsym", VERS_FROM40, 8501 }, 212 | { "larr", VERS_FROM40, 8592 }, 213 | { "uarr", VERS_FROM40, 8593 }, 214 | { "rarr", VERS_FROM40, 8594 }, 215 | { "darr", VERS_FROM40, 8595 }, 216 | { "harr", VERS_FROM40, 8596 }, 217 | { "crarr", VERS_FROM40, 8629 }, 218 | { "lArr", VERS_FROM40, 8656 }, 219 | { "uArr", VERS_FROM40, 8657 }, 220 | { "rArr", VERS_FROM40, 8658 }, 221 | { "dArr", VERS_FROM40, 8659 }, 222 | { "hArr", VERS_FROM40, 8660 }, 223 | { "forall", VERS_FROM40, 8704 }, 224 | { "part", VERS_FROM40, 8706 }, 225 | { "exist", VERS_FROM40, 8707 }, 226 | { "empty", VERS_FROM40, 8709 }, 227 | { "nabla", VERS_FROM40, 8711 }, 228 | { "isin", VERS_FROM40, 8712 }, 229 | { "notin", VERS_FROM40, 8713 }, 230 | { "ni", VERS_FROM40, 8715 }, 231 | { "prod", VERS_FROM40, 8719 }, 232 | { "sum", VERS_FROM40, 8721 }, 233 | { "minus", VERS_FROM40, 8722 }, 234 | { "lowast", VERS_FROM40, 8727 }, 235 | { "radic", VERS_FROM40, 8730 }, 236 | { "prop", VERS_FROM40, 8733 }, 237 | { "infin", VERS_FROM40, 8734 }, 238 | { "ang", VERS_FROM40, 8736 }, 239 | { "and", VERS_FROM40, 8743 }, 240 | { "or", VERS_FROM40, 8744 }, 241 | { "cap", VERS_FROM40, 8745 }, 242 | { "cup", VERS_FROM40, 8746 }, 243 | { "int", VERS_FROM40, 8747 }, 244 | { "there4", VERS_FROM40, 8756 }, 245 | { "sim", VERS_FROM40, 8764 }, 246 | { "cong", VERS_FROM40, 8773 }, 247 | { "asymp", VERS_FROM40, 8776 }, 248 | { "ne", VERS_FROM40, 8800 }, 249 | { "equiv", VERS_FROM40, 8801 }, 250 | { "le", VERS_FROM40, 8804 }, 251 | { "ge", VERS_FROM40, 8805 }, 252 | { "sub", VERS_FROM40, 8834 }, 253 | { "sup", VERS_FROM40, 8835 }, 254 | { "nsub", VERS_FROM40, 8836 }, 255 | { "sube", VERS_FROM40, 8838 }, 256 | { "supe", VERS_FROM40, 8839 }, 257 | { "oplus", VERS_FROM40, 8853 }, 258 | { "otimes", VERS_FROM40, 8855 }, 259 | { "perp", VERS_FROM40, 8869 }, 260 | { "sdot", VERS_FROM40, 8901 }, 261 | { "lceil", VERS_FROM40, 8968 }, 262 | { "rceil", VERS_FROM40, 8969 }, 263 | { "lfloor", VERS_FROM40, 8970 }, 264 | { "rfloor", VERS_FROM40, 8971 }, 265 | { "lang", VERS_FROM40, 9001 }, 266 | { "rang", VERS_FROM40, 9002 }, 267 | { "loz", VERS_FROM40, 9674 }, 268 | { "spades", VERS_FROM40, 9824 }, 269 | { "clubs", VERS_FROM40, 9827 }, 270 | { "hearts", VERS_FROM40, 9829 }, 271 | { "diams", VERS_FROM40, 9830 }, 272 | 273 | /* 274 | ** Extended Entities defined in HTML 4: Special (less Markup at top) 275 | */ 276 | { "OElig", VERS_FROM40, 338 }, 277 | { "oelig", VERS_FROM40, 339 }, 278 | { "Scaron", VERS_FROM40, 352 }, 279 | { "scaron", VERS_FROM40, 353 }, 280 | { "Yuml", VERS_FROM40, 376 }, 281 | { "circ", VERS_FROM40, 710 }, 282 | { "tilde", VERS_FROM40, 732 }, 283 | { "ensp", VERS_FROM40, 8194 }, 284 | { "emsp", VERS_FROM40, 8195 }, 285 | { "thinsp", VERS_FROM40, 8201 }, 286 | { "zwnj", VERS_FROM40, 8204 }, 287 | { "zwj", VERS_FROM40, 8205 }, 288 | { "lrm", VERS_FROM40, 8206 }, 289 | { "rlm", VERS_FROM40, 8207 }, 290 | { "ndash", VERS_FROM40, 8211 }, 291 | { "mdash", VERS_FROM40, 8212 }, 292 | { "lsquo", VERS_FROM40, 8216 }, 293 | { "rsquo", VERS_FROM40, 8217 }, 294 | { "sbquo", VERS_FROM40, 8218 }, 295 | { "ldquo", VERS_FROM40, 8220 }, 296 | { "rdquo", VERS_FROM40, 8221 }, 297 | { "bdquo", VERS_FROM40, 8222 }, 298 | { "dagger", VERS_FROM40, 8224 }, 299 | { "Dagger", VERS_FROM40, 8225 }, 300 | { "permil", VERS_FROM40, 8240 }, 301 | { "lsaquo", VERS_FROM40, 8249 }, 302 | { "rsaquo", VERS_FROM40, 8250 }, 303 | { "euro", VERS_FROM40, 8364 }, 304 | { NULL, 0, 0 } 305 | }; 306 | 307 | 308 | /* Pure static implementation. Trades off lookup speed 309 | ** for faster setup time (well, none actually). 310 | ** Optimization of comparing 1st character buys enough 311 | ** speed that hash doesn't improve things without > 500 312 | ** items in list. 313 | */ 314 | static const entity* lookup( ctmbstr s ) 315 | { 316 | tmbchar ch = (tmbchar)( s ? *s : 0 ); 317 | const entity *np; 318 | for ( np = entities; ch && np && np->name; ++np ) 319 | if ( ch == *np->name && tmbstrcmp(s, np->name) == 0 ) 320 | return np; 321 | return NULL; 322 | } 323 | 324 | /* entity starting with "&" returns zero on error */ 325 | uint EntityCode( ctmbstr name, uint versions ) 326 | { 327 | const entity* np; 328 | assert( name && name[0] == '&' ); 329 | 330 | /* numeric entitity: name = "&#" followed by number */ 331 | if ( name[1] == '#' ) 332 | { 333 | uint c = 0; /* zero on missing/bad number */ 334 | Bool isXml = ( (versions & VERS_XML) == VERS_XML ); 335 | 336 | /* 'x' prefix denotes hexadecimal number format */ 337 | if ( name[2] == 'x' || (!isXml && name[2] == 'X') ) 338 | sscanf( name+3, "%x", &c ); 339 | else 340 | sscanf( name+2, "%d", &c ); 341 | 342 | return (uint) c; 343 | } 344 | 345 | /* Named entity: name ="&" followed by a name */ 346 | if ( np = lookup(name+1) ) 347 | { 348 | /* Only recognize entity name if version supports it. */ 349 | if ( np->versions & versions ) 350 | return np->code; 351 | } 352 | 353 | return 0; /* zero signifies unknown entity name */ 354 | } 355 | 356 | Bool EntityInfo( ctmbstr name, Bool isXml, uint* code, uint* versions ) 357 | { 358 | const entity* np; 359 | assert( name && name[0] == '&' ); 360 | assert( code != NULL ); 361 | assert( versions != NULL ); 362 | 363 | /* numeric entitity: name = "&#" followed by number */ 364 | if ( name[1] == '#' ) 365 | { 366 | uint c = 0; /* zero on missing/bad number */ 367 | 368 | /* 'x' prefix denotes hexadecimal number format */ 369 | if ( name[2] == 'x' || (!isXml && name[2] == 'X') ) 370 | sscanf( name+3, "%x", &c ); 371 | else 372 | sscanf( name+2, "%d", &c ); 373 | 374 | *code = c; 375 | *versions = VERS_ALL; 376 | return yes; 377 | } 378 | 379 | /* Named entity: name ="&" followed by a name */ 380 | if ( np = lookup(name+1) ) 381 | { 382 | *code = np->code; 383 | *versions = np->versions; 384 | return yes; 385 | } 386 | 387 | *code = 0; 388 | *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY ); 389 | return no; 390 | } 391 | 392 | 393 | ctmbstr EntityName( uint ch, uint versions ) 394 | { 395 | ctmbstr entnam = NULL; 396 | const entity *ep; 397 | 398 | for ( ep = entities; ep->name != NULL; ++ep ) 399 | { 400 | if ( ep->code == ch ) 401 | { 402 | if ( ep->versions & versions ) 403 | entnam = ep->name; 404 | break; /* Found code. Stop search. */ 405 | } 406 | } 407 | return entnam; 408 | } 409 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/include/platform.h: -------------------------------------------------------------------------------- 1 | #ifndef __PLATFORM_H__ 2 | #define __PLATFORM_H__ 3 | 4 | /* platform.h -- Platform specifics 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info : 10 | 11 | $LastChangedBy$ 12 | $LastChangedDate$ 13 | $LastChangedRevision$ 14 | 15 | */ 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | /* 22 | Uncomment and edit one of the following #defines if you 23 | want to specify the config file at compile-time. 24 | */ 25 | 26 | /* #define CONFIG_FILE "/etc/tidy_config.txt" */ /* original */ 27 | /* #define CONFIG_FILE "/etc/tidyrc" */ 28 | /* #define CONFIG_FILE "/etc/tidy.conf" */ 29 | 30 | /* 31 | Uncomment the following #define if you are on a system 32 | supporting the HOME environment variable. 33 | It enables tidy to find config files named ~/.tidyrc if 34 | the HTML_TIDY environment variable is not set. 35 | */ 36 | /* #define USER_CONFIG_FILE "~/.tidyrc" */ 37 | 38 | /* 39 | Uncomment the following #define if your 40 | system supports the call getpwnam(). 41 | E.g. Unix and Linux. 42 | 43 | It enables tidy to find files named 44 | ~your/foo for use in the HTML_TIDY environment 45 | variable or CONFIG_FILE or USER_CONFIGFILE or 46 | on the command line: -config ~joebob/tidy.cfg 47 | 48 | Contributed by Todd Lewis. 49 | */ 50 | 51 | /* #define SUPPORT_GETPWNAM */ 52 | 53 | 54 | /* Enable/disable support for Big5 and Shift_JIS character encodings */ 55 | #ifndef SUPPORT_ASIAN_ENCODINGS 56 | #define SUPPORT_ASIAN_ENCODINGS 1 57 | #endif 58 | 59 | /* Enable/disable support for UTF-16 character encodings */ 60 | #ifndef SUPPORT_UTF16_ENCODINGS 61 | #define SUPPORT_UTF16_ENCODINGS 1 62 | #endif 63 | 64 | /* Enable/disable support for additional accessibility checks */ 65 | #ifndef SUPPORT_ACCESSIBILITY_CHECKS 66 | #define SUPPORT_ACCESSIBILITY_CHECKS 1 67 | #endif 68 | 69 | 70 | /* Convenience defines for Mac platforms */ 71 | 72 | #if defined(macintosh) 73 | /* Mac OS 6.x/7.x/8.x/9.x, with or without CarbonLib - MPW or Metrowerks 68K/PPC compilers */ 74 | #define MAC_OS_CLASSIC 75 | #ifndef PLATFORM_NAME 76 | #define PLATFORM_NAME "Mac OS" 77 | #endif 78 | 79 | /* needed for access() */ 80 | #if !defined(_POSIX) && !defined(NO_ACCESS_SUPPORT) 81 | #define NO_ACCESS_SUPPORT 82 | #endif 83 | 84 | #ifdef SUPPORT_GETPWNAM 85 | #undef SUPPORT_GETPWNAM 86 | #endif 87 | 88 | #elif defined(__APPLE__) && defined(__MACH__) 89 | /* Mac OS X (client) 10.x (or server 1.x/10.x) - gcc or Metrowerks MachO compilers */ 90 | #define MAC_OS_X 91 | #ifndef PLATFORM_NAME 92 | #define PLATFORM_NAME "Mac OS X" 93 | #endif 94 | #endif 95 | 96 | #if defined(MAC_OS_CLASSIC) || defined(MAC_OS_X) 97 | /* Any OS on Mac platform */ 98 | #define MAC_OS 99 | #define FILENAMES_CASE_SENSITIVE 0 100 | #define strcasecmp strcmp 101 | #ifndef DFLT_REPL_CHARENC 102 | #define DFLT_REPL_CHARENC MACROMAN 103 | #endif 104 | #endif 105 | 106 | /* Convenience defines for BSD like platforms */ 107 | 108 | #if defined(__FreeBSD__) 109 | #define BSD_BASED_OS 110 | #ifndef PLATFORM_NAME 111 | #define PLATFORM_NAME "FreeBSD" 112 | #endif 113 | 114 | #elif defined(__NetBSD__) 115 | #define BSD_BASED_OS 116 | #ifndef PLATFORM_NAME 117 | #define PLATFORM_NAME "NetBSD" 118 | #endif 119 | 120 | #elif defined(__OpenBSD__) 121 | #define BSD_BASED_OS 122 | #ifndef PLATFORM_NAME 123 | #define PLATFORM_NAME "OpenBSD" 124 | #endif 125 | 126 | #elif defined(__MINT__) 127 | #define BSD_BASED_OS 128 | #ifndef PLATFORM_NAME 129 | #define PLATFORM_NAME "FreeMiNT" 130 | #endif 131 | 132 | #elif defined(__bsdi__) 133 | #define BSD_BASED_OS 134 | #ifndef PLATFORM_NAME 135 | #define PLATFORM_NAME "BSD/OS" 136 | #endif 137 | 138 | #endif 139 | 140 | /* Convenience defines for Windows platforms */ 141 | 142 | #if defined(WINDOWS) || defined(_WIN32) 143 | 144 | #define WINDOWS_OS 145 | #ifndef PLATFORM_NAME 146 | #define PLATFORM_NAME "Windows" 147 | #endif 148 | 149 | #if defined(__MWERKS__) || defined(__MSL__) 150 | /* not available with Metrowerks Standard Library */ 151 | 152 | #ifdef SUPPORT_GETPWNAM 153 | #undef SUPPORT_GETPWNAM 154 | #endif 155 | 156 | /* needed for setmode() */ 157 | #if !defined(NO_SETMODE_SUPPORT) 158 | #define NO_SETMODE_SUPPORT 159 | #endif 160 | 161 | #define strcasecmp _stricmp 162 | 163 | #endif 164 | 165 | #define FILENAMES_CASE_SENSITIVE 0 166 | 167 | #endif 168 | 169 | /* Convenience defines for Linux platforms */ 170 | 171 | #if defined(linux) && defined(__alpha__) 172 | /* Linux on Alpha - gcc compiler */ 173 | #define LINUX_OS 174 | #ifndef PLATFORM_NAME 175 | #define PLATFORM_NAME "Linux/Alpha" 176 | #endif 177 | 178 | #elif defined(linux) && defined(__sparc__) 179 | /* Linux on Sparc - gcc compiler */ 180 | #define LINUX_OS 181 | #ifndef PLATFORM_NAME 182 | #define PLATFORM_NAME "Linux/Sparc" 183 | #endif 184 | 185 | #elif defined(linux) && (defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)) 186 | /* Linux on x86 - gcc compiler */ 187 | #define LINUX_OS 188 | #ifndef PLATFORM_NAME 189 | #define PLATFORM_NAME "Linux/x86" 190 | #endif 191 | 192 | #elif defined(linux) && defined(__powerpc__) 193 | /* Linux on PPC - gcc compiler */ 194 | #define LINUX_OS 195 | 196 | #if defined(__linux__) && defined(__powerpc__) 197 | 198 | /* #if #system(linux) */ 199 | /* MkLinux on PPC - gcc (egcs) compiler */ 200 | /* #define MAC_OS_MKLINUX */ 201 | #ifndef PLATFORM_NAME 202 | #define PLATFORM_NAME "MkLinux" 203 | #endif 204 | 205 | #else 206 | 207 | #ifndef PLATFORM_NAME 208 | #define PLATFORM_NAME "Linux/PPC" 209 | #endif 210 | 211 | #endif 212 | 213 | #elif defined(linux) || defined(__linux__) 214 | /* generic Linux */ 215 | #define LINUX_OS 216 | #ifndef PLATFORM_NAME 217 | #define PLATFORM_NAME "Linux" 218 | #endif 219 | 220 | #endif 221 | 222 | /* Convenience defines for Solaris platforms */ 223 | 224 | #if defined(sun) 225 | #define SOLARIS_OS 226 | #ifndef PLATFORM_NAME 227 | #define PLATFORM_NAME "Solaris" 228 | #endif 229 | #endif 230 | 231 | /* Convenience defines for HPUX + gcc platforms */ 232 | 233 | #if defined(__hpux) 234 | #define HPUX_OS 235 | #ifndef PLATFORM_NAME 236 | #define PLATFORM_NAME "HPUX" 237 | #endif 238 | #endif 239 | 240 | /* Convenience defines for RISCOS + gcc platforms */ 241 | 242 | #if defined(__riscos__) 243 | #define RISC_OS 244 | #ifndef PLATFORM_NAME 245 | #define PLATFORM_NAME "RISC OS" 246 | #endif 247 | #endif 248 | 249 | /* Convenience defines for OS/2 + icc/gcc platforms */ 250 | 251 | #if defined(__OS2__) || defined(__EMX__) 252 | #define OS2_OS 253 | #ifndef PLATFORM_NAME 254 | #define PLATFORM_NAME "OS/2" 255 | #endif 256 | #define FILENAMES_CASE_SENSITIVE 0 257 | #define strcasecmp stricmp 258 | #endif 259 | 260 | /* Convenience defines for IRIX */ 261 | 262 | #if defined(__sgi) 263 | #define IRIX_OS 264 | #ifndef PLATFORM_NAME 265 | #define PLATFORM_NAME "SGI IRIX" 266 | #endif 267 | #endif 268 | 269 | /* Convenience defines for AIX */ 270 | 271 | #if defined(_AIX) 272 | #define AIX_OS 273 | #ifndef PLATFORM_NAME 274 | #define PLATFORM_NAME "IBM AIX" 275 | #endif 276 | #endif 277 | 278 | 279 | /* Convenience defines for BeOS platforms */ 280 | 281 | #if defined(__BEOS__) 282 | #define BE_OS 283 | #ifndef PLATFORM_NAME 284 | #define PLATFORM_NAME "BeOS" 285 | #endif 286 | #endif 287 | 288 | /* Convenience defines for Cygwin platforms */ 289 | 290 | #if defined(__CYGWIN__) 291 | #define CYGWIN_OS 292 | #ifndef PLATFORM_NAME 293 | #define PLATFORM_NAME "Cygwin" 294 | #endif 295 | #define FILENAMES_CASE_SENSITIVE 0 296 | #endif 297 | 298 | /* Convenience defines for OpenVMS */ 299 | 300 | #if defined(__VMS) 301 | #define OPENVMS_OS 302 | #ifndef PLATFORM_NAME 303 | #define PLATFORM_NAME "OpenVMS" 304 | #endif 305 | #define FILENAMES_CASE_SENSITIVE 0 306 | #endif 307 | 308 | /* Convenience defines for DEC Alpha OSF + gcc platforms */ 309 | 310 | #if defined(__osf__) 311 | #define OSF_OS 312 | #ifndef PLATFORM_NAME 313 | #define PLATFORM_NAME "DEC Alpha OSF" 314 | #endif 315 | #endif 316 | 317 | /* Convenience defines for ARM platforms */ 318 | 319 | #if defined(__arm) 320 | #define ARM_OS 321 | 322 | #if defined(forARM) && defined(__NEWTON_H) 323 | 324 | /* Using Newton C++ Tools ARMCpp compiler */ 325 | #define NEWTON_OS 326 | #ifndef PLATFORM_NAME 327 | #define PLATFORM_NAME "Newton" 328 | #endif 329 | 330 | #else 331 | 332 | #ifndef PLATFORM_NAME 333 | #define PLATFORM_NAME "ARM" 334 | #endif 335 | 336 | #endif 337 | 338 | #endif 339 | 340 | #include 341 | #include 342 | #include /* for longjmp on error exit */ 343 | #include 344 | #include /* may need for Unix V */ 345 | #include 346 | #include 347 | 348 | #ifdef NEEDS_MALLOC_H 349 | #include 350 | #endif 351 | 352 | #ifdef SUPPORT_GETPWNAM 353 | #include 354 | #endif 355 | 356 | #ifdef NEEDS_UNISTD_H 357 | #include /* needed for unlink on some Unix systems */ 358 | #endif 359 | 360 | /* This can be set at compile time. Usually Windows, 361 | ** except for Macintosh builds. 362 | */ 363 | #ifndef DFLT_REPL_CHARENC 364 | #define DFLT_REPL_CHARENC WIN1252 365 | #endif 366 | 367 | /* By default, use case-sensitive filename comparison. 368 | */ 369 | #ifndef FILENAMES_CASE_SENSITIVE 370 | #define FILENAMES_CASE_SENSITIVE 1 371 | #endif 372 | 373 | 374 | /* 375 | Tidy preserves the last modified time for the files it 376 | cleans up. 377 | */ 378 | 379 | /* 380 | If your platform doesn't support and the 381 | utime() function, or and the futime() 382 | function then set PRESERVE_FILE_TIMES to 0. 383 | 384 | If your platform doesn't support and the 385 | futime() function, then set HAS_FUTIME to 0. 386 | 387 | If your platform supports and the 388 | utime() function requires the file to be 389 | closed first, then set UTIME_NEEDS_CLOSED_FILE to 1. 390 | */ 391 | 392 | /* Keep old PRESERVEFILETIMES define for compatibility */ 393 | #ifdef PRESERVEFILETIMES 394 | #undef PRESERVE_FILE_TIMES 395 | #define PRESERVE_FILE_TIMES PRESERVEFILETIMES 396 | #endif 397 | 398 | #ifndef PRESERVE_FILE_TIMES 399 | #if defined(RISC_OS) || defined(OPENVMS_OS) || defined(OSF_OS) 400 | #define PRESERVE_FILE_TIMES 0 401 | #else 402 | #define PRESERVE_FILE_TIMES 1 403 | #endif 404 | #endif 405 | 406 | #if PRESERVE_FILE_TIMES 407 | 408 | #ifndef HAS_FUTIME 409 | #if defined(CYGWIN_OS) || defined(BE_OS) || defined(OS2_OS) || defined(HPUX_OS) || defined(SOLARIS_OS) || defined(LINUX_OS) || defined(BSD_BASED_OS) || defined(MAC_OS) || defined(__MSL__) || defined(IRIX_OS) || defined(AIX_OS) || defined(__BORLANDC__) 410 | #define HAS_FUTIME 0 411 | #else 412 | #define HAS_FUTIME 1 413 | #endif 414 | #endif 415 | 416 | #ifndef UTIME_NEEDS_CLOSED_FILE 417 | #if defined(SOLARIS_OS) || defined(BSD_BASED_OS) || defined(MAC_OS) || defined(__MSL__) || defined(LINUX_OS) 418 | #define UTIME_NEEDS_CLOSED_FILE 1 419 | #else 420 | #define UTIME_NEEDS_CLOSED_FILE 0 421 | #endif 422 | #endif 423 | 424 | #if defined(MAC_OS_X) || (!defined(MAC_OS_CLASSIC) && !defined(__MSL__)) 425 | #include 426 | #include 427 | #else 428 | #include 429 | #endif 430 | 431 | #if HAS_FUTIME 432 | #include 433 | #else 434 | #include 435 | #endif /* HASFUTIME */ 436 | 437 | /* 438 | MS Windows needs _ prefix for Unix file functions. 439 | Not required by Metrowerks Standard Library (MSL). 440 | 441 | Tidy uses following for preserving the last modified time. 442 | 443 | WINDOWS automatically set by Win16 compilers. 444 | _WIN32 automatically set by Win32 compilers. 445 | */ 446 | #if defined(_WIN32) && !defined(__MSL__) && !defined(__BORLANDC__) 447 | 448 | #define futime _futime 449 | #define fstat _fstat 450 | #define utimbuf _utimbuf /* Windows seems to want utimbuf */ 451 | #define stat _stat 452 | #define utime _utime 453 | 454 | #endif /* _WIN32 */ 455 | 456 | #endif /* PRESERVE_FILE_TIMES */ 457 | 458 | /* 459 | MS Windows needs _ prefix for Unix file functions. 460 | Not required by Metrowerks Standard Library (MSL). 461 | 462 | WINDOWS automatically set by Win16 compilers. 463 | _WIN32 automatically set by Win32 compilers. 464 | */ 465 | #if defined(_WIN32) && !defined(__MSL__) && !defined(__BORLANDC__) 466 | 467 | #ifndef __WATCOMC__ 468 | #define fileno _fileno 469 | #define setmode _setmode 470 | #endif 471 | 472 | #define access _access 473 | #define strcasecmp _stricmp 474 | 475 | #if _MSC_VER > 1000 476 | #pragma warning( disable : 4189 ) /* local variable is initialized but not referenced */ 477 | #pragma warning( disable : 4100 ) /* unreferenced formal parameter */ 478 | #pragma warning( disable : 4706 ) /* assignment within conditional expression */ 479 | #endif 480 | 481 | #endif /* _WIN32 */ 482 | 483 | #if defined(_WIN32) 484 | 485 | #if defined(_USRDLL) && !defined(TIDY_EXPORT) 486 | #define TIDY_EXPORT __declspec( dllexport ) 487 | #endif 488 | 489 | #endif /* _WIN32 */ 490 | 491 | /* hack for gnu sys/types.h file which defines uint and ulong */ 492 | 493 | #if defined(BE_OS) || defined(SOLARIS_OS) || defined(BSD_BASED_OS) || defined(OSF_OS) || defined(IRIX_OS) || defined(AIX_OS) 494 | #include 495 | #endif 496 | #if !defined(HPUX_OS) && !defined(CYGWIN_OS) && !defined(MAC_OS_X) && !defined(BE_OS) && !defined(SOLARIS_OS) && !defined(BSD_BASED_OS) && !defined(OSF_OS) && !defined(IRIX_OS) && !defined(AIX_OS) 497 | typedef unsigned int uint; 498 | #endif 499 | #if defined(HPUX_OS) || defined(CYGWIN_OS) || defined(MAC_OS) || defined(BSD_BASED_OS) || defined(_WIN32) 500 | typedef unsigned long ulong; 501 | #endif 502 | 503 | #ifndef TIDY_EXPORT /* Define it away for most builds */ 504 | #define TIDY_EXPORT 505 | #endif 506 | 507 | #ifndef TIDY_STRUCT 508 | #define TIDY_STRUCT 509 | #endif 510 | 511 | typedef unsigned char byte; 512 | 513 | typedef uint tchar; /* single, full character */ 514 | typedef char tmbchar; /* single, possibly partial character */ 515 | #ifndef TMBSTR_DEFINED 516 | typedef tmbchar* tmbstr; /* pointer to buffer of possibly partial chars */ 517 | typedef const tmbchar* ctmbstr; /* Ditto, but const */ 518 | #define TMBSTR_DEFINED 519 | #endif 520 | 521 | 522 | /* 523 | bool is a reserved word in some but 524 | not all C++ compilers depending on age 525 | work around is to avoid bool altogether 526 | by introducing a new enum called Bool 527 | */ 528 | typedef enum 529 | { 530 | no, 531 | yes 532 | } Bool; 533 | 534 | /* for NULL pointers 535 | #define null ((const void*)0) 536 | extern void* null; 537 | */ 538 | 539 | #if defined(DMALLOC) 540 | #include "dmalloc.h" 541 | #endif 542 | 543 | void *MemAlloc(size_t size); 544 | void *MemRealloc(void *mem, size_t newsize); 545 | void MemFree(void *mem); 546 | void ClearMemory(void *, size_t size); 547 | void FatalError( ctmbstr msg ); 548 | 549 | /* Opaque data structure. 550 | * Cast to implementation type struct within lib. 551 | * This will reduce inter-dependencies/conflicts w/ application code. 552 | */ 553 | #if 1 554 | /* 555 | * Please note - this definition assumes your compiler uses 'int' for enums. 556 | */ 557 | #define opaque( typenam )\ 558 | struct _##typenam { int _opaque; };\ 559 | typedef struct _##typenam* typenam 560 | #else 561 | #define opaque(typenam) typedef void* typenam 562 | #endif 563 | 564 | /* Opaque data structure used to pass back 565 | ** and forth to keep current position in a 566 | ** list or other collection. 567 | */ 568 | opaque( TidyIterator ); 569 | 570 | #ifdef __cplusplus 571 | } /* extern "C" */ 572 | #endif 573 | 574 | #endif /* __PLATFORM_H__ */ 575 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/utf8.c: -------------------------------------------------------------------------------- 1 | /* utf8.c -- convert characters to/from UTF-8 2 | 3 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 4 | See tidy.h for the copyright notice. 5 | 6 | CVS Info : 7 | 8 | $LastChangedBy$ 9 | $LastChangedDate$ 10 | $LastChangedRevision$ 11 | 12 | Uses public interfaces to abstract input source and output 13 | sink, which may be user supplied or either FILE* or memory 14 | based Tidy implementations. Encoding support is uniform 15 | regardless of I/O mechanism. 16 | 17 | Note, UTF-8 encoding, by itself, does not affect the actual 18 | "codepoints" of the underlying character encoding. In the 19 | cases of ASCII, Latin1, Unicode (16-bit, BMP), these all 20 | refer to ISO-10646 "codepoints". For anything else, they 21 | refer to some other "codepoint" set. 22 | 23 | Put another way, UTF-8 is a variable length method to 24 | represent any non-negative integer value. The glyph 25 | that a integer value represents is unchanged and defined 26 | externally (e.g. by ISO-10646, Big5, Win1252, MacRoman, 27 | Latin2-9, and so on). 28 | 29 | Put still another way, UTF-8 is more of a _transfer_ encoding 30 | than a _character_ encoding, per se. 31 | */ 32 | 33 | #include "tidy.h" 34 | #include "utf8.h" 35 | 36 | /* 37 | UTF-8 encoding/decoding functions 38 | Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence 39 | 40 | Also see below for UTF-16 encoding/decoding functions 41 | 42 | References : 43 | 44 | 1) UCS Transformation Format 8 (UTF-8): 45 | ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D 46 | 47 | 48 | 49 | Table 4 - Mapping from UCS-4 to UTF-8 50 | 51 | 2) Unicode standards: 52 | 53 | 54 | 3) Legal UTF-8 byte sequences: 55 | 56 | 57 | Code point 1st byte 2nd byte 3rd byte 4th byte 58 | ---------- -------- -------- -------- -------- 59 | U+0000..U+007F 00..7F 60 | U+0080..U+07FF C2..DF 80..BF 61 | U+0800..U+0FFF E0 A0..BF 80..BF 62 | U+1000..U+FFFF E1..EF 80..BF 80..BF 63 | U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 64 | U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 65 | U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 66 | 67 | The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also 68 | allows for the use of five- and six-byte sequences to encode 69 | characters that are outside the range of the Unicode character 70 | set; those five- and six-byte sequences are illegal for the use 71 | of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646 72 | does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF 73 | (but it does allow other noncharacters). 74 | 75 | 4) RFC 2279: UTF-8, a transformation format of ISO 10646: 76 | 77 | 78 | 5) UTF-8 and Unicode FAQ: 79 | 80 | 81 | 6) Markus Kuhn's UTF-8 decoder stress test file: 82 | 83 | 84 | 7) UTF-8 Demo: 85 | 86 | 87 | 8) UTF-8 Sampler: 88 | 89 | 90 | 9) Transformation Format for 16 Planes of Group 00 (UTF-16): 91 | ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C 92 | 93 | 94 | 95 | 10) RFC 2781: UTF-16, an encoding of ISO 10646: 96 | 97 | 98 | 11) UTF-16 invalid surrogate pairs: 99 | 100 | 101 | UTF-16 UTF-8 UCS-4 102 | D83F DFF* F0 9F BF B* 0001FFF* 103 | D87F DFF* F0 AF BF B* 0002FFF* 104 | D8BF DFF* F0 BF BF B* 0003FFF* 105 | D8FF DFF* F1 8F BF B* 0004FFF* 106 | D93F DFF* F1 9F BF B* 0005FFF* 107 | D97F DFF* F1 AF BF B* 0006FFF* 108 | ... 109 | DBBF DFF* F3 BF BF B* 000FFFF* 110 | DBFF DFF* F4 8F BF B* 0010FFF* 111 | 112 | * = E or F 113 | 114 | 1010 A 115 | 1011 B 116 | 1100 C 117 | 1101 D 118 | 1110 E 119 | 1111 F 120 | 121 | */ 122 | 123 | #define kNumUTF8Sequences 7 124 | #define kMaxUTF8Bytes 4 125 | 126 | #define kUTF8ByteSwapNotAChar 0xFFFE 127 | #define kUTF8NotAChar 0xFFFF 128 | 129 | #define kMaxUTF8FromUCS4 0x10FFFF 130 | 131 | #define kUTF16SurrogatesBegin 0x10000 132 | #define kMaxUTF16FromUCS4 0x10FFFF 133 | 134 | /* UTF-16 surrogate pair areas */ 135 | #define kUTF16LowSurrogateBegin 0xD800 136 | #define kUTF16LowSurrogateEnd 0xDBFF 137 | #define kUTF16HighSurrogateBegin 0xDC00 138 | #define kUTF16HighSurrogateEnd 0xDFFF 139 | 140 | 141 | /* offsets into validUTF8 table below */ 142 | static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] = 143 | { 144 | 0, /* 1 byte */ 145 | 1, /* 2 bytes */ 146 | 2, /* 3 bytes */ 147 | 4, /* 4 bytes */ 148 | kNumUTF8Sequences /* must be last */ 149 | }; 150 | 151 | static const struct validUTF8Sequence 152 | { 153 | uint lowChar; 154 | uint highChar; 155 | int numBytes; 156 | byte validBytes[8]; 157 | } validUTF8[kNumUTF8Sequences] = 158 | { 159 | /* low high #bytes byte 1 byte 2 byte 3 byte 4 */ 160 | {0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, 161 | {0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}}, 162 | {0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}}, 163 | {0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}}, 164 | {0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}}, 165 | {0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}}, 166 | {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}} 167 | }; 168 | 169 | int DecodeUTF8BytesToChar( uint* c, uint firstByte, tmbstr successorBytes, 170 | TidyInputSource* inp, int* count ) 171 | { 172 | byte tempbuf[10]; 173 | byte *buf = &tempbuf[0]; 174 | uint ch = 0, n = 0; 175 | int i, bytes = 0; 176 | Bool hasError = no; 177 | 178 | if ( successorBytes ) 179 | buf = (byte*) successorBytes; 180 | 181 | /* special check if we have been passed an EOF char */ 182 | if ( firstByte == EndOfStream ) 183 | { 184 | /* at present */ 185 | *c = firstByte; 186 | *count = 1; 187 | return 0; 188 | } 189 | 190 | ch = firstByte; /* first byte is passed in separately */ 191 | 192 | if (ch <= 0x7F) /* 0XXX XXXX one byte */ 193 | { 194 | n = ch; 195 | bytes = 1; 196 | } 197 | else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */ 198 | { 199 | n = ch & 31; 200 | bytes = 2; 201 | } 202 | else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */ 203 | { 204 | n = ch & 15; 205 | bytes = 3; 206 | } 207 | else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */ 208 | { 209 | n = ch & 7; 210 | bytes = 4; 211 | } 212 | else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */ 213 | { 214 | n = ch & 3; 215 | bytes = 5; 216 | hasError = yes; 217 | } 218 | else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */ 219 | { 220 | n = ch & 1; 221 | bytes = 6; 222 | hasError = yes; 223 | } 224 | else 225 | { 226 | /* not a valid first byte of a UTF-8 sequence */ 227 | n = ch; 228 | bytes = 1; 229 | hasError = yes; 230 | } 231 | 232 | /* successor bytes should have the form 10XX XXXX */ 233 | 234 | /* If caller supplied buffer, use it. Else see if caller 235 | ** supplied an input source, use that. 236 | */ 237 | if ( successorBytes ) 238 | { 239 | for ( i=0; i < bytes-1; ++i ) 240 | { 241 | if ( !buf[i] || (buf[i] & 0xC0) != 0x80 ) 242 | { 243 | hasError = yes; 244 | bytes = i; 245 | break; 246 | } 247 | n = (n << 6) | (buf[i] & 0x3F); 248 | } 249 | } 250 | else if ( inp ) 251 | { 252 | for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i ) 253 | { 254 | int b = inp->getByte( inp->sourceData ); 255 | buf[i] = (tmbchar) b; 256 | 257 | /* End of data or illegal successor byte value */ 258 | if ( b == EOF || (buf[i] & 0xC0) != 0x80 ) 259 | { 260 | hasError = yes; 261 | bytes = i; 262 | if ( b != EOF ) 263 | inp->ungetByte( inp->sourceData, buf[i] ); 264 | break; 265 | } 266 | n = (n << 6) | (buf[i] & 0x3F); 267 | } 268 | } 269 | else if ( bytes > 1 ) 270 | { 271 | hasError = yes; 272 | bytes = 1; 273 | } 274 | 275 | if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar))) 276 | hasError = yes; 277 | 278 | if (!hasError && (n > kMaxUTF8FromUCS4)) 279 | hasError = yes; 280 | 281 | #if 0 /* Breaks Big5 D8 - DF */ 282 | if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd)) 283 | /* unpaired surrogates not allowed */ 284 | hasError = yes; 285 | #endif 286 | 287 | if (!hasError) 288 | { 289 | int lo, hi; 290 | 291 | lo = offsetUTF8Sequences[bytes - 1]; 292 | hi = offsetUTF8Sequences[bytes] - 1; 293 | 294 | /* check for overlong sequences */ 295 | if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar)) 296 | hasError = yes; 297 | else 298 | { 299 | hasError = yes; /* assume error until proven otherwise */ 300 | 301 | for (i = lo; i <= hi; i++) 302 | { 303 | int tempCount; 304 | byte theByte; 305 | 306 | for (tempCount = 0; tempCount < bytes; tempCount++) 307 | { 308 | if (!tempCount) 309 | theByte = (tmbchar) firstByte; 310 | else 311 | theByte = buf[tempCount - 1]; 312 | 313 | if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] && 314 | theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] ) 315 | hasError = no; 316 | if (hasError) 317 | break; 318 | } 319 | } 320 | } 321 | } 322 | 323 | #if 1 && defined(_DEBUG) 324 | if ( hasError ) 325 | { 326 | /* debug */ 327 | fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes ); 328 | fprintf( stderr, "0x%02x ", firstByte ); 329 | for (i = 1; i < bytes; i++) 330 | fprintf( stderr, "0x%02x ", buf[i - 1] ); 331 | fprintf( stderr, " = U+%04lx\n", n ); 332 | } 333 | #endif 334 | 335 | *count = bytes; 336 | *c = n; 337 | if ( hasError ) 338 | return -1; 339 | return 0; 340 | } 341 | 342 | int EncodeCharToUTF8Bytes( uint c, tmbstr encodebuf, 343 | TidyOutputSink* outp, int* count ) 344 | { 345 | byte tempbuf[10] = {0}; 346 | byte* buf = &tempbuf[0]; 347 | int bytes = 0; 348 | Bool hasError = no; 349 | 350 | if ( encodebuf ) 351 | buf = (byte*) encodebuf; 352 | 353 | if (c <= 0x7F) /* 0XXX XXXX one byte */ 354 | { 355 | buf[0] = (tmbchar) c; 356 | bytes = 1; 357 | } 358 | else if (c <= 0x7FF) /* 110X XXXX two bytes */ 359 | { 360 | buf[0] = (tmbchar) ( 0xC0 | (c >> 6) ); 361 | buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) ); 362 | bytes = 2; 363 | } 364 | else if (c <= 0xFFFF) /* 1110 XXXX three bytes */ 365 | { 366 | buf[0] = (tmbchar) (0xE0 | (c >> 12)); 367 | buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 368 | buf[2] = (tmbchar) (0x80 | (c & 0x3F)); 369 | bytes = 3; 370 | if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar ) 371 | hasError = yes; 372 | #if 0 /* Breaks Big5 D8 - DF */ 373 | else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd ) 374 | /* unpaired surrogates not allowed */ 375 | hasError = yes; 376 | #endif 377 | } 378 | else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */ 379 | { 380 | buf[0] = (tmbchar) (0xF0 | (c >> 18)); 381 | buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); 382 | buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 383 | buf[3] = (tmbchar) (0x80 | (c & 0x3F)); 384 | bytes = 4; 385 | if (c > kMaxUTF8FromUCS4) 386 | hasError = yes; 387 | } 388 | else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */ 389 | { 390 | buf[0] = (tmbchar) (0xF8 | (c >> 24)); 391 | buf[1] = (tmbchar) (0x80 | (c >> 18)); 392 | buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); 393 | buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 394 | buf[4] = (tmbchar) (0x80 | (c & 0x3F)); 395 | bytes = 5; 396 | hasError = yes; 397 | } 398 | else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */ 399 | { 400 | buf[0] = (tmbchar) (0xFC | (c >> 30)); 401 | buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F)); 402 | buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F)); 403 | buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); 404 | buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 405 | buf[5] = (tmbchar) (0x80 | (c & 0x3F)); 406 | bytes = 6; 407 | hasError = yes; 408 | } 409 | else 410 | hasError = yes; 411 | 412 | /* don't output invalid UTF-8 byte sequence to a stream */ 413 | if ( !hasError && outp != NULL ) 414 | { 415 | int ix; 416 | for ( ix=0; ix < bytes; ++ix ) 417 | outp->putByte( outp->sinkData, buf[ix] ); 418 | } 419 | 420 | #if 1 && defined(_DEBUG) 421 | if ( hasError ) 422 | { 423 | int i; 424 | fprintf( stderr, "UTF-8 encoding error for U+%x : ", c ); 425 | for (i = 0; i < bytes; i++) 426 | fprintf( stderr, "0x%02x ", buf[i] ); 427 | fprintf( stderr, "\n" ); 428 | } 429 | #endif 430 | 431 | *count = bytes; 432 | if (hasError) 433 | return -1; 434 | return 0; 435 | } 436 | 437 | 438 | /* return one less than the number of bytes used by the UTF-8 byte sequence */ 439 | /* str points to the UTF-8 byte sequence */ 440 | /* the Unicode char is returned in *ch */ 441 | uint GetUTF8( tmbstr str, uint *ch ) 442 | { 443 | uint n; 444 | int bytes; 445 | 446 | int err; 447 | 448 | bytes = 0; 449 | 450 | /* first byte "str[0]" is passed in separately from the */ 451 | /* rest of the UTF-8 byte sequence starting at "str[1]" */ 452 | err = DecodeUTF8BytesToChar( &n, str[0], str+1, NULL, &bytes ); 453 | if (err) 454 | { 455 | #if 1 && defined(_DEBUG) 456 | fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n); 457 | #endif 458 | n = 0xFFFD; /* replacement char */ 459 | } 460 | 461 | *ch = n; 462 | return bytes - 1; 463 | } 464 | 465 | /* store char c as UTF-8 encoded byte stream */ 466 | tmbstr PutUTF8( tmbstr buf, uint c ) 467 | { 468 | int err, count = 0; 469 | 470 | err = EncodeCharToUTF8Bytes( c, buf, NULL, &count ); 471 | if (err) 472 | { 473 | #if 1 && defined(_DEBUG) 474 | fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c); 475 | #endif 476 | /* replacement char 0xFFFD encoded as UTF-8 */ 477 | buf[0] = (byte) 0xEF; 478 | buf[1] = (byte) 0xBF; 479 | buf[2] = (byte) 0xBD; 480 | count = 3; 481 | } 482 | 483 | buf += count; 484 | return buf; 485 | } 486 | 487 | Bool IsValidUTF16FromUCS4( tchar ucs4 ) 488 | { 489 | return ( ucs4 <= kMaxUTF16FromUCS4 ); 490 | } 491 | 492 | Bool IsHighSurrogate( tchar ch ) 493 | { 494 | return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd ); 495 | } 496 | Bool IsLowSurrogate( tchar ch ) 497 | { 498 | return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd ); 499 | } 500 | 501 | tchar CombineSurrogatePair( tchar high, tchar low ) 502 | { 503 | assert( IsHighSurrogate(high) && IsLowSurrogate(low) ); 504 | return ( ((low - kUTF16LowSurrogateBegin) * 0x400) + 505 | high - kUTF16HighSurrogateBegin + 0x10000 ); 506 | } 507 | 508 | Bool SplitSurrogatePair( tchar utf16, tchar* low, tchar* high ) 509 | { 510 | Bool ok = ( IsValidCombinedChar( utf16 ) && high && low ); 511 | if ( ok ) 512 | { 513 | *low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin; 514 | *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin; 515 | } 516 | return ok; 517 | } 518 | 519 | Bool IsValidCombinedChar( tchar ch ) 520 | { 521 | return ( ch >= kUTF16SurrogatesBegin && 522 | (ch & 0x0000FFFE) != 0x0000FFFE && 523 | (ch & 0x0000FFFF) != 0x0000FFFF ); 524 | } 525 | 526 | Bool IsCombinedChar( tchar ch ) 527 | { 528 | return ( ch >= kUTF16SurrogatesBegin ); 529 | } 530 | -------------------------------------------------------------------------------- /elementtidy-1.0-20050212/tidylib/src/lexer.h: -------------------------------------------------------------------------------- 1 | #ifndef __LEXER_H__ 2 | #define __LEXER_H__ 3 | 4 | /* lexer.h -- Lexer for html parser 5 | 6 | (c) 1998-2003 (W3C) MIT, ERCIM, Keio University 7 | See tidy.h for the copyright notice. 8 | 9 | CVS Info: 10 | $LastChangedBy$ 11 | $LastChangedDate$ 12 | $LastChangedRevision$ 13 | 14 | */ 15 | 16 | /* 17 | Given an input source, it returns a sequence of tokens. 18 | 19 | GetToken(source) gets the next token 20 | UngetToken(source) provides one level undo 21 | 22 | The tags include an attribute list: 23 | 24 | - linked list of attribute/value nodes 25 | - each node has 2 NULL-terminated strings. 26 | - entities are replaced in attribute values 27 | 28 | white space is compacted if not in preformatted mode 29 | If not in preformatted mode then leading white space 30 | is discarded and subsequent white space sequences 31 | compacted to single space characters. 32 | 33 | If XmlTags is no then Tag names are folded to upper 34 | case and attribute names to lower case. 35 | 36 | Not yet done: 37 | - Doctype subset and marked sections 38 | */ 39 | 40 | #ifdef __cplusplus 41 | extern "C" { 42 | #endif 43 | 44 | #include "forward.h" 45 | 46 | /* lexer character types 47 | */ 48 | #define digit 1 49 | #define letter 2 50 | #define namechar 4 51 | #define white 8 52 | #define newline 16 53 | #define lowercase 32 54 | #define uppercase 64 55 | 56 | 57 | /* node->type is one of these values 58 | */ 59 | #define RootNode 0 60 | #define DocTypeTag 1 61 | #define CommentTag 2 62 | #define ProcInsTag 3 63 | #define TextNode 4 64 | #define StartTag 5 65 | #define EndTag 6 66 | #define StartEndTag 7 67 | #define CDATATag 8 68 | #define SectionTag 9 69 | #define AspTag 10 70 | #define JsteTag 11 71 | #define PhpTag 12 72 | #define XmlDecl 13 73 | 74 | 75 | 76 | /* lexer GetToken states 77 | */ 78 | #define LEX_CONTENT 0 79 | #define LEX_GT 1 80 | #define LEX_ENDTAG 2 81 | #define LEX_STARTTAG 3 82 | #define LEX_COMMENT 4 83 | #define LEX_DOCTYPE 5 84 | #define LEX_PROCINSTR 6 85 | #define LEX_ENDCOMMENT 7 86 | #define LEX_CDATA 8 87 | #define LEX_SECTION 9 88 | #define LEX_ASP 10 89 | #define LEX_JSTE 11 90 | #define LEX_PHP 12 91 | #define LEX_XMLDECL 13 92 | 93 | /* ParseDocTypeDecl state constants */ 94 | #define DT_INTERMEDIATE 0 95 | #define DT_DOCTYPENAME 1 96 | #define DT_PUBLICSYSTEM 2 97 | #define DT_QUOTEDSTRING 3 98 | #define DT_INTSUBSET 4 99 | 100 | /* content model shortcut encoding 101 | */ 102 | #define CM_UNKNOWN 0 103 | #define CM_EMPTY (1 << 0) 104 | #define CM_HTML (1 << 1) 105 | #define CM_HEAD (1 << 2) 106 | #define CM_BLOCK (1 << 3) 107 | #define CM_INLINE (1 << 4) 108 | #define CM_LIST (1 << 5) 109 | #define CM_DEFLIST (1 << 6) 110 | #define CM_TABLE (1 << 7) 111 | #define CM_ROWGRP (1 << 8) 112 | #define CM_ROW (1 << 9) 113 | #define CM_FIELD (1 << 10) 114 | #define CM_OBJECT (1 << 11) 115 | #define CM_PARAM (1 << 12) 116 | #define CM_FRAMES (1 << 13) 117 | #define CM_HEADING (1 << 14) 118 | #define CM_OPT (1 << 15) 119 | #define CM_IMG (1 << 16) 120 | #define CM_MIXED (1 << 17) 121 | #define CM_NO_INDENT (1 << 18) 122 | #define CM_OBSOLETE (1 << 19) 123 | #define CM_NEW (1 << 20) 124 | #define CM_OMITST (1 << 21) 125 | 126 | /* If the document uses just HTML 2.0 tags and attributes described 127 | ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. 128 | ** If there are proprietary tags and attributes then describe it as 129 | ** HTML Proprietary. If it includes the xml-lang or xmlns attributes 130 | ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the 131 | ** flavors of Voyager (strict, loose or frameset). 132 | */ 133 | 134 | /* unknown */ 135 | #define xxxx 0u 136 | 137 | /* W3C defined HTML/XHTML family document types */ 138 | #define HT20 1u 139 | #define HT32 2u 140 | #define H40S 4u 141 | #define H40T 8u 142 | #define H40F 16u 143 | #define H41S 32u 144 | #define H41T 64u 145 | #define H41F 128u 146 | #define X10S 256u 147 | #define X10T 512u 148 | #define X10F 1024u 149 | #define XH11 2048u 150 | #define XB10 4096u 151 | 152 | /* proprietary stuff */ 153 | #define VERS_SUN 8192u 154 | #define VERS_NETSCAPE 16384u 155 | #define VERS_MICROSOFT 32768u 156 | 157 | /* special flag */ 158 | #define VERS_XML 65536u 159 | 160 | /* compatibility symbols */ 161 | #define VERS_UNKNOWN (xxxx) 162 | #define VERS_HTML20 (HT20) 163 | #define VERS_HTML32 (HT32) 164 | #define VERS_HTML40_STRICT (H40S|H41S|X10S) 165 | #define VERS_HTML40_LOOSE (H40T|H41T|X10T) 166 | #define VERS_FRAMESET (H40F|H41F|X10F) 167 | #define VERS_XHTML11 (XH11) 168 | #define VERS_BASIC (XB10) 169 | 170 | /* meta symbols */ 171 | #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET) 172 | #define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET) 173 | #define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME) 174 | #define VERS_EVENTS (VERS_HTML40|VERS_XHTML11) 175 | #define VERS_FROM32 (VERS_HTML32|VERS_HTML40) 176 | #define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC) 177 | #define VERS_XHTML (X10S|X10T|X10F|XH11|XB10) 178 | 179 | /* all W3C defined document types */ 180 | #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40) 181 | 182 | /* all proprietary types */ 183 | #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) 184 | 185 | /* Linked list of class names and styles 186 | */ 187 | struct _Style; 188 | typedef struct _Style Style; 189 | 190 | struct _Style 191 | { 192 | tmbstr tag; 193 | tmbstr tag_class; 194 | tmbstr properties; 195 | Style *next; 196 | }; 197 | 198 | 199 | /* Linked list of style properties 200 | */ 201 | struct _StyleProp; 202 | typedef struct _StyleProp StyleProp; 203 | 204 | struct _StyleProp 205 | { 206 | tmbstr name; 207 | tmbstr value; 208 | StyleProp *next; 209 | }; 210 | 211 | 212 | 213 | 214 | /* Attribute/Value linked list node 215 | */ 216 | 217 | struct _AttVal 218 | { 219 | AttVal* next; 220 | const Attribute* dict; 221 | Node* asp; 222 | Node* php; 223 | int delim; 224 | tmbstr attribute; 225 | tmbstr value; 226 | }; 227 | 228 | 229 | 230 | /* 231 | Mosaic handles inlines via a separate stack from other elements 232 | We duplicate this to recover from inline markup errors such as: 233 | 234 | italic text 235 |

more italic text normal text 236 | 237 | which for compatibility with Mosaic is mapped to: 238 | 239 | italic text 240 |

more italic text normal text 241 | 242 | Note that any inline end tag pop's the effect of the current 243 | inline start tag, so that pop's in the above example. 244 | */ 245 | struct _IStack 246 | { 247 | IStack* next; 248 | const Dict* tag; /* tag's dictionary definition */ 249 | tmbstr element; /* name (NULL for text nodes) */ 250 | AttVal* attributes; 251 | }; 252 | 253 | 254 | /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, 255 | ** etc. etc. 256 | */ 257 | 258 | struct _Node 259 | { 260 | Node* parent; /* tree structure */ 261 | Node* prev; 262 | Node* next; 263 | Node* content; 264 | Node* last; 265 | 266 | AttVal* attributes; 267 | const Dict* was; /* old tag when it was changed */ 268 | const Dict* tag; /* tag's dictionary definition */ 269 | 270 | tmbstr element; /* name (NULL for text nodes) */ 271 | 272 | uint start; /* start of span onto text array */ 273 | uint end; /* end of span onto text array */ 274 | uint type; /* TextNode, StartTag, EndTag etc. */ 275 | 276 | uint line; /* current line of document */ 277 | uint column; /* current column of document */ 278 | 279 | Bool closed; /* true if closed by explicit end tag */ 280 | Bool implicit; /* true if inferred */ 281 | Bool linebreak; /* true if followed by a line break */ 282 | }; 283 | 284 | 285 | /* 286 | The following are private to the lexer 287 | Use NewLexer() to create a lexer, and 288 | FreeLexer() to free it. 289 | */ 290 | 291 | struct _Lexer 292 | { 293 | #if 0 /* Move to TidyDocImpl */ 294 | StreamIn* in; /* document content input */ 295 | StreamOut* errout; /* error output stream */ 296 | 297 | uint badAccess; /* for accessibility errors */ 298 | uint badLayout; /* for bad style errors */ 299 | uint badChars; /* for bad character encodings */ 300 | uint badForm; /* for mismatched/mispositioned form tags */ 301 | uint warnings; /* count of warnings in this document */ 302 | uint errors; /* count of errors */ 303 | #endif 304 | 305 | uint lines; /* lines seen */ 306 | uint columns; /* at start of current token */ 307 | Bool waswhite; /* used to collapse contiguous white space */ 308 | Bool pushed; /* true after token has been pushed back */ 309 | Bool insertspace; /* when space is moved after end tag */ 310 | Bool excludeBlocks; /* Netscape compatibility */ 311 | Bool exiled; /* true if moved out of table */ 312 | Bool isvoyager; /* true if xmlns attribute on html element */ 313 | uint versions; /* bit vector of HTML versions */ 314 | int doctype; /* version as given by doctype (if any) */ 315 | Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ 316 | uint txtstart; /* start of current node */ 317 | uint txtend; /* end of current node */ 318 | uint state; /* state of lexer's finite state machine */ 319 | 320 | Node* token; /* current parse point */ 321 | Node* root; /* remember root node of the document */ 322 | 323 | Bool seenEndBody; /* true if a tag has been encountered */ 324 | Bool seenEndHtml; /* true if a tag has been encountered */ 325 | 326 | /* 327 | Lexer character buffer 328 | 329 | Parse tree nodes span onto this buffer 330 | which contains the concatenated text 331 | contents of all of the elements. 332 | 333 | lexsize must be reset for each file. 334 | */ 335 | tmbstr lexbuf; /* MB character buffer */ 336 | uint lexlength; /* allocated */ 337 | uint lexsize; /* used */ 338 | 339 | /* Inline stack for compatibility with Mosaic */ 340 | Node* inode; /* for deferring text node */ 341 | IStack* insert; /* for inferring inline tags */ 342 | IStack* istack; 343 | uint istacklength; /* allocated */ 344 | uint istacksize; /* used */ 345 | uint istackbase; /* start of frame */ 346 | 347 | Style *styles; /* used for cleaning up presentation markup */ 348 | 349 | #if 0 350 | TidyDocImpl* doc; /* Pointer back to doc for error reporting */ 351 | #endif 352 | }; 353 | 354 | 355 | /* Lexer Functions 356 | */ 357 | Node *CommentToken( Lexer *lexer ); 358 | 359 | /* choose what version to use for new doctype */ 360 | int HTMLVersion( TidyDocImpl* doc ); 361 | 362 | /* everything is allowed in proprietary version of HTML */ 363 | /* this is handled here rather than in the tag/attr dicts */ 364 | 365 | void ConstrainVersion( TidyDocImpl* doc, uint vers ); 366 | 367 | Bool IsWhite(uint c); 368 | Bool IsDigit(uint c); 369 | Bool IsLetter(uint c); 370 | Bool IsNewline(uint c); 371 | Bool IsNamechar(uint c); 372 | Bool IsXMLLetter(uint c); 373 | Bool IsXMLNamechar(uint c); 374 | 375 | Bool IsLower(uint c); 376 | Bool IsUpper(uint c); 377 | uint ToLower(uint c); 378 | uint ToUpper(uint c); 379 | 380 | char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps ); 381 | 382 | 383 | Lexer* NewLexer( TidyDocImpl* doc ); 384 | Bool EndOfInput( TidyDocImpl* doc ); 385 | void FreeLexer( TidyDocImpl* doc ); 386 | 387 | /* store character c as UTF-8 encoded byte stream */ 388 | void AddCharToLexer( Lexer *lexer, uint c ); 389 | 390 | /* 391 | Used for elements and text nodes 392 | element name is NULL for text nodes 393 | start and end are offsets into lexbuf 394 | which contains the textual content of 395 | all elements in the parse tree. 396 | 397 | parent and content allow traversal 398 | of the parse tree in any direction. 399 | attributes are represented as a linked 400 | list of AttVal nodes which hold the 401 | strings for attribute/value pairs. 402 | */ 403 | Node* NewNode( Lexer* lexer ); 404 | 405 | 406 | /* used to clone heading nodes when split by an


*/ 407 | Node *CloneNode( TidyDocImpl* doc, Node *element ); 408 | 409 | /* clones the given node using source node attributes, 410 | ** no lexer attributes */ 411 | Node *CloneNodeEx( TidyDocImpl* doc, Node *element ); 412 | 413 | /* free node's attributes */ 414 | void FreeAttrs( TidyDocImpl* doc, Node *node ); 415 | 416 | /* doesn't repair attribute list linkage */ 417 | void FreeAttribute( AttVal *av ); 418 | 419 | /* remove attribute from node then free it 420 | */ 421 | void RemoveAttribute( Node *node, AttVal *attr ); 422 | 423 | /* 424 | Free document nodes by iterating through peers and recursing 425 | through children. Set next to NULL before calling FreeNode() 426 | to avoid freeing peer nodes. Doesn't patch up prev/next links. 427 | */ 428 | void FreeNode( TidyDocImpl* doc, Node *node ); 429 | 430 | Node* TextToken( Lexer *lexer ); 431 | 432 | /* used for creating preformatted text from Word2000 */ 433 | Node *NewLineNode( Lexer *lexer ); 434 | 435 | /* used for adding a   for Word2000 */ 436 | Node *NewLiteralTextNode(Lexer *lexer, ctmbstr txt ); 437 | 438 | Node* CommentToken(Lexer *lexer); 439 | Node* GetCDATA( TidyDocImpl* doc, Node *container ); 440 | 441 | void AddByte( Lexer *lexer, tmbchar c ); 442 | void AddStringLiteral( Lexer* lexer, ctmbstr str ); 443 | void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); 444 | 445 | /* find element */ 446 | Node* FindDocType( TidyDocImpl* doc ); 447 | Node* FindHTML( TidyDocImpl* doc ); 448 | Node* FindHEAD( TidyDocImpl* doc ); 449 | Node* FindTITLE(TidyDocImpl* doc); 450 | Node* FindBody( TidyDocImpl* doc ); 451 | Node* FindXmlDecl(TidyDocImpl* doc); 452 | 453 | /* Returns containing block element, if any */ 454 | Node* FindContainer( Node* node ); 455 | 456 | /* add meta element for Tidy */ 457 | Bool AddGenerator( TidyDocImpl* doc ); 458 | 459 | /* examine to identify version */ 460 | int FindGivenVersion( TidyDocImpl* doc, Node* doctype ); 461 | int ApparentVersion( TidyDocImpl* doc ); 462 | 463 | 464 | Bool CheckDocTypeKeyWords(Lexer *lexer, Node *doctype); 465 | 466 | ctmbstr HTMLVersionName( TidyDocImpl* doc ); 467 | ctmbstr HTMLVersionNameFromCode( uint vers, Bool isXhtml ); 468 | 469 | Bool SetXHTMLDocType( TidyDocImpl* doc ); 470 | 471 | 472 | /* fixup doctype if missing */ 473 | Bool FixDocType( TidyDocImpl* doc ); 474 | 475 | /* ensure XML document starts with */ 476 | /* add encoding attribute if not using ASCII or UTF-8 output */ 477 | Bool FixXmlDecl( TidyDocImpl* doc ); 478 | 479 | Node* InferredTag( TidyDocImpl* doc, ctmbstr name ); 480 | 481 | Bool ExpectsContent(Node *node); 482 | 483 | 484 | void UngetToken( TidyDocImpl* doc ); 485 | 486 | 487 | /* 488 | modes for GetToken() 489 | 490 | MixedContent -- for elements which don't accept PCDATA 491 | Preformatted -- white space preserved as is 492 | IgnoreMarkup -- for CDATA elements such as script, style 493 | */ 494 | #define IgnoreWhitespace 0 495 | #define MixedContent 1 496 | #define Preformatted 2 497 | #define IgnoreMarkup 3 498 | 499 | Node* GetToken( TidyDocImpl* doc, uint mode ); 500 | 501 | void InitMap(void); 502 | 503 | Bool IsValidAttrName( ctmbstr attr ); 504 | 505 | 506 | /* create a new attribute */ 507 | AttVal *NewAttribute(void); 508 | 509 | /* create a new attribute with given name and value */ 510 | AttVal *NewAttributeEx(ctmbstr name, ctmbstr value); 511 | 512 | /************************************* 513 | In-line Stack functions 514 | *************************************/ 515 | 516 | 517 | /* duplicate attributes */ 518 | AttVal* DupAttrs( TidyDocImpl* doc, AttVal* attrs ); 519 | 520 | /* 521 | push a copy of an inline node onto stack 522 | but don't push if implicit or OBJECT or APPLET 523 | (implicit tags are ones generated from the istack) 524 | 525 | One issue arises with pushing inlines when 526 | the tag is already pushed. For instance: 527 | 528 |

text 529 |

more text 530 | 531 | Shouldn't be mapped to 532 | 533 |

text

534 |

more text 535 | */ 536 | void PushInline( TidyDocImpl* doc, Node* node ); 537 | 538 | /* pop inline stack */ 539 | void PopInline( TidyDocImpl* doc, Node* node ); 540 | 541 | Bool IsPushed( TidyDocImpl* doc, Node* node ); 542 | 543 | /* 544 | This has the effect of inserting "missing" inline 545 | elements around the contents of blocklevel elements 546 | such as P, TD, TH, DIV, PRE etc. This procedure is 547 | called at the start of ParseBlock. when the inline 548 | stack is not empty, as will be the case in: 549 | 550 |

italic heading

551 | 552 | which is then treated as equivalent to 553 | 554 |

italic heading

555 | 556 | This is implemented by setting the lexer into a mode 557 | where it gets tokens from the inline stack rather than 558 | from the input stream. 559 | */ 560 | int InlineDup( TidyDocImpl* doc, Node *node ); 561 | 562 | /* 563 | defer duplicates when entering a table or other 564 | element where the inlines shouldn't be duplicated 565 | */ 566 | void DeferDup( TidyDocImpl* doc ); 567 | Node *InsertedToken( TidyDocImpl* doc ); 568 | 569 | #ifdef __cplusplus 570 | } 571 | #endif 572 | 573 | 574 | #endif /* __LEXER_H__ */ 575 | --------------------------------------------------------------------------------