├── elementtidy-1.0-20050212
    ├── elementtidy
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── TidyHTMLTreeBuilder.pyc
    │   └── TidyHTMLTreeBuilder.py
    ├── selftest.py
    ├── PKG-INFO
    ├── tidylib
    │   ├── src
    │   │   ├── iconvtc.h
    │   │   ├── charsets.h
    │   │   ├── win32tc.h
    │   │   ├── entities.h
    │   │   ├── forward.h
    │   │   ├── utf8.h
    │   │   ├── alloc.c
    │   │   ├── tmbstr.h
    │   │   ├── fileio.c
    │   │   ├── parser.h
    │   │   ├── clean.h
    │   │   ├── pprint.h
    │   │   ├── attrdict.h
    │   │   ├── buffio.c
    │   │   ├── tidy-int.h
    │   │   ├── attrask.c
    │   │   ├── streamio.h
    │   │   ├── tmbstr.c
    │   │   ├── config.h
    │   │   ├── attrget.c
    │   │   ├── message.h
    │   │   ├── istack.c
    │   │   ├── tagask.c
    │   │   ├── tags.h
    │   │   ├── access.h
    │   │   ├── entities.c
    │   │   ├── utf8.c
    │   │   └── lexer.h
    │   └── include
    │   │   ├── fileio.h
    │   │   ├── buffio.h
    │   │   └── platform.h
    ├── setup.py
    ├── _elementtidy.c
    └── README
├── TODO
├── README
└── juniperncprompt.py


/elementtidy-1.0-20050212/elementtidy/__init__.py:
--------------------------------------------------------------------------------
1 | # $Id: __init__.py 1764 2004-03-29 07:07:36Z fredrik $
2 | # package marker
3 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/selftest.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crimsonknave/juniperncprompt/HEAD/elementtidy-1.0-20050212/selftest.py


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/elementtidy/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crimsonknave/juniperncprompt/HEAD/elementtidy-1.0-20050212/elementtidy/__init__.pyc


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/elementtidy/TidyHTMLTreeBuilder.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crimsonknave/juniperncprompt/HEAD/elementtidy-1.0-20050212/elementtidy/TidyHTMLTreeBuilder.pyc


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
1 | In no particular order
2 | 
3 | Logging?
4 | Update README with troubleshooting?  (ncLinux*.jar, resolv.conf not moving /tmp partition)
5 | Allow the password fields to have printed names
6 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: elementtidy
 3 | Version: 1.0-20050212
 4 | Summary: ElementTidy - a tidylib interface for ElementTree
 5 | Home-page: http://effbot.org/zone/element-tidylib.htm
 6 | Author: Fredrik Lundh
 7 | Author-email: fredrik@pythonware.com
 8 | License: UNKNOWN
 9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/iconvtc.h:
--------------------------------------------------------------------------------
 1 | #ifndef __ICONVTC_H__
 2 | #define __ICONVTC_H__
 3 | #ifdef TIDY_ICONV_SUPPORT
 4 | 
 5 | /* iconvtc.h -- Interface to iconv transcoding routines
 6 | 
 7 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 8 |   See tidy.h for the copyright notice.
 9 | 
10 |   $Id$
11 | */
12 | 
13 | 
14 | #endif /* TIDY_ICONV_SUPPORT */
15 | #endif /* __ICONVTC_H__ */
16 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/charsets.h:
--------------------------------------------------------------------------------
 1 | /* charsets.h -- character set information and mappings
 2 | 
 3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 4 |   See tidy.h for the copyright notice.
 5 | 
 6 |   $Id$
 7 | */
 8 | 
 9 | uint GetEncodingIdFromName(ctmbstr name);
10 | uint GetEncodingIdFromCodePage(uint cp);
11 | uint GetEncodingCodePageFromName(ctmbstr name);
12 | uint GetEncodingCodePageFromId(uint id);
13 | ctmbstr GetEncodingNameFromId(uint id);
14 | ctmbstr GetEncodingNameFromCodePage(uint cp);
15 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/win32tc.h:
--------------------------------------------------------------------------------
 1 | #ifndef __WIN32TC_H__
 2 | #define __WIN32TC_H__
 3 | #ifdef TIDY_WIN32_MLANG_SUPPORT
 4 | 
 5 | /* win32tc.h -- Interface to Win32 transcoding routines
 6 | 
 7 |    (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 8 |    See tidy.h for the copyright notice.
 9 | 
10 |    $Id$
11 | */
12 | 
13 | uint Win32MLangGetCPFromName(ctmbstr encoding);
14 | Bool Win32MLangInitInputTranscoder(StreamIn * in, uint wincp);
15 | void Win32MLangUninitInputTranscoder(StreamIn * in);
16 | int Win32MLangGetChar(byte firstByte, StreamIn * in, uint * bytesRead);
17 | 
18 | #endif /* TIDY_WIN32_MLANG_SUPPORT */
19 | #endif /* __WIN32TC_H__ */
20 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/entities.h:
--------------------------------------------------------------------------------
 1 | #ifndef __ENTITIES_H__
 2 | #define __ENTITIES_H__
 3 | 
 4 | /* entities.h -- recognize character entities
 5 | 
 6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 7 |   See tidy.h for the copyright notice.
 8 | 
 9 |   CVS Info :
10 | 
11 |     $LastChangedBy$ 
12 |     $LastChangedDate$ 
13 |     $LastChangedRevision$ 
14 | 
15 | */
16 | 
17 | #include "forward.h"
18 | 
19 | /* entity starting with "&" returns zero on error */
20 | uint    EntityCode( ctmbstr name, uint versions );
21 | ctmbstr EntityName( uint charCode, uint versions );
22 | Bool    EntityInfo( ctmbstr name, Bool isXml, uint* code, uint* versions );
23 | 
24 | #endif /* __ENTITIES_H__ */
25 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/include/fileio.h:
--------------------------------------------------------------------------------
 1 | #ifndef __FILEIO_H__
 2 | #define __FILEIO_H__
 3 | 
 4 | /** @file fileio.h - does standard C I/O
 5 | 
 6 |   Implementation of a FILE* based TidyInputSource and 
 7 |   TidyOutputSink.
 8 | 
 9 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
10 |   See tidy.h for the copyright notice.
11 | 
12 |   CVS Info:
13 |     $LastChangedBy$ 
14 |     $LastChangedDate$ 
15 |     $LastChangedRevision$ 
16 | */
17 | 
18 | #include "buffio.h"
19 | #ifdef __cplusplus
20 | extern "C" {
21 | #endif
22 | 
23 | /** Allocate and initialize file input source */
24 | void initFileSource( TidyInputSource* source, FILE* fp );
25 | 
26 | /** Free file input source */
27 | void freeFileSource( TidyInputSource* source, Bool closeIt );
28 | 
29 | /** Initialize file output sink */
30 | void initFileSink( TidyOutputSink* sink, FILE* fp );
31 | 
32 | /* Needed for internal declarations */
33 | void filesink_putByte( ulong sinkData, byte bv );
34 | 
35 | #ifdef __cplusplus
36 | }
37 | #endif
38 | #endif /* __FILEIO_H__ */
39 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/forward.h:
--------------------------------------------------------------------------------
 1 | #ifndef __FORWARD_H__
 2 | #define __FORWARD_H__
 3 | 
 4 | /* forward.h -- Forward declarations for major Tidy structures
 5 | 
 6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 7 |   See tidy.h for the copyright notice.
 8 | 
 9 |   CVS Info :
10 | 
11 |     $LastChangedBy$ 
12 |     $LastChangedDate$ 
13 |     $LastChangedRevision$ 
14 | 
15 |   Avoids many include file circular dependencies.
16 | 
17 |   Try to keep this file down to the minimum to avoid
18 |   cross-talk between modules.
19 | 
20 |   Header files include this file.  C files include tidy-int.h.
21 | 
22 | */
23 | 
24 | #include "platform.h"
25 | #include "tidy.h"
26 | 
27 | struct _StreamIn;
28 | typedef struct _StreamIn StreamIn;
29 | 
30 | struct _StreamOut;
31 | typedef struct _StreamOut StreamOut;
32 | 
33 | struct _TidyDocImpl;
34 | typedef struct _TidyDocImpl TidyDocImpl;
35 | 
36 | 
37 | struct _Dict;
38 | typedef struct _Dict Dict;
39 | 
40 | struct _Attribute;
41 | typedef struct _Attribute Attribute;
42 | 
43 | struct _AttVal;
44 | typedef struct _AttVal AttVal;
45 | 
46 | struct _Node;
47 | typedef struct _Node Node;
48 | 
49 | struct _IStack;
50 | typedef struct _IStack IStack;
51 | 
52 | struct _Lexer;
53 | typedef struct _Lexer Lexer;
54 | 
55 | 
56 | 
57 | #endif /* __FORWARD_H__ */
58 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/utf8.h:
--------------------------------------------------------------------------------
 1 | #ifndef __UTF8_H__
 2 | #define __UTF8_H__
 3 | 
 4 | /* utf8.h -- convert characters to/from UTF-8
 5 | 
 6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 7 |   See tidy.h for the copyright notice.
 8 | 
 9 |   CVS Info :
10 | 
11 |     $LastChangedBy$ 
12 |     $LastChangedDate$ 
13 |     $LastChangedRevision$ 
14 | 
15 | */
16 | 
17 | #include "platform.h"
18 | #include "buffio.h"
19 | 
20 | /* UTF-8 encoding/decoding support
21 | ** Does not convert character "codepoints", i.e. to/from 10646.
22 | */
23 | 
24 | int DecodeUTF8BytesToChar( uint* c, uint firstByte, tmbstr successorBytes,
25 |                            TidyInputSource* inp, int* count );
26 | 
27 | int EncodeCharToUTF8Bytes( uint c, tmbstr encodebuf,
28 |                            TidyOutputSink* outp, int* count );
29 | 
30 | 
31 | uint  GetUTF8( tmbstr str, uint *ch );
32 | tmbstr PutUTF8( tmbstr buf, uint c );
33 | 
34 | #define UNICODE_BOM_BE   0xFEFF   /* big-endian (default) UNICODE BOM */
35 | #define UNICODE_BOM      UNICODE_BOM_BE
36 | #define UNICODE_BOM_LE   0xFFFE   /* little-endian UNICODE BOM */
37 | #define UNICODE_BOM_UTF8 0xEFBBBF /* UTF-8 UNICODE BOM */
38 | 
39 | 
40 | Bool    IsValidUTF16FromUCS4( tchar ucs4 );
41 | Bool    IsHighSurrogate( tchar ch );
42 | Bool    IsLowSurrogate( tchar ch );
43 | 
44 | Bool    IsCombinedChar( tchar ch );
45 | Bool    IsValidCombinedChar( tchar ch );
46 | 
47 | tchar   CombineSurrogatePair( tchar high, tchar low );
48 | Bool    SplitSurrogatePair( tchar utf16, tchar* high, tchar* low );
49 | 
50 | 
51 | 
52 | #endif /* __UTF8_H__ */
53 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Setup script for the elementtidy library
 4 | # $Id: setup.py 2275 2005-02-03 18:20:56Z fredrik $
 5 | #
 6 | # Usage: python setup.py install
 7 | #
 8 | 
 9 | from distutils.core import setup, Extension
10 | 
11 | NAME = "elementtidy"
12 | VERSION = "1.0-20050212"
13 | 
14 | TIDYFILES = [
15 |     "tidylib/src/access.c",
16 |     "tidylib/src/alloc.c",
17 |     "tidylib/src/attrask.c",
18 |     "tidylib/src/attrdict.c",
19 |     "tidylib/src/attrget.c",
20 |     "tidylib/src/attrs.c",
21 |     "tidylib/src/buffio.c",
22 |     "tidylib/src/clean.c",
23 |     "tidylib/src/config.c",
24 |     "tidylib/src/entities.c",
25 |     "tidylib/src/fileio.c",
26 |     "tidylib/src/istack.c",
27 |     "tidylib/src/lexer.c",
28 |     "tidylib/src/localize.c",
29 |     "tidylib/src/parser.c",
30 |     "tidylib/src/pprint.c",
31 |     "tidylib/src/streamio.c",
32 |     "tidylib/src/tagask.c",
33 |     "tidylib/src/tags.c",
34 |     "tidylib/src/tidylib.c",
35 |     "tidylib/src/tmbstr.c",
36 |     "tidylib/src/utf8.c",
37 | ]
38 | 
39 | setup(
40 |     name=NAME,
41 |     version=VERSION,
42 |     author="Fredrik Lundh",
43 |     author_email="fredrik@pythonware.com",
44 |     description="ElementTidy - a tidylib interface for ElementTree",
45 |     url="http://effbot.org/zone/element-tidylib.htm",
46 |     packages=["elementtidy"],
47 |     ext_modules = [
48 |         Extension(
49 |             "_elementtidy",
50 |             ["_elementtidy.c"] + TIDYFILES,
51 |             define_macros=[("NDEBUG", None)],
52 |             include_dirs=["tidylib/include"],
53 |             )
54 |         ]
55 |     )
56 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/alloc.c:
--------------------------------------------------------------------------------
 1 | /* alloc.c -- Default memory allocation routines.
 2 | 
 3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 4 |   See tidy.h for the copyright notice.
 5 | 
 6 |   CVS Info :
 7 | 
 8 |     $LastChangedBy$ 
 9 |     $LastChangedDate$ 
10 |     $LastChangedRevision$ 
11 | 
12 | */
13 | 
14 | #include "tidy.h"
15 | 
16 | static TidyMalloc  g_malloc  = NULL;
17 | static TidyRealloc g_realloc = NULL;
18 | static TidyFree    g_free    = NULL;
19 | static TidyPanic   g_panic   = NULL;
20 | 
21 | Bool        tidySetMallocCall( TidyMalloc fmalloc )
22 | {
23 |   g_malloc  = fmalloc;
24 |   return yes;
25 | }
26 | Bool        tidySetReallocCall( TidyRealloc frealloc )
27 | {
28 |   g_realloc = frealloc;
29 |   return yes;
30 | }
31 | Bool        tidySetFreeCall( TidyFree ffree )
32 | {
33 |   g_free    = ffree;
34 |   return yes;
35 | }
36 | Bool        tidySetPanicCall( TidyPanic fpanic )
37 | {
38 |   g_panic   = fpanic;
39 |   return yes;
40 | }
41 | 
42 | void FatalError( ctmbstr msg )
43 | {
44 |   if ( g_panic )
45 |     g_panic( msg );
46 |   else
47 |   {
48 |     /* 2 signifies a serious error */
49 |     fprintf( stderr, "Fatal error: %s\n", msg );
50 |     exit(2);
51 |   }
52 | }
53 | 
54 | void* MemAlloc( size_t size )
55 | {
56 |     void *p = ( g_malloc ? g_malloc(size) : malloc(size) );
57 |     if ( !p )
58 |         FatalError("Out of memory!");
59 |     return p;
60 | }
61 | 
62 | void* MemRealloc( void* mem, size_t newsize )
63 | {
64 |     void *p;
65 |     if ( mem == NULL )
66 |         return MemAlloc( newsize );
67 | 
68 |     p = ( g_realloc ? g_realloc(mem, newsize) : realloc(mem, newsize) );
69 |     if (!p)
70 |         FatalError("Out of memory!");
71 |     return p;
72 | }
73 | 
74 | void MemFree( void* mem )
75 | {
76 |     if ( mem )
77 |     {
78 |         if ( g_free )
79 |             g_free( mem );
80 |         else
81 |             free( mem );
82 |     }
83 | }
84 | 
85 | void ClearMemory( void *mem, size_t size )
86 | {
87 |     memset(mem, 0, size);
88 | }
89 | 
90 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/tmbstr.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TMBSTR_H__
 2 | #define __TMBSTR_H__
 3 | 
 4 | /* tmbstr.h - Tidy string utility functions
 5 | 
 6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 7 |   See tidy.h for the copyright notice.
 8 | 
 9 |   CVS Info :
10 | 
11 |     $LastChangedBy$ 
12 |     $LastChangedDate$ 
13 |     $LastChangedRevision$ 
14 | 
15 | */
16 | 
17 | #include "platform.h"
18 | 
19 | #ifdef __cplusplus
20 | extern "C"
21 | {
22 | #endif
23 | 
24 | /* like strdup but using MemAlloc */
25 | tmbstr tmbstrdup( ctmbstr str );
26 | 
27 | /* like strndup but using MemAlloc */
28 | tmbstr tmbstrndup( ctmbstr str, uint len);
29 | 
30 | /* exactly same as strncpy */
31 | uint tmbstrncpy( tmbstr s1, ctmbstr s2, uint size );
32 | 
33 | uint tmbstrcpy( tmbstr s1, ctmbstr s2 );
34 | 
35 | uint tmbstrcat( tmbstr s1, ctmbstr s2 );
36 | 
37 | /* exactly same as strcmp */
38 | int tmbstrcmp( ctmbstr s1, ctmbstr s2 );
39 | 
40 | /* returns byte count, not char count */
41 | uint tmbstrlen( ctmbstr str );
42 | 
43 | /*
44 |   MS C 4.2 doesn't include strcasecmp.
45 |   Note that tolower and toupper won't
46 |   work on chars > 127.
47 | 
48 |   Neither do Lexer.ToLower() or Lexer.ToUpper()!
49 | 
50 |   We get away with this because, except for XML tags,
51 |   we are always comparing to ascii element and
52 |   attribute names defined by HTML specs.
53 | */
54 | int tmbstrcasecmp( ctmbstr s1, ctmbstr s2 );
55 | 
56 | int tmbstrncmp( ctmbstr s1, ctmbstr s2, uint n );
57 | 
58 | int tmbstrncasecmp( ctmbstr s1, ctmbstr s2, uint n );
59 | 
60 | /* return offset of cc from beginning of s1,
61 | ** -1 if not found.
62 | */
63 | int tmbstrnchr( ctmbstr s1, uint len1, tmbchar cc );
64 | 
65 | ctmbstr tmbsubstrn( ctmbstr s1, uint len1, ctmbstr s2 );
66 | ctmbstr tmbsubstrncase( ctmbstr s1, uint len1, ctmbstr s2 );
67 | ctmbstr tmbsubstr( ctmbstr s1, ctmbstr s2 );
68 | 
69 | /* transform string to lower case */
70 | tmbstr tmbstrtolower( tmbstr s );
71 | 
72 | /* Transform ASCII chars in string to upper case */
73 | tmbstr tmbstrtoupper(tmbstr s);
74 | 
75 | Bool tmbsamefile( ctmbstr filename1, ctmbstr filename2 );
76 | 
77 | #ifdef __cplusplus
78 | }  /* extern "C" */
79 | #endif
80 | 
81 | #endif /* __TMBSTR_H__ */
82 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/fileio.c:
--------------------------------------------------------------------------------
 1 | /* fileio.c -- does standard I/O
 2 | 
 3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 4 |   See tidy.h for the copyright notice.
 5 | 
 6 |   CVS Info :
 7 | 
 8 |     $LastChangedBy$ 
 9 |     $LastChangedDate$ 
10 |     $LastChangedRevision$ 
11 | 
12 |   Default implementations of Tidy input sources
13 |   and output sinks based on standard C FILE*.
14 | 
15 | */
16 | 
17 | #include <stdio.h>
18 | 
19 | #include "fileio.h"
20 | #include "tidy.h"
21 | 
22 | 
23 | typedef struct _fp_input_source
24 | {
25 |     FILE*        fp;
26 |     TidyBuffer   unget;
27 | } FileSource;
28 | 
29 | int filesrc_getByte( ulong sourceData )
30 | {
31 |   FileSource* fin = (FileSource*) sourceData;
32 |   int bv;
33 |   if ( fin->unget.size > 0 )
34 |     bv = tidyBufPopByte( &fin->unget );
35 |   else
36 |     bv = fgetc( fin->fp );
37 |   return bv;
38 | }
39 | Bool filesrc_eof( ulong sourceData )
40 | {
41 |   FileSource* fin = (FileSource*) sourceData;
42 |   Bool isEOF = ( fin->unget.size == 0 );
43 |   if ( isEOF )
44 |     isEOF = feof( fin->fp );
45 |   return isEOF;
46 | }
47 | void filesrc_ungetByte( ulong sourceData, byte bv )
48 | {
49 |   FileSource* fin = (FileSource*) sourceData;
50 |   tidyBufPutByte( &fin->unget, bv );
51 | }
52 | 
53 | void initFileSource( TidyInputSource* inp, FILE* fp )
54 | {
55 |   FileSource* fin = NULL;
56 | 
57 |   inp->getByte    = filesrc_getByte;
58 |   inp->eof        = filesrc_eof;
59 |   inp->ungetByte  = filesrc_ungetByte;
60 | 
61 |   fin = (FileSource*) MemAlloc( sizeof(FileSource) );
62 |   ClearMemory( fin, sizeof(FileSource) );
63 |   fin->fp = fp;
64 |   inp->sourceData = (ulong) fin;
65 | }
66 | 
67 | void freeFileSource( TidyInputSource* inp, Bool closeIt )
68 | {
69 |     FileSource* fin = (FileSource*) inp->sourceData;
70 |     if ( closeIt && fin && fin->fp )
71 |       fclose( fin->fp );
72 |     tidyBufFree( &fin->unget );
73 |     MemFree( fin );
74 | }
75 | 
76 | void filesink_putByte( ulong sinkData, byte bv )
77 | {
78 |   FILE* fout = (FILE*) sinkData;
79 |   fputc( bv, fout );
80 | }
81 | 
82 | void  initFileSink( TidyOutputSink* outp, FILE* fp )
83 | {
84 |   outp->putByte  = filesink_putByte;
85 |   outp->sinkData = (ulong) fp;
86 | }
87 | 
88 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/parser.h:
--------------------------------------------------------------------------------
 1 | #ifndef __PARSER_H__
 2 | #define __PARSER_H__
 3 | 
 4 | /* parser.h -- HTML Parser
 5 | 
 6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 7 |   See tidy.h for the copyright notice.
 8 |   
 9 |   CVS Info :
10 | 
11 |     $LastChangedBy$ 
12 |     $LastChangedDate$ 
13 |     $LastChangedRevision$ 
14 | 
15 | */
16 | 
17 | #include "forward.h"
18 | 
19 | Bool CheckNodeIntegrity(Node *node);
20 | 
21 | /*
22 |  used to determine how attributes
23 |  without values should be printed
24 |  this was introduced to deal with
25 |  user defined tags e.g. Cold Fusion
26 | */
27 | Bool IsNewNode(Node *node);
28 | 
29 | void CoerceNode(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool expected);
30 | 
31 | /* extract a node and its children from a markup tree */
32 | Node *RemoveNode(Node *node);
33 | 
34 | /* remove node from markup tree and discard it */
35 | Node *DiscardElement( TidyDocImpl* doc, Node *element);
36 | 
37 | /* insert node into markup tree */
38 | void InsertNodeAtStart(Node *element, Node *node);
39 | 
40 | /* insert node into markup tree */
41 | void InsertNodeAtEnd(Node *element, Node *node);
42 | 
43 | /* insert node into markup tree before element */
44 | void InsertNodeBeforeElement(Node *element, Node *node);
45 | 
46 | /* insert node into markup tree after element */
47 | void InsertNodeAfterElement(Node *element, Node *node);
48 | 
49 | Node *TrimEmptyElement( TidyDocImpl* doc, Node *element );
50 | 
51 | 
52 | 
53 | /* assumes node is a text node */
54 | Bool IsBlank(Lexer *lexer, Node *node);
55 | 
56 | 
57 | /*
58 |  duplicate name attribute as an id
59 |  and check if id and name match
60 | */
61 | void FixId( TidyDocImpl* doc, Node *node );
62 | 
63 | void FixXmlLang(TidyDocImpl* doc, Node* node);
64 | 
65 | /* acceptable content for pre elements */
66 | Bool PreContent( TidyDocImpl* doc, Node *node );
67 | 
68 | Bool IsJavaScript(Node *node);
69 | Bool DescendantOf(Node *element, TidyTagId tid);
70 | 
71 | /*
72 |   HTML is the top level element
73 | */
74 | void ParseDocument( TidyDocImpl* doc );
75 | 
76 | 
77 | 
78 | /*
79 |   XML documents
80 | */
81 | Bool XMLPreserveWhiteSpace( TidyDocImpl* doc, Node *element );
82 | 
83 | void ParseXMLDocument( TidyDocImpl* doc );
84 | 
85 | #endif /* __PARSER_H__ */
86 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Install steps (below) are compiled from http://makefile.com/.plan/2009/10/juniper-vpn-64-bit-linux-an-unsolved-mystery/
 2 | 
 3 | This script logs into a juniper network connect vpn website using authentication provided by you and retrieves the DSID cookie which can be passed to the ncui executable to create a VPN tunnel.  This allows users on linux to connect to the tunnel without using the web client and/or a 32 bit java.
 4 | 
 5 | I found it neccessary to use this solution because my work required both a password and a secureid token and all of the tools provided by Juniper do not allow you to pass a second password.  If you only need to use a single password the accepted answer seems to be to use http://mad-scientist.us/juniper.html, but my script should work as well (Haven't tested it, but see no reason it won't)
 6 | 
 7 | What you need to get this to work:
 8 | 1) xterm (the java app won't install the files without xterm)
 9 | 2) gcc multilib (I've only tested this on 64 bit)
10 | 3) 32bit zlib
11 | 4) /tmp and /etc on the same partition (the program can't swap your
12 | /etc/resolv.conf otherwise... sad, I know)
13 | 5) python and python-devel (I believe 2.7 is needed, it might run on 2.6)
14 | 
15 | To get the files we'll need do the following:
16 | 1) Log in to your vpn website
17 | 2) If you aren't redirected to the network connect page go there
18 | 3) A java app should try to start let it run (you will likely be asked twice)
19 | 4) Make sure that you have libncui.so and ncsvc in ~/.juniper_networks/network_connect/
20 | 5) If you don't have both of those make sure xterm is installed and that a java prompt didn't get hidden under a window
21 | 
22 | Setup the files:
23 | 1) cd ~/.juniper_networks/network_connect
24 | 2) gcc -m32 -Wl,-rpath,`pwd` -o ncui libncui.so
25 | 3) sudo chown root:root ncui
26 | 4) sudo chmod 4775 ncui
27 | 5) echo | openssl s_client -connect vpn.constantcontact.com:443 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -outform der > ssl.crt
28 | 6) cd <your git directory>/juniperncprompt/elementtidy-1.0-20050212 
29 | 7) sudo python setup.py install (make sure you use the correct version of python to setup and install if you have multiple versions on your system)
30 | 8) cd ../
31 | 9) ./juniperncprompt.py vpn.website.com
32 | 10) ifconfig (typically /sbin for those who don't have sbin in their path)
33 | You should see a tun0, if so everything worked and you are connected to the vpn
34 | 
35 | The default values are set up for the vpn setup at my work, they can all be adjusted with a command line flag or by changing the source file if you want.
36 | 
37 | If you have any questions or comments please feel free to email me at crimsonknave@gmail.com
38 | 
39 | This distribution includes the sources for elementtidy and tidylib.  You can get the latest elementtidy from http://effbot.org/downloads/ and tidylib from http://tidy.sourceforge.net/
40 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/clean.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CLEAN_H__
 2 | #define __CLEAN_H__
 3 | 
 4 | /* clean.h -- clean up misuse of presentation markup
 5 | 
 6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 7 |   See tidy.h for the copyright notice.
 8 | 
 9 |   CVS Info:
10 |     $LastChangedBy$ 
11 |     $LastChangedDate$ 
12 |     $LastChangedRevision$ 
13 | 
14 | */
15 | 
16 | void RenameElem( Node* node, TidyTagId tid );
17 | 
18 | Node* CleanNode( TidyDocImpl* doc, Node* node );
19 | 
20 | void FreeStyles( TidyDocImpl* doc );
21 | 
22 | /* Add class="foo" to node
23 | */
24 | void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname );
25 | 
26 | void CleanDocument( TidyDocImpl* doc );
27 | 
28 | /* simplifies <b><b> ... </b> ...</b> etc. */
29 | void NestedEmphasis( TidyDocImpl* doc, Node* node );
30 | 
31 | /* replace i by em and b by strong */
32 | void EmFromI( TidyDocImpl* doc, Node* node );
33 | 
34 | /*
35 |  Some people use dir or ul without an li
36 |  to indent the content. The pattern to
37 |  look for is a list with a single implicit
38 |  li. This is recursively replaced by an
39 |  implicit blockquote.
40 | */
41 | void List2BQ( TidyDocImpl* doc, Node* node );
42 | 
43 | /*
44 |  Replace implicit blockquote by div with an indent
45 |  taking care to reduce nested blockquotes to a single
46 |  div with the indent set to match the nesting depth
47 | */
48 | void BQ2Div( TidyDocImpl* doc, Node* node );
49 | 
50 | 
51 | Node *FindEnclosingCell( TidyDocImpl* doc, Node* node );
52 | 
53 | void DropSections( TidyDocImpl* doc, Node* node );
54 | 
55 | /* used to hunt for hidden preformatted sections */
56 | Bool NoMargins(Node *node);
57 | 
58 | /* does element have a single space as its content? */
59 | Bool IsSingleSpace(Lexer *lexer, Node *node);
60 | 
61 | 
62 | /*
63 |  This is a major clean up to strip out all the extra stuff you get
64 |  when you save as web page from Word 2000. It doesn't yet know what
65 |  to do with VML tags, but these will appear as errors unless you
66 |  declare them as new tags, such as o:p which needs to be declared
67 |  as inline.
68 | */
69 | void CleanWord2000( TidyDocImpl* doc, Node *node);
70 | 
71 | Bool IsWord2000( TidyDocImpl* doc );
72 | 
73 | /* where appropriate move object elements from head to body */
74 | void BumpObject( TidyDocImpl* doc, Node *html );
75 | 
76 | void FixBrakes( TidyDocImpl* pDoc, Node *pParent );
77 | 
78 | void VerifyHTTPEquiv( TidyDocImpl* pDoc, Node *pParent );
79 | 
80 | void DropComments(TidyDocImpl* doc, Node* node);
81 | void DropFontElements(TidyDocImpl* doc, Node* node, Node **pnode);
82 | void WbrToSpace(TidyDocImpl* doc, Node* node);
83 | void DowngradeTypography(TidyDocImpl* doc, Node* node);
84 | void ReplacePreformattedSpaces(TidyDocImpl* doc, Node* node);
85 | void NormalizeSpaces(Lexer *lexer, Node *node);
86 | void ConvertCDATANodes(TidyDocImpl* doc, Node* node);
87 | 
88 | #endif /* __CLEAN_H__ */
89 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/pprint.h:
--------------------------------------------------------------------------------
  1 | #ifndef __PPRINT_H__
  2 | #define __PPRINT_H__
  3 | 
  4 | /* pprint.h -- pretty print parse tree  
  5 |   
  6 |    (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  7 |    See tidy.h for the copyright notice.
  8 |   
  9 |    CVS Info:
 10 |      $LastChangedBy$ 
 11 |      $LastChangedDate$ 
 12 |      $LastChangedRevision$ 
 13 | 
 14 | */
 15 | 
 16 | #include "forward.h"
 17 | 
 18 | /*
 19 |   Block-level and unknown elements are printed on
 20 |   new lines and their contents indented 2 spaces
 21 | 
 22 |   Inline elements are printed inline.
 23 | 
 24 |   Inline content is wrapped on spaces (except in
 25 |   attribute values or preformatted text, after
 26 |   start tags and before end tags
 27 | */
 28 | 
 29 | #define NORMAL        0
 30 | #define PREFORMATTED  1
 31 | #define COMMENT       2
 32 | #define ATTRIBVALUE   4
 33 | #define NOWRAP        8
 34 | #define CDATA         16
 35 | 
 36 | 
 37 | /* The pretty printer keeps at most two lines of text in the
 38 | ** buffer before flushing output.  We need to capture the
 39 | ** indent state (indent level) at the _beginning_ of _each_
 40 | ** line, not the end of just the second line.
 41 | **
 42 | ** We must also keep track "In Attribute" and "In String"
 43 | ** states at the _end_ of each line, 
 44 | */
 45 | 
 46 | typedef struct _TidyIndent
 47 | {
 48 |     int spaces;
 49 |     int attrValStart;
 50 |     int attrStringStart;
 51 | } TidyIndent;
 52 | 
 53 | typedef struct _TidyPrintImpl
 54 | {
 55 |     uint *linebuf;
 56 |     uint lbufsize;
 57 |     uint linelen;
 58 |     uint wraphere;
 59 |     uint linecount;
 60 |   
 61 |     uint ixInd;
 62 |     TidyIndent indent[2];  /* Two lines worth of indent state */
 63 | 
 64 | } TidyPrintImpl;
 65 | 
 66 | void PPrintDocument( TidyDocImpl* doc );
 67 | 
 68 | 
 69 | #if SUPPORT_ASIAN_ENCODINGS
 70 | /* #431953 - start RJ Wraplen adjusted for smooth international ride */
 71 | uint CWrapLen( TidyDocImpl* doc, uint ind );
 72 | #endif
 73 | 
 74 | void InitPrintBuf( TidyDocImpl* doc );
 75 | void FreePrintBuf( TidyDocImpl* doc );
 76 | 
 77 | void PFlushLine( TidyDocImpl* doc, uint indent );
 78 | void PCondFlushLine( TidyDocImpl* doc, uint indent );
 79 | 
 80 | void PPrintScriptStyle( TidyDocImpl* doc, uint mode, uint indent, Node* node );
 81 | 
 82 | /* print just the content of the body element.
 83 | ** useful when you want to reuse material from
 84 | ** other documents.
 85 | ** 
 86 | ** -- Sebastiano Vigna <vigna@dsi.unimi.it>
 87 | */
 88 | 
 89 | void PrintPreamble( TidyDocImpl* doc );   /* Between these 3, */
 90 | void PrintBody( TidyDocImpl* doc );       /* you can print an entire document */
 91 | void PrintPostamble( TidyDocImpl* doc );  /* or you can substitute another */
 92 |                                           /* node as body using PPrintTree() */
 93 | 
 94 | void PPrintTree( TidyDocImpl* doc, uint mode, uint indent, Node *node );
 95 | 
 96 | void PPrintXMLTree( TidyDocImpl* doc, uint mode, uint indent, Node *node );
 97 | 
 98 | 
 99 | #endif /* __PPRINT_H__ */
100 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/include/buffio.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BUFFIO_H__
 2 | #define __BUFFIO_H__
 3 | 
 4 | /** @file buffio.h - Treat buffer as an I/O stream.
 5 | 
 6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
 7 |   See tidy.h for the copyright notice.
 8 | 
 9 |   CVS Info :
10 | 
11 |     $LastChangedBy$ 
12 |     $LastChangedDate$ 
13 |     $LastChangedRevision$ 
14 | 
15 |   Requires buffer to automatically grow as bytes are added.
16 |   Must keep track of current read and write points.
17 | 
18 | */
19 | 
20 | #include "platform.h"
21 | #include "tidy.h"
22 | 
23 | #ifdef __cplusplus
24 | extern "C" {
25 | #endif
26 | 
27 | /** TidyBuffer - A chunk of memory */
28 | TIDY_STRUCT
29 | struct _TidyBuffer 
30 | {
31 |     byte* bp;           /**< Pointer to bytes */
32 |     uint  size;         /**< # bytes currently in use */
33 |     uint  allocated;    /**< # bytes allocated */ 
34 |     uint  next;         /**< Offset of current input position */
35 | };
36 | 
37 | /** Zero out data structure */
38 | TIDY_EXPORT void tidyBufInit( TidyBuffer* buf );
39 | 
40 | /** Free current buffer, allocate given amount, reset input pointer */
41 | TIDY_EXPORT void tidyBufAlloc( TidyBuffer* buf, uint allocSize );
42 | 
43 | /** Expand buffer to given size. 
44 | **  Chunk size is minimum growth. Pass 0 for default of 256 bytes.
45 | */
46 | TIDY_EXPORT void tidyBufCheckAlloc( TidyBuffer* buf,
47 |                                     uint allocSize, uint chunkSize );
48 | 
49 | /** Free current contents and zero out */
50 | TIDY_EXPORT void tidyBufFree( TidyBuffer* buf );
51 | 
52 | /** Set buffer bytes to 0 */
53 | TIDY_EXPORT void tidyBufClear( TidyBuffer* buf );
54 | 
55 | /** Attach to existing buffer */
56 | TIDY_EXPORT void tidyBufAttach( TidyBuffer* buf, void* bp, uint size );
57 | 
58 | /** Detach from buffer.  Caller must free. */
59 | TIDY_EXPORT void tidyBufDetach( TidyBuffer* buf );
60 | 
61 | 
62 | /** Append bytes to buffer.  Expand if necessary. */
63 | TIDY_EXPORT void tidyBufAppend( TidyBuffer* buf, void* vp, uint size );
64 | 
65 | /** Append one byte to buffer.  Expand if necessary. */
66 | TIDY_EXPORT void tidyBufPutByte( TidyBuffer* buf, byte bv );
67 | 
68 | /** Get byte from end of buffer */
69 | TIDY_EXPORT int  tidyBufPopByte( TidyBuffer* buf );
70 | 
71 | 
72 | /** Get byte from front of buffer.  Increment input offset. */
73 | TIDY_EXPORT int  tidyBufGetByte( TidyBuffer* buf );
74 | 
75 | /** At end of buffer? */
76 | TIDY_EXPORT Bool tidyBufEndOfInput( TidyBuffer* buf );
77 | 
78 | /** Put a byte back into the buffer.  Decrement input offset. */
79 | TIDY_EXPORT void tidyBufUngetByte( TidyBuffer* buf, byte bv );
80 | 
81 | 
82 | /**************
83 |    TIDY
84 | **************/
85 | 
86 | /* Forward declarations
87 | */
88 | 
89 | /** Initialize a buffer input source */
90 | TIDY_EXPORT void initInputBuffer( TidyInputSource* inp, TidyBuffer* buf );
91 | 
92 | /** Initialize a buffer output sink */
93 | TIDY_EXPORT void initOutputBuffer( TidyOutputSink* outp, TidyBuffer* buf );
94 | 
95 | #ifdef __cplusplus
96 | }
97 | #endif
98 | #endif /* __BUFFIO_H__ */
99 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/elementtidy/TidyHTMLTreeBuilder.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # ElementTree
  3 | # $Id: TidyHTMLTreeBuilder.py 2276 2005-02-03 19:21:25Z fredrik $
  4 | #
  5 | # tree builder based on the _elementtidy tidylib wrapper
  6 | #
  7 | # history:
  8 | # 2003-07-06 fl   created
  9 | # 2003-09-17 fl   capture stderr as well
 10 | # 2005-02-03 fl   added encoding support
 11 | #
 12 | # Copyright (c) 1999-2005 by Fredrik Lundh.  All rights reserved.
 13 | #
 14 | # fredrik@pythonware.com
 15 | # http://www.pythonware.com
 16 | #
 17 | # --------------------------------------------------------------------
 18 | # The ElementTree toolkit is
 19 | #
 20 | # Copyright (c) 1999-2005 by Fredrik Lundh
 21 | #
 22 | # By obtaining, using, and/or copying this software and/or its
 23 | # associated documentation, you agree that you have read, understood,
 24 | # and will comply with the following terms and conditions:
 25 | #
 26 | # Permission to use, copy, modify, and distribute this software and
 27 | # its associated documentation for any purpose and without fee is
 28 | # hereby granted, provided that the above copyright notice appears in
 29 | # all copies, and that both that copyright notice and this permission
 30 | # notice appear in supporting documentation, and that the name of
 31 | # Secret Labs AB or the author not be used in advertising or publicity
 32 | # pertaining to distribution of the software without specific, written
 33 | # prior permission.
 34 | #
 35 | # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
 36 | # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
 37 | # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
 38 | # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
 39 | # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 40 | # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 41 | # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 42 | # OF THIS SOFTWARE.
 43 | # --------------------------------------------------------------------
 44 | 
 45 | # note: route all elementtree access via ElementTree, so that external
 46 | # users can "patch in" another implementation if they want to (such as
 47 | # cElementTree)
 48 | 
 49 | # Support for python >= 2.5
 50 | try:
 51 |   from elementtree import ElementTree
 52 | except ImportError:
 53 |   from xml.etree import ElementTree
 54 | 
 55 | import _elementtidy
 56 | import string
 57 | 
 58 | ##
 59 | # ElementTree builder for HTML source code.  This builder converts an
 60 | # HTML document or fragment to an XHTML ElementTree, by running it
 61 | # through the _elementtidy processor.
 62 | #
 63 | # @kwparam encoding Optional source document encoding.
 64 | #
 65 | # @see elementtree.ElementTree
 66 | 
 67 | class TidyHTMLTreeBuilder:
 68 | 
 69 |     def __init__(self, encoding=None):
 70 |         self.__data = []
 71 |         if encoding:
 72 |             if encoding == "iso-8859-1":
 73 |                 encoding = "latin1"
 74 |             else:
 75 |                 encoding = string.replace(encoding, "-", "")
 76 |         self.__encoding = encoding
 77 |         self.errlog = None
 78 | 
 79 |     ##
 80 |     # Add data to parser buffers.
 81 | 
 82 |     def feed(self, text):
 83 |         self.__data.append(text)
 84 | 
 85 |     ##
 86 |     # Flush parser buffers, and return the root element.
 87 |     #
 88 |     # @return An Element instance.
 89 | 
 90 |     def close(self):
 91 |         args = [string.join(self.__data, "")]
 92 |         if self.__encoding:
 93 |             args.append(self.__encoding)
 94 |         stdout, stderr = _elementtidy.fixup(*args)
 95 |         self.errlog = stderr
 96 |         return ElementTree.XML(stdout)
 97 | 
 98 | ##
 99 | # An alias for the <b>TidyHTMLTreeBuilder</b> class.
100 | 
101 | TreeBuilder = TidyHTMLTreeBuilder
102 | 
103 | ##
104 | # Parse an HTML document into an XHTML-style element tree.
105 | #
106 | # @param source A filename or file object containing HTML data.
107 | # @return An ElementTree instance
108 | 
109 | def parse(source):
110 |     return ElementTree.parse(source, TreeBuilder())
111 | 
112 | ##
113 | # Parse an HTML document into an XHTML-style element tree, and return
114 | # both the tree and the error log.
115 | #
116 | # @param source A filename or file object containing HTML data.
117 | # @return A 2-tuple containing an ElementTree instance and a string
118 | #     with TidyLib's error log.
119 | 
120 | def parse2(source):
121 |     builder = TreeBuilder()
122 |     tree = ElementTree.parse(source, builder)
123 |     return tree, builder.errlog
124 | 
125 | if __name__ == "__main__":
126 |     import sys
127 |     ElementTree.dump(parse(open(sys.argv[1])))
128 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/_elementtidy.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ElementTree
  3 |  * $Id: _elementtidy.c 2276 2005-02-03 19:21:25Z fredrik $
  4 |  *
  5 |  * TidyHTMLTreeBuilder driver for the ElementTree package, based
  6 |  * on tidylib (from http://tidy.sourceforge.net)
  7 |  *
  8 |  * Copyright (c) 2003-2005 by Fredrik Lundh.  All rights reserved.
  9 |  */
 10 | 
 11 | /* --------------------------------------------------------------------
 12 |    Copyright (c) 2003-2005 by Fredrik Lundh
 13 | 
 14 |    By obtaining, using, and/or copying this software and/or its
 15 |    associated documentation, you agree that you have read, understood,
 16 |    and will comply with the following terms and conditions:
 17 | 
 18 |    Permission to use, copy, modify, and distribute this software and its
 19 |    associated documentation for any purpose and without fee is hereby
 20 |    granted, provided that the above copyright notice appears in all
 21 |    copies, and that both that copyright notice and this permission notice
 22 |    appear in supporting documentation, and that the name of Secret Labs
 23 |    AB or the author not be used in advertising or publicity pertaining to
 24 |    distribution of the software without specific, written prior
 25 |    permission.
 26 | 
 27 |    SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
 28 |    THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 29 |    FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
 30 |    ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 31 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 32 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
 33 |    OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 34 |    -------------------------------------------------------------------- */
 35 | 
 36 | #include "Python.h"
 37 | 
 38 | /* TODO: instead of saving to string, generate tree events */
 39 | 
 40 | #include "tidy.h"
 41 | #include "buffio.h"
 42 | 
 43 | static PyObject*
 44 | elementtidy_fixup(PyObject* self, PyObject* args)
 45 | {
 46 |     int rc;
 47 |     TidyDoc doc;
 48 |     TidyBuffer out = {0};
 49 |     TidyBuffer err = {0};
 50 |     PyObject* pyout;
 51 |     PyObject* pyerr;
 52 | 
 53 |     char* text;
 54 |     char* encoding = NULL;
 55 |     if (!PyArg_ParseTuple(args, "s|s:fixup", &text, &encoding))
 56 |         return NULL;
 57 | 
 58 |     doc = tidyCreate();
 59 | 
 60 |     /* options for nice XHTML output */
 61 |     if (encoding)
 62 |         /* if an encoding is given, use it for both input and output */
 63 |         tidyOptSetValue(doc, TidyCharEncoding, encoding);
 64 |     else
 65 |         /* if no encoding is given, use default input and utf-8 output */
 66 |         tidyOptSetValue(doc, TidyOutCharEncoding, "utf8");
 67 |     tidyOptSetBool(doc, TidyForceOutput, yes);
 68 |     tidyOptSetInt(doc, TidyWrapLen, 0);
 69 |     tidyOptSetBool(doc, TidyQuiet, yes);
 70 |     tidyOptSetBool(doc, TidyXhtmlOut, yes);
 71 |     tidyOptSetBool(doc, TidyXmlDecl, yes);
 72 |     tidyOptSetInt(doc, TidyIndentContent, 0);
 73 |     tidyOptSetBool(doc, TidyNumEntities, yes);
 74 | 
 75 |     rc = tidySetErrorBuffer(doc, &err);
 76 |     if (rc < 0) {
 77 |         PyErr_SetString(PyExc_IOError, "tidySetErrorBuffer failed");
 78 |         goto error;
 79 |     }
 80 | 
 81 |     rc = tidyParseString(doc, text);
 82 |     if (rc < 0) {
 83 |         PyErr_SetString(PyExc_IOError, "tidyParseString failed");
 84 |         goto error;
 85 |     }
 86 | 
 87 |     rc = tidyCleanAndRepair(doc);
 88 |     if (rc < 0) {
 89 |         PyErr_SetString(PyExc_IOError, "tidyCleanAndRepair failed");
 90 |         goto error;
 91 |     }
 92 | 
 93 |     rc = tidyRunDiagnostics(doc);
 94 |     if (rc < 0) {
 95 |         PyErr_SetString(PyExc_IOError, "tidyRunDiagnostics failed");
 96 |         goto error;
 97 |     }
 98 | 
 99 |     rc = tidySaveBuffer(doc, &out);
100 |     if (rc < 0) {
101 |         PyErr_SetString(PyExc_IOError, "tidyRunDiagnostics failed");
102 |         goto error;
103 |     }
104 | 
105 | 
106 |     pyout = PyString_FromString(out.bp ? out.bp : "");
107 |     if (!pyout)
108 |         goto error;
109 |     pyerr = PyString_FromString(err.bp ? err.bp : "");
110 |     if (!pyerr) {
111 |         Py_DECREF(pyout);
112 |         goto error;
113 |     }
114 | 
115 |     tidyBufFree(&out);
116 |     tidyBufFree(&err);
117 | 
118 |     tidyRelease(doc);
119 | 
120 |     return Py_BuildValue("NN", pyout, pyerr);
121 | 
122 |   error:
123 |     tidyBufFree(&out);
124 |     tidyBufFree(&err);
125 | 
126 |     tidyRelease(doc);
127 | 
128 |     return NULL;
129 | }
130 | 
131 | static PyMethodDef _functions[] = {
132 |     {"fixup", elementtidy_fixup, 1},
133 |     {NULL, NULL}
134 | };
135 | 
136 | void
137 | #ifdef WIN32
138 | __declspec(dllexport)
139 | #endif
140 | init_elementtidy()
141 | {
142 |     Py_InitModule("_elementtidy", _functions);
143 | }
144 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/attrdict.h:
--------------------------------------------------------------------------------
  1 | #ifndef __ATTRDICT_H__
  2 | #define __ATTRDICT_H__
  3 | 
  4 | /* attrdict.h -- extended attribute information
  5 | 
  6 |    (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  7 |    See tidy.h for the copyright notice.
  8 | 
  9 |    $Id$
 10 | */
 11 | 
 12 | #include "tidy.h"
 13 | 
 14 | typedef struct _AttrVersion
 15 | {
 16 |     uint attribute;
 17 |     uint versions;
 18 | } AttrVersion;
 19 | 
 20 | extern AttrVersion W3CAttrsFor_A[];
 21 | extern AttrVersion W3CAttrsFor_ABBR[];
 22 | extern AttrVersion W3CAttrsFor_ACRONYM[];
 23 | extern AttrVersion W3CAttrsFor_ADDRESS[];
 24 | extern AttrVersion W3CAttrsFor_APPLET[];
 25 | extern AttrVersion W3CAttrsFor_AREA[];
 26 | extern AttrVersion W3CAttrsFor_B[];
 27 | extern AttrVersion W3CAttrsFor_BASE[];
 28 | extern AttrVersion W3CAttrsFor_BASEFONT[];
 29 | extern AttrVersion W3CAttrsFor_BDO[];
 30 | extern AttrVersion W3CAttrsFor_BIG[];
 31 | extern AttrVersion W3CAttrsFor_BLOCKQUOTE[];
 32 | extern AttrVersion W3CAttrsFor_BODY[];
 33 | extern AttrVersion W3CAttrsFor_BR[];
 34 | extern AttrVersion W3CAttrsFor_BUTTON[];
 35 | extern AttrVersion W3CAttrsFor_CAPTION[];
 36 | extern AttrVersion W3CAttrsFor_CENTER[];
 37 | extern AttrVersion W3CAttrsFor_CITE[];
 38 | extern AttrVersion W3CAttrsFor_CODE[];
 39 | extern AttrVersion W3CAttrsFor_COL[];
 40 | extern AttrVersion W3CAttrsFor_COLGROUP[];
 41 | extern AttrVersion W3CAttrsFor_DD[];
 42 | extern AttrVersion W3CAttrsFor_DEL[];
 43 | extern AttrVersion W3CAttrsFor_DFN[];
 44 | extern AttrVersion W3CAttrsFor_DIR[];
 45 | extern AttrVersion W3CAttrsFor_DIV[];
 46 | extern AttrVersion W3CAttrsFor_DL[];
 47 | extern AttrVersion W3CAttrsFor_DT[];
 48 | extern AttrVersion W3CAttrsFor_EM[];
 49 | extern AttrVersion W3CAttrsFor_FIELDSET[];
 50 | extern AttrVersion W3CAttrsFor_FONT[];
 51 | extern AttrVersion W3CAttrsFor_FORM[];
 52 | extern AttrVersion W3CAttrsFor_FRAME[];
 53 | extern AttrVersion W3CAttrsFor_FRAMESET[];
 54 | extern AttrVersion W3CAttrsFor_H1[];
 55 | extern AttrVersion W3CAttrsFor_H2[];
 56 | extern AttrVersion W3CAttrsFor_H3[];
 57 | extern AttrVersion W3CAttrsFor_H4[];
 58 | extern AttrVersion W3CAttrsFor_H5[];
 59 | extern AttrVersion W3CAttrsFor_H6[];
 60 | extern AttrVersion W3CAttrsFor_HEAD[];
 61 | extern AttrVersion W3CAttrsFor_HR[];
 62 | extern AttrVersion W3CAttrsFor_HTML[];
 63 | extern AttrVersion W3CAttrsFor_I[];
 64 | extern AttrVersion W3CAttrsFor_IFRAME[];
 65 | extern AttrVersion W3CAttrsFor_IMG[];
 66 | extern AttrVersion W3CAttrsFor_INPUT[];
 67 | extern AttrVersion W3CAttrsFor_INS[];
 68 | extern AttrVersion W3CAttrsFor_ISINDEX[];
 69 | extern AttrVersion W3CAttrsFor_KBD[];
 70 | extern AttrVersion W3CAttrsFor_LABEL[];
 71 | extern AttrVersion W3CAttrsFor_LEGEND[];
 72 | extern AttrVersion W3CAttrsFor_LI[];
 73 | extern AttrVersion W3CAttrsFor_LINK[];
 74 | extern AttrVersion W3CAttrsFor_LISTING[];
 75 | extern AttrVersion W3CAttrsFor_MAP[];
 76 | extern AttrVersion W3CAttrsFor_MENU[];
 77 | extern AttrVersion W3CAttrsFor_META[];
 78 | extern AttrVersion W3CAttrsFor_NEXTID[];
 79 | extern AttrVersion W3CAttrsFor_NOFRAMES[];
 80 | extern AttrVersion W3CAttrsFor_NOSCRIPT[];
 81 | extern AttrVersion W3CAttrsFor_OBJECT[];
 82 | extern AttrVersion W3CAttrsFor_OL[];
 83 | extern AttrVersion W3CAttrsFor_OPTGROUP[];
 84 | extern AttrVersion W3CAttrsFor_OPTION[];
 85 | extern AttrVersion W3CAttrsFor_P[];
 86 | extern AttrVersion W3CAttrsFor_PARAM[];
 87 | extern AttrVersion W3CAttrsFor_PLAINTEXT[];
 88 | extern AttrVersion W3CAttrsFor_PRE[];
 89 | extern AttrVersion W3CAttrsFor_Q[];
 90 | extern AttrVersion W3CAttrsFor_RB[];
 91 | extern AttrVersion W3CAttrsFor_RBC[];
 92 | extern AttrVersion W3CAttrsFor_RP[];
 93 | extern AttrVersion W3CAttrsFor_RT[];
 94 | extern AttrVersion W3CAttrsFor_RTC[];
 95 | extern AttrVersion W3CAttrsFor_RUBY[];
 96 | extern AttrVersion W3CAttrsFor_S[];
 97 | extern AttrVersion W3CAttrsFor_SAMP[];
 98 | extern AttrVersion W3CAttrsFor_SCRIPT[];
 99 | extern AttrVersion W3CAttrsFor_SELECT[];
100 | extern AttrVersion W3CAttrsFor_SMALL[];
101 | extern AttrVersion W3CAttrsFor_SPAN[];
102 | extern AttrVersion W3CAttrsFor_STRIKE[];
103 | extern AttrVersion W3CAttrsFor_STRONG[];
104 | extern AttrVersion W3CAttrsFor_STYLE[];
105 | extern AttrVersion W3CAttrsFor_SUB[];
106 | extern AttrVersion W3CAttrsFor_SUP[];
107 | extern AttrVersion W3CAttrsFor_TABLE[];
108 | extern AttrVersion W3CAttrsFor_TBODY[];
109 | extern AttrVersion W3CAttrsFor_TD[];
110 | extern AttrVersion W3CAttrsFor_TEXTAREA[];
111 | extern AttrVersion W3CAttrsFor_TFOOT[];
112 | extern AttrVersion W3CAttrsFor_TH[];
113 | extern AttrVersion W3CAttrsFor_THEAD[];
114 | extern AttrVersion W3CAttrsFor_TITLE[];
115 | extern AttrVersion W3CAttrsFor_TR[];
116 | extern AttrVersion W3CAttrsFor_TT[];
117 | extern AttrVersion W3CAttrsFor_U[];
118 | extern AttrVersion W3CAttrsFor_UL[];
119 | extern AttrVersion W3CAttrsFor_VAR[];
120 | extern AttrVersion W3CAttrsFor_XMP[];
121 | 
122 | #endif /* __ATTRDICT_H__ */
123 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/buffio.c:
--------------------------------------------------------------------------------
  1 | /* buffio.c -- Treat buffer as an I/O stream.
  2 | 
  3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  4 |   See tidy.h for the copyright notice.
  5 | 
  6 |   CVS Info :
  7 | 
  8 |     $LastChangedBy$ 
  9 |     $LastChangedDate$ 
 10 |     $LastChangedRevision$ 
 11 | 
 12 |   Requires buffer to automatically grow as bytes are added.
 13 |   Must keep track of current read and write points.
 14 | 
 15 | */
 16 | 
 17 | #include "tidy.h"
 18 | #include "buffio.h"
 19 | 
 20 | 
 21 | /**************
 22 |    TIDY
 23 | **************/
 24 | 
 25 | static int insrc_getByte( ulong appData )
 26 | {
 27 |   TidyBuffer* buf = (TidyBuffer*) appData;
 28 |   return tidyBufGetByte( buf );
 29 | }
 30 | static Bool insrc_eof( ulong appData )
 31 | {
 32 |   TidyBuffer* buf = (TidyBuffer*) appData;
 33 |   return tidyBufEndOfInput( buf );
 34 | }
 35 | static void insrc_ungetByte( ulong appData, byte bv )
 36 | {
 37 |   TidyBuffer* buf = (TidyBuffer*) appData;
 38 |   tidyBufUngetByte( buf, bv );
 39 | }
 40 | 
 41 | void  initInputBuffer( TidyInputSource* inp, TidyBuffer* buf )
 42 | {
 43 |   inp->getByte    = insrc_getByte;
 44 |   inp->eof        = insrc_eof;
 45 |   inp->ungetByte  = insrc_ungetByte;
 46 |   inp->sourceData = (ulong) buf;
 47 | }
 48 | 
 49 | static void outsink_putByte( ulong appData, byte bv )
 50 | {
 51 |   TidyBuffer* buf = (TidyBuffer*) appData;
 52 |   tidyBufPutByte( buf, bv );
 53 | }
 54 | 
 55 | void  initOutputBuffer( TidyOutputSink* outp, TidyBuffer* buf )
 56 | {
 57 |   outp->putByte  = outsink_putByte;
 58 |   outp->sinkData = (ulong) buf;
 59 | }
 60 | 
 61 | 
 62 | void      tidyBufInit( TidyBuffer* buf )
 63 | {
 64 |     assert( buf != NULL );
 65 |     ClearMemory( buf, sizeof(TidyBuffer) );
 66 | }
 67 | 
 68 | void      tidyBufAlloc( TidyBuffer* buf, uint allocSize )
 69 | {
 70 |     tidyBufInit( buf );
 71 |     tidyBufCheckAlloc( buf, allocSize, 0 );
 72 |     buf->next = 0;
 73 | }
 74 | void      tidyBufFree( TidyBuffer* buf )
 75 | {
 76 |     assert( buf != NULL );
 77 |     MemFree( buf->bp );
 78 |     tidyBufInit( buf );
 79 | }
 80 | 
 81 | void      tidyBufClear( TidyBuffer* buf )
 82 | {
 83 |     assert( buf != NULL );
 84 |     if ( buf->bp )
 85 |     {
 86 |         ClearMemory( buf->bp, buf->allocated );
 87 |         buf->size = 0;
 88 |     }
 89 |     buf->next = 0;
 90 | }
 91 | 
 92 | /* Avoid thrashing memory by doubling buffer size
 93 | ** until larger than requested size.
 94 | */
 95 | void tidyBufCheckAlloc( TidyBuffer* buf, uint allocSize, uint chunkSize )
 96 | {
 97 |     assert( buf != NULL );
 98 |     if ( 0 == chunkSize )
 99 |         chunkSize = 256;
100 |     if ( allocSize > buf->allocated )
101 |     {
102 |         byte* bp;
103 |         uint allocAmt = chunkSize;
104 |         if ( buf->allocated > 0 )
105 |             allocAmt = buf->allocated;
106 |         while ( allocAmt < allocSize )
107 |             allocAmt *= 2;
108 | 
109 |         bp = MemRealloc( buf->bp, allocAmt );
110 |         if ( bp != NULL )
111 |         {
112 |             ClearMemory( bp + buf->allocated, allocAmt - buf->allocated );
113 |             buf->bp = bp;
114 |             buf->allocated = allocAmt;
115 |         }
116 |     }
117 | }
118 | 
119 | /* Attach buffer to a chunk O' memory w/out allocation */
120 | void      tidyBufAttach( TidyBuffer* buf, void* bp, uint size )
121 | {
122 |     assert( buf != NULL );
123 |     buf->bp = bp;
124 |     buf->size = buf->allocated = size;
125 |     buf->next = 0;
126 | }
127 | 
128 | /* Clear pointer to memory w/out deallocation */
129 | void      tidyBufDetach( TidyBuffer* buf )
130 | {
131 |     tidyBufInit( buf );
132 | }
133 | 
134 | 
135 | /**************
136 |    OUTPUT
137 | **************/
138 | 
139 | void      tidyBufAppend( TidyBuffer* buf, void* vp, uint size )
140 | {
141 |     assert( buf != NULL );
142 |     if ( vp != NULL && size > 0 )
143 |     {
144 |         tidyBufCheckAlloc( buf, buf->size + size, 0 );
145 |         memcpy( buf->bp + buf->size, vp, size );
146 |         buf->size += size;
147 |     }
148 | }
149 | 
150 | void      tidyBufPutByte( TidyBuffer* buf, byte bv )
151 | {
152 |     assert( buf != NULL );
153 |     tidyBufCheckAlloc( buf, buf->size + 1, 0 );
154 |     buf->bp[ buf->size++ ] = bv;
155 | }
156 | 
157 | 
158 | int      tidyBufPopByte( TidyBuffer* buf )
159 | {
160 |     int bv = EOF;
161 |     assert( buf != NULL );
162 |     if ( buf->size > 0 )
163 |       bv = buf->bp[ --buf->size ];
164 |     return bv;
165 | }
166 | 
167 | /**************
168 |    INPUT
169 | **************/
170 | 
171 | int       tidyBufGetByte( TidyBuffer* buf )
172 | {
173 |     int bv = EOF;
174 |     if ( ! tidyBufEndOfInput(buf) )
175 |       bv = buf->bp[ buf->next++ ];
176 |     return bv;
177 | }
178 | 
179 | Bool      tidyBufEndOfInput( TidyBuffer* buf )
180 | {
181 |     return ( buf->next >= buf->size );
182 | }
183 | 
184 | void      tidyBufUngetByte( TidyBuffer* buf, byte bv )
185 | {
186 |     if ( buf->next > 0 )
187 |     {
188 |         --buf->next;
189 |         assert( bv == buf->bp[ buf->next ] );
190 |     }
191 | }
192 | 
193 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/tidy-int.h:
--------------------------------------------------------------------------------
  1 | #ifndef __TIDY_INT_H__
  2 | #define __TIDY_INT_H__
  3 | 
  4 | /* tidy-int.h -- internal library declarations
  5 | 
  6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  7 |   See tidy.h for the copyright notice.
  8 | 
  9 |   CVS Info :
 10 | 
 11 |     $LastChangedBy$ 
 12 |     $LastChangedDate$ 
 13 |     $LastChangedRevision$ 
 14 | 
 15 | */
 16 | 
 17 | #include "tidy.h"
 18 | #include "config.h"
 19 | #include "tags.h"
 20 | #include "attrs.h"
 21 | #include "lexer.h"
 22 | #include "pprint.h"
 23 | #include "access.h"
 24 | 
 25 | #ifndef MAX
 26 | #define MAX(a,b) (((a) > (b))?(a):(b))
 27 | #endif
 28 | #ifndef MIN
 29 | #define MIN(a,b) (((a) < (b))?(a):(b))
 30 | #endif
 31 | 
 32 | struct _TidyDocImpl
 33 | {
 34 |     /* The Document Tree (and backing store buffer) */
 35 |     Node                root;       /* This MUST remain the first declared 
 36 |                                        variable in this structure */
 37 |     Lexer*              lexer;
 38 | 
 39 |     /* Config + Markup Declarations */
 40 |     TidyConfigImpl      config;
 41 |     TidyTagImpl         tags;
 42 |     TidyAttribImpl      attribs;
 43 | 
 44 | #if SUPPORT_ACCESSIBILITY_CHECKS
 45 |     /* Accessibility Checks state */
 46 |     TidyAccessImpl      access;
 47 | #endif
 48 | 
 49 |     /* The Pretty Print buffer */
 50 |     TidyPrintImpl       pprint;
 51 | 
 52 |     /* I/O */
 53 |     StreamIn*           docIn;
 54 |     StreamOut*          docOut;
 55 |     StreamOut*          errout;
 56 |     TidyReportFilter    mssgFilt;
 57 |     TidyOptCallback     pOptCallback;
 58 | 
 59 |     /* Parse + Repair Results */
 60 |     uint                optionErrors;
 61 |     uint                errors;
 62 |     uint                warnings;
 63 |     uint                accessErrors;
 64 |     uint                infoMessages;
 65 |     uint                docErrors;
 66 |     int                 parseStatus;
 67 | 
 68 |     uint                badAccess;   /* for accessibility errors */
 69 |     uint                badLayout;   /* for bad style errors */
 70 |     uint                badChars;    /* for bad char encodings */
 71 |     uint                badForm;     /* for badly placed form tags */
 72 | 
 73 |     /* Miscellaneous */
 74 |     ulong               appData;
 75 |     uint                nClassId;
 76 |     Bool                inputHadBOM;
 77 | 
 78 | #if PRESERVE_FILE_TIMES
 79 |     struct utimbuf      filetimes;
 80 | #endif
 81 |     Node*               givenDoctype;
 82 | };
 83 | 
 84 | 
 85 | /* Twizzle internal/external types */
 86 | #ifdef NEVER
 87 | TidyDocImpl* tidyDocToImpl( TidyDoc tdoc );
 88 | TidyDoc      tidyImplToDoc( TidyDocImpl* impl );
 89 | 
 90 | Node*        tidyNodeToImpl( TidyNode tnod );
 91 | TidyNode     tidyImplToNode( Node* node );
 92 | 
 93 | AttVal*      tidyAttrToImpl( TidyAttr tattr );
 94 | TidyAttr     tidyImplToAttr( AttVal* attval );
 95 | 
 96 | const TidyOptionImpl* tidyOptionToImpl( TidyOption topt );
 97 | TidyOption   tidyImplToOption( const TidyOptionImpl* option );
 98 | #else
 99 | 
100 | #define tidyDocToImpl( tdoc )       ((TidyDocImpl*)(tdoc))
101 | #define tidyImplToDoc( doc )        ((TidyDoc)(doc))
102 | 
103 | #define tidyNodeToImpl( tnod )      ((Node*)(tnod))
104 | #define tidyImplToNode( node )      ((TidyNode)(node))
105 | 
106 | #define tidyAttrToImpl( tattr )     ((AttVal*)(tattr))
107 | #define tidyImplToAttr( attval )    ((TidyAttr)(attval))
108 | 
109 | #define tidyOptionToImpl( topt )    ((const TidyOptionImpl*)(topt))
110 | #define tidyImplToOption( option )  ((TidyOption)(option))
111 | 
112 | #endif
113 | 
114 | /* Create/Destroy a Tidy "document" object */
115 | TidyDocImpl* tidyDocCreate(void);
116 | void         tidyDocRelease( TidyDocImpl* impl );
117 | 
118 | int          tidyDocStatus( TidyDocImpl* impl );
119 | 
120 | /* Parse Markup */
121 | int          tidyDocParseFile( TidyDocImpl* impl, ctmbstr htmlfil );
122 | int          tidyDocParseStdin( TidyDocImpl* impl );
123 | int          tidyDocParseString( TidyDocImpl* impl, ctmbstr content );
124 | int          tidyDocParseBuffer( TidyDocImpl* impl, TidyBuffer* inbuf );
125 | int          tidyDocParseSource( TidyDocImpl* impl, TidyInputSource* docIn );
126 | int          tidyDocParseStream( TidyDocImpl* impl, StreamIn* in );
127 | 
128 | 
129 | /* Execute post-parse diagnostics and cleanup.
130 | ** Note, the order is important.  You will get different
131 | ** results from the diagnostics depending on if they are run
132 | ** pre-or-post repair.
133 | */
134 | int          tidyDocRunDiagnostics( TidyDocImpl* doc );
135 | int          tidyDocCleanAndRepair( TidyDocImpl* doc );
136 | 
137 | 
138 | /* Save cleaned up file to file/buffer/sink */
139 | int          tidyDocSaveFile( TidyDocImpl* impl, ctmbstr htmlfil );
140 | int          tidyDocSaveStdout( TidyDocImpl* impl );
141 | int          tidyDocSaveString( TidyDocImpl* impl, tmbstr buffer, uint* buflen );
142 | int          tidyDocSaveBuffer( TidyDocImpl* impl, TidyBuffer* outbuf );
143 | int          tidyDocSaveSink( TidyDocImpl* impl, TidyOutputSink* docOut );
144 | int          tidyDocSaveStream( TidyDocImpl* impl, StreamOut* out );
145 | 
146 | #endif /* __TIDY_INT_H__ */
147 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/README:
--------------------------------------------------------------------------------
  1 | $Id: README 2275 2005-02-03 18:20:56Z fredrik $
  2 | 
  3 | ======================
  4 | The elementtidy module
  5 | ======================
  6 | 
  7 | This module provides an ElementTree builder based on the HTML TidyLib
  8 | library.  For more information, see:
  9 | 
 10 |     http://effbot.org/zone/element-tidylib.htm
 11 | 
 12 | For more information on the ElementTree library, see:
 13 | 
 14 |     http://effbot.org/zone/element-index.htm
 15 | 
 16 | This distribution includes a snapshot of the HTML Tidy sources.  You
 17 | can get the latest version from:
 18 | 
 19 |     http://tidy.sourceforge.net
 20 | 
 21 | Enjoy /F
 22 | 
 23 | fredrik@pythonware.com
 24 | http://www.pythonware.com
 25 | 
 26 | --------------------------------------------------------------------
 27 | Changes
 28 | --------------------------------------------------------------------
 29 | 
 30 | (1.0 final released)
 31 | 
 32 | - Improved error checking.  The library may now raise IOError
 33 |   exceptions if the underlying Tidy library fails.
 34 | 
 35 | (1.0 beta 1 released)
 36 | 
 37 | - Use 'ForceOutput' flag to force output even for badly malformed
 38 |   HTML.
 39 | 
 40 | - Added source encoding support (based on code by Kevin Dangoor).
 41 | 
 42 | (1.0 alpha 3 released)
 43 | 
 44 | - Fixed core dump when pages are broken beyond repair (reported by
 45 |   many, fix proposed by Brad Clements)
 46 | 
 47 | (1.0 alpha 2 released)
 48 | 
 49 | - Capture error output (available via the 'errlog' attribute on the
 50 |   parser instance).
 51 | 
 52 | (1.0 alpha 1 released -- initial release)
 53 | 
 54 | --------------------------------------------------------------------
 55 | Software License
 56 | --------------------------------------------------------------------
 57 | 
 58 | The software components in this package are copyrighted, but can all
 59 | be used freely in all sorts of applications.
 60 | 
 61 | The _elementtree binding, and associated Python code, is
 62 | 
 63 |    Copyright (c) 2003-2005 by Fredrik Lundh.  All rights reserved.
 64 | 
 65 |    By obtaining, using, and/or copying this software and/or its
 66 |    associated documentation, you agree that you have read, understood,
 67 |    and will comply with the following terms and conditions:
 68 | 
 69 |    Permission to use, copy, modify, and distribute this software and its
 70 |    associated documentation for any purpose and without fee is hereby
 71 |    granted, provided that the above copyright notice appears in all
 72 |    copies, and that both that copyright notice and this permission notice
 73 |    appear in supporting documentation, and that the name of Secret Labs
 74 |    AB or the author not be used in advertising or publicity pertaining to
 75 |    distribution of the software without specific, written prior
 76 |    permission.
 77 | 
 78 |    SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
 79 |    THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 80 |    FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
 81 |    ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 82 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 83 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
 84 |    OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 85 | 
 86 | --------------------------------------------------------------------
 87 | 
 88 | The TidyLib library is copyright (c) by 1998-2003 World Wide Web
 89 | Consortium.  Software license:
 90 | 
 91 |    HTML Tidy
 92 | 
 93 |    HTML parser and pretty printer
 94 | 
 95 |    Copyright (c) 1998-2003 World Wide Web Consortium
 96 |    (Massachusetts Institute of Technology, European Research 
 97 |    Consortium for Informatics and Mathematics, Keio University).
 98 |    All Rights Reserved.
 99 | 
100 |    This software and documentation is provided "as is," and
101 |    the copyright holders and contributing author(s) make no
102 |    representations or warranties, express or implied, including
103 |    but not limited to, warranties of merchantability or fitness
104 |    for any particular purpose or that the use of the software or
105 |    documentation will not infringe any third party patents,
106 |    copyrights, trademarks or other rights. 
107 | 
108 |    The copyright holders and contributing author(s) will not be held
109 |    liable for any direct, indirect, special or consequential damages
110 |    arising out of any use of the software or documentation, even if
111 |    advised of the possibility of such damage.
112 | 
113 |    Permission is hereby granted to use, copy, modify, and distribute
114 |    this source code, or portions hereof, documentation and executables,
115 |    for any purpose, without fee, subject to the following restrictions:
116 | 
117 |    1. The origin of this source code must not be misrepresented.
118 |    2. Altered versions must be plainly marked as such and must
119 |       not be misrepresented as being the original source.
120 |    3. This Copyright notice may not be removed or altered from any
121 |       source or altered source distribution.
122 | 
123 |    The copyright holders and contributing author(s) specifically
124 |    permit, without fee, and encourage the use of this source code
125 |    as a component for supporting the Hypertext Markup Language in
126 |    commercial products. If you use this source code in a product,
127 |    acknowledgment is not required but would be appreciated.
128 | 
129 | (from http://tidy.sourceforge.net/license.html)
130 | 
131 | --------------------------------------------------------------------
132 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/attrask.c:
--------------------------------------------------------------------------------
  1 | /* attrask.c -- Interrogate attribute type
  2 | 
  3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  4 |   See tidy.h for the copyright notice.
  5 |   
  6 |   CVS Info:
  7 |     $LastChangedBy$ 
  8 |     $LastChangedDate$ 
  9 |     $LastChangedRevision$ 
 10 | 
 11 | */
 12 | 
 13 | #include "tidy-int.h"
 14 | #include "tidy.h"
 15 | #include "attrs.h"
 16 | 
 17 | Bool tidyAttrIsHREF( TidyAttr tattr )
 18 | {
 19 |     return attrIsHREF( tidyAttrToImpl(tattr) );
 20 | }
 21 | Bool tidyAttrIsSRC( TidyAttr tattr )
 22 | {
 23 |     return attrIsSRC( tidyAttrToImpl(tattr) );
 24 | }
 25 | Bool tidyAttrIsID( TidyAttr tattr )
 26 | {
 27 |     return attrIsID( tidyAttrToImpl(tattr) );
 28 | }
 29 | Bool tidyAttrIsNAME( TidyAttr tattr )
 30 | {
 31 |     return attrIsNAME( tidyAttrToImpl(tattr) );
 32 | }
 33 | Bool tidyAttrIsSUMMARY( TidyAttr tattr )
 34 | {
 35 |     return attrIsSUMMARY( tidyAttrToImpl(tattr) );
 36 | }
 37 | Bool tidyAttrIsALT( TidyAttr tattr )
 38 | {
 39 |     return attrIsALT( tidyAttrToImpl(tattr) );
 40 | }
 41 | Bool tidyAttrIsLONGDESC( TidyAttr tattr )
 42 | {
 43 |     return attrIsLONGDESC( tidyAttrToImpl(tattr) );
 44 | }
 45 | Bool tidyAttrIsUSEMAP( TidyAttr tattr )
 46 | {
 47 |     return attrIsUSEMAP( tidyAttrToImpl(tattr) );
 48 | }
 49 | Bool tidyAttrIsISMAP( TidyAttr tattr )
 50 | {
 51 |     return attrIsISMAP( tidyAttrToImpl(tattr) );
 52 | }
 53 | Bool tidyAttrIsLANGUAGE( TidyAttr tattr )
 54 | {
 55 |     return attrIsLANGUAGE( tidyAttrToImpl(tattr) );
 56 | }
 57 | Bool tidyAttrIsTYPE( TidyAttr tattr )
 58 | {
 59 |     return attrIsTYPE( tidyAttrToImpl(tattr) );
 60 | }
 61 | Bool tidyAttrIsVALUE( TidyAttr tattr )
 62 | {
 63 |     return attrIsVALUE( tidyAttrToImpl(tattr) );
 64 | }
 65 | Bool tidyAttrIsCONTENT( TidyAttr tattr )
 66 | {
 67 |     return attrIsCONTENT( tidyAttrToImpl(tattr) );
 68 | }
 69 | Bool tidyAttrIsTITLE( TidyAttr tattr )
 70 | {
 71 |     return attrIsTITLE( tidyAttrToImpl(tattr) );
 72 | }
 73 | Bool tidyAttrIsXMLNS( TidyAttr tattr )
 74 | {
 75 |     return attrIsXMLNS( tidyAttrToImpl(tattr) );
 76 | }
 77 | Bool tidyAttrIsDATAFLD( TidyAttr tattr )
 78 | {
 79 |     return attrIsDATAFLD( tidyAttrToImpl(tattr) );
 80 | }
 81 | Bool tidyAttrIsWIDTH( TidyAttr tattr )
 82 | {
 83 |     return attrIsWIDTH( tidyAttrToImpl(tattr) );
 84 | }
 85 | Bool tidyAttrIsHEIGHT( TidyAttr tattr )
 86 | {
 87 |     return attrIsHEIGHT( tidyAttrToImpl(tattr) );
 88 | }
 89 | Bool tidyAttrIsFOR( TidyAttr tattr )
 90 | {
 91 |     return attrIsFOR( tidyAttrToImpl(tattr) );
 92 | }
 93 | Bool tidyAttrIsSELECTED( TidyAttr tattr )
 94 | {
 95 |     return attrIsSELECTED( tidyAttrToImpl(tattr) );
 96 | }
 97 | Bool tidyAttrIsCHECKED( TidyAttr tattr )
 98 | {
 99 |     return attrIsCHECKED( tidyAttrToImpl(tattr) );
100 | }
101 | Bool tidyAttrIsLANG( TidyAttr tattr )
102 | {
103 |     return attrIsLANG( tidyAttrToImpl(tattr) );
104 | }
105 | Bool tidyAttrIsTARGET( TidyAttr tattr )
106 | {
107 |     return attrIsTARGET( tidyAttrToImpl(tattr) );
108 | }
109 | Bool tidyAttrIsHTTP_EQUIV( TidyAttr tattr )
110 | {
111 |     return attrIsHTTP_EQUIV( tidyAttrToImpl(tattr) );
112 | }
113 | Bool tidyAttrIsREL( TidyAttr tattr )
114 | {
115 |     return attrIsREL( tidyAttrToImpl(tattr) );
116 | }
117 | Bool tidyAttrIsEvent( TidyAttr tattr )
118 | {
119 |     return attrIsEvent( tidyAttrToImpl(tattr) );
120 | }
121 | Bool tidyAttrIsOnMOUSEMOVE( TidyAttr tattr )
122 | {
123 |     return attrIsOnMOUSEMOVE( tidyAttrToImpl(tattr) );
124 | }
125 | Bool tidyAttrIsOnMOUSEDOWN( TidyAttr tattr )
126 | {
127 |     return attrIsOnMOUSEDOWN( tidyAttrToImpl(tattr) );
128 | }
129 | Bool tidyAttrIsOnMOUSEUP( TidyAttr tattr )
130 | {
131 |     return attrIsOnMOUSEUP( tidyAttrToImpl(tattr) );
132 | }
133 | Bool tidyAttrIsOnCLICK( TidyAttr tattr )
134 | {
135 |     return attrIsOnCLICK( tidyAttrToImpl(tattr) );
136 | }
137 | Bool tidyAttrIsOnMOUSEOVER( TidyAttr tattr )
138 | {
139 |     return attrIsOnMOUSEOVER( tidyAttrToImpl(tattr) );
140 | }
141 | Bool tidyAttrIsOnMOUSEOUT( TidyAttr tattr )
142 | {
143 |     return attrIsOnMOUSEOUT( tidyAttrToImpl(tattr) );
144 | }
145 | Bool tidyAttrIsOnKEYDOWN( TidyAttr tattr )
146 | {
147 |     return attrIsOnKEYDOWN( tidyAttrToImpl(tattr) );
148 | }
149 | Bool tidyAttrIsOnKEYUP( TidyAttr tattr )
150 | {
151 |     return attrIsOnKEYUP( tidyAttrToImpl(tattr) );
152 | }
153 | Bool tidyAttrIsOnKEYPRESS( TidyAttr tattr )
154 | {
155 |     return attrIsOnKEYPRESS( tidyAttrToImpl(tattr) );
156 | }
157 | Bool tidyAttrIsOnFOCUS( TidyAttr tattr )
158 | {
159 |     return attrIsOnFOCUS( tidyAttrToImpl(tattr) );
160 | }
161 | Bool tidyAttrIsOnBLUR( TidyAttr tattr )
162 | {
163 |     return attrIsOnBLUR( tidyAttrToImpl(tattr) );
164 | }
165 | Bool tidyAttrIsBGCOLOR( TidyAttr tattr )
166 | {
167 |     return attrIsBGCOLOR( tidyAttrToImpl(tattr) );
168 | }
169 | Bool tidyAttrIsLINK( TidyAttr tattr )
170 | {
171 |     return attrIsLINK( tidyAttrToImpl(tattr) );
172 | }
173 | Bool tidyAttrIsALINK( TidyAttr tattr )
174 | {
175 |     return attrIsALINK( tidyAttrToImpl(tattr) );
176 | }
177 | Bool tidyAttrIsVLINK( TidyAttr tattr )
178 | {
179 |     return attrIsVLINK( tidyAttrToImpl(tattr) );
180 | }
181 | Bool tidyAttrIsTEXT( TidyAttr tattr )
182 | {
183 |     return attrIsTEXT( tidyAttrToImpl(tattr) );
184 | }
185 | Bool tidyAttrIsSTYLE( TidyAttr tattr )
186 | {
187 |     return attrIsSTYLE( tidyAttrToImpl(tattr) );
188 | }
189 | Bool tidyAttrIsABBR( TidyAttr tattr )
190 | {
191 |     return attrIsABBR( tidyAttrToImpl(tattr) );
192 | }
193 | Bool tidyAttrIsCOLSPAN( TidyAttr tattr )
194 | {
195 |     return attrIsCOLSPAN( tidyAttrToImpl(tattr) );
196 | }
197 | Bool tidyAttrIsROWSPAN( TidyAttr tattr )
198 | {
199 |     return attrIsROWSPAN( tidyAttrToImpl(tattr) );
200 | }
201 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/streamio.h:
--------------------------------------------------------------------------------
  1 | #ifndef __STREAMIO_H__
  2 | #define __STREAMIO_H__
  3 | 
  4 | /* streamio.h -- handles character stream I/O
  5 | 
  6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  7 |   See tidy.h for the copyright notice.
  8 | 
  9 |   CVS Info :
 10 | 
 11 |     $LastChangedBy$ 
 12 |     $LastChangedDate$ 
 13 |     $LastChangedRevision$ 
 14 | 
 15 |   Wrapper around Tidy input source and output sink
 16 |   that calls appropriate interfaces, and applies 
 17 |   necessary char encoding transformations: to/from
 18 |   ISO-10646 and/or UTF-8.
 19 | 
 20 | */
 21 | 
 22 | #include "forward.h"
 23 | #include "buffio.h"
 24 | #include "fileio.h"
 25 | 
 26 | #ifdef __cplusplus
 27 | extern "C"
 28 | {
 29 | #endif
 30 | typedef enum
 31 | {
 32 |   FileIO,
 33 |   BufferIO,
 34 |   UserIO
 35 | } IOType;
 36 | 
 37 | /************************
 38 | ** Source
 39 | ************************/
 40 | 
 41 | #define CHARBUF_SIZE 5
 42 | 
 43 | /* non-raw input is cleaned up*/
 44 | struct _StreamIn
 45 | {
 46 |     int  state;     /* FSM for ISO2022 */
 47 |     Bool pushed;
 48 |     uint charbuf[ CHARBUF_SIZE ];
 49 |     int  bufpos;
 50 |     int  tabs;
 51 |     int  lastcol;
 52 |     int  curcol;
 53 |     int  curline;
 54 | 
 55 |     int  encoding;
 56 |     IOType iotype;
 57 |     TidyInputSource source;
 58 | 
 59 | #ifdef TIDY_WIN32_MLANG_SUPPORT
 60 |     ulong mlang;
 61 | #endif
 62 | 
 63 |     /* Pointer back to document for error reporting */
 64 |     TidyDocImpl* doc;
 65 | };
 66 | 
 67 | StreamIn* FileInput( TidyDocImpl* doc, FILE* fp, int encoding );
 68 | StreamIn* BufferInput( TidyDocImpl* doc, TidyBuffer* content, int encoding );
 69 | StreamIn* UserInput( TidyDocImpl* doc, TidyInputSource* source, int encoding );
 70 | 
 71 | int       ReadBOMEncoding(StreamIn *in);
 72 | uint      ReadChar( StreamIn* in );
 73 | void      UngetChar( uint c, StreamIn* in );
 74 | uint      PopChar( StreamIn *in );
 75 | Bool      IsEOF( StreamIn* in );
 76 | 
 77 | 
 78 | /************************
 79 | ** Sink
 80 | ************************/
 81 | 
 82 | struct _StreamOut
 83 | {
 84 |     int   encoding;
 85 |     int   state;     /* for ISO 2022 */
 86 |     uint  nl;
 87 | 
 88 | #ifdef TIDY_WIN32_MLANG_SUPPORT
 89 |     ulong mlang;
 90 | #endif
 91 | 
 92 |     IOType iotype;
 93 |     TidyOutputSink sink;
 94 | };
 95 | 
 96 | StreamOut* FileOutput( FILE* fp, int encoding, uint newln );
 97 | StreamOut* BufferOutput( TidyBuffer* buf, int encoding, uint newln );
 98 | StreamOut* UserOutput( TidyOutputSink* sink, int encoding, uint newln );
 99 | 
100 | StreamOut* StdErrOutput(void);
101 | StreamOut* StdOutOutput(void);
102 | void       ReleaseStreamOut( StreamOut* out );
103 | 
104 | void WriteChar( uint c, StreamOut* out );
105 | void outBOM( StreamOut *out );
106 | 
107 | ctmbstr GetEncodingNameFromTidyId(uint id);
108 | 
109 | /************************
110 | ** Misc
111 | ************************/
112 | 
113 | /* character encodings
114 | */
115 | #define RAW         0
116 | #define ASCII       1
117 | #define LATIN0      2
118 | #define LATIN1      3
119 | #define UTF8        4
120 | #define ISO2022     5
121 | #define MACROMAN    6
122 | #define WIN1252     7
123 | #define IBM858      8
124 | 
125 | #if SUPPORT_UTF16_ENCODINGS
126 | #define UTF16LE     9
127 | #define UTF16BE     10
128 | #define UTF16       11
129 | #endif
130 | 
131 | /* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints
132 | ** (i.e., to Unicode) before being recoded into UTF-8. This may be
133 | ** confusing: usually UTF-8 implies ISO10646 codepoints.
134 | */
135 | #if SUPPORT_ASIAN_ENCODINGS
136 | #if SUPPORT_UTF16_ENCODINGS
137 | #define BIG5        12
138 | #define SHIFTJIS    13
139 | #else
140 | #define BIG5        9
141 | #define SHIFTJIS    10
142 | #endif
143 | #endif
144 | 
145 | #ifdef TIDY_WIN32_MLANG_SUPPORT
146 | /* hack: windows code page numbers start at 37 */
147 | #define WIN32MLANG  36
148 | #endif
149 | 
150 | /* states for ISO 2022
151 | 
152 |  A document in ISO-2022 based encoding uses some ESC sequences called
153 |  "designator" to switch character sets. The designators defined and
154 |  used in ISO-2022-JP are:
155 | 
156 |     "ESC" + "(" + ?     for ISO646 variants
157 | 
158 |     "ESC" + "$" + ?     and
159 |     "ESC" + "$" + "(" + ?   for multibyte character sets
160 | */
161 | #define FSM_ASCII    0
162 | #define FSM_ESC      1
163 | #define FSM_ESCD     2
164 | #define FSM_ESCDP    3
165 | #define FSM_ESCP     4
166 | #define FSM_NONASCII 5
167 | 
168 | 
169 | /* char encoding used when replacing illegal SGML chars,
170 | ** regardless of specified encoding.  Set at compile time
171 | ** to either Windows or Mac.
172 | */
173 | extern const int ReplacementCharEncoding;
174 | 
175 | /* Function for conversion from Windows-1252 to Unicode */
176 | uint DecodeWin1252(uint c);
177 | 
178 | /* Function to convert from MacRoman to Unicode */
179 | uint DecodeMacRoman(uint c);
180 | 
181 | /* Function for conversion from OS/2-850 to Unicode */
182 | uint DecodeIbm850(uint c);
183 | 
184 | /* Function for conversion from Latin0 to Unicode */
185 | uint DecodeLatin0(uint c);
186 | 
187 | /* Function to convert from Symbol Font chars to Unicode */
188 | uint DecodeSymbolFont(uint c);
189 | #ifdef __cplusplus
190 | }
191 | #endif
192 | 
193 | 
194 | /* Use numeric constants as opposed to escape chars (\r, \n)
195 | ** to avoid conflict Mac compilers that may re-define these.
196 | */
197 | #define CR    0xD
198 | #define LF    0xA
199 | 
200 | #if   defined(MAC_OS_CLASSIC)
201 | #define DEFAULT_NL_CONFIG TidyCR
202 | #elif defined(_WIN32) || defined(OS2_OS)
203 | #define DEFAULT_NL_CONFIG TidyCRLF
204 | #else
205 | #define DEFAULT_NL_CONFIG TidyLF
206 | #endif
207 | 
208 | 
209 | #endif /* __STREAMIO_H__ */
210 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/tmbstr.c:
--------------------------------------------------------------------------------
  1 | /* tmbstr.c -- Tidy string utility functions
  2 | 
  3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  4 |   See tidy.h for the copyright notice.
  5 | 
  6 |   CVS Info :
  7 | 
  8 |     $LastChangedBy$ 
  9 |     $LastChangedDate$ 
 10 |     $LastChangedRevision$ 
 11 | 
 12 | */
 13 | 
 14 | #include "tmbstr.h"
 15 | #include "lexer.h"
 16 | 
 17 | /* like strdup but using MemAlloc */
 18 | tmbstr tmbstrdup( ctmbstr str )
 19 | {
 20 |     tmbstr s = NULL;
 21 |     if ( str )
 22 |     {
 23 |         uint len = tmbstrlen( str );
 24 |         tmbstr cp = s = (tmbstr) MemAlloc( 1+len );
 25 |         while ( *cp++ = *str++ )
 26 |             /**/;
 27 |     }
 28 |     return s;
 29 | }
 30 | 
 31 | /* like strndup but using MemAlloc */
 32 | tmbstr tmbstrndup( ctmbstr str, uint len )
 33 | {
 34 |     tmbstr s = NULL;
 35 |     if ( str && len > 0 )
 36 |     {
 37 |         tmbstr cp = s = (tmbstr) MemAlloc( 1+len );
 38 |         while ( len-- > 0 &&  (*cp++ = *str++) )
 39 |           /**/;
 40 |         *cp = 0;
 41 |     }
 42 |     return s;
 43 | }
 44 | 
 45 | /* exactly same as strncpy */
 46 | uint tmbstrncpy( tmbstr s1, ctmbstr s2, uint size )
 47 | {
 48 |     if ( s1 != NULL && s2 != NULL )
 49 |     {
 50 |         tmbstr cp = s1;
 51 |         while ( *s2 && --size )  /* Predecrement: reserve byte */
 52 |             *cp++ = *s2++;       /* for NULL terminator. */
 53 |         *cp = 0;
 54 |     }
 55 |     return size;
 56 | }
 57 | 
 58 | /* Allows expressions like:  cp += tmbstrcpy( cp, "joebob" );
 59 | */
 60 | uint tmbstrcpy( tmbstr s1, ctmbstr s2 )
 61 | {
 62 |     uint ncpy = 0;
 63 |     while ( *s1++ = *s2++ )
 64 |         ++ncpy;
 65 |     return ncpy;
 66 | }
 67 | 
 68 | /* Allows expressions like:  cp += tmbstrcat( cp, "joebob" );
 69 | */
 70 | uint tmbstrcat( tmbstr s1, ctmbstr s2 )
 71 | {
 72 |     uint ncpy = 0;
 73 |     while ( *s1 )
 74 |         ++s1;
 75 | 
 76 |     while ( *s1++ = *s2++ )
 77 |         ++ncpy;
 78 |     return ncpy;
 79 | }
 80 | 
 81 | /* exactly same as strcmp */
 82 | int tmbstrcmp( ctmbstr s1, ctmbstr s2 )
 83 | {
 84 |     int c;
 85 |     while ((c = *s1) == *s2)
 86 |     {
 87 |         if (c == '\0')
 88 |             return 0;
 89 | 
 90 |         ++s1;
 91 |         ++s2;
 92 |     }
 93 | 
 94 |     return (*s1 > *s2 ? 1 : -1);
 95 | }
 96 | 
 97 | /* returns byte count, not char count */
 98 | uint tmbstrlen( ctmbstr str )
 99 | {
100 |     uint len = 0;
101 |     if ( str ) 
102 |     {
103 |         while ( *str++ )
104 |             ++len;
105 |     }
106 |     return len;
107 | }
108 | 
109 | /*
110 |  MS C 4.2 doesn't include strcasecmp.
111 |  Note that tolower and toupper won't
112 |  work on chars > 127.
113 | 
114 |  Neither does ToLower()!
115 | */
116 | int tmbstrcasecmp( ctmbstr s1, ctmbstr s2 )
117 | {
118 |     uint c;
119 | 
120 |     while (c = (uint)(*s1), ToLower(c) == ToLower((uint)(*s2)))
121 |     {
122 |         if (c == '\0')
123 |             return 0;
124 | 
125 |         ++s1;
126 |         ++s2;
127 |     }
128 | 
129 |     return (*s1 > *s2 ? 1 : -1);
130 | }
131 | 
132 | int tmbstrncmp( ctmbstr s1, ctmbstr s2, uint n )
133 | {
134 |     uint c;
135 | 
136 |     while ((c = (byte)*s1) == (byte)*s2)
137 |     {
138 |         if (c == '\0')
139 |             return 0;
140 | 
141 |         if (n == 0)
142 |             return 0;
143 | 
144 |         ++s1;
145 |         ++s2;
146 |         --n;
147 |     }
148 | 
149 |     if (n == 0)
150 |         return 0;
151 | 
152 |     return (*s1 > *s2 ? 1 : -1);
153 | }
154 | 
155 | int tmbstrncasecmp( ctmbstr s1, ctmbstr s2, uint n )
156 | {
157 |     uint c;
158 | 
159 |     while ( (c = tolower(*s1)) == (uint) tolower(*s2) )
160 |     {
161 |         if (c == '\0')
162 |             return 0;
163 | 
164 |         if (n == 0)
165 |             return 0;
166 | 
167 |         ++s1;
168 |         ++s2;
169 |         --n;
170 |     }
171 | 
172 |     if (n == 0)
173 |         return 0;
174 | 
175 |     return (*s1 > *s2 ? 1 : -1);
176 | }
177 | 
178 | /* return offset of cc from beginning of s1,
179 | ** -1 if not found.
180 | */
181 | int tmbstrnchr( ctmbstr s1, uint maxlen, tmbchar cc )
182 | {
183 |     int i;
184 |     ctmbstr cp = s1;
185 | 
186 |     for ( i = 0; (uint)i < maxlen; ++i, ++cp )
187 |     {
188 |         if ( *cp == cc )
189 |             return i;
190 |     }
191 | 
192 |     return -1;
193 | }
194 | 
195 | ctmbstr tmbsubstrn( ctmbstr s1, uint len1, ctmbstr s2 )
196 | {
197 |     uint len2 = tmbstrlen(s2);
198 |     int ix, diff = len1 - len2;
199 | 
200 |     for ( ix = 0; ix <= diff; ++ix )
201 |     {
202 |         if ( tmbstrncmp(s1+ix, s2, len2) == 0 )
203 |             return (ctmbstr) s1+ix;
204 |     }
205 |     return NULL;
206 | }
207 | 
208 | ctmbstr tmbsubstrncase( ctmbstr s1, uint len1, ctmbstr s2 )
209 | {
210 |     uint len2 = tmbstrlen(s2);
211 |     int ix, diff = len1 - len2;
212 | 
213 |     for ( ix = 0; ix <= diff; ++ix )
214 |     {
215 |         if ( tmbstrncasecmp(s1+ix, s2, len2) == 0 )
216 |             return (ctmbstr) s1+ix;
217 |     }
218 |     return NULL;
219 | }
220 | 
221 | ctmbstr tmbsubstr( ctmbstr s1, ctmbstr s2 )
222 | {
223 |     uint len1 = tmbstrlen(s1), len2 = tmbstrlen(s2);
224 |     int ix, diff = len1 - len2;
225 | 
226 |     for ( ix = 0; ix <= diff; ++ix )
227 |     {
228 |         if ( tmbstrncasecmp(s1+ix, s2, len2) == 0 )
229 |             return (ctmbstr) s1+ix;
230 |     }
231 |     return NULL;
232 | }
233 | 
234 | /* Transform ASCII chars in string to lower case */
235 | tmbstr tmbstrtolower( tmbstr s )
236 | {
237 |     tmbstr cp;
238 |     for ( cp=s; *cp; ++cp )
239 |         *cp = (tmbchar) ToLower( *cp );
240 |     return s;
241 | }
242 | 
243 | /* Transform ASCII chars in string to upper case */
244 | tmbstr tmbstrtoupper(tmbstr s)
245 | {
246 |     tmbstr cp;
247 | 
248 |     for (cp = s; *cp; ++cp)
249 |         *cp = (tmbchar)ToUpper(*cp);
250 | 
251 |     return s;
252 | }
253 | 
254 | Bool tmbsamefile( ctmbstr filename1, ctmbstr filename2 )
255 | {
256 | #if FILENAMES_CASE_SENSITIVE
257 |     return ( tmbstrcmp( filename1, filename2 ) == 0 );
258 | #else
259 |     return ( tmbstrcasecmp( filename1, filename2 ) == 0 );
260 | #endif
261 | }
262 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/config.h:
--------------------------------------------------------------------------------
  1 | #ifndef __CONFIG_H__
  2 | #define __CONFIG_H__
  3 | 
  4 | /* config.h -- read config file and manage config properties
  5 |   
  6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  7 |   See tidy.h for the copyright notice.
  8 | 
  9 |   CVS Info :
 10 | 
 11 |     $LastChangedBy$ 
 12 |     $LastChangedDate$ 
 13 |     $LastChangedRevision$ 
 14 | 
 15 |   config files associate a property name with a value.
 16 | 
 17 |   // comments can start at the beginning of a line
 18 |   # comments can start at the beginning of a line
 19 |   name: short values fit onto one line
 20 |   name: a really long value that
 21 |    continues on the next line
 22 | 
 23 |   property names are case insensitive and should be less than
 24 |   60 characters in length and must start at the begining of
 25 |   the line, as whitespace at the start of a line signifies a
 26 |   line continuation.
 27 | 
 28 | */
 29 | 
 30 | #include "forward.h"
 31 | #include "tidy.h"
 32 | #include "streamio.h"
 33 | 
 34 | struct _tidy_option;
 35 | typedef struct _tidy_option TidyOptionImpl;
 36 | 
 37 | typedef Bool (ParseProperty)( TidyDocImpl* doc, const TidyOptionImpl* opt );
 38 | 
 39 | struct _tidy_option
 40 | {
 41 |     TidyOptionId        id;
 42 |     TidyConfigCategory  category;   /* put 'em in groups */
 43 |     ctmbstr             name;       /* property name */
 44 |     TidyOptionType      type;       /* string, int or bool */
 45 |     ulong               dflt;       /* factory default */
 46 |     ParseProperty*      parser;     /* parsing method, read-only if NULL */
 47 |     const ctmbstr*      pickList;   /* pick list */
 48 | };
 49 | 
 50 | 
 51 | typedef struct _tidy_config
 52 | {
 53 |     ulong value[ N_TIDY_OPTIONS + 1 ];     /* current config values */
 54 |     ulong snapshot[ N_TIDY_OPTIONS + 1 ];  /* Snapshot of values to be restored later */
 55 | 
 56 |     /* track what tags user has defined to eliminate unnecessary searches */
 57 |     uint  defined_tags;
 58 | 
 59 |     uint c;           /* current char in input stream */
 60 |     StreamIn* cfgIn;  /* current input source */
 61 | 
 62 | } TidyConfigImpl;
 63 | 
 64 | 
 65 | const TidyOptionImpl* lookupOption( ctmbstr optnam );
 66 | const TidyOptionImpl* getOption( TidyOptionId optId );
 67 | 
 68 | TidyIterator getOptionList( TidyDocImpl* doc );
 69 | const TidyOptionImpl*  getNextOption( TidyDocImpl* doc, TidyIterator* iter );
 70 | 
 71 | TidyIterator getOptionPickList( const TidyOptionImpl* option );
 72 | ctmbstr getNextOptionPick( const TidyOptionImpl* option, TidyIterator* iter );
 73 | 
 74 | void InitConfig( TidyDocImpl* doc );
 75 | void FreeConfig( TidyDocImpl* doc );
 76 | 
 77 | Bool SetOptionValue( TidyDocImpl* doc, TidyOptionId optId, ctmbstr val );
 78 | Bool SetOptionInt( TidyDocImpl* doc, TidyOptionId optId, ulong val );
 79 | Bool SetOptionBool( TidyDocImpl* doc, TidyOptionId optId, Bool val );
 80 | 
 81 | Bool ResetOptionToDefault( TidyDocImpl* doc, TidyOptionId optId );
 82 | void ResetConfigToDefault( TidyDocImpl* doc );
 83 | void TakeConfigSnapshot( TidyDocImpl* doc );
 84 | void ResetConfigToSnapshot( TidyDocImpl* doc );
 85 | 
 86 | void CopyConfig( TidyDocImpl* docTo, TidyDocImpl* docFrom );
 87 | 
 88 | 
 89 | #ifdef SUPPORT_GETPWNAM
 90 | /*
 91 |  Tod Lewis contributed this code for expanding
 92 |  ~/foo or ~your/foo according to $HOME and your
 93 |  user name. This will only work on Unix systems.
 94 | */
 95 | ctmbstr ExpandTilde(ctmbstr filename);
 96 | #endif /* SUPPORT_GETPWNAM */
 97 | 
 98 | int  ParseConfigFile( TidyDocImpl* doc, ctmbstr cfgfil );
 99 | int  ParseConfigFileEnc( TidyDocImpl* doc,
100 |                          ctmbstr cfgfil, ctmbstr charenc );
101 | 
102 | int  SaveConfigFile( TidyDocImpl* doc, ctmbstr cfgfil );
103 | int  SaveConfigSink( TidyDocImpl* doc, TidyOutputSink* sink );
104 | 
105 | /* returns false if unknown option, missing parameter, or
106 |    option doesn't use parameter
107 | */
108 | Bool  ParseConfigOption( TidyDocImpl* doc, ctmbstr optnam, ctmbstr optVal );
109 | Bool  ParseConfigValue( TidyDocImpl* doc, TidyOptionId optId, ctmbstr optVal );
110 | 
111 | /* ensure that char encodings are self consistent */
112 | Bool  AdjustCharEncoding( TidyDocImpl* doc, int encoding );
113 | 
114 | /* ensure that config is self consistent */
115 | void AdjustConfig( TidyDocImpl* doc );
116 | 
117 | Bool  ConfigDiffThanDefault( TidyDocImpl* doc );
118 | Bool  ConfigDiffThanSnapshot( TidyDocImpl* doc );
119 | 
120 | int CharEncodingId( ctmbstr charenc );
121 | ctmbstr CharEncodingName( int encoding );
122 | 
123 | void SetEmacsFilename( TidyDocImpl* doc, ctmbstr filename );
124 | 
125 | 
126 | #ifdef _DEBUG
127 | 
128 | /* Debug lookup functions will be type-safe and assert option type match */
129 | ulong   _cfgGet( TidyDocImpl* doc, TidyOptionId optId );
130 | Bool    _cfgGetBool( TidyDocImpl* doc, TidyOptionId optId );
131 | ctmbstr _cfgGetString( TidyDocImpl* doc, TidyOptionId optId );
132 | 
133 | #define cfg(doc, id)            _cfgGet( (doc), (id) )
134 | #define cfgBool(doc, id)        _cfgGetBool( (doc), (id) )
135 | #define cfgStr(doc, id)         _cfgGetString( (doc), (id) )
136 | 
137 | #else
138 | 
139 | /* Release build macros for speed */
140 | #define cfg(doc, id)            ((doc)->config.value[ (id) ])
141 | #define cfgBool(doc, id)        ((Bool) cfg(doc, id))
142 | #define cfgStr(doc, id)         ((ctmbstr) cfg(doc, id))
143 | 
144 | #endif /* _DEBUG */
145 | 
146 | 
147 | 
148 | /* parser for integer values */
149 | ParseProperty ParseInt;
150 | 
151 | /* parser for 't'/'f', 'true'/'false', 'y'/'n', 'yes'/'no' or '1'/'0' */
152 | ParseProperty ParseBool;
153 | 
154 | /* a string excluding whitespace */
155 | ParseProperty ParseName;
156 | 
157 | /* a CSS1 selector - CSS class naming for -clean option */
158 | ParseProperty ParseCSS1Selector;
159 | 
160 | /* a string including whitespace */
161 | ParseProperty ParseString;
162 | 
163 | /* a space or comma separated list of tag names */
164 | ParseProperty ParseTagNames;
165 | 
166 | /* RAW, ASCII, LATIN0, LATIN1, UTF8, ISO2022, MACROMAN, 
167 |    WIN1252, IBM858, UTF16LE, UTF16BE, UTF16, BIG5, SHIFTJIS
168 | */
169 | ParseProperty ParseCharEnc;
170 | ParseProperty ParseNewline;
171 | 
172 | /* specific to the indent option - Bool and 'auto' */
173 | ParseProperty ParseIndent;
174 | 
175 | /* omit | auto | strict | loose | <fpi> */
176 | ParseProperty ParseDocType;
177 | 
178 | /* keep-first or keep-last? */
179 | ParseProperty ParseRepeatAttr;
180 | 
181 | /* specific to the output-bom option - Bool and 'auto' */
182 | ParseProperty ParseBOM;
183 | 
184 | #endif /* __CONFIG_H__ */
185 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/attrget.c:
--------------------------------------------------------------------------------
  1 | /* attrget.c -- Locate attribute value by type
  2 | 
  3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  4 |   See tidy.h for the copyright notice.
  5 |   
  6 |   CVS Info:
  7 |     $LastChangedBy$ 
  8 |     $LastChangedDate$ 
  9 |     $LastChangedRevision$ 
 10 | 
 11 | */
 12 | 
 13 | #include "tidy-int.h"
 14 | #include "tags.h"
 15 | #include "attrs.h"
 16 | #include "tidy.h"
 17 | 
 18 | TidyAttr tidyAttrGetHREF( TidyNode tnod )
 19 | {
 20 |     return tidyImplToAttr( attrGetHREF( tidyNodeToImpl(tnod) ) );
 21 | }
 22 | TidyAttr tidyAttrGetSRC( TidyNode tnod )
 23 | {
 24 |     return tidyImplToAttr( attrGetSRC( tidyNodeToImpl(tnod) ) );
 25 | }
 26 | TidyAttr tidyAttrGetID( TidyNode tnod )
 27 | {
 28 |     return tidyImplToAttr( attrGetID( tidyNodeToImpl(tnod) ) );
 29 | }
 30 | TidyAttr tidyAttrGetNAME( TidyNode tnod )
 31 | {
 32 |     return tidyImplToAttr( attrGetNAME( tidyNodeToImpl(tnod) ) );
 33 | }
 34 | TidyAttr tidyAttrGetSUMMARY( TidyNode tnod )
 35 | {
 36 |     return tidyImplToAttr( attrGetSUMMARY( tidyNodeToImpl(tnod) ) );
 37 | }
 38 | TidyAttr tidyAttrGetALT( TidyNode tnod )
 39 | {
 40 |     return tidyImplToAttr( attrGetALT( tidyNodeToImpl(tnod) ) );
 41 | }
 42 | TidyAttr tidyAttrGetLONGDESC( TidyNode tnod )
 43 | {
 44 |     return tidyImplToAttr( attrGetLONGDESC( tidyNodeToImpl(tnod) ) );
 45 | }
 46 | TidyAttr tidyAttrGetUSEMAP( TidyNode tnod )
 47 | {
 48 |     return tidyImplToAttr( attrGetUSEMAP( tidyNodeToImpl(tnod) ) );
 49 | }
 50 | TidyAttr tidyAttrGetISMAP( TidyNode tnod )
 51 | {
 52 |     return tidyImplToAttr( attrGetISMAP( tidyNodeToImpl(tnod) ) );
 53 | }
 54 | TidyAttr tidyAttrGetLANGUAGE( TidyNode tnod )
 55 | {
 56 |     return tidyImplToAttr( attrGetLANGUAGE( tidyNodeToImpl(tnod) ) );
 57 | }
 58 | TidyAttr tidyAttrGetTYPE( TidyNode tnod )
 59 | {
 60 |     return tidyImplToAttr( attrGetTYPE( tidyNodeToImpl(tnod) ) );
 61 | }
 62 | TidyAttr tidyAttrGetVALUE( TidyNode tnod )
 63 | {
 64 |     return tidyImplToAttr( attrGetVALUE( tidyNodeToImpl(tnod) ) );
 65 | }
 66 | TidyAttr tidyAttrGetCONTENT( TidyNode tnod )
 67 | {
 68 |     return tidyImplToAttr( attrGetCONTENT( tidyNodeToImpl(tnod) ) );
 69 | }
 70 | TidyAttr tidyAttrGetTITLE( TidyNode tnod )
 71 | {
 72 |     return tidyImplToAttr( attrGetTITLE( tidyNodeToImpl(tnod) ) );
 73 | }
 74 | TidyAttr tidyAttrGetXMLNS( TidyNode tnod )
 75 | {
 76 |     return tidyImplToAttr( attrGetXMLNS( tidyNodeToImpl(tnod) ) );
 77 | }
 78 | TidyAttr tidyAttrGetDATAFLD( TidyNode tnod )
 79 | {
 80 |     return tidyImplToAttr( attrGetDATAFLD( tidyNodeToImpl(tnod) ) );
 81 | }
 82 | TidyAttr tidyAttrGetWIDTH( TidyNode tnod )
 83 | {
 84 |     return tidyImplToAttr( attrGetWIDTH( tidyNodeToImpl(tnod) ) );
 85 | }
 86 | TidyAttr tidyAttrGetHEIGHT( TidyNode tnod )
 87 | {
 88 |     return tidyImplToAttr( attrGetHEIGHT( tidyNodeToImpl(tnod) ) );
 89 | }
 90 | TidyAttr tidyAttrGetFOR( TidyNode tnod )
 91 | {
 92 |     return tidyImplToAttr( attrGetFOR( tidyNodeToImpl(tnod) ) );
 93 | }
 94 | TidyAttr tidyAttrGetSELECTED( TidyNode tnod )
 95 | {
 96 |     return tidyImplToAttr( attrGetSELECTED( tidyNodeToImpl(tnod) ) );
 97 | }
 98 | TidyAttr tidyAttrGetCHECKED( TidyNode tnod )
 99 | {
100 |     return tidyImplToAttr( attrGetCHECKED( tidyNodeToImpl(tnod) ) );
101 | }
102 | TidyAttr tidyAttrGetLANG( TidyNode tnod )
103 | {
104 |     return tidyImplToAttr( attrGetLANG( tidyNodeToImpl(tnod) ) );
105 | }
106 | TidyAttr tidyAttrGetTARGET( TidyNode tnod )
107 | {
108 |     return tidyImplToAttr( attrGetTARGET( tidyNodeToImpl(tnod) ) );
109 | }
110 | TidyAttr tidyAttrGetHTTP_EQUIV( TidyNode tnod )
111 | {
112 |     return tidyImplToAttr( attrGetHTTP_EQUIV( tidyNodeToImpl(tnod) ) );
113 | }
114 | TidyAttr tidyAttrGetREL( TidyNode tnod )
115 | {
116 |     return tidyImplToAttr( attrGetREL( tidyNodeToImpl(tnod) ) );
117 | }
118 | 
119 | TidyAttr tidyAttrGetOnMOUSEMOVE( TidyNode tnod )
120 | {
121 |     return tidyImplToAttr( attrGetOnMOUSEMOVE( tidyNodeToImpl(tnod) ) );
122 | }
123 | TidyAttr tidyAttrGetOnMOUSEDOWN( TidyNode tnod )
124 | {
125 |     return tidyImplToAttr( attrGetOnMOUSEDOWN( tidyNodeToImpl(tnod) ) );
126 | }
127 | TidyAttr tidyAttrGetOnMOUSEUP( TidyNode tnod )
128 | {
129 |     return tidyImplToAttr( attrGetOnMOUSEUP( tidyNodeToImpl(tnod) ) );
130 | }
131 | TidyAttr tidyAttrGetOnCLICK( TidyNode tnod )
132 | {
133 |     return tidyImplToAttr( attrGetOnCLICK( tidyNodeToImpl(tnod) ) );
134 | }
135 | TidyAttr tidyAttrGetOnMOUSEOVER( TidyNode tnod )
136 | {
137 |     return tidyImplToAttr( attrGetOnMOUSEOVER( tidyNodeToImpl(tnod) ) );
138 | }
139 | TidyAttr tidyAttrGetOnMOUSEOUT( TidyNode tnod )
140 | {
141 |     return tidyImplToAttr( attrGetOnMOUSEOUT( tidyNodeToImpl(tnod) ) );
142 | }
143 | TidyAttr tidyAttrGetOnKEYDOWN( TidyNode tnod )
144 | {
145 |     return tidyImplToAttr( attrGetOnKEYDOWN( tidyNodeToImpl(tnod) ) );
146 | }
147 | TidyAttr tidyAttrGetOnKEYUP( TidyNode tnod )
148 | {
149 |     return tidyImplToAttr( attrGetOnKEYUP( tidyNodeToImpl(tnod) ) );
150 | }
151 | TidyAttr tidyAttrGetOnKEYPRESS( TidyNode tnod )
152 | {
153 |     return tidyImplToAttr( attrGetOnKEYPRESS( tidyNodeToImpl(tnod) ) );
154 | }
155 | TidyAttr tidyAttrGetOnFOCUS( TidyNode tnod )
156 | {
157 |     return tidyImplToAttr( attrGetOnFOCUS( tidyNodeToImpl(tnod) ) );
158 | }
159 | TidyAttr tidyAttrGetOnBLUR( TidyNode tnod )
160 | {
161 |     return tidyImplToAttr( attrGetOnBLUR( tidyNodeToImpl(tnod) ) );
162 | }
163 | TidyAttr tidyAttrGetBGCOLOR( TidyNode tnod )
164 | {
165 |     return tidyImplToAttr( attrGetBGCOLOR( tidyNodeToImpl(tnod) ) );
166 | }
167 | TidyAttr tidyAttrGetLINK( TidyNode tnod )
168 | {
169 |     return tidyImplToAttr( attrGetLINK( tidyNodeToImpl(tnod) ) );
170 | }
171 | TidyAttr tidyAttrGetALINK( TidyNode tnod )
172 | {
173 |     return tidyImplToAttr( attrGetALINK( tidyNodeToImpl(tnod) ) );
174 | }
175 | TidyAttr tidyAttrGetVLINK( TidyNode tnod )
176 | {
177 |     return tidyImplToAttr( attrGetVLINK( tidyNodeToImpl(tnod) ) );
178 | }
179 | 
180 | TidyAttr tidyAttrGetTEXT( TidyNode tnod )
181 | {
182 |     return tidyImplToAttr( attrGetTEXT( tidyNodeToImpl(tnod) ) );
183 | }
184 | TidyAttr tidyAttrGetSTYLE( TidyNode tnod )
185 | {
186 |     return tidyImplToAttr( attrGetSTYLE( tidyNodeToImpl(tnod) ) );
187 | }
188 | TidyAttr tidyAttrGetABBR( TidyNode tnod )
189 | {
190 |     return tidyImplToAttr( attrGetABBR( tidyNodeToImpl(tnod) ) );
191 | }
192 | TidyAttr tidyAttrGetCOLSPAN( TidyNode tnod )
193 | {
194 |     return tidyImplToAttr( attrGetCOLSPAN( tidyNodeToImpl(tnod) ) );
195 | }
196 | TidyAttr tidyAttrGetROWSPAN( TidyNode tnod )
197 | {
198 |     return tidyImplToAttr( attrGetROWSPAN( tidyNodeToImpl(tnod) ) );
199 | }
200 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/message.h:
--------------------------------------------------------------------------------
  1 | #ifndef __MESSAGE_H__
  2 | #define __MESSAGE_H__
  3 | 
  4 | /* message.h -- general message writing routines
  5 | 
  6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  7 |   See tidy.h for the copyright notice.
  8 |   
  9 |   CVS Info :
 10 | 
 11 |     $LastChangedBy$ 
 12 |     $LastChangedDate$ 
 13 |     $LastChangedRevision$ 
 14 | 
 15 | */
 16 | 
 17 | #include "forward.h"
 18 | #include "tidy.h"  /* For TidyReportLevel */
 19 | 
 20 | /* General message writing routines.
 21 | ** Each message is a single warning, error, etc.
 22 | **
 23 | ** This routine will keep track of counts and,
 24 | ** if the caller has set a filter, it will be
 25 | ** called.  The new preferred way of handling
 26 | ** Tidy diagnostics output is either a) define
 27 | ** a new output sink or b) install a message
 28 | ** filter routine.
 29 | **
 30 | ** Keeps track of ShowWarnings, ShowErrors, etc.
 31 | */
 32 | 
 33 | ctmbstr ReleaseDate(void);
 34 | 
 35 | /* Reports error at current Lexer line/column. */ 
 36 | void message( TidyDocImpl* doc, TidyReportLevel level, ctmbstr msg, ... );
 37 | 
 38 | /* Reports error at node line/column. */ 
 39 | void messageNode( TidyDocImpl* doc, TidyReportLevel level,
 40 |                   Node* node, ctmbstr msg, ... );
 41 | 
 42 | /* Reports error at given line/column. */ 
 43 | void messageLexer( TidyDocImpl* doc, TidyReportLevel level, 
 44 |                    ctmbstr msg, ... );
 45 | 
 46 | /* For general reporting.  Emits nothing if --quiet yes */
 47 | void tidy_out( TidyDocImpl* doc, ctmbstr msg, ... );
 48 | 
 49 | 
 50 | void ShowVersion( TidyDocImpl* doc );
 51 | void ReportUnknownOption( TidyDocImpl* doc, ctmbstr option );
 52 | void ReportBadArgument( TidyDocImpl* doc, ctmbstr option );
 53 | void NeedsAuthorIntervention( TidyDocImpl* doc );
 54 | 
 55 | void HelloMessage( TidyDocImpl* doc, ctmbstr date, ctmbstr filename );
 56 | void ReportMarkupVersion( TidyDocImpl* doc );
 57 | void ReportNumWarnings( TidyDocImpl* doc );
 58 | 
 59 | void GeneralInfo( TidyDocImpl* doc );
 60 | void UnknownOption( TidyDocImpl* doc, char c );
 61 | void UnknownFile( TidyDocImpl* doc, ctmbstr program, ctmbstr file );
 62 | void FileError( TidyDocImpl* doc, ctmbstr file, TidyReportLevel level );
 63 | 
 64 | void ErrorSummary( TidyDocImpl* doc );
 65 | 
 66 | void ReportEncodingWarning(TidyDocImpl* doc, uint code, uint encoding);
 67 | void ReportEncodingError(TidyDocImpl* doc, uint code, uint c, Bool discarded);
 68 | void ReportEntityError( TidyDocImpl* doc, uint code, ctmbstr entity, int c );
 69 | void ReportAttrError( TidyDocImpl* doc, Node* node, AttVal* av, uint code );
 70 | void ReportMissingAttr( TidyDocImpl* doc, Node* node, ctmbstr name );
 71 | 
 72 | void ReportNotice(TidyDocImpl* doc, Node *element, Node *node, uint code);
 73 | void ReportWarning(TidyDocImpl* doc, Node *element, Node *node, uint code);
 74 | void ReportError(TidyDocImpl* doc, Node* element, Node* node, uint code);
 75 | void ReportFatal(TidyDocImpl* doc, Node* element, Node* node, uint code);
 76 | 
 77 | /* error codes for entities/numeric character references */
 78 | 
 79 | #define MISSING_SEMICOLON            1
 80 | #define MISSING_SEMICOLON_NCR        2
 81 | #define UNKNOWN_ENTITY               3
 82 | #define UNESCAPED_AMPERSAND          4
 83 | #define APOS_UNDEFINED               5
 84 | 
 85 | /* error codes for element messages */
 86 | 
 87 | #define MISSING_ENDTAG_FOR           6
 88 | #define MISSING_ENDTAG_BEFORE        7
 89 | #define DISCARDING_UNEXPECTED        8
 90 | #define NESTED_EMPHASIS              9
 91 | #define NON_MATCHING_ENDTAG          10
 92 | #define TAG_NOT_ALLOWED_IN           11
 93 | #define MISSING_STARTTAG             12
 94 | #define UNEXPECTED_ENDTAG            13
 95 | #define USING_BR_INPLACE_OF          14
 96 | #define INSERTING_TAG                15
 97 | #define SUSPECTED_MISSING_QUOTE      16
 98 | #define MISSING_TITLE_ELEMENT        17
 99 | #define DUPLICATE_FRAMESET           18
100 | #define CANT_BE_NESTED               19
101 | #define OBSOLETE_ELEMENT             20
102 | #define PROPRIETARY_ELEMENT          21
103 | #define UNKNOWN_ELEMENT              22
104 | #define TRIM_EMPTY_ELEMENT           23
105 | #define COERCE_TO_ENDTAG             24
106 | #define ILLEGAL_NESTING              25
107 | #define NOFRAMES_CONTENT             26
108 | #define CONTENT_AFTER_BODY           27
109 | #define INCONSISTENT_VERSION         28
110 | #define MALFORMED_COMMENT            29
111 | #define BAD_COMMENT_CHARS            30
112 | #define BAD_XML_COMMENT              31
113 | #define BAD_CDATA_CONTENT            32
114 | #define INCONSISTENT_NAMESPACE       33
115 | #define DOCTYPE_AFTER_TAGS           34
116 | #define MALFORMED_DOCTYPE            35
117 | #define UNEXPECTED_END_OF_FILE       36
118 | #define DTYPE_NOT_UPPER_CASE         37
119 | #define TOO_MANY_ELEMENTS            38
120 | #define UNESCAPED_ELEMENT            39
121 | #define NESTED_QUOTATION             40
122 | #define ELEMENT_NOT_EMPTY            41
123 | #define ENCODING_IO_CONFLICT         42
124 | #define MIXED_CONTENT_IN_BLOCK       43
125 | #define MISSING_DOCTYPE              44
126 | #define SPACE_PRECEDING_XMLDECL      45
127 | #define TOO_MANY_ELEMENTS_IN         46
128 | #define UNEXPECTED_ENDTAG_IN         47
129 | #define REPLACING_ELEMENT            83
130 | #define REPLACING_UNEX_ELEMENT       84
131 | #define COERCE_TO_ENDTAG_WARN        85 /* last */
132 | 
133 | /* error codes used for attribute messages */
134 | 
135 | #define UNKNOWN_ATTRIBUTE            48
136 | #define INSERTING_ATTRIBUTE          49
137 | #define MISSING_ATTR_VALUE           50
138 | #define BAD_ATTRIBUTE_VALUE          51
139 | #define UNEXPECTED_GT                52
140 | #define PROPRIETARY_ATTRIBUTE        53
141 | #define PROPRIETARY_ATTR_VALUE       54
142 | #define REPEATED_ATTRIBUTE           55
143 | #define MISSING_IMAGEMAP             56
144 | #define XML_ATTRIBUTE_VALUE          57
145 | #define UNEXPECTED_QUOTEMARK         58
146 | #define MISSING_QUOTEMARK            59
147 | #define ID_NAME_MISMATCH             60
148 | 
149 | #define BACKSLASH_IN_URI             61
150 | #define FIXED_BACKSLASH              62
151 | #define ILLEGAL_URI_REFERENCE        63
152 | #define ESCAPED_ILLEGAL_URI          64
153 | 
154 | #define NEWLINE_IN_URI               65
155 | #define ANCHOR_NOT_UNIQUE            66
156 | 
157 | #define JOINING_ATTRIBUTE            68
158 | #define UNEXPECTED_EQUALSIGN         69
159 | #define ATTR_VALUE_NOT_LCASE         70
160 | #define XML_ID_SYNTAX                71
161 | 
162 | #define INVALID_ATTRIBUTE            72
163 | 
164 | #define BAD_ATTRIBUTE_VALUE_REPLACED 73
165 | 
166 | #define INVALID_XML_ID               74
167 | #define UNEXPECTED_END_OF_FILE_ATTR  75
168 | 
169 | 
170 | /* character encoding errors */
171 | 
172 | #define VENDOR_SPECIFIC_CHARS        76
173 | #define INVALID_SGML_CHARS           77
174 | #define INVALID_UTF8                 78
175 | #define INVALID_UTF16                79
176 | #define ENCODING_MISMATCH            80
177 | #define INVALID_URI                  81
178 | #define INVALID_NCR                  82
179 | 
180 | /* accessibility flaws */
181 | 
182 | #define MISSING_IMAGE_ALT       1
183 | #define MISSING_LINK_ALT        2
184 | #define MISSING_SUMMARY         4
185 | #define MISSING_IMAGE_MAP       8
186 | #define USING_FRAMES            16
187 | #define USING_NOFRAMES          32
188 | 
189 | /* presentation flaws */
190 | 
191 | #define USING_SPACER            1
192 | #define USING_LAYER             2
193 | #define USING_NOBR              4
194 | #define USING_FONT              8
195 | #define USING_BODY              16
196 | 
197 | #define REPLACED_CHAR           0
198 | #define DISCARDED_CHAR          1
199 | 
200 | /* badchar bit field */
201 | 
202 | #define BC_VENDOR_SPECIFIC_CHARS   1
203 | #define BC_INVALID_SGML_CHARS      2
204 | #define BC_INVALID_UTF8            4
205 | #define BC_INVALID_UTF16           8
206 | #define BC_ENCODING_MISMATCH       16 /* fatal error */
207 | #define BC_INVALID_URI             32
208 | #define BC_INVALID_NCR             64
209 | 
210 | #endif /* __MESSAGE_H__ */
211 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/istack.c:
--------------------------------------------------------------------------------
  1 | /* istack.c -- inline stack for compatibility with Mosaic
  2 | 
  3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  4 |   See tidy.h for the copyright notice.
  5 |   
  6 |   CVS Info :
  7 | 
  8 |     $LastChangedBy$ 
  9 |     $LastChangedDate$ 
 10 |     $LastChangedRevision$ 
 11 | 
 12 | */
 13 | 
 14 | #include "tidy-int.h"
 15 | #include "lexer.h"
 16 | #include "attrs.h"
 17 | #include "streamio.h"
 18 | #include "tmbstr.h"
 19 | 
 20 | extern Bool   debug_flag;
 21 | extern Node  *debug_element;
 22 | extern Lexer *debug_lexer;
 23 | 
 24 | /* duplicate attributes */
 25 | AttVal *DupAttrs( TidyDocImpl* doc, AttVal *attrs)
 26 | {
 27 |     AttVal *newattrs;
 28 | 
 29 |     if (attrs == NULL)
 30 |         return attrs;
 31 | 
 32 |     newattrs = NewAttribute();
 33 |     *newattrs = *attrs;
 34 |     newattrs->next = DupAttrs( doc, attrs->next );
 35 |     newattrs->attribute = tmbstrdup(attrs->attribute);
 36 |     newattrs->value = tmbstrdup(attrs->value);
 37 |     newattrs->dict = FindAttribute(doc, newattrs);
 38 |     return newattrs;
 39 | }
 40 | 
 41 | /*
 42 |   push a copy of an inline node onto stack
 43 |   but don't push if implicit or OBJECT or APPLET
 44 |   (implicit tags are ones generated from the istack)
 45 | 
 46 |   One issue arises with pushing inlines when
 47 |   the tag is already pushed. For instance:
 48 | 
 49 |       <p><em>text
 50 |       <p><em>more text
 51 | 
 52 |   Shouldn't be mapped to
 53 | 
 54 |       <p><em>text</em></p>
 55 |       <p><em><em>more text</em></em>
 56 | */
 57 | void PushInline( TidyDocImpl* doc, Node *node)
 58 | {
 59 |     Lexer* lexer = doc->lexer;
 60 |     IStack *istack;
 61 | 
 62 |     if (node->implicit)
 63 |         return;
 64 | 
 65 |     if (node->tag == NULL)
 66 |         return;
 67 | 
 68 |     if (!(node->tag->model & CM_INLINE))
 69 |         return;
 70 | 
 71 |     if (node->tag->model & CM_OBJECT)
 72 |         return;
 73 | 
 74 |     if ( !nodeIsFONT(node) && IsPushed(doc, node) )
 75 |         return;
 76 | 
 77 |     /* make sure there is enough space for the stack */
 78 |     if (lexer->istacksize + 1 > lexer->istacklength)
 79 |     {
 80 |         if (lexer->istacklength == 0)
 81 |             lexer->istacklength = 6;   /* this is perhaps excessive */
 82 | 
 83 |         lexer->istacklength = lexer->istacklength * 2;
 84 |         lexer->istack = (IStack *)MemRealloc(lexer->istack,
 85 |                             sizeof(IStack)*(lexer->istacklength));
 86 |     }
 87 | 
 88 |     istack = &(lexer->istack[lexer->istacksize]);
 89 |     istack->tag = node->tag;
 90 | 
 91 |     istack->element = tmbstrdup(node->element);
 92 |     istack->attributes = DupAttrs( doc, node->attributes );
 93 |     ++(lexer->istacksize);
 94 | }
 95 | 
 96 | /* pop inline stack */
 97 | void PopInline( TidyDocImpl* doc, Node *node )
 98 | {
 99 |     Lexer* lexer = doc->lexer;
100 |     AttVal *av;
101 |     IStack *istack;
102 | 
103 |     if (node)
104 |     {
105 |         if (node->tag == NULL)
106 |             return;
107 | 
108 |         if (!(node->tag->model & CM_INLINE))
109 |             return;
110 | 
111 |         if (node->tag->model & CM_OBJECT)
112 |             return;
113 | 
114 |         /* if node is </a> then pop until we find an <a> */
115 |         if ( nodeIsA(node) )
116 |         {
117 |             while (lexer->istacksize > 0)
118 |             {
119 |                 --(lexer->istacksize);
120 |                 istack = &(lexer->istack[lexer->istacksize]);
121 | 
122 |                 while (istack->attributes)
123 |                 {
124 |                     av = istack->attributes;
125 | 
126 |                     if (av->attribute)
127 |                         MemFree(av->attribute);
128 |                     if (av->value)
129 |                         MemFree(av->value);
130 | 
131 |                     istack->attributes = av->next;
132 |                     MemFree(av);
133 |                 }
134 | 
135 |                 if ( istack->tag->id == TidyTag_A )
136 |                 {
137 |                     MemFree(istack->element);
138 |                     break;
139 |                 }
140 | 
141 |                 MemFree(istack->element);
142 |             }
143 | 
144 |             return;
145 |         }
146 |     }
147 | 
148 |     if (lexer->istacksize > 0)
149 |     {
150 |         --(lexer->istacksize);
151 |         istack = &(lexer->istack[lexer->istacksize]);
152 | 
153 |         while (istack->attributes)
154 |         {
155 |             av = istack->attributes;
156 | 
157 |             if (av->attribute)
158 |                 MemFree(av->attribute);
159 |             if (av->value)
160 |                 MemFree(av->value);
161 | 
162 |             istack->attributes = av->next;
163 |             MemFree(av);
164 |         }
165 | 
166 |         MemFree(istack->element);
167 | 
168 |         /* #427822 - fix by Randy Waki 7 Aug 00 */
169 |         if (lexer->insert >= lexer->istack + lexer->istacksize)
170 |             lexer->insert = NULL;
171 |     }
172 | }
173 | 
174 | Bool IsPushed( TidyDocImpl* doc, Node *node)
175 | {
176 |     Lexer* lexer = doc->lexer;
177 |     int i;
178 | 
179 |     for (i = lexer->istacksize - 1; i >= 0; --i)
180 |     {
181 |         if (lexer->istack[i].tag == node->tag)
182 |             return yes;
183 |     }
184 | 
185 |     return no;
186 | }
187 | 
188 | /*
189 |   This has the effect of inserting "missing" inline
190 |   elements around the contents of blocklevel elements
191 |   such as P, TD, TH, DIV, PRE etc. This procedure is
192 |   called at the start of ParseBlock. when the inline
193 |   stack is not empty, as will be the case in:
194 | 
195 |     <i><h1>italic heading</h1></i>
196 | 
197 |   which is then treated as equivalent to
198 | 
199 |     <h1><i>italic heading</i></h1>
200 | 
201 |   This is implemented by setting the lexer into a mode
202 |   where it gets tokens from the inline stack rather than
203 |   from the input stream.
204 | */
205 | int InlineDup( TidyDocImpl* doc, Node* node )
206 | {
207 |     Lexer* lexer = doc->lexer;
208 |     int n;
209 | 
210 |     if ((n = lexer->istacksize - lexer->istackbase) > 0)
211 |     {
212 |         lexer->insert = &(lexer->istack[lexer->istackbase]);
213 |         lexer->inode = node;
214 |     }
215 | 
216 |     return n;
217 | }
218 | 
219 | /*
220 |  defer duplicates when entering a table or other
221 |  element where the inlines shouldn't be duplicated
222 | */
223 | void DeferDup( TidyDocImpl* doc )
224 | {
225 |     doc->lexer->insert = NULL;
226 |     doc->lexer->inode = NULL;
227 | }
228 | 
229 | Node *InsertedToken( TidyDocImpl* doc )
230 | {
231 |     Lexer* lexer = doc->lexer;
232 |     Node *node;
233 |     IStack *istack;
234 |     uint n;
235 | 
236 |     /* this will only be NULL if inode != NULL */
237 |     if (lexer->insert == NULL)
238 |     {
239 |         node = lexer->inode;
240 |         lexer->inode = NULL;
241 |         return node;
242 |     }
243 | 
244 |     /*
245 |     
246 |       is this is the "latest" node then update
247 |       the position, otherwise use current values
248 |     */
249 | 
250 |     if (lexer->inode == NULL)
251 |     {
252 |         lexer->lines = doc->docIn->curline;
253 |         lexer->columns = doc->docIn->curcol;
254 |     }
255 | 
256 |     node = NewNode(lexer);
257 |     node->type = StartTag;
258 |     node->implicit = yes;
259 |     node->start = lexer->txtstart;
260 |     /* #431734 [JTidy bug #226261 (was 126261)] - fix by Gary Peskin 20 Dec 00 */ 
261 |     node->end = lexer->txtend; /* was : lexer->txtstart; */
262 |     istack = lexer->insert;
263 | 
264 | #if 0 && defined(_DEBUG)
265 |     if ( lexer->istacksize == 0 )
266 |         fprintf( stderr, "0-size istack!\n" );
267 | #endif
268 | 
269 |     node->element = tmbstrdup(istack->element);
270 |     node->tag = istack->tag;
271 |     node->attributes = DupAttrs( doc, istack->attributes );
272 | 
273 |     /* advance lexer to next item on the stack */
274 |     n = (uint)(lexer->insert - &(lexer->istack[0]));
275 | 
276 |     /* and recover state if we have reached the end */
277 |     if (++n < lexer->istacksize)
278 |         lexer->insert = &(lexer->istack[n]);
279 |     else
280 |         lexer->insert = NULL;
281 | 
282 |     return node;
283 | }
284 | 
285 | 
286 | 
287 | 
288 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/tagask.c:
--------------------------------------------------------------------------------
  1 | /* tagask.c -- Interrogate node type
  2 | 
  3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  4 |   See tidy.h for the copyright notice.
  5 | 
  6 |   CVS Info :
  7 | 
  8 |     $LastChangedBy$ 
  9 |     $LastChangedDate$ 
 10 |     $LastChangedRevision$ 
 11 | 
 12 | */
 13 | 
 14 | #include "tidy-int.h"
 15 | #include "tags.h"
 16 | #include "tidy.h"
 17 | 
 18 | Bool tidyNodeIsText( TidyNode tnod )
 19 | { return nodeIsText( tidyNodeToImpl(tnod) );
 20 | }
 21 | Bool tidyNodeCMIsBlock( TidyNode tnod )
 22 | { return nodeCMIsBlock( tidyNodeToImpl(tnod) );
 23 | }
 24 | Bool tidyNodeCMIsInline( TidyNode tnod )
 25 | { return nodeCMIsInline( tidyNodeToImpl(tnod) );
 26 | }
 27 | Bool tidyNodeCMIsEmpty( TidyNode tnod )
 28 | { return nodeCMIsEmpty( tidyNodeToImpl(tnod) );
 29 | }
 30 | Bool tidyNodeIsHeader( TidyNode tnod )
 31 | { return nodeIsHeader( tidyNodeToImpl(tnod) );
 32 | }
 33 | 
 34 | Bool tidyNodeIsHTML( TidyNode tnod )
 35 | { return nodeIsHTML( tidyNodeToImpl(tnod) );
 36 | }
 37 | Bool tidyNodeIsHEAD( TidyNode tnod )
 38 | { return nodeIsHEAD( tidyNodeToImpl(tnod) );
 39 | }
 40 | Bool tidyNodeIsTITLE( TidyNode tnod )
 41 | { return nodeIsTITLE( tidyNodeToImpl(tnod) );
 42 | }
 43 | Bool tidyNodeIsBASE( TidyNode tnod )
 44 | { return nodeIsBASE( tidyNodeToImpl(tnod) );
 45 | }
 46 | Bool tidyNodeIsMETA( TidyNode tnod )
 47 | { return nodeIsMETA( tidyNodeToImpl(tnod) );
 48 | }
 49 | Bool tidyNodeIsBODY( TidyNode tnod )
 50 | { return nodeIsBODY( tidyNodeToImpl(tnod) );
 51 | }
 52 | Bool tidyNodeIsFRAMESET( TidyNode tnod )
 53 | { return nodeIsFRAMESET( tidyNodeToImpl(tnod) );
 54 | }
 55 | Bool tidyNodeIsFRAME( TidyNode tnod )
 56 | { return nodeIsFRAME( tidyNodeToImpl(tnod) );
 57 | }
 58 | Bool tidyNodeIsIFRAME( TidyNode tnod )
 59 | { return nodeIsIFRAME( tidyNodeToImpl(tnod) );
 60 | }
 61 | Bool tidyNodeIsNOFRAMES( TidyNode tnod )
 62 | { return nodeIsNOFRAMES( tidyNodeToImpl(tnod) );
 63 | }
 64 | Bool tidyNodeIsHR( TidyNode tnod )
 65 | { return nodeIsHR( tidyNodeToImpl(tnod) );
 66 | }
 67 | Bool tidyNodeIsH1( TidyNode tnod )
 68 | { return nodeIsH1( tidyNodeToImpl(tnod) );
 69 | }
 70 | Bool tidyNodeIsH2( TidyNode tnod )
 71 | { return nodeIsH2( tidyNodeToImpl(tnod) );
 72 | }
 73 | Bool tidyNodeIsPRE( TidyNode tnod )
 74 | { return nodeIsPRE( tidyNodeToImpl(tnod) );
 75 | }
 76 | Bool tidyNodeIsLISTING( TidyNode tnod )
 77 | { return nodeIsLISTING( tidyNodeToImpl(tnod) );
 78 | }
 79 | Bool tidyNodeIsP( TidyNode tnod )
 80 | { return nodeIsP( tidyNodeToImpl(tnod) );
 81 | }
 82 | Bool tidyNodeIsUL( TidyNode tnod )
 83 | { return nodeIsUL( tidyNodeToImpl(tnod) );
 84 | }
 85 | Bool tidyNodeIsOL( TidyNode tnod )
 86 | { return nodeIsOL( tidyNodeToImpl(tnod) );
 87 | }
 88 | Bool tidyNodeIsDL( TidyNode tnod )
 89 | { return nodeIsDL( tidyNodeToImpl(tnod) );
 90 | }
 91 | Bool tidyNodeIsDIR( TidyNode tnod )
 92 | { return nodeIsDIR( tidyNodeToImpl(tnod) );
 93 | }
 94 | Bool tidyNodeIsLI( TidyNode tnod )
 95 | { return nodeIsLI( tidyNodeToImpl(tnod) );
 96 | }
 97 | Bool tidyNodeIsDT( TidyNode tnod )
 98 | { return nodeIsDT( tidyNodeToImpl(tnod) );
 99 | }
100 | Bool tidyNodeIsDD( TidyNode tnod )
101 | { return nodeIsDD( tidyNodeToImpl(tnod) );
102 | }
103 | Bool tidyNodeIsTABLE( TidyNode tnod )
104 | { return nodeIsTABLE( tidyNodeToImpl(tnod) );
105 | }
106 | Bool tidyNodeIsCAPTION( TidyNode tnod )
107 | { return nodeIsCAPTION( tidyNodeToImpl(tnod) );
108 | }
109 | Bool tidyNodeIsTD( TidyNode tnod )
110 | { return nodeIsTD( tidyNodeToImpl(tnod) );
111 | }
112 | Bool tidyNodeIsTH( TidyNode tnod )
113 | { return nodeIsTH( tidyNodeToImpl(tnod) );
114 | }
115 | Bool tidyNodeIsTR( TidyNode tnod )
116 | { return nodeIsTR( tidyNodeToImpl(tnod) );
117 | }
118 | Bool tidyNodeIsCOL( TidyNode tnod )
119 | { return nodeIsCOL( tidyNodeToImpl(tnod) );
120 | }
121 | Bool tidyNodeIsCOLGROUP( TidyNode tnod )
122 | { return nodeIsCOLGROUP( tidyNodeToImpl(tnod) );
123 | }
124 | Bool tidyNodeIsBR( TidyNode tnod )
125 | { return nodeIsBR( tidyNodeToImpl(tnod) );
126 | }
127 | Bool tidyNodeIsA( TidyNode tnod )
128 | { return nodeIsA( tidyNodeToImpl(tnod) );
129 | }
130 | Bool tidyNodeIsLINK( TidyNode tnod )
131 | { return nodeIsLINK( tidyNodeToImpl(tnod) );
132 | }
133 | Bool tidyNodeIsB( TidyNode tnod )
134 | { return nodeIsB( tidyNodeToImpl(tnod) );
135 | }
136 | Bool tidyNodeIsI( TidyNode tnod )
137 | { return nodeIsI( tidyNodeToImpl(tnod) );
138 | }
139 | Bool tidyNodeIsSTRONG( TidyNode tnod )
140 | { return nodeIsSTRONG( tidyNodeToImpl(tnod) );
141 | }
142 | Bool tidyNodeIsEM( TidyNode tnod )
143 | { return nodeIsEM( tidyNodeToImpl(tnod) );
144 | }
145 | Bool tidyNodeIsBIG( TidyNode tnod )
146 | { return nodeIsBIG( tidyNodeToImpl(tnod) );
147 | }
148 | Bool tidyNodeIsSMALL( TidyNode tnod )
149 | { return nodeIsSMALL( tidyNodeToImpl(tnod) );
150 | }
151 | Bool tidyNodeIsPARAM( TidyNode tnod )
152 | { return nodeIsPARAM( tidyNodeToImpl(tnod) );
153 | }
154 | Bool tidyNodeIsOPTION( TidyNode tnod )
155 | { return nodeIsOPTION( tidyNodeToImpl(tnod) );
156 | }
157 | Bool tidyNodeIsOPTGROUP( TidyNode tnod )
158 | { return nodeIsOPTGROUP( tidyNodeToImpl(tnod) );
159 | }
160 | Bool tidyNodeIsIMG( TidyNode tnod )
161 | { return nodeIsIMG( tidyNodeToImpl(tnod) );
162 | }
163 | Bool tidyNodeIsMAP( TidyNode tnod )
164 | { return nodeIsMAP( tidyNodeToImpl(tnod) );
165 | }
166 | Bool tidyNodeIsAREA( TidyNode tnod )
167 | { return nodeIsAREA( tidyNodeToImpl(tnod) );
168 | }
169 | Bool tidyNodeIsNOBR( TidyNode tnod )
170 | { return nodeIsNOBR( tidyNodeToImpl(tnod) );
171 | }
172 | Bool tidyNodeIsWBR( TidyNode tnod )
173 | { return nodeIsWBR( tidyNodeToImpl(tnod) );
174 | }
175 | Bool tidyNodeIsFONT( TidyNode tnod )
176 | { return nodeIsFONT( tidyNodeToImpl(tnod) );
177 | }
178 | Bool tidyNodeIsLAYER( TidyNode tnod )
179 | { return nodeIsLAYER( tidyNodeToImpl(tnod) );
180 | }
181 | Bool tidyNodeIsSPACER( TidyNode tnod )
182 | { return nodeIsSPACER( tidyNodeToImpl(tnod) );
183 | }
184 | Bool tidyNodeIsCENTER( TidyNode tnod )
185 | { return nodeIsCENTER( tidyNodeToImpl(tnod) );
186 | }
187 | Bool tidyNodeIsSTYLE( TidyNode tnod )
188 | { return nodeIsSTYLE( tidyNodeToImpl(tnod) );
189 | }
190 | Bool tidyNodeIsSCRIPT( TidyNode tnod )
191 | { return nodeIsSCRIPT( tidyNodeToImpl(tnod) );
192 | }
193 | Bool tidyNodeIsNOSCRIPT( TidyNode tnod )
194 | { return nodeIsNOSCRIPT( tidyNodeToImpl(tnod) );
195 | }
196 | Bool tidyNodeIsFORM( TidyNode tnod )
197 | { return nodeIsFORM( tidyNodeToImpl(tnod) );
198 | }
199 | Bool tidyNodeIsTEXTAREA( TidyNode tnod )
200 | { return nodeIsTEXTAREA( tidyNodeToImpl(tnod) );
201 | }
202 | Bool tidyNodeIsBLOCKQUOTE( TidyNode tnod )
203 | { return nodeIsBLOCKQUOTE( tidyNodeToImpl(tnod) );
204 | }
205 | Bool tidyNodeIsAPPLET( TidyNode tnod )
206 | { return nodeIsAPPLET( tidyNodeToImpl(tnod) );
207 | }
208 | Bool tidyNodeIsOBJECT( TidyNode tnod )
209 | { return nodeIsOBJECT( tidyNodeToImpl(tnod) );
210 | }
211 | Bool tidyNodeIsDIV( TidyNode tnod )
212 | { return nodeIsDIV( tidyNodeToImpl(tnod) );
213 | }
214 | Bool tidyNodeIsSPAN( TidyNode tnod )
215 | { return nodeIsSPAN( tidyNodeToImpl(tnod) );
216 | }
217 | Bool tidyNodeIsINPUT( TidyNode tnod )
218 | { return nodeIsINPUT( tidyNodeToImpl(tnod) );
219 | }
220 | Bool tidyNodeIsQ( TidyNode tnod )
221 | { return nodeIsQ( tidyNodeToImpl(tnod) );
222 | }
223 | Bool tidyNodeIsLABEL( TidyNode tnod )
224 | { return nodeIsLABEL( tidyNodeToImpl(tnod) );
225 | }
226 | Bool tidyNodeIsH3( TidyNode tnod )
227 | { return nodeIsH3( tidyNodeToImpl(tnod) );
228 | }
229 | Bool tidyNodeIsH4( TidyNode tnod )
230 | { return nodeIsH4( tidyNodeToImpl(tnod) );
231 | }
232 | Bool tidyNodeIsH5( TidyNode tnod )
233 | { return nodeIsH5( tidyNodeToImpl(tnod) );
234 | }
235 | Bool tidyNodeIsH6( TidyNode tnod )
236 | { return nodeIsH6( tidyNodeToImpl(tnod) );
237 | }
238 | Bool tidyNodeIsADDRESS( TidyNode tnod )
239 | { return nodeIsADDRESS( tidyNodeToImpl(tnod) );
240 | }
241 | Bool tidyNodeIsXMP( TidyNode tnod )
242 | { return nodeIsXMP( tidyNodeToImpl(tnod) );
243 | }
244 | Bool tidyNodeIsSELECT( TidyNode tnod )
245 | { return nodeIsSELECT( tidyNodeToImpl(tnod) );
246 | }
247 | Bool tidyNodeIsBLINK( TidyNode tnod )
248 | { return nodeIsBLINK( tidyNodeToImpl(tnod) );
249 | }
250 | Bool tidyNodeIsMARQUEE( TidyNode tnod )
251 | { return nodeIsMARQUEE( tidyNodeToImpl(tnod) );
252 | }
253 | Bool tidyNodeIsEMBED( TidyNode tnod )
254 | { return nodeIsEMBED( tidyNodeToImpl(tnod) );
255 | }
256 | Bool tidyNodeIsBASEFONT( TidyNode tnod )
257 | { return nodeIsBASEFONT( tidyNodeToImpl(tnod) );
258 | }
259 | Bool tidyNodeIsISINDEX( TidyNode tnod )
260 | { return nodeIsISINDEX( tidyNodeToImpl(tnod) );
261 | }
262 | Bool tidyNodeIsS( TidyNode tnod )
263 | { return nodeIsS( tidyNodeToImpl(tnod) );
264 | }
265 | Bool tidyNodeIsSTRIKE( TidyNode tnod )
266 | { return nodeIsSTRIKE( tidyNodeToImpl(tnod) );
267 | }
268 | Bool tidyNodeIsU( TidyNode tnod )
269 | { return nodeIsU( tidyNodeToImpl(tnod) );
270 | }
271 | Bool tidyNodeIsMENU( TidyNode tnod )
272 | { return nodeIsMENU( tidyNodeToImpl(tnod) );
273 | }
274 | 
275 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/tags.h:
--------------------------------------------------------------------------------
  1 | #ifndef __TAGS_H__
  2 | #define __TAGS_H__
  3 | 
  4 | /* tags.h -- recognize HTML tags
  5 | 
  6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  7 |   See tidy.h for the copyright notice.
  8 | 
  9 |   CVS Info :
 10 | 
 11 |     $LastChangedBy$ 
 12 |     $LastChangedDate$ 
 13 |     $LastChangedRevision$ 
 14 | 
 15 |   The HTML tags are stored as 8 bit ASCII strings.
 16 |   Use lookupw() to find a tag given a wide char string.
 17 | 
 18 | */
 19 | 
 20 | #include "forward.h"
 21 | #include "attrdict.h"
 22 | 
 23 | typedef void (Parser)( TidyDocImpl* doc, Node *node, uint mode );
 24 | typedef void (CheckAttribs)( TidyDocImpl* doc, Node *node );
 25 | 
 26 | /*
 27 |  Tag dictionary node
 28 | */
 29 | 
 30 | /* types of tags that the user can define */
 31 | #define tagtype_empty     1
 32 | #define tagtype_inline    2
 33 | #define tagtype_block     4
 34 | #define tagtype_pre       8
 35 | 
 36 | struct _Dict
 37 | {
 38 |     TidyTagId       id;
 39 |     tmbstr          name;
 40 |     uint            versions;
 41 |     AttrVersion*    attrvers;
 42 |     uint            model;
 43 |     Parser*         parser;
 44 |     CheckAttribs*   chkattrs;
 45 |     Dict*           next;
 46 | };
 47 | 
 48 | #ifdef ELEMENT_HASH_LOOKUP
 49 | #define ELEMENT_HASH_SIZE 178
 50 | #endif
 51 | 
 52 | struct _TidyTagImpl
 53 | {
 54 |     Dict* xml_tags;                /* placeholder for all xml tags */
 55 |     Dict* declared_tag_list;       /* User declared tags */
 56 | #ifdef ELEMENT_HASH_LOOKUP
 57 |     Dict* hashtab[ELEMENT_HASH_SIZE];
 58 | #endif
 59 | };
 60 | 
 61 | typedef struct _TidyTagImpl TidyTagImpl;
 62 | 
 63 | /* interface for finding tag by name */
 64 | const Dict* LookupTagDef( TidyTagId tid );
 65 | Bool    FindTag( TidyDocImpl* doc, Node *node );
 66 | Parser* FindParser( TidyDocImpl* doc, Node *node );
 67 | void    DefineTag( TidyDocImpl* doc, int tagType, ctmbstr name );
 68 | void    FreeDeclaredTags( TidyDocImpl* doc, int tagType ); /* 0 to free all */
 69 | 
 70 | TidyIterator   GetDeclaredTagList( TidyDocImpl* doc );
 71 | Dict*          GetNextDeclaredDict( TidyDocImpl* doc, TidyIterator* iter );
 72 | ctmbstr        GetNextDeclaredTag( TidyDocImpl* doc, int tagType,
 73 |                                    TidyIterator* iter );
 74 | 
 75 | void InitTags( TidyDocImpl* doc );
 76 | void FreeTags( TidyDocImpl* doc );
 77 | 
 78 | 
 79 | /* Parser methods for tags */
 80 | 
 81 | Parser ParseHTML;
 82 | Parser ParseHead;
 83 | Parser ParseTitle;
 84 | Parser ParseScript;
 85 | Parser ParseFrameSet;
 86 | Parser ParseNoFrames;
 87 | Parser ParseBody;
 88 | Parser ParsePre;
 89 | Parser ParseList;
 90 | Parser ParseLI;
 91 | Parser ParseDefList;
 92 | Parser ParseBlock;
 93 | Parser ParseInline;
 94 | Parser ParseEmpty;
 95 | Parser ParseTableTag;
 96 | Parser ParseColGroup;
 97 | Parser ParseRowGroup;
 98 | Parser ParseRow;
 99 | Parser ParseSelect;
100 | Parser ParseOptGroup;
101 | Parser ParseText;
102 | Parser ParseObject;
103 | Parser ParseMap;
104 | 
105 | /* Attribute checking methods */
106 | 
107 | CheckAttribs CheckAttributes;
108 | CheckAttribs CheckIMG;
109 | CheckAttribs CheckLINK;
110 | CheckAttribs CheckAREA;
111 | CheckAttribs CheckTABLE;
112 | CheckAttribs CheckCaption;
113 | CheckAttribs CheckSCRIPT;
114 | CheckAttribs CheckSTYLE;
115 | CheckAttribs CheckHTML;
116 | CheckAttribs CheckFORM;
117 | CheckAttribs CheckMETA;
118 | 
119 | /* 0 == TidyTag_UNKNOWN */
120 | #define TagId(node)        ((node) && (node)->tag ? (node)->tag->id : TidyTag_UNKNOWN)
121 | #define TagIsId(node, tid) ((node) && (node)->tag && (node)->tag->id == tid)
122 | 
123 | Bool nodeIsText( Node* node );
124 | Bool nodeIsElement( Node* node );
125 | 
126 | Bool nodeHasText( TidyDocImpl* doc, Node* node );
127 | 
128 | /* Compare & result to operand.  If equal, then all bits
129 | ** requested are set.
130 | */
131 | Bool nodeMatchCM( Node* node, uint contentModel );
132 | 
133 | /* True if any of the bits requested are set.
134 | */
135 | Bool nodeHasCM( Node* node, uint contentModel );
136 | 
137 | Bool nodeCMIsBlock( Node* node );
138 | Bool nodeCMIsInline( Node* node );
139 | Bool nodeCMIsEmpty( Node* node );
140 | 
141 | 
142 | Bool nodeIsHeader( Node* node );     /* H1, H2, ..., H6 */
143 | uint nodeHeaderLevel( Node* node );  /* 1, 2, ..., 6 */
144 | 
145 | #define nodeIsHTML( node )       TagIsId( node, TidyTag_HTML )
146 | #define nodeIsHEAD( node )       TagIsId( node, TidyTag_HEAD )
147 | #define nodeIsTITLE( node )      TagIsId( node, TidyTag_TITLE )
148 | #define nodeIsBASE( node )       TagIsId( node, TidyTag_BASE )
149 | #define nodeIsMETA( node )       TagIsId( node, TidyTag_META )
150 | #define nodeIsBODY( node )       TagIsId( node, TidyTag_BODY )
151 | #define nodeIsFRAMESET( node )   TagIsId( node, TidyTag_FRAMESET )
152 | #define nodeIsFRAME( node )      TagIsId( node, TidyTag_FRAME )
153 | #define nodeIsIFRAME( node )     TagIsId( node, TidyTag_IFRAME )
154 | #define nodeIsNOFRAMES( node )   TagIsId( node, TidyTag_NOFRAMES )
155 | #define nodeIsHR( node )         TagIsId( node, TidyTag_HR )
156 | #define nodeIsH1( node )         TagIsId( node, TidyTag_H1 )
157 | #define nodeIsH2( node )         TagIsId( node, TidyTag_H2 )
158 | #define nodeIsPRE( node )        TagIsId( node, TidyTag_PRE )
159 | #define nodeIsLISTING( node )    TagIsId( node, TidyTag_LISTING )
160 | #define nodeIsP( node )          TagIsId( node, TidyTag_P )
161 | #define nodeIsUL( node )         TagIsId( node, TidyTag_UL )
162 | #define nodeIsOL( node )         TagIsId( node, TidyTag_OL )
163 | #define nodeIsDL( node )         TagIsId( node, TidyTag_DL )
164 | #define nodeIsDIR( node )        TagIsId( node, TidyTag_DIR )
165 | #define nodeIsLI( node )         TagIsId( node, TidyTag_LI )
166 | #define nodeIsDT( node )         TagIsId( node, TidyTag_DT )
167 | #define nodeIsDD( node )         TagIsId( node, TidyTag_DD )
168 | #define nodeIsTABLE( node )      TagIsId( node, TidyTag_TABLE )
169 | #define nodeIsCAPTION( node )    TagIsId( node, TidyTag_CAPTION )
170 | #define nodeIsTD( node )         TagIsId( node, TidyTag_TD )
171 | #define nodeIsTH( node )         TagIsId( node, TidyTag_TH )
172 | #define nodeIsTR( node )         TagIsId( node, TidyTag_TR )
173 | #define nodeIsCOL( node )        TagIsId( node, TidyTag_COL )
174 | #define nodeIsCOLGROUP( node )   TagIsId( node, TidyTag_COLGROUP )
175 | #define nodeIsBR( node )         TagIsId( node, TidyTag_BR )
176 | #define nodeIsA( node )          TagIsId( node, TidyTag_A )
177 | #define nodeIsLINK( node )       TagIsId( node, TidyTag_LINK )
178 | #define nodeIsB( node )          TagIsId( node, TidyTag_B )
179 | #define nodeIsI( node )          TagIsId( node, TidyTag_I )
180 | #define nodeIsSTRONG( node )     TagIsId( node, TidyTag_STRONG )
181 | #define nodeIsEM( node )         TagIsId( node, TidyTag_EM )
182 | #define nodeIsBIG( node )        TagIsId( node, TidyTag_BIG )
183 | #define nodeIsSMALL( node )      TagIsId( node, TidyTag_SMALL )
184 | #define nodeIsPARAM( node )      TagIsId( node, TidyTag_PARAM )
185 | #define nodeIsOPTION( node )     TagIsId( node, TidyTag_OPTION )
186 | #define nodeIsOPTGROUP( node )   TagIsId( node, TidyTag_OPTGROUP )
187 | #define nodeIsIMG( node )        TagIsId( node, TidyTag_IMG )
188 | #define nodeIsMAP( node )        TagIsId( node, TidyTag_MAP )
189 | #define nodeIsAREA( node )       TagIsId( node, TidyTag_AREA )
190 | #define nodeIsNOBR( node )       TagIsId( node, TidyTag_NOBR )
191 | #define nodeIsWBR( node )        TagIsId( node, TidyTag_WBR )
192 | #define nodeIsFONT( node )       TagIsId( node, TidyTag_FONT )
193 | #define nodeIsLAYER( node )      TagIsId( node, TidyTag_LAYER )
194 | #define nodeIsSPACER( node )     TagIsId( node, TidyTag_SPACER )
195 | #define nodeIsCENTER( node )     TagIsId( node, TidyTag_CENTER )
196 | #define nodeIsSTYLE( node )      TagIsId( node, TidyTag_STYLE )
197 | #define nodeIsSCRIPT( node )     TagIsId( node, TidyTag_SCRIPT )
198 | #define nodeIsNOSCRIPT( node )   TagIsId( node, TidyTag_NOSCRIPT )
199 | #define nodeIsFORM( node )       TagIsId( node, TidyTag_FORM )
200 | #define nodeIsTEXTAREA( node )   TagIsId( node, TidyTag_TEXTAREA )
201 | #define nodeIsBLOCKQUOTE( node ) TagIsId( node, TidyTag_BLOCKQUOTE )
202 | #define nodeIsAPPLET( node )     TagIsId( node, TidyTag_APPLET )
203 | #define nodeIsOBJECT( node )     TagIsId( node, TidyTag_OBJECT )
204 | #define nodeIsDIV( node )        TagIsId( node, TidyTag_DIV )
205 | #define nodeIsSPAN( node )       TagIsId( node, TidyTag_SPAN )
206 | #define nodeIsINPUT( node )      TagIsId( node, TidyTag_INPUT )
207 | #define nodeIsQ( node )          TagIsId( node, TidyTag_Q )
208 | #define nodeIsLABEL( node )      TagIsId( node, TidyTag_LABEL )
209 | #define nodeIsH3( node )         TagIsId( node, TidyTag_H3 )
210 | #define nodeIsH4( node )         TagIsId( node, TidyTag_H4 )
211 | #define nodeIsH5( node )         TagIsId( node, TidyTag_H5 )
212 | #define nodeIsH6( node )         TagIsId( node, TidyTag_H6 )
213 | #define nodeIsADDRESS( node )    TagIsId( node, TidyTag_ADDRESS )
214 | #define nodeIsXMP( node )        TagIsId( node, TidyTag_XMP )
215 | #define nodeIsSELECT( node )     TagIsId( node, TidyTag_SELECT )
216 | #define nodeIsBLINK( node )      TagIsId( node, TidyTag_BLINK )
217 | #define nodeIsMARQUEE( node )    TagIsId( node, TidyTag_MARQUEE )
218 | #define nodeIsEMBED( node )      TagIsId( node, TidyTag_EMBED )
219 | #define nodeIsBASEFONT( node )   TagIsId( node, TidyTag_BASEFONT )
220 | #define nodeIsISINDEX( node )    TagIsId( node, TidyTag_ISINDEX )
221 | #define nodeIsS( node )          TagIsId( node, TidyTag_S )
222 | #define nodeIsSTRIKE( node )     TagIsId( node, TidyTag_STRIKE )
223 | #define nodeIsU( node )          TagIsId( node, TidyTag_U )
224 | #define nodeIsMENU( node )       TagIsId( node, TidyTag_MENU )
225 | 
226 | 
227 | #endif /* __TAGS_H__ */
228 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/access.h:
--------------------------------------------------------------------------------
  1 | #ifndef __ACCESS_H__
  2 | #define __ACCESS_H__
  3 | 
  4 | /* access.h -- carry out accessibility checks
  5 | 
  6 |   Copyright University of Toronto
  7 |   Portions (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  8 |   See tidy.h for the copyright notice.
  9 |   
 10 |   CVS Info :
 11 | 
 12 |     $LastChangedBy$ 
 13 |     $LastChangedDate$ 
 14 |     $LastChangedRevision$ 
 15 | 
 16 | */
 17 | 
 18 | /*********************************************************************
 19 | * AccessibilityChecks
 20 | *
 21 | * Carries out processes for all accessibility checks.  Traverses
 22 | * through all the content within the tree and evaluates the tags for
 23 | * accessibility.
 24 | *
 25 | * To perform the following checks, 'AccessibilityChecks' must be
 26 | * called AFTER the tree structure has been formed.
 27 | *
 28 | * If, in the command prompt, there is no specification of which
 29 | * accessibility priorities to check, no accessibility checks will be 
 30 | * performed.  (ie. '1' for priority 1, '2' for priorities 1 and 2, 
 31 | *                  and '3') for priorities 1, 2 and 3.)
 32 | *
 33 | * Copyright University of Toronto
 34 | * Programmed by: Mike Lam and Chris Ridpath
 35 | * Modifications by : Terry Teague (TRT)
 36 | *
 37 | *********************************************************************/
 38 | 
 39 | 
 40 | #include "forward.h"
 41 | 
 42 | #if SUPPORT_ACCESSIBILITY_CHECKS
 43 | 
 44 | /* The accessibility checks to perform depending on user's desire.
 45 | 
 46 |    1. priority 1
 47 |    2. priority 1 & 2
 48 |    3. priority 1, 2, & 3
 49 | */
 50 | 
 51 | /* Determines if the client-side text link is found within the document
 52 | typedef struct AreaLinks
 53 | {
 54 |     struct AreaLinks* next;
 55 |     char* link;
 56 |     Bool HasBeenFound;
 57 | } AreaLinks;
 58 | */
 59 | 
 60 | #define TEXTBUF_SIZE    128
 61 | 
 62 | struct _TidyAccessImpl;
 63 | typedef struct _TidyAccessImpl TidyAccessImpl;
 64 | 
 65 | struct _TidyAccessImpl
 66 | {
 67 |     /* gets set from Tidy variable AccessibilityCheckLevel */
 68 |     int PRIORITYCHK;
 69 | 
 70 |     /* Number of characters that are found within the concatenated text */
 71 |     int counter;
 72 | 
 73 |     /* list of characters in the text nodes found within a container element */
 74 |     tmbchar textNode[ TEXTBUF_SIZE ]; 
 75 | 
 76 |     /* The list of characters found within one text node */
 77 |     tmbchar text[ TEXTBUF_SIZE ]; 
 78 | 
 79 |     /* Number of frame elements found within a frameset */
 80 |     int numFrames; 
 81 | 
 82 |     /* Number of 'longdesc' attributes found within a frameset */
 83 |     int HasCheckedLongDesc; 
 84 | 
 85 |     int  CheckedHeaders;
 86 |     int  ListElements;
 87 |     int  OtherListElements;
 88 | 
 89 |     /* For 'USEMAP' identifier */
 90 |     Bool HasUseMap; 
 91 |     Bool HasName; 
 92 |     Bool HasMap;
 93 | 
 94 |     /* For tracking nodes that are deleted from the original parse tree - TRT */
 95 |     /* Node *access_tree; */
 96 | 
 97 |     Bool HasTH;
 98 |     Bool HasValidFor;
 99 |     Bool HasValidId;
100 |     Bool HasValidRowHeaders;
101 |     Bool HasValidColumnHeaders;
102 |     Bool HasInvalidRowHeader;
103 |     Bool HasInvalidColumnHeader;
104 |     int  ForID;
105 | 
106 |     /* List containing map-links
107 |     AreaLinks* links;
108 |     AreaLinks* start;
109 |     AreaLinks* current;
110 |     */
111 | 
112 | };
113 | 
114 | 
115 | /* 
116 |     Determines which error/warning message should be displayed,
117 |     depending on the error code that was called.
118 | */
119 | enum accessErrorCodes
120 | {
121 |     /* [1.1.1.1] */        IMG_MISSING_ALT,
122 |     /* [1.1.1.2] */        IMG_ALT_SUSPICIOUS_FILENAME,
123 |     /* [1.1.1.3] */        IMG_ALT_SUSPICIOUS_FILE_SIZE,
124 |     /* [1.1.1.4] */        IMG_ALT_SUSPICIOUS_PLACEHOLDER,
125 |     /* [1.1.1.10] */       IMG_ALT_SUSPICIOUS_TOO_LONG,
126 |     /* [1.1.1.11] */       IMG_MISSING_ALT_BULLET,
127 |     /* [1.1.1.12] */       IMG_MISSING_ALT_H_RULE,
128 |     /* [1.1.2.1] */        IMG_MISSING_LONGDESC_DLINK,
129 |     /* [1.1.2.2] */        IMG_MISSING_DLINK,
130 |     /* [1.1.2.3] */        IMG_MISSING_LONGDESC,
131 |     /* [1.1.2.5] */        LONGDESC_NOT_REQUIRED,
132 |     /* [1.1.3.1] */        IMG_BUTTON_MISSING_ALT, 
133 |     /* [1.1.4.1] */        APPLET_MISSING_ALT,
134 |     /* [1.1.5.1] */        OBJECT_MISSING_ALT,
135 |     /* [1.1.6.1] */        AUDIO_MISSING_TEXT_WAV,
136 |     /* [1.1.6.2] */        AUDIO_MISSING_TEXT_AU,
137 |     /* [1.1.6.3] */        AUDIO_MISSING_TEXT_AIFF,
138 |     /* [1.1.6.4] */        AUDIO_MISSING_TEXT_SND,
139 |     /* [1.1.6.5] */        AUDIO_MISSING_TEXT_RA,
140 |     /* [1.1.6.6] */        AUDIO_MISSING_TEXT_RM,
141 |     /* [1.1.8.1] */        FRAME_MISSING_LONGDESC,
142 |     /* [1.1.9.1] */        AREA_MISSING_ALT,
143 |     /* [1.1.10.1] */       SCRIPT_MISSING_NOSCRIPT,
144 |     /* [1.1.12.1] */       ASCII_REQUIRES_DESCRIPTION,
145 |     /* [1.2.1.1] */        IMG_MAP_SERVER_REQUIRES_TEXT_LINKS,
146 |     /* [1.4.1.1] */        MULTIMEDIA_REQUIRES_TEXT,
147 |     /* [1.5.1.1] */        IMG_MAP_CLIENT_MISSING_TEXT_LINKS,
148 |     /* [2.1.1.1] */        INFORMATION_NOT_CONVEYED_IMAGE,
149 |     /* [2.1.1.2] */        INFORMATION_NOT_CONVEYED_APPLET,
150 |     /* [2.1.1.3] */        INFORMATION_NOT_CONVEYED_OBJECT,
151 |     /* [2.1.1.4] */        INFORMATION_NOT_CONVEYED_SCRIPT,
152 |     /* [2.1.1.5] */        INFORMATION_NOT_CONVEYED_INPUT,
153 |     /* [2.2.1.1] */        COLOR_CONTRAST_TEXT,
154 |     /* [2.2.1.2] */        COLOR_CONTRAST_LINK,
155 |     /* [2.2.1.3] */        COLOR_CONTRAST_ACTIVE_LINK,
156 |     /* [2.2.1.4] */        COLOR_CONTRAST_VISITED_LINK,
157 |     /* [3.2.1.1] */        DOCTYPE_MISSING,
158 |     /* [3.3.1.1] */        STYLE_SHEET_CONTROL_PRESENTATION,
159 |     /* [3.5.1.1] */        HEADERS_IMPROPERLY_NESTED,
160 |     /* [3.5.2.1] */        POTENTIAL_HEADER_BOLD,
161 |     /* [3.5.2.2] */        POTENTIAL_HEADER_ITALICS,
162 |     /* [3.5.2.3] */        POTENTIAL_HEADER_UNDERLINE,
163 |     /* [3.5.3.1] */        HEADER_USED_FORMAT_TEXT,
164 |     /* [3.6.1.1] */        LIST_USAGE_INVALID_UL,
165 |     /* [3.6.1.2] */        LIST_USAGE_INVALID_OL,
166 |     /* [3.6.1.4] */        LIST_USAGE_INVALID_LI,
167 |     /* [4.1.1.1] */        INDICATE_CHANGES_IN_LANGUAGE,
168 |     /* [4.3.1.1] */        LANGUAGE_NOT_IDENTIFIED,
169 |     /* [4.3.1.1] */        LANGUAGE_INVALID,
170 |     /* [5.1.2.1] */        DATA_TABLE_MISSING_HEADERS,
171 |     /* [5.1.2.2] */        DATA_TABLE_MISSING_HEADERS_COLUMN,
172 |     /* [5.1.2.3] */        DATA_TABLE_MISSING_HEADERS_ROW,
173 |     /* [5.2.1.1] */        DATA_TABLE_REQUIRE_MARKUP_COLUMN_HEADERS,
174 |     /* [5.2.1.2] */        DATA_TABLE_REQUIRE_MARKUP_ROW_HEADERS,
175 |     /* [5.3.1.1] */        LAYOUT_TABLES_LINEARIZE_PROPERLY,
176 |     /* [5.4.1.1] */        LAYOUT_TABLE_INVALID_MARKUP,
177 |     /* [5.5.1.1] */        TABLE_MISSING_SUMMARY,
178 |     /* [5.5.1.2] */        TABLE_SUMMARY_INVALID_NULL,
179 |     /* [5.5.1.3] */        TABLE_SUMMARY_INVALID_SPACES,
180 |     /* [5.5.1.6] */        TABLE_SUMMARY_INVALID_PLACEHOLDER,
181 |     /* [5.5.2.1] */        TABLE_MISSING_CAPTION,
182 |     /* [5.6.1.1] */        TABLE_MAY_REQUIRE_HEADER_ABBR,
183 |     /* [5.6.1.2] */        TABLE_MAY_REQUIRE_HEADER_ABBR_NULL,
184 |     /* [5.6.1.3] */        TABLE_MAY_REQUIRE_HEADER_ABBR_SPACES,
185 |     /* [6.1.1.1] */        STYLESHEETS_REQUIRE_TESTING_LINK,
186 |     /* [6.1.1.2] */        STYLESHEETS_REQUIRE_TESTING_STYLE_ELEMENT,
187 |     /* [6.1.1.3] */        STYLESHEETS_REQUIRE_TESTING_STYLE_ATTR,
188 |     /* [6.2.1.1] */        FRAME_SRC_INVALID,
189 |     /* [6.2.2.1] */        TEXT_EQUIVALENTS_REQUIRE_UPDATING_APPLET,
190 |     /* [6.2.2.2] */        TEXT_EQUIVALENTS_REQUIRE_UPDATING_SCRIPT,
191 |     /* [6.2.2.3] */        TEXT_EQUIVALENTS_REQUIRE_UPDATING_OBJECT,
192 |     /* [6.3.1.1] */        PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_SCRIPT,
193 |     /* [6.3.1.2] */        PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_OBJECT,
194 |     /* [6.3.1.3] */        PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_EMBED,
195 |     /* [6.3.1.4] */        PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_APPLET,
196 |     /* [6.5.1.1] */        FRAME_MISSING_NOFRAMES,
197 |     /* [6.5.1.2] */        NOFRAMES_INVALID_NO_VALUE,
198 |     /* [6.5.1.3] */        NOFRAMES_INVALID_CONTENT,
199 |     /* [6.5.1.4] */        NOFRAMES_INVALID_LINK,
200 |     /* [7.1.1.1] */        REMOVE_FLICKER_SCRIPT,
201 |     /* [7.1.1.2] */        REMOVE_FLICKER_OBJECT,
202 |     /* [7.1.1.3] */        REMOVE_FLICKER_EMBED,
203 |     /* [7.1.1.4] */        REMOVE_FLICKER_APPLET,
204 |     /* [7.1.1.5] */        REMOVE_FLICKER_ANIMATED_GIF,
205 |     /* [7.2.1.1] */        REMOVE_BLINK_MARQUEE,
206 |     /* [7.4.1.1] */        REMOVE_AUTO_REFRESH,
207 |     /* [7.5.1.1] */        REMOVE_AUTO_REDIRECT,
208 |     /* [8.1.1.1] */        ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_SCRIPT,
209 |     /* [8.1.1.2] */        ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_OBJECT,
210 |     /* [8.1.1.3] */        ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_APPLET,
211 |     /* [8.1.1.4] */        ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_EMBED,
212 |     /* [9.1.1.1] */        IMAGE_MAP_SERVER_SIDE_REQUIRES_CONVERSION,
213 |     /* [9.3.1.1] */        SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_DOWN,
214 |     /* [9.3.1.2] */        SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_UP,
215 |     /* [9.3.1.3] */        SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_CLICK,
216 |     /* [9.3.1.4] */        SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OVER,
217 |     /* [9.3.1.5] */        SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OUT,
218 |     /* [9.3.1.6] */        SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_MOVE,
219 |     /* [10.1.1.1] */       NEW_WINDOWS_REQUIRE_WARNING_NEW,
220 |     /* [10.1.1.2] */       NEW_WINDOWS_REQUIRE_WARNING_BLANK,
221 |     /* [10.2.1.1] */       LABEL_NEEDS_REPOSITIONING_BEFORE_INPUT,
222 |     /* [10.2.1.2] */       LABEL_NEEDS_REPOSITIONING_AFTER_INPUT,
223 |     /* [10.4.1.1] */       FORM_CONTROL_REQUIRES_DEFAULT_TEXT,
224 |     /* [10.4.1.2] */       FORM_CONTROL_DEFAULT_TEXT_INVALID_NULL,
225 |     /* [10.4.1.3] */       FORM_CONTROL_DEFAULT_TEXT_INVALID_SPACES,
226 |     /* [11.2.1.1] */       REPLACE_DEPRECATED_HTML_APPLET,
227 |     /* [11.2.1.2] */       REPLACE_DEPRECATED_HTML_BASEFONT,
228 |     /* [11.2.1.3] */       REPLACE_DEPRECATED_HTML_CENTER,
229 |     /* [11.2.1.4] */       REPLACE_DEPRECATED_HTML_DIR,
230 |     /* [11.2.1.5] */       REPLACE_DEPRECATED_HTML_FONT,
231 |     /* [11.2.1.6] */       REPLACE_DEPRECATED_HTML_ISINDEX,
232 |     /* [11.2.1.7] */       REPLACE_DEPRECATED_HTML_MENU,
233 |     /* [11.2.1.8] */       REPLACE_DEPRECATED_HTML_S,
234 |     /* [11.2.1.9] */       REPLACE_DEPRECATED_HTML_STRIKE,
235 |     /* [11.2.1.10] */      REPLACE_DEPRECATED_HTML_U,
236 |     /* [12.1.1.1] */       FRAME_MISSING_TITLE,
237 |     /* [12.1.1.2] */       FRAME_TITLE_INVALID_NULL,
238 |     /* [12.1.1.3] */       FRAME_TITLE_INVALID_SPACES,
239 |     /* [12.4.1.1] */       ASSOCIATE_LABELS_EXPLICITLY,
240 |     /* [12.4.1.2] */       ASSOCIATE_LABELS_EXPLICITLY_FOR,
241 |     /* [12.4.1.3] */       ASSOCIATE_LABELS_EXPLICITLY_ID,
242 |     /* [13.1.1.1] */       LINK_TEXT_NOT_MEANINGFUL,
243 |     /* [13.1.1.2] */       LINK_TEXT_MISSING,
244 |     /* [13.1.1.3] */       LINK_TEXT_TOO_LONG,
245 |     /* [13.1.1.4] */       LINK_TEXT_NOT_MEANINGFUL_CLICK_HERE,
246 |     /* [13.1.1.5] */       LINK_TEXT_NOT_MEANINGFUL_MORE,
247 |     /* [13.1.1.6] */       LINK_TEXT_NOT_MEANINGFUL_FOLLOW_THIS,
248 |     /* [13.2.1.1] */       METADATA_MISSING,
249 |     /* [13.2.1.2] */       METADATA_MISSING_LINK,
250 |     /* [13.2.1.3] */       METADATA_MISSING_REDIRECT_AUTOREFRESH,
251 |     /* [13.10.1.1] */      SKIPOVER_ASCII_ART,
252 |     
253 |     LAST_ACCESS_ERR    /* must be last */
254 | };
255 | 
256 | 
257 | /************************************************************
258 | * AccessibilityChecks
259 | *
260 | * Traverses through the individual nodes of the tree
261 | * and checks attributes and elements for accessibility.
262 | * after the tree structure has been formed.
263 | ************************************************************/
264 | 
265 | void AccessibilityChecks( TidyDocImpl* doc );
266 | 
267 | 
268 | #endif /* SUPPORT_ACCESSIBILITY_CHECKS */
269 | #endif /* __ACCESS_H__ */
270 | 


--------------------------------------------------------------------------------
/juniperncprompt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # If this doesn't run for you, replace python2 with python
  3 | 
  4 | ###################
  5 | # Copyright 2011 Joseph Henrich (crimsonknave@gmail.com)
  6 | #
  7 | # This program is free software: you can redistribute it and/or modify
  8 | # it under the terms of the GNU General Public License as published by
  9 | # the Free Software Foundation, either version 3 of the License, or
 10 | # (at your option) any later version.
 11 | #
 12 | # This program is distributed in the hope that it will be useful,
 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | # GNU General Public License for more details.
 16 | #
 17 | # You should have received a copy of the GNU General Public License
 18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>
 19 | #
 20 | ######################
 21 | 
 22 | import sys
 23 | import getpass
 24 | import argparse
 25 | import urllib2, urllib, cookielib
 26 | import os
 27 | import pexpect
 28 | import time
 29 | from elementtidy import TidyHTMLTreeBuilder
 30 | 
 31 | 
 32 | 
 33 | # The defaults are what my set up expected, change them if you want
 34 | parser = argparse.ArgumentParser(description="Set up the vpn tunnel.")
 35 | parser.add_argument('hostname', help="The hostname of the vpn server")
 36 | parser.add_argument('-u', '--username', default=None)
 37 | parser.add_argument('--password-fields', help="What are the password fields required by your vpn site.  Delimited by commas please (no spaces)", default="password,password#2")
 38 | parser.add_argument('-r', '--realm', help="What realm are we using.  This will be a hidden field in the web form on the vpn site", default="Internal Users")
 39 | parser.add_argument('--login-path', help="The path to the login page (What the submit button points to)", default="/dana-na/auth/url_2/login.cgi")
 40 | parser.add_argument('--nc-path', help="Where the juniper network connect files are located", default="{}/.juniper_networks/network_connect".format(os.environ["HOME"]))
 41 | parser.add_argument('--logout-path', help="The path to the logout call, so we don't leave sessions trailing behind us.", default="/dana-na/auth/logout.cgi")
 42 | parser.add_argument('--cert', help="The location of the cert file to use with ncui", default="{}/.juniper_networks/network_connect/ssl.crt".format(os.environ["HOME"]))
 43 | parser.add_argument('--out-file', help="If an error occurs where should the page be written to for review", default="/tmp/juniperncprompt_error.html")
 44 | 
 45 | 
 46 | def find_sessions(base, par=None):
 47 |   values = [x.text for x in base.getchildren()]
 48 |   if values[1:5] == ['Login IP Address', 'Login Time', 'Idle Time', 'Browser']:
 49 |     return par
 50 | 
 51 |   for child in base.getchildren():
 52 |     answer = find_sessions(child, base)
 53 |     if answer is not None:
 54 |       return answer
 55 | 
 56 | 
 57 | def find_by_name(base, name):
 58 |   try:
 59 |     if dict(base.items())['name'] == name:
 60 |       return base
 61 |   except KeyError, AttributeError:
 62 |     pass
 63 | 
 64 |   for child in base.getchildren():
 65 |     answer = find_by_name(child, name)
 66 |     if answer is not None:
 67 |       return answer
 68 | 
 69 | def find_session_values(table):
 70 |   return [dict(tr.getchildren()[0].getchildren()[0].items()) for tr in table.getchildren()[1:]]
 71 | 
 72 | def display_session(table):
 73 |   i = 0
 74 |   for tr in table.getchildren():
 75 |     if i == 0:
 76 |       print(u"     "+u"".join([u"{:<30}".format(text) for text in [td.text for td in tr.getchildren()][1:5]]))
 77 |     else:
 78 |       print(u"{:<3}: ".format(i)+u"".join([u"{:<30}".format(text) for text in [td.text for td in tr.getchildren()][1:5]]))
 79 |     i += 1
 80 | 
 81 | 
 82 | class JuniperNCPrompt:
 83 |   def __init__(self):
 84 |     self.args = parser.parse_args()
 85 |     if not self.args.username:
 86 |       self.get_user()
 87 | 
 88 |     self.passwords = self.get_passwords()
 89 |     self.data = self.configure_data(self.passwords)
 90 | 
 91 |     self.cj = cookielib.CookieJar()
 92 |     self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
 93 | 
 94 |   def parse_error(self, html):
 95 |     tree = TidyHTMLTreeBuilder.parse(html)
 96 |     prefix = '{http://www.w3.org/1999/xhtml}'
 97 |     root = tree.getroot()
 98 | 
 99 |     self.form = root.find('{}body/{}form'.format(prefix, prefix))
100 |     # Some of the returned html has a blockquote before the form, some don't
101 |     if self.form is None:
102 |       self.form = root.find('{}body/{}blockquote/{}form'.format(*[prefix]*3))
103 |     if self.form is not None:
104 |       fields = dict(self.form.items())
105 |       if fields['name'] == 'frmConfirmation':
106 |         # Existing sessions open
107 |         print("There are existing sessions open")
108 |         #submit = form.find('{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}input'.format(*[prefix]*13))
109 |         submit = find_by_name(self.form, 'btnContinue')
110 |         values = dict(submit.items())
111 |         table = find_sessions(self.form)
112 |         display_session(table)
113 |         if values["value"] == "Close Selected Sessions and Log in":
114 |           self.close_sessions(table, True)
115 |         elif values["value"] == "Log in (and optionally Close Selected Sessions)":
116 |           self.close_sessions(table)
117 |         session = self.get_session()
118 |         if session is not None:
119 |           return session
120 |         else:
121 |           return self.parse_error(self.args.out_file)
122 |       elif fields['name'] == 'frmLogin':
123 | 	print "Found login form, looking post-auth message"
124 | 	# This may be the post-login form
125 | 	# check whether there's a hidden field with a key
126 | 	#print tostring(self.form)
127 | 	temp = self.form.getiterator('{}input'.format(prefix))
128 | 	isSecondary = 0
129 | 	if temp is not None:
130 | 	  for t in temp:
131 | 	    #print tostring(t)
132 | 	    if t.get('name') == 'key':
133 | 	      key = t.get('value')
134 | 	      isSecondary = 1
135 | 	      break
136 | 
137 | 	if isSecondary == 1:
138 |           dat = {"key": key, "sn-postauth-proceed": "Proceed"}
139 | 	  #print dat
140 | 	  self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), urllib.urlencode(dat))
141 | 	  #print 'Attempting to get session'
142 | 	  session = self.get_session()
143 | 	  if session is not None:
144 | 	    return session
145 | 	  else:
146 | 	    return self.parse_error(self.args.out_file)
147 | 	else:
148 | 	  # Invalid user/pass try again
149 | 	  print("Invalid user/pass, please try again")
150 | 	  self.get_user()
151 | 	  passwords = self.get_passwords()
152 | 	  self.data = self.configure_data(passwords)
153 | 	  self.log_in()
154 | 	  session = self.get_session()
155 | 	  if session is not None:
156 | 	    return session
157 | 	  else:
158 | 	    return self.parse_error(self.args.out_file)
159 |       elif fields['name'] == 'frmNextToken':
160 |         # Wait till the next token pops up and then enter it
161 |         temp = self.form.find('{}/input'.format(prefix))
162 |         if temp is not None:
163 |           values = dict(temp.items())
164 |           try:
165 |             if values['name'] == 'key':
166 |               key = values['value']
167 |             else:
168 |               print("Unable to find the key for the next token form, either we detected the form incorrectly or somthing went wrong.")
169 |               return
170 |           except KeyError:
171 |             return
172 |           password = getpass("Please enter the next securID token to appear on your fob")
173 |           self.data = self.configure_data({"password":password})
174 |           #self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), self.data)
175 |           self.log_in()
176 |           session = self.get_session()
177 | 
178 | 
179 |       else:
180 |         # Unknown case, note where the file is so they can see what happened
181 |         print("An unhandled case has come up.  Please view the page at {}".format(self.args.out_file))
182 |     else:
183 |       print("Unable to parse the html, please view it at {}".format(self.args.out_file))
184 | 
185 |   def close_sessions(self, table, required=False):
186 |     if required:
187 |       reply = raw_input("Sessions maxed out, select at least one to close (space delimited)")
188 |     else:
189 |       reply = raw_input("Close any sessions you wish to, or log in with out closing sessions by typing 'n' (space delimited)")
190 |       if reply.strip() == 'n':
191 |         reply = ""
192 | 
193 |     try:
194 |       to_close = [int(x)-1 for x in reply.split()]
195 |     except ValueError:
196 |       reply = False
197 |       display_session(table)
198 |       self.close_sessions(table, required)
199 |       return
200 | 
201 |     button = dict(find_by_name(self.form, "btnContinue").items())
202 |     form_data_str = dict(find_by_name(self.form, "FormDataStr").items())
203 | 
204 |     sessions = [dict(tr.getchildren()[0].getchildren()[0].items()) for tr in table.getchildren()[1:]]
205 |     sessions_to_close = [(x['name'], x['value']) for x in [sessions[y] for y in to_close]]
206 |     print("Closing {} which turns out to be {}".format(to_close, sessions_to_close))
207 |     if to_close:
208 |       # We want the FormDataStr to be the last parameter
209 |       base_data = [(button['name'], button['value'])]
210 |       base_data.extend(sessions_to_close)
211 |       base_data.append((form_data_str['name'], form_data_str['value']))
212 |     else:
213 |       base_data = [(button['name'],button['value']), (form_data_str['name'],form_data_str['value'])]
214 |     self.data = urllib.urlencode(base_data)
215 |     self.log_in()
216 | 
217 |     
218 | 
219 | 
220 | 
221 |   def log_out(self):
222 |     print("Logging out now")
223 |     try:
224 |       self.latest_response = self.opener.open("https://{}{}".format(self.args.hostname, self.args.logout_path))
225 |       if self.latest_response.getcode() != 200:
226 |         print("Got a non 200 back ({}), there may be a session still around.".format(self.latest_response.getcode()))
227 |     except Exception, e:
228 |       print("We tried to log out, but were unable to, there may be a lingering session...")
229 | 
230 |   def get_passwords(self):
231 |     passwords = {}
232 |     for pass_name in self.args.password_fields.split(','):
233 |       passwords[pass_name] = getpass.getpass("'"+pass_name+"':")
234 |     return passwords
235 | 
236 |   def get_user(self):
237 |     self.args.username = raw_input("Please enter your username: ")
238 | 
239 |   def log_in(self):
240 |     self.latest_response = self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), self.data)
241 | 
242 |   def get_session(self):
243 |     cookies = self.cj._cookies
244 |     try:
245 |       session = cookies[self.args.hostname]['/']['DSID'].value
246 |       return session
247 |     except KeyError, e:
248 |       out_file = open(self.args.out_file, 'w')
249 |       out_file.write(self.latest_response.read())
250 |       out_file.close()
251 |       print("Something went wrong, html of last page is written to {}".format(self.args.out_file))
252 |       session = self.parse_error(self.args.out_file)
253 |       if session:
254 |         return session
255 |       else:
256 |         print("Couldn't find any cookies for {}, code was {}, body written to {}".format(self.args.hostname, self.latest_response.getcode(), self.args.out_file))
257 |         sys.exit(1)
258 | 
259 |   def run_ncui(self, session):
260 |     command = "{}/ncui -h {} -c DSID={} -f {}".format(self.args.nc_path, self.args.hostname, session, self.args.cert)
261 |     print("Got the session ({}) creating the tunnel now, use Ctrl+C when you are done.".format(session))
262 |     child = pexpect.spawn(command)
263 |     child.expect('Password:')
264 |     child.sendline("")
265 |     #print child.read()
266 |     while child.isalive():
267 |       #We don't expect the child to die, but we certainly should exit if it does
268 |       time.sleep(1)
269 | 
270 |   def configure_data(self, passwords):
271 |     # By not putting this in the __init__ method we don't risk something wonky
272 |     # happening and we have extra password fields from one time to another
273 |     # This is only likly to happen if this is imported as a module
274 |     params = {"username": self.args.username, "realm": self.args.realm, "btnSubmit": "Sign In"}
275 |     params.update(passwords)
276 |     return urllib.urlencode(params)
277 | 
278 | 
279 | if __name__ == "__main__":
280 |   try:
281 |     attempt = JuniperNCPrompt()
282 | 
283 |     attempt.log_in()
284 |     session = attempt.get_session()
285 |     attempt.run_ncui(session)
286 | 
287 | 
288 |     print("Done!")
289 |     attempt.log_out()
290 |   except Exception, e:
291 |     print("Uh-oh, we got an exception: {} cleaning up now".format(e))
292 |     print(sys.exc_info()[0])
293 |     attempt.log_out()
294 |     
295 |   except KeyboardInterrupt:
296 |     attempt.log_out()
297 | 
298 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/entities.c:
--------------------------------------------------------------------------------
  1 | /* entities.c -- recognize HTML ISO entities
  2 | 
  3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  4 |   See tidy.h for the copyright notice.
  5 | 
  6 |   CVS Info :
  7 | 
  8 |     $LastChangedBy$ 
  9 |     $LastChangedDate$ 
 10 |     $LastChangedRevision$ 
 11 | 
 12 |   Entity handling can be static because there are no config or
 13 |   document-specific values.  Lookup table is 100% defined at 
 14 |   compile time.
 15 | 
 16 | */
 17 | 
 18 | #include <stdio.h>
 19 | #include "entities.h"
 20 | #include "tidy-int.h"
 21 | #include "tmbstr.h"
 22 | 
 23 | struct _entity;
 24 | typedef struct _entity entity;
 25 | 
 26 | struct _entity
 27 | {
 28 |     ctmbstr name;
 29 |     uint    versions;
 30 |     uint    code;
 31 | };
 32 | 
 33 | 
 34 | static const entity entities[] =
 35 | {
 36 |     /*
 37 |     ** Markup pre-defined character entities
 38 |     */
 39 |     { "quot",    VERS_ALL|VERS_XML,    34 },
 40 |     { "amp",     VERS_ALL|VERS_XML,    38 },
 41 |     { "apos",    VERS_FROM40|VERS_XML, 39 },
 42 |     { "lt",      VERS_ALL|VERS_XML,    60 },
 43 |     { "gt",      VERS_ALL|VERS_XML,    62 },
 44 | 
 45 |     /*
 46 |     ** Latin-1 character entities
 47 |     */
 48 |     { "nbsp",     VERS_ALL,      160 },
 49 |     { "iexcl",    VERS_ALL,      161 },
 50 |     { "cent",     VERS_ALL,      162 },
 51 |     { "pound",    VERS_ALL,      163 },
 52 |     { "curren",   VERS_ALL,      164 },
 53 |     { "yen",      VERS_ALL,      165 },
 54 |     { "brvbar",   VERS_ALL,      166 },
 55 |     { "sect",     VERS_ALL,      167 },
 56 |     { "uml",      VERS_ALL,      168 },
 57 |     { "copy",     VERS_ALL,      169 },
 58 |     { "ordf",     VERS_ALL,      170 },
 59 |     { "laquo",    VERS_ALL,      171 },
 60 |     { "not",      VERS_ALL,      172 },
 61 |     { "shy",      VERS_ALL,      173 },
 62 |     { "reg",      VERS_ALL,      174 },
 63 |     { "macr",     VERS_ALL,      175 },
 64 |     { "deg",      VERS_ALL,      176 },
 65 |     { "plusmn",   VERS_ALL,      177 },
 66 |     { "sup2",     VERS_ALL,      178 },
 67 |     { "sup3",     VERS_ALL,      179 },
 68 |     { "acute",    VERS_ALL,      180 },
 69 |     { "micro",    VERS_ALL,      181 },
 70 |     { "para",     VERS_ALL,      182 },
 71 |     { "middot",   VERS_ALL,      183 },
 72 |     { "cedil",    VERS_ALL,      184 },
 73 |     { "sup1",     VERS_ALL,      185 },
 74 |     { "ordm",     VERS_ALL,      186 },
 75 |     { "raquo",    VERS_ALL,      187 },
 76 |     { "frac14",   VERS_ALL,      188 },
 77 |     { "frac12",   VERS_ALL,      189 },
 78 |     { "frac34",   VERS_ALL,      190 },
 79 |     { "iquest",   VERS_ALL,      191 },
 80 |     { "Agrave",   VERS_ALL,      192 },
 81 |     { "Aacute",   VERS_ALL,      193 },
 82 |     { "Acirc",    VERS_ALL,      194 },
 83 |     { "Atilde",   VERS_ALL,      195 },
 84 |     { "Auml",     VERS_ALL,      196 },
 85 |     { "Aring",    VERS_ALL,      197 },
 86 |     { "AElig",    VERS_ALL,      198 },
 87 |     { "Ccedil",   VERS_ALL,      199 },
 88 |     { "Egrave",   VERS_ALL,      200 },
 89 |     { "Eacute",   VERS_ALL,      201 },
 90 |     { "Ecirc",    VERS_ALL,      202 },
 91 |     { "Euml",     VERS_ALL,      203 },
 92 |     { "Igrave",   VERS_ALL,      204 },
 93 |     { "Iacute",   VERS_ALL,      205 },
 94 |     { "Icirc",    VERS_ALL,      206 },
 95 |     { "Iuml",     VERS_ALL,      207 },
 96 |     { "ETH",      VERS_ALL,      208 },
 97 |     { "Ntilde",   VERS_ALL,      209 },
 98 |     { "Ograve",   VERS_ALL,      210 },
 99 |     { "Oacute",   VERS_ALL,      211 },
100 |     { "Ocirc",    VERS_ALL,      212 },
101 |     { "Otilde",   VERS_ALL,      213 },
102 |     { "Ouml",     VERS_ALL,      214 },
103 |     { "times",    VERS_ALL,      215 },
104 |     { "Oslash",   VERS_ALL,      216 },
105 |     { "Ugrave",   VERS_ALL,      217 },
106 |     { "Uacute",   VERS_ALL,      218 },
107 |     { "Ucirc",    VERS_ALL,      219 },
108 |     { "Uuml",     VERS_ALL,      220 },
109 |     { "Yacute",   VERS_ALL,      221 },
110 |     { "THORN",    VERS_ALL,      222 },
111 |     { "szlig",    VERS_ALL,      223 },
112 |     { "agrave",   VERS_ALL,      224 },
113 |     { "aacute",   VERS_ALL,      225 },
114 |     { "acirc",    VERS_ALL,      226 },
115 |     { "atilde",   VERS_ALL,      227 },
116 |     { "auml",     VERS_ALL,      228 },
117 |     { "aring",    VERS_ALL,      229 },
118 |     { "aelig",    VERS_ALL,      230 },
119 |     { "ccedil",   VERS_ALL,      231 },
120 |     { "egrave",   VERS_ALL,      232 },
121 |     { "eacute",   VERS_ALL,      233 },
122 |     { "ecirc",    VERS_ALL,      234 },
123 |     { "euml",     VERS_ALL,      235 },
124 |     { "igrave",   VERS_ALL,      236 },
125 |     { "iacute",   VERS_ALL,      237 },
126 |     { "icirc",    VERS_ALL,      238 },
127 |     { "iuml",     VERS_ALL,      239 },
128 |     { "eth",      VERS_ALL,      240 },
129 |     { "ntilde",   VERS_ALL,      241 },
130 |     { "ograve",   VERS_ALL,      242 },
131 |     { "oacute",   VERS_ALL,      243 },
132 |     { "ocirc",    VERS_ALL,      244 },
133 |     { "otilde",   VERS_ALL,      245 },
134 |     { "ouml",     VERS_ALL,      246 },
135 |     { "divide",   VERS_ALL,      247 },
136 |     { "oslash",   VERS_ALL,      248 },
137 |     { "ugrave",   VERS_ALL,      249 },
138 |     { "uacute",   VERS_ALL,      250 },
139 |     { "ucirc",    VERS_ALL,      251 },
140 |     { "uuml",     VERS_ALL,      252 },
141 |     { "yacute",   VERS_ALL,      253 },
142 |     { "thorn",    VERS_ALL,      254 },
143 |     { "yuml",     VERS_ALL,      255 },
144 | 
145 |     /*
146 |     ** Extended Entities defined in HTML 4: Symbols 
147 |     */
148 |     { "fnof",     VERS_FROM40,   402 },
149 |     { "Alpha",    VERS_FROM40,   913 },
150 |     { "Beta",     VERS_FROM40,   914 },
151 |     { "Gamma",    VERS_FROM40,   915 },
152 |     { "Delta",    VERS_FROM40,   916 },
153 |     { "Epsilon",  VERS_FROM40,   917 },
154 |     { "Zeta",     VERS_FROM40,   918 },
155 |     { "Eta",      VERS_FROM40,   919 },
156 |     { "Theta",    VERS_FROM40,   920 },
157 |     { "Iota",     VERS_FROM40,   921 },
158 |     { "Kappa",    VERS_FROM40,   922 },
159 |     { "Lambda",   VERS_FROM40,   923 },
160 |     { "Mu",       VERS_FROM40,   924 },
161 |     { "Nu",       VERS_FROM40,   925 },
162 |     { "Xi",       VERS_FROM40,   926 },
163 |     { "Omicron",  VERS_FROM40,   927 },
164 |     { "Pi",       VERS_FROM40,   928 },
165 |     { "Rho",      VERS_FROM40,   929 },
166 |     { "Sigma",    VERS_FROM40,   931 },
167 |     { "Tau",      VERS_FROM40,   932 },
168 |     { "Upsilon",  VERS_FROM40,   933 },
169 |     { "Phi",      VERS_FROM40,   934 },
170 |     { "Chi",      VERS_FROM40,   935 },
171 |     { "Psi",      VERS_FROM40,   936 },
172 |     { "Omega",    VERS_FROM40,   937 },
173 |     { "alpha",    VERS_FROM40,   945 },
174 |     { "beta",     VERS_FROM40,   946 },
175 |     { "gamma",    VERS_FROM40,   947 },
176 |     { "delta",    VERS_FROM40,   948 },
177 |     { "epsilon",  VERS_FROM40,   949 },
178 |     { "zeta",     VERS_FROM40,   950 },
179 |     { "eta",      VERS_FROM40,   951 },
180 |     { "theta",    VERS_FROM40,   952 },
181 |     { "iota",     VERS_FROM40,   953 },
182 |     { "kappa",    VERS_FROM40,   954 },
183 |     { "lambda",   VERS_FROM40,   955 },
184 |     { "mu",       VERS_FROM40,   956 },
185 |     { "nu",       VERS_FROM40,   957 },
186 |     { "xi",       VERS_FROM40,   958 },
187 |     { "omicron",  VERS_FROM40,   959 },
188 |     { "pi",       VERS_FROM40,   960 },
189 |     { "rho",      VERS_FROM40,   961 },
190 |     { "sigmaf",   VERS_FROM40,   962 },
191 |     { "sigma",    VERS_FROM40,   963 },
192 |     { "tau",      VERS_FROM40,   964 },
193 |     { "upsilon",  VERS_FROM40,   965 },
194 |     { "phi",      VERS_FROM40,   966 },
195 |     { "chi",      VERS_FROM40,   967 },
196 |     { "psi",      VERS_FROM40,   968 },
197 |     { "omega",    VERS_FROM40,   969 },
198 |     { "thetasym", VERS_FROM40,   977 },
199 |     { "upsih",    VERS_FROM40,   978 },
200 |     { "piv",      VERS_FROM40,   982 },
201 |     { "bull",     VERS_FROM40,  8226 },
202 |     { "hellip",   VERS_FROM40,  8230 },
203 |     { "prime",    VERS_FROM40,  8242 },
204 |     { "Prime",    VERS_FROM40,  8243 },
205 |     { "oline",    VERS_FROM40,  8254 },
206 |     { "frasl",    VERS_FROM40,  8260 },
207 |     { "weierp",   VERS_FROM40,  8472 },
208 |     { "image",    VERS_FROM40,  8465 },
209 |     { "real",     VERS_FROM40,  8476 },
210 |     { "trade",    VERS_FROM40,  8482 },
211 |     { "alefsym",  VERS_FROM40,  8501 },
212 |     { "larr",     VERS_FROM40,  8592 },
213 |     { "uarr",     VERS_FROM40,  8593 },
214 |     { "rarr",     VERS_FROM40,  8594 },
215 |     { "darr",     VERS_FROM40,  8595 },
216 |     { "harr",     VERS_FROM40,  8596 },
217 |     { "crarr",    VERS_FROM40,  8629 },
218 |     { "lArr",     VERS_FROM40,  8656 },
219 |     { "uArr",     VERS_FROM40,  8657 },
220 |     { "rArr",     VERS_FROM40,  8658 },
221 |     { "dArr",     VERS_FROM40,  8659 },
222 |     { "hArr",     VERS_FROM40,  8660 },
223 |     { "forall",   VERS_FROM40,  8704 },
224 |     { "part",     VERS_FROM40,  8706 },
225 |     { "exist",    VERS_FROM40,  8707 },
226 |     { "empty",    VERS_FROM40,  8709 },
227 |     { "nabla",    VERS_FROM40,  8711 },
228 |     { "isin",     VERS_FROM40,  8712 },
229 |     { "notin",    VERS_FROM40,  8713 },
230 |     { "ni",       VERS_FROM40,  8715 },
231 |     { "prod",     VERS_FROM40,  8719 },
232 |     { "sum",      VERS_FROM40,  8721 },
233 |     { "minus",    VERS_FROM40,  8722 },
234 |     { "lowast",   VERS_FROM40,  8727 },
235 |     { "radic",    VERS_FROM40,  8730 },
236 |     { "prop",     VERS_FROM40,  8733 },
237 |     { "infin",    VERS_FROM40,  8734 },
238 |     { "ang",      VERS_FROM40,  8736 },
239 |     { "and",      VERS_FROM40,  8743 },
240 |     { "or",       VERS_FROM40,  8744 },
241 |     { "cap",      VERS_FROM40,  8745 },
242 |     { "cup",      VERS_FROM40,  8746 },
243 |     { "int",      VERS_FROM40,  8747 },
244 |     { "there4",   VERS_FROM40,  8756 },
245 |     { "sim",      VERS_FROM40,  8764 },
246 |     { "cong",     VERS_FROM40,  8773 },
247 |     { "asymp",    VERS_FROM40,  8776 },
248 |     { "ne",       VERS_FROM40,  8800 },
249 |     { "equiv",    VERS_FROM40,  8801 },
250 |     { "le",       VERS_FROM40,  8804 },
251 |     { "ge",       VERS_FROM40,  8805 },
252 |     { "sub",      VERS_FROM40,  8834 },
253 |     { "sup",      VERS_FROM40,  8835 },
254 |     { "nsub",     VERS_FROM40,  8836 },
255 |     { "sube",     VERS_FROM40,  8838 },
256 |     { "supe",     VERS_FROM40,  8839 },
257 |     { "oplus",    VERS_FROM40,  8853 },
258 |     { "otimes",   VERS_FROM40,  8855 },
259 |     { "perp",     VERS_FROM40,  8869 },
260 |     { "sdot",     VERS_FROM40,  8901 },
261 |     { "lceil",    VERS_FROM40,  8968 },
262 |     { "rceil",    VERS_FROM40,  8969 },
263 |     { "lfloor",   VERS_FROM40,  8970 },
264 |     { "rfloor",   VERS_FROM40,  8971 },
265 |     { "lang",     VERS_FROM40,  9001 },
266 |     { "rang",     VERS_FROM40,  9002 },
267 |     { "loz",      VERS_FROM40,  9674 },
268 |     { "spades",   VERS_FROM40,  9824 },
269 |     { "clubs",    VERS_FROM40,  9827 },
270 |     { "hearts",   VERS_FROM40,  9829 },
271 |     { "diams",    VERS_FROM40,  9830 },
272 | 
273 |     /*
274 |     ** Extended Entities defined in HTML 4: Special (less Markup at top)
275 |     */
276 |     { "OElig",    VERS_FROM40,   338 },
277 |     { "oelig",    VERS_FROM40,   339 },
278 |     { "Scaron",   VERS_FROM40,   352 },
279 |     { "scaron",   VERS_FROM40,   353 },
280 |     { "Yuml",     VERS_FROM40,   376 },
281 |     { "circ",     VERS_FROM40,   710 },
282 |     { "tilde",    VERS_FROM40,   732 },
283 |     { "ensp",     VERS_FROM40,  8194 },
284 |     { "emsp",     VERS_FROM40,  8195 },
285 |     { "thinsp",   VERS_FROM40,  8201 },
286 |     { "zwnj",     VERS_FROM40,  8204 },
287 |     { "zwj",      VERS_FROM40,  8205 },
288 |     { "lrm",      VERS_FROM40,  8206 },
289 |     { "rlm",      VERS_FROM40,  8207 },
290 |     { "ndash",    VERS_FROM40,  8211 },
291 |     { "mdash",    VERS_FROM40,  8212 },
292 |     { "lsquo",    VERS_FROM40,  8216 },
293 |     { "rsquo",    VERS_FROM40,  8217 },
294 |     { "sbquo",    VERS_FROM40,  8218 },
295 |     { "ldquo",    VERS_FROM40,  8220 },
296 |     { "rdquo",    VERS_FROM40,  8221 },
297 |     { "bdquo",    VERS_FROM40,  8222 },
298 |     { "dagger",   VERS_FROM40,  8224 },
299 |     { "Dagger",   VERS_FROM40,  8225 },
300 |     { "permil",   VERS_FROM40,  8240 },
301 |     { "lsaquo",   VERS_FROM40,  8249 },
302 |     { "rsaquo",   VERS_FROM40,  8250 },
303 |     { "euro",     VERS_FROM40,  8364 },
304 |     { NULL,       0,               0 }
305 | };
306 | 
307 | 
308 | /* Pure static implementation.  Trades off lookup speed
309 | ** for faster setup time (well, none actually).
310 | ** Optimization of comparing 1st character buys enough
311 | ** speed that hash doesn't improve things without > 500
312 | ** items in list.
313 | */
314 | static const entity* lookup( ctmbstr s )
315 | {
316 |     tmbchar ch = (tmbchar)( s ? *s : 0 );
317 |     const entity *np;
318 |     for ( np = entities; ch && np && np->name; ++np )
319 |         if ( ch == *np->name && tmbstrcmp(s, np->name) == 0 )
320 |             return np;
321 |     return NULL;
322 | }
323 | 
324 | /* entity starting with "&" returns zero on error */
325 | uint EntityCode( ctmbstr name, uint versions )
326 | {
327 |     const entity* np;
328 |     assert( name && name[0] == '&' );
329 | 
330 |     /* numeric entitity: name = "&#" followed by number */
331 |     if ( name[1] == '#' )
332 |     {
333 |         uint c = 0;  /* zero on missing/bad number */
334 |         Bool isXml = ( (versions & VERS_XML) == VERS_XML );
335 | 
336 |         /* 'x' prefix denotes hexadecimal number format */
337 |         if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
338 |             sscanf( name+3, "%x", &c );
339 |         else
340 |             sscanf( name+2, "%d", &c );
341 | 
342 |         return (uint) c;
343 |     }
344 | 
345 |    /* Named entity: name ="&" followed by a name */
346 |     if ( np = lookup(name+1) )
347 |     {
348 |         /* Only recognize entity name if version supports it.  */
349 |         if ( np->versions & versions )
350 |             return np->code;
351 |     }
352 | 
353 |     return 0;   /* zero signifies unknown entity name */
354 | }
355 | 
356 | Bool EntityInfo( ctmbstr name, Bool isXml, uint* code, uint* versions )
357 | {
358 |     const entity* np;
359 |     assert( name && name[0] == '&' );
360 |     assert( code != NULL );
361 |     assert( versions != NULL );
362 | 
363 |     /* numeric entitity: name = "&#" followed by number */
364 |     if ( name[1] == '#' )
365 |     {
366 |         uint c = 0;  /* zero on missing/bad number */
367 | 
368 |         /* 'x' prefix denotes hexadecimal number format */
369 |         if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
370 |             sscanf( name+3, "%x", &c );
371 |         else
372 |             sscanf( name+2, "%d", &c );
373 | 
374 |         *code = c;
375 |         *versions = VERS_ALL;
376 |         return yes;
377 |     }
378 | 
379 |     /* Named entity: name ="&" followed by a name */
380 |     if ( np = lookup(name+1) )
381 |     {
382 |         *code = np->code;
383 |         *versions = np->versions;
384 |         return yes;
385 |     }
386 | 
387 |     *code = 0;
388 |     *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
389 |     return no;
390 | }
391 | 
392 | 
393 | ctmbstr EntityName( uint ch, uint versions )
394 | {
395 |     ctmbstr entnam = NULL;
396 |     const entity *ep;
397 | 
398 |     for ( ep = entities; ep->name != NULL; ++ep )
399 |     {
400 |         if ( ep->code == ch )
401 |         {
402 |             if ( ep->versions & versions )
403 |                 entnam = ep->name;
404 |             break; /* Found code. Stop search. */
405 |         }
406 |     }
407 |     return entnam;
408 | }
409 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/include/platform.h:
--------------------------------------------------------------------------------
  1 | #ifndef __PLATFORM_H__
  2 | #define __PLATFORM_H__
  3 | 
  4 | /* platform.h -- Platform specifics
  5 | 
  6 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  7 |   See tidy.h for the copyright notice.
  8 | 
  9 |   CVS Info :
 10 | 
 11 |     $LastChangedBy$ 
 12 |     $LastChangedDate$ 
 13 |     $LastChangedRevision$ 
 14 | 
 15 | */
 16 | 
 17 | #ifdef __cplusplus
 18 | extern "C" {
 19 | #endif
 20 | 
 21 | /*
 22 |   Uncomment and edit one of the following #defines if you
 23 |   want to specify the config file at compile-time.
 24 | */
 25 | 
 26 | /* #define CONFIG_FILE "/etc/tidy_config.txt" */ /* original */
 27 | /* #define CONFIG_FILE "/etc/tidyrc" */
 28 | /* #define CONFIG_FILE "/etc/tidy.conf" */
 29 | 
 30 | /*
 31 |   Uncomment the following #define if you are on a system
 32 |   supporting the HOME environment variable.
 33 |   It enables tidy to find config files named ~/.tidyrc if 
 34 |   the HTML_TIDY environment variable is not set.
 35 | */
 36 | /* #define USER_CONFIG_FILE "~/.tidyrc" */
 37 | 
 38 | /*
 39 |   Uncomment the following #define if your
 40 |   system supports the call getpwnam(). 
 41 |   E.g. Unix and Linux.
 42 | 
 43 |   It enables tidy to find files named 
 44 |   ~your/foo for use in the HTML_TIDY environment
 45 |   variable or CONFIG_FILE or USER_CONFIGFILE or
 46 |   on the command line: -config ~joebob/tidy.cfg
 47 | 
 48 |   Contributed by Todd Lewis.
 49 | */
 50 | 
 51 | /* #define SUPPORT_GETPWNAM */
 52 | 
 53 | 
 54 | /* Enable/disable support for Big5 and Shift_JIS character encodings */
 55 | #ifndef SUPPORT_ASIAN_ENCODINGS
 56 | #define SUPPORT_ASIAN_ENCODINGS 1
 57 | #endif
 58 | 
 59 | /* Enable/disable support for UTF-16 character encodings */
 60 | #ifndef SUPPORT_UTF16_ENCODINGS
 61 | #define SUPPORT_UTF16_ENCODINGS 1
 62 | #endif
 63 | 
 64 | /* Enable/disable support for additional accessibility checks */
 65 | #ifndef SUPPORT_ACCESSIBILITY_CHECKS
 66 | #define SUPPORT_ACCESSIBILITY_CHECKS 1
 67 | #endif
 68 | 
 69 | 
 70 | /* Convenience defines for Mac platforms */
 71 | 
 72 | #if defined(macintosh)
 73 | /* Mac OS 6.x/7.x/8.x/9.x, with or without CarbonLib - MPW or Metrowerks 68K/PPC compilers */
 74 | #define MAC_OS_CLASSIC
 75 | #ifndef PLATFORM_NAME
 76 | #define PLATFORM_NAME "Mac OS"
 77 | #endif
 78 | 
 79 | /* needed for access() */
 80 | #if !defined(_POSIX) && !defined(NO_ACCESS_SUPPORT)
 81 | #define NO_ACCESS_SUPPORT
 82 | #endif
 83 | 
 84 | #ifdef SUPPORT_GETPWNAM
 85 | #undef SUPPORT_GETPWNAM
 86 | #endif
 87 | 
 88 | #elif defined(__APPLE__) && defined(__MACH__)
 89 | /* Mac OS X (client) 10.x (or server 1.x/10.x) - gcc or Metrowerks MachO compilers */
 90 | #define MAC_OS_X
 91 | #ifndef PLATFORM_NAME
 92 | #define PLATFORM_NAME "Mac OS X"
 93 | #endif
 94 | #endif
 95 | 
 96 | #if defined(MAC_OS_CLASSIC) || defined(MAC_OS_X)
 97 | /* Any OS on Mac platform */
 98 | #define MAC_OS
 99 | #define FILENAMES_CASE_SENSITIVE 0
100 | #define strcasecmp strcmp
101 | #ifndef DFLT_REPL_CHARENC
102 | #define DFLT_REPL_CHARENC MACROMAN
103 | #endif
104 | #endif
105 | 
106 | /* Convenience defines for BSD like platforms */
107 |  
108 | #if defined(__FreeBSD__)
109 | #define BSD_BASED_OS
110 | #ifndef PLATFORM_NAME
111 | #define PLATFORM_NAME "FreeBSD"
112 | #endif
113 | 
114 | #elif defined(__NetBSD__)
115 | #define BSD_BASED_OS
116 | #ifndef PLATFORM_NAME
117 | #define PLATFORM_NAME "NetBSD"
118 | #endif
119 | 
120 | #elif defined(__OpenBSD__)
121 | #define BSD_BASED_OS
122 | #ifndef PLATFORM_NAME
123 | #define PLATFORM_NAME "OpenBSD"
124 | #endif
125 | 
126 | #elif defined(__MINT__)
127 | #define BSD_BASED_OS
128 | #ifndef PLATFORM_NAME
129 | #define PLATFORM_NAME "FreeMiNT"
130 | #endif
131 | 
132 | #elif defined(__bsdi__)
133 | #define BSD_BASED_OS
134 | #ifndef PLATFORM_NAME
135 | #define PLATFORM_NAME "BSD/OS"
136 | #endif
137 | 
138 | #endif
139 | 
140 | /* Convenience defines for Windows platforms */
141 |  
142 | #if defined(WINDOWS) || defined(_WIN32)
143 | 
144 | #define WINDOWS_OS
145 | #ifndef PLATFORM_NAME
146 | #define PLATFORM_NAME "Windows"
147 | #endif
148 | 
149 | #if defined(__MWERKS__) || defined(__MSL__)
150 | /* not available with Metrowerks Standard Library */
151 | 
152 | #ifdef SUPPORT_GETPWNAM
153 | #undef SUPPORT_GETPWNAM
154 | #endif
155 | 
156 | /* needed for setmode() */
157 | #if !defined(NO_SETMODE_SUPPORT)
158 | #define NO_SETMODE_SUPPORT
159 | #endif
160 | 
161 | #define strcasecmp _stricmp
162 | 
163 | #endif
164 | 
165 | #define FILENAMES_CASE_SENSITIVE 0
166 | 
167 | #endif
168 | 
169 | /* Convenience defines for Linux platforms */
170 |  
171 | #if defined(linux) && defined(__alpha__)
172 | /* Linux on Alpha - gcc compiler */
173 | #define LINUX_OS
174 | #ifndef PLATFORM_NAME
175 | #define PLATFORM_NAME "Linux/Alpha"
176 | #endif
177 | 
178 | #elif defined(linux) && defined(__sparc__)
179 | /* Linux on Sparc - gcc compiler */
180 | #define LINUX_OS
181 | #ifndef PLATFORM_NAME
182 | #define PLATFORM_NAME "Linux/Sparc"
183 | #endif
184 | 
185 | #elif defined(linux) && (defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__))
186 | /* Linux on x86 - gcc compiler */
187 | #define LINUX_OS
188 | #ifndef PLATFORM_NAME
189 | #define PLATFORM_NAME "Linux/x86"
190 | #endif
191 | 
192 | #elif defined(linux) && defined(__powerpc__)
193 | /* Linux on PPC - gcc compiler */
194 | #define LINUX_OS
195 | 
196 | #if defined(__linux__) && defined(__powerpc__)
197 | 
198 | /* #if #system(linux) */
199 | /* MkLinux on PPC  - gcc (egcs) compiler */
200 | /* #define MAC_OS_MKLINUX */
201 | #ifndef PLATFORM_NAME
202 | #define PLATFORM_NAME "MkLinux"
203 | #endif
204 | 
205 | #else
206 | 
207 | #ifndef PLATFORM_NAME
208 | #define PLATFORM_NAME "Linux/PPC"
209 | #endif
210 | 
211 | #endif
212 | 
213 | #elif defined(linux) || defined(__linux__)
214 | /* generic Linux */
215 | #define LINUX_OS
216 | #ifndef PLATFORM_NAME
217 | #define PLATFORM_NAME "Linux"
218 | #endif
219 | 
220 | #endif
221 | 
222 | /* Convenience defines for Solaris platforms */
223 |  
224 | #if defined(sun)
225 | #define SOLARIS_OS
226 | #ifndef PLATFORM_NAME
227 | #define PLATFORM_NAME "Solaris"
228 | #endif
229 | #endif
230 | 
231 | /* Convenience defines for HPUX + gcc platforms */
232 | 
233 | #if defined(__hpux)
234 | #define HPUX_OS
235 | #ifndef PLATFORM_NAME
236 | #define PLATFORM_NAME "HPUX"
237 | #endif
238 | #endif
239 | 
240 | /* Convenience defines for RISCOS + gcc platforms */
241 | 
242 | #if defined(__riscos__)
243 | #define RISC_OS
244 | #ifndef PLATFORM_NAME
245 | #define PLATFORM_NAME "RISC OS"
246 | #endif
247 | #endif
248 | 
249 | /* Convenience defines for OS/2 + icc/gcc platforms */
250 | 
251 | #if defined(__OS2__) || defined(__EMX__)
252 | #define OS2_OS
253 | #ifndef PLATFORM_NAME
254 | #define PLATFORM_NAME "OS/2"
255 | #endif
256 | #define FILENAMES_CASE_SENSITIVE 0
257 | #define strcasecmp stricmp
258 | #endif
259 | 
260 | /* Convenience defines for IRIX */
261 | 
262 | #if defined(__sgi)
263 | #define IRIX_OS
264 | #ifndef PLATFORM_NAME
265 | #define PLATFORM_NAME "SGI IRIX"
266 | #endif
267 | #endif
268 | 
269 | /* Convenience defines for AIX */
270 | 
271 | #if defined(_AIX)
272 | #define AIX_OS
273 | #ifndef PLATFORM_NAME
274 | #define PLATFORM_NAME "IBM AIX"
275 | #endif
276 | #endif
277 | 
278 | 
279 | /* Convenience defines for BeOS platforms */
280 | 
281 | #if defined(__BEOS__)
282 | #define BE_OS
283 | #ifndef PLATFORM_NAME
284 | #define PLATFORM_NAME "BeOS"
285 | #endif
286 | #endif
287 | 
288 | /* Convenience defines for Cygwin platforms */
289 | 
290 | #if defined(__CYGWIN__)
291 | #define CYGWIN_OS
292 | #ifndef PLATFORM_NAME
293 | #define PLATFORM_NAME "Cygwin"
294 | #endif
295 | #define FILENAMES_CASE_SENSITIVE 0
296 | #endif
297 | 
298 | /* Convenience defines for OpenVMS */
299 | 
300 | #if defined(__VMS)
301 | #define OPENVMS_OS
302 | #ifndef PLATFORM_NAME
303 | #define PLATFORM_NAME "OpenVMS"
304 | #endif
305 | #define FILENAMES_CASE_SENSITIVE 0
306 | #endif
307 | 
308 | /* Convenience defines for DEC Alpha OSF + gcc platforms */
309 | 
310 | #if defined(__osf__)
311 | #define OSF_OS
312 | #ifndef PLATFORM_NAME
313 | #define PLATFORM_NAME "DEC Alpha OSF"
314 | #endif
315 | #endif
316 | 
317 | /* Convenience defines for ARM platforms */
318 | 
319 | #if defined(__arm)
320 | #define ARM_OS
321 | 
322 | #if defined(forARM) && defined(__NEWTON_H)
323 | 
324 | /* Using Newton C++ Tools ARMCpp compiler */
325 | #define NEWTON_OS
326 | #ifndef PLATFORM_NAME
327 | #define PLATFORM_NAME "Newton"
328 | #endif
329 | 
330 | #else
331 | 
332 | #ifndef PLATFORM_NAME
333 | #define PLATFORM_NAME "ARM"
334 | #endif
335 | 
336 | #endif
337 | 
338 | #endif
339 | 
340 | #include <ctype.h>
341 | #include <stdio.h>
342 | #include <setjmp.h>  /* for longjmp on error exit */
343 | #include <stdlib.h>
344 | #include <stdarg.h>  /* may need <varargs.h> for Unix V */
345 | #include <string.h>
346 | #include <assert.h>
347 | 
348 | #ifdef NEEDS_MALLOC_H
349 | #include <malloc.h>
350 | #endif
351 | 
352 | #ifdef SUPPORT_GETPWNAM
353 | #include <pwd.h>
354 | #endif
355 | 
356 | #ifdef NEEDS_UNISTD_H
357 | #include <unistd.h>  /* needed for unlink on some Unix systems */
358 | #endif
359 | 
360 | /* This can be set at compile time.  Usually Windows,
361 | ** except for Macintosh builds.
362 | */
363 | #ifndef DFLT_REPL_CHARENC
364 | #define DFLT_REPL_CHARENC WIN1252
365 | #endif
366 | 
367 | /* By default, use case-sensitive filename comparison.
368 | */
369 | #ifndef FILENAMES_CASE_SENSITIVE
370 | #define FILENAMES_CASE_SENSITIVE 1
371 | #endif
372 | 
373 | 
374 | /*
375 |   Tidy preserves the last modified time for the files it
376 |   cleans up.
377 | */
378 | 
379 | /*
380 |   If your platform doesn't support <utime.h> and the
381 |   utime() function, or <sys/futime> and the futime()
382 |   function then set PRESERVE_FILE_TIMES to 0.
383 |   
384 |   If your platform doesn't support <sys/utime.h> and the
385 |   futime() function, then set HAS_FUTIME to 0.
386 |   
387 |   If your platform supports <utime.h> and the
388 |   utime() function requires the file to be
389 |   closed first, then set UTIME_NEEDS_CLOSED_FILE to 1.
390 | */
391 | 
392 | /* Keep old PRESERVEFILETIMES define for compatibility */
393 | #ifdef PRESERVEFILETIMES
394 | #undef PRESERVE_FILE_TIMES
395 | #define PRESERVE_FILE_TIMES PRESERVEFILETIMES
396 | #endif
397 | 
398 | #ifndef PRESERVE_FILE_TIMES
399 | #if defined(RISC_OS) || defined(OPENVMS_OS) || defined(OSF_OS)
400 | #define PRESERVE_FILE_TIMES 0
401 | #else
402 | #define PRESERVE_FILE_TIMES 1
403 | #endif
404 | #endif
405 | 
406 | #if PRESERVE_FILE_TIMES
407 | 
408 | #ifndef HAS_FUTIME
409 | #if defined(CYGWIN_OS) || defined(BE_OS) || defined(OS2_OS) || defined(HPUX_OS) || defined(SOLARIS_OS) || defined(LINUX_OS) || defined(BSD_BASED_OS) || defined(MAC_OS) || defined(__MSL__) || defined(IRIX_OS) || defined(AIX_OS) || defined(__BORLANDC__)
410 | #define HAS_FUTIME 0
411 | #else
412 | #define HAS_FUTIME 1
413 | #endif
414 | #endif
415 | 
416 | #ifndef UTIME_NEEDS_CLOSED_FILE
417 | #if defined(SOLARIS_OS) || defined(BSD_BASED_OS) || defined(MAC_OS) || defined(__MSL__) || defined(LINUX_OS)
418 | #define UTIME_NEEDS_CLOSED_FILE 1
419 | #else
420 | #define UTIME_NEEDS_CLOSED_FILE 0
421 | #endif
422 | #endif
423 | 
424 | #if defined(MAC_OS_X) || (!defined(MAC_OS_CLASSIC) && !defined(__MSL__))
425 | #include <sys/types.h> 
426 | #include <sys/stat.h>
427 | #else
428 | #include <stat.h>
429 | #endif
430 | 
431 | #if HAS_FUTIME
432 | #include <sys/utime.h>
433 | #else
434 | #include <utime.h>
435 | #endif /* HASFUTIME */
436 | 
437 | /*
438 |   MS Windows needs _ prefix for Unix file functions.
439 |   Not required by Metrowerks Standard Library (MSL).
440 |   
441 |   Tidy uses following for preserving the last modified time.
442 | 
443 |   WINDOWS automatically set by Win16 compilers.
444 |   _WIN32 automatically set by Win32 compilers.
445 | */
446 | #if defined(_WIN32) && !defined(__MSL__) && !defined(__BORLANDC__)
447 | 
448 | #define futime _futime
449 | #define fstat _fstat
450 | #define utimbuf _utimbuf /* Windows seems to want utimbuf */
451 | #define stat _stat
452 | #define utime _utime
453 | 
454 | #endif /* _WIN32 */
455 | 
456 | #endif /* PRESERVE_FILE_TIMES */
457 | 
458 | /*
459 |   MS Windows needs _ prefix for Unix file functions.
460 |   Not required by Metrowerks Standard Library (MSL).
461 |   
462 |   WINDOWS automatically set by Win16 compilers.
463 |   _WIN32 automatically set by Win32 compilers.
464 | */
465 | #if defined(_WIN32) && !defined(__MSL__) && !defined(__BORLANDC__)
466 | 
467 | #ifndef __WATCOMC__
468 | #define fileno _fileno
469 | #define setmode _setmode
470 | #endif
471 | 
472 | #define access _access
473 | #define strcasecmp _stricmp
474 | 
475 | #if _MSC_VER > 1000
476 | #pragma warning( disable : 4189 ) /* local variable is initialized but not referenced */
477 | #pragma warning( disable : 4100 ) /* unreferenced formal parameter */
478 | #pragma warning( disable : 4706 ) /* assignment within conditional expression */
479 | #endif
480 | 
481 | #endif /* _WIN32 */
482 | 
483 | #if defined(_WIN32)
484 | 
485 | #if defined(_USRDLL) && !defined(TIDY_EXPORT)
486 | #define TIDY_EXPORT __declspec( dllexport ) 
487 | #endif
488 | 
489 | #endif /* _WIN32 */
490 | 
491 | /* hack for gnu sys/types.h file  which defines uint and ulong */
492 | 
493 | #if defined(BE_OS) || defined(SOLARIS_OS) || defined(BSD_BASED_OS) || defined(OSF_OS) || defined(IRIX_OS) || defined(AIX_OS)
494 | #include <sys/types.h>
495 | #endif
496 | #if !defined(HPUX_OS) && !defined(CYGWIN_OS) && !defined(MAC_OS_X) && !defined(BE_OS) && !defined(SOLARIS_OS) && !defined(BSD_BASED_OS) && !defined(OSF_OS) && !defined(IRIX_OS) && !defined(AIX_OS)
497 | typedef unsigned int uint;
498 | #endif
499 | #if defined(HPUX_OS) || defined(CYGWIN_OS) || defined(MAC_OS) || defined(BSD_BASED_OS) || defined(_WIN32)
500 | typedef unsigned long ulong;
501 | #endif
502 | 
503 | #ifndef TIDY_EXPORT /* Define it away for most builds */
504 | #define TIDY_EXPORT
505 | #endif
506 | 
507 | #ifndef TIDY_STRUCT
508 | #define TIDY_STRUCT
509 | #endif
510 | 
511 | typedef unsigned char byte;
512 | 
513 | typedef uint tchar;         /* single, full character */
514 | typedef char tmbchar;       /* single, possibly partial character */
515 | #ifndef TMBSTR_DEFINED
516 | typedef tmbchar* tmbstr;    /* pointer to buffer of possibly partial chars */
517 | typedef const tmbchar* ctmbstr; /* Ditto, but const */
518 | #define TMBSTR_DEFINED
519 | #endif
520 | 
521 |            
522 | /*
523 |   bool is a reserved word in some but
524 |   not all C++ compilers depending on age
525 |   work around is to avoid bool altogether
526 |   by introducing a new enum called Bool
527 | */
528 | typedef enum
529 | {
530 |    no,
531 |    yes
532 | } Bool;
533 | 
534 | /* for NULL pointers 
535 | #define null ((const void*)0)
536 | extern void* null;
537 | */
538 | 
539 | #if defined(DMALLOC)
540 | #include "dmalloc.h"
541 | #endif
542 | 
543 | void *MemAlloc(size_t size);
544 | void *MemRealloc(void *mem, size_t newsize);
545 | void MemFree(void *mem);
546 | void ClearMemory(void *, size_t size);
547 | void FatalError( ctmbstr msg );
548 | 
549 | /* Opaque data structure.
550 | *  Cast to implementation type struct within lib.
551 | *  This will reduce inter-dependencies/conflicts w/ application code.
552 | */
553 | #if 1
554 | /*
555 | *  Please note - this definition assumes your compiler uses 'int' for enums.
556 | */
557 | #define opaque( typenam )\
558 | struct _##typenam { int _opaque; };\
559 | typedef struct _##typenam* typenam
560 | #else
561 | #define opaque(typenam) typedef void* typenam
562 | #endif
563 | 
564 | /* Opaque data structure used to pass back
565 | ** and forth to keep current position in a
566 | ** list or other collection.
567 | */
568 | opaque( TidyIterator );
569 | 
570 | #ifdef __cplusplus
571 | } /* extern "C" */
572 | #endif
573 | 
574 | #endif /* __PLATFORM_H__ */
575 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/utf8.c:
--------------------------------------------------------------------------------
  1 | /* utf8.c -- convert characters to/from UTF-8
  2 | 
  3 |   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  4 |   See tidy.h for the copyright notice.
  5 | 
  6 |   CVS Info :
  7 | 
  8 |     $LastChangedBy$ 
  9 |     $LastChangedDate$ 
 10 |     $LastChangedRevision$ 
 11 | 
 12 |   Uses public interfaces to abstract input source and output
 13 |   sink, which may be user supplied or either FILE* or memory
 14 |   based Tidy implementations.  Encoding support is uniform
 15 |   regardless of I/O mechanism.
 16 | 
 17 |   Note, UTF-8 encoding, by itself, does not affect the actual
 18 |   "codepoints" of the underlying character encoding.  In the
 19 |   cases of ASCII, Latin1, Unicode (16-bit, BMP), these all 
 20 |   refer to ISO-10646 "codepoints".  For anything else, they
 21 |   refer to some other "codepoint" set.
 22 | 
 23 |   Put another way, UTF-8 is a variable length method to 
 24 |   represent any non-negative integer value.  The glyph 
 25 |   that a integer value represents is unchanged and defined
 26 |   externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
 27 |   Latin2-9, and so on).
 28 | 
 29 |   Put still another way, UTF-8 is more of a _transfer_ encoding
 30 |   than a _character_ encoding, per se.
 31 | */
 32 | 
 33 | #include "tidy.h"
 34 | #include "utf8.h"
 35 | 
 36 | /* 
 37 | UTF-8 encoding/decoding functions
 38 | Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
 39 | 
 40 | Also see below for UTF-16 encoding/decoding functions
 41 | 
 42 | References :
 43 | 
 44 | 1) UCS Transformation Format 8 (UTF-8):
 45 | ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
 46 | <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
 47 | <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
 48 | 
 49 | Table 4 - Mapping from UCS-4 to UTF-8
 50 | 
 51 | 2) Unicode standards:
 52 | <http://www.unicode.org/unicode/standard/standard.html>
 53 | 
 54 | 3) Legal UTF-8 byte sequences:
 55 | <http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
 56 | 
 57 | Code point          1st byte    2nd byte    3rd byte    4th byte
 58 | ----------          --------    --------    --------    --------
 59 | U+0000..U+007F      00..7F
 60 | U+0080..U+07FF      C2..DF      80..BF
 61 | U+0800..U+0FFF      E0          A0..BF      80..BF
 62 | U+1000..U+FFFF      E1..EF      80..BF      80..BF
 63 | U+10000..U+3FFFF    F0          90..BF      80..BF      80..BF
 64 | U+40000..U+FFFFF    F1..F3      80..BF      80..BF      80..BF
 65 | U+100000..U+10FFFF  F4          80..8F      80..BF      80..BF
 66 | 
 67 | The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
 68 | allows for the use of five- and six-byte sequences to encode
 69 | characters that are outside the range of the Unicode character
 70 | set; those five- and six-byte sequences are illegal for the use
 71 | of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
 72 | does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
 73 | (but it does allow other noncharacters).
 74 | 
 75 | 4) RFC 2279: UTF-8, a transformation format of ISO 10646:
 76 | <http://www.ietf.org/rfc/rfc2279.txt>
 77 | 
 78 | 5) UTF-8 and Unicode FAQ:
 79 | <http://www.cl.cam.ac.uk/~mgk25/unicode.html>
 80 | 
 81 | 6) Markus Kuhn's UTF-8 decoder stress test file:
 82 | <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
 83 | 
 84 | 7) UTF-8 Demo:
 85 | <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
 86 | 
 87 | 8) UTF-8 Sampler:
 88 | <http://www.columbia.edu/kermit/utf8.html>
 89 | 
 90 | 9) Transformation Format for 16 Planes of Group 00 (UTF-16):
 91 | ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
 92 | <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
 93 | <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
 94 | 
 95 | 10) RFC 2781: UTF-16, an encoding of ISO 10646:
 96 | <http://www.ietf.org/rfc/rfc2781.txt>
 97 | 
 98 | 11) UTF-16 invalid surrogate pairs:
 99 | <http://www.unicode.org/unicode/faq/utf_bom.html#16>
100 | 
101 | UTF-16       UTF-8          UCS-4
102 | D83F DFF*    F0 9F BF B*    0001FFF*
103 | D87F DFF*    F0 AF BF B*    0002FFF*
104 | D8BF DFF*    F0 BF BF B*    0003FFF*
105 | D8FF DFF*    F1 8F BF B*    0004FFF*
106 | D93F DFF*    F1 9F BF B*    0005FFF*
107 | D97F DFF*    F1 AF BF B*    0006FFF*
108 |                 ...
109 | DBBF DFF*    F3 BF BF B*    000FFFF*
110 | DBFF DFF*    F4 8F BF B*    0010FFF*
111 | 
112 | * = E or F
113 |                                    
114 | 1010  A
115 | 1011  B
116 | 1100  C
117 | 1101  D
118 | 1110  E
119 | 1111  F
120 | 
121 | */
122 | 
123 | #define kNumUTF8Sequences        7
124 | #define kMaxUTF8Bytes            4
125 | 
126 | #define kUTF8ByteSwapNotAChar    0xFFFE
127 | #define kUTF8NotAChar            0xFFFF
128 | 
129 | #define kMaxUTF8FromUCS4         0x10FFFF
130 | 
131 | #define kUTF16SurrogatesBegin    0x10000
132 | #define kMaxUTF16FromUCS4        0x10FFFF
133 | 
134 | /* UTF-16 surrogate pair areas */
135 | #define kUTF16LowSurrogateBegin  0xD800
136 | #define kUTF16LowSurrogateEnd    0xDBFF
137 | #define kUTF16HighSurrogateBegin 0xDC00
138 | #define kUTF16HighSurrogateEnd   0xDFFF
139 | 
140 | 
141 | /* offsets into validUTF8 table below */
142 | static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
143 | {
144 |     0, /* 1 byte */
145 |     1, /* 2 bytes */
146 |     2, /* 3 bytes */
147 |     4, /* 4 bytes */
148 |     kNumUTF8Sequences /* must be last */
149 | };
150 | 
151 | static const struct validUTF8Sequence
152 | {
153 |      uint lowChar;
154 |      uint highChar;
155 |      int  numBytes;
156 |      byte validBytes[8];
157 | } validUTF8[kNumUTF8Sequences] =
158 | {
159 | /*   low       high   #bytes  byte 1      byte 2      byte 3      byte 4 */
160 |     {0x0000,   0x007F,   1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
161 |     {0x0080,   0x07FF,   2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
162 |     {0x0800,   0x0FFF,   3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
163 |     {0x1000,   0xFFFF,   3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
164 |     {0x10000,  0x3FFFF,  4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
165 |     {0x40000,  0xFFFFF,  4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
166 |     {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}} 
167 | };
168 | 
169 | int DecodeUTF8BytesToChar( uint* c, uint firstByte, tmbstr successorBytes,
170 |                            TidyInputSource* inp, int* count )
171 | {
172 |     byte tempbuf[10];
173 |     byte *buf = &tempbuf[0];
174 |     uint ch = 0, n = 0;
175 |     int i, bytes = 0;
176 |     Bool hasError = no;
177 |     
178 |     if ( successorBytes )
179 |         buf = (byte*) successorBytes;
180 |         
181 |     /* special check if we have been passed an EOF char */
182 |     if ( firstByte == EndOfStream )
183 |     {
184 |         /* at present */
185 |         *c = firstByte;
186 |         *count = 1;
187 |         return 0;
188 |     }
189 | 
190 |     ch = firstByte; /* first byte is passed in separately */
191 |     
192 |     if (ch <= 0x7F) /* 0XXX XXXX one byte */
193 |     {
194 |         n = ch;
195 |         bytes = 1;
196 |     }
197 |     else if ((ch & 0xE0) == 0xC0)  /* 110X XXXX  two bytes */
198 |     {
199 |         n = ch & 31;
200 |         bytes = 2;
201 |     }
202 |     else if ((ch & 0xF0) == 0xE0)  /* 1110 XXXX  three bytes */
203 |     {
204 |         n = ch & 15;
205 |         bytes = 3;
206 |     }
207 |     else if ((ch & 0xF8) == 0xF0)  /* 1111 0XXX  four bytes */
208 |     {
209 |         n = ch & 7;
210 |         bytes = 4;
211 |     }
212 |     else if ((ch & 0xFC) == 0xF8)  /* 1111 10XX  five bytes */
213 |     {
214 |         n = ch & 3;
215 |         bytes = 5;
216 |         hasError = yes;
217 |     }
218 |     else if ((ch & 0xFE) == 0xFC)  /* 1111 110X  six bytes */
219 |     {
220 |         n = ch & 1;
221 |         bytes = 6;
222 |         hasError = yes;
223 |     }
224 |     else
225 |     {
226 |         /* not a valid first byte of a UTF-8 sequence */
227 |         n = ch;
228 |         bytes = 1;
229 |         hasError = yes;
230 |     }
231 | 
232 |     /* successor bytes should have the form 10XX XXXX */
233 | 
234 |     /* If caller supplied buffer, use it.  Else see if caller
235 |     ** supplied an input source, use that.
236 |     */
237 |     if ( successorBytes )
238 |     {
239 |         for ( i=0; i < bytes-1; ++i )
240 |         {
241 |             if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
242 |             {
243 |                 hasError = yes;
244 |                 bytes = i;
245 |                 break;
246 |             }
247 |             n = (n << 6) | (buf[i] & 0x3F);
248 |         }
249 |     }
250 |     else if ( inp )
251 |     {
252 |         for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
253 |         {
254 |             int b = inp->getByte( inp->sourceData );
255 |             buf[i] = (tmbchar) b;
256 | 
257 |             /* End of data or illegal successor byte value */
258 |             if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
259 |             {
260 |                 hasError = yes;
261 |                 bytes = i;
262 |                 if ( b != EOF )
263 |                     inp->ungetByte( inp->sourceData, buf[i] );
264 |                 break;
265 |             }
266 |             n = (n << 6) | (buf[i] & 0x3F);
267 |         }
268 |     }
269 |     else if ( bytes > 1 )
270 |     {
271 |         hasError = yes;
272 |         bytes = 1;
273 |     }
274 |     
275 |     if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
276 |         hasError = yes;
277 |         
278 |     if (!hasError && (n > kMaxUTF8FromUCS4))
279 |         hasError = yes;
280 | 
281 | #if 0 /* Breaks Big5 D8 - DF */
282 |     if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
283 |         /* unpaired surrogates not allowed */
284 |         hasError = yes;
285 | #endif
286 | 
287 |     if (!hasError)
288 |     {
289 |         int lo, hi;
290 |         
291 |         lo = offsetUTF8Sequences[bytes - 1];
292 |         hi = offsetUTF8Sequences[bytes] - 1;
293 |         
294 |         /* check for overlong sequences */
295 |         if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
296 |             hasError = yes;
297 |         else
298 |         {
299 |             hasError = yes; /* assume error until proven otherwise */
300 |         
301 |             for (i = lo; i <= hi; i++)
302 |             {
303 |                 int tempCount;
304 |                 byte theByte;
305 |                 
306 |                 for (tempCount = 0; tempCount < bytes; tempCount++)
307 |                 {
308 |                     if (!tempCount)
309 |                         theByte = (tmbchar) firstByte;
310 |                     else
311 |                         theByte = buf[tempCount - 1];
312 |                         
313 |                     if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
314 |                          theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
315 |                         hasError = no;
316 |                     if (hasError)
317 |                         break;
318 |                 }
319 |             }
320 |         }
321 |     }
322 | 
323 | #if 1 && defined(_DEBUG)
324 |     if ( hasError )
325 |     {
326 |        /* debug */
327 |        fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
328 |        fprintf( stderr, "0x%02x ", firstByte );
329 |        for (i = 1; i < bytes; i++)
330 |            fprintf( stderr, "0x%02x ", buf[i - 1] );
331 |        fprintf( stderr, " = U+%04lx\n", n );
332 |     }
333 | #endif
334 | 
335 |     *count = bytes;
336 |     *c = n;
337 |     if ( hasError )
338 |         return -1;
339 |     return 0;
340 | }
341 | 
342 | int EncodeCharToUTF8Bytes( uint c, tmbstr encodebuf,
343 |                            TidyOutputSink* outp, int* count )
344 | {
345 |     byte tempbuf[10] = {0};
346 |     byte* buf = &tempbuf[0];
347 |     int bytes = 0;
348 |     Bool hasError = no;
349 |     
350 |     if ( encodebuf )
351 |         buf = (byte*) encodebuf;
352 |         
353 |     if (c <= 0x7F)  /* 0XXX XXXX one byte */
354 |     {
355 |         buf[0] = (tmbchar) c;
356 |         bytes = 1;
357 |     }
358 |     else if (c <= 0x7FF)  /* 110X XXXX  two bytes */
359 |     {
360 |         buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
361 |         buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
362 |         bytes = 2;
363 |     }
364 |     else if (c <= 0xFFFF)  /* 1110 XXXX  three bytes */
365 |     {
366 |         buf[0] = (tmbchar) (0xE0 | (c >> 12));
367 |         buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
368 |         buf[2] = (tmbchar) (0x80 | (c & 0x3F));
369 |         bytes = 3;
370 |         if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
371 |             hasError = yes;
372 | #if 0 /* Breaks Big5 D8 - DF */
373 |         else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd )
374 |             /* unpaired surrogates not allowed */
375 |             hasError = yes;
376 | #endif
377 |     }
378 |     else if (c <= 0x1FFFFF)  /* 1111 0XXX  four bytes */
379 |     {
380 |         buf[0] = (tmbchar) (0xF0 | (c >> 18));
381 |         buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
382 |         buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
383 |         buf[3] = (tmbchar) (0x80 | (c & 0x3F));
384 |         bytes = 4;
385 |         if (c > kMaxUTF8FromUCS4)
386 |             hasError = yes;
387 |     }
388 |     else if (c <= 0x3FFFFFF)  /* 1111 10XX  five bytes */
389 |     {
390 |         buf[0] = (tmbchar) (0xF8 | (c >> 24));
391 |         buf[1] = (tmbchar) (0x80 | (c >> 18));
392 |         buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
393 |         buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
394 |         buf[4] = (tmbchar) (0x80 | (c & 0x3F));
395 |         bytes = 5;
396 |         hasError = yes;
397 |     }
398 |     else if (c <= 0x7FFFFFFF)  /* 1111 110X  six bytes */
399 |     {
400 |         buf[0] = (tmbchar) (0xFC | (c >> 30));
401 |         buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
402 |         buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
403 |         buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
404 |         buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
405 |         buf[5] = (tmbchar) (0x80 | (c & 0x3F));
406 |         bytes = 6;
407 |         hasError = yes;
408 |     }
409 |     else
410 |         hasError = yes;
411 |         
412 |     /* don't output invalid UTF-8 byte sequence to a stream */
413 |     if ( !hasError && outp != NULL )
414 |     {
415 |         int ix;
416 |         for ( ix=0; ix < bytes; ++ix )
417 |           outp->putByte( outp->sinkData, buf[ix] );
418 |     }
419 | 
420 | #if 1 && defined(_DEBUG)
421 |     if ( hasError )
422 |     {
423 |         int i;
424 |         fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
425 |         for (i = 0; i < bytes; i++)
426 |             fprintf( stderr, "0x%02x ", buf[i] );
427 |         fprintf( stderr, "\n" );
428 |     }
429 | #endif
430 |     
431 |     *count = bytes;
432 |     if (hasError)
433 |         return -1;
434 |     return 0;
435 | }
436 | 
437 | 
438 | /* return one less than the number of bytes used by the UTF-8 byte sequence */
439 | /* str points to the UTF-8 byte sequence */
440 | /* the Unicode char is returned in *ch */
441 | uint GetUTF8( tmbstr str, uint *ch )
442 | {
443 |     uint n;
444 |     int bytes;
445 | 
446 |     int err;
447 |     
448 |     bytes = 0;
449 |     
450 |     /* first byte "str[0]" is passed in separately from the */
451 |     /* rest of the UTF-8 byte sequence starting at "str[1]" */
452 |     err = DecodeUTF8BytesToChar( &n, str[0], str+1, NULL, &bytes );
453 |     if (err)
454 |     {
455 | #if 1 && defined(_DEBUG)
456 |         fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
457 | #endif
458 |         n = 0xFFFD; /* replacement char */
459 |     }
460 | 
461 |     *ch = n;
462 |     return bytes - 1;
463 | }
464 | 
465 | /* store char c as UTF-8 encoded byte stream */
466 | tmbstr PutUTF8( tmbstr buf, uint c )
467 | {
468 |     int err, count = 0;
469 |         
470 |     err = EncodeCharToUTF8Bytes( c, buf, NULL, &count );
471 |     if (err)
472 |     {
473 | #if 1 && defined(_DEBUG)
474 |         fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
475 | #endif
476 |         /* replacement char 0xFFFD encoded as UTF-8 */
477 |         buf[0] = (byte) 0xEF;
478 |         buf[1] = (byte) 0xBF;
479 |         buf[2] = (byte) 0xBD;
480 |         count = 3;
481 |     }
482 |     
483 |     buf += count;
484 |     return buf;
485 | }
486 | 
487 | Bool    IsValidUTF16FromUCS4( tchar ucs4 )
488 | {
489 |   return ( ucs4 <= kMaxUTF16FromUCS4 );
490 | }
491 | 
492 | Bool    IsHighSurrogate( tchar ch )
493 | {
494 |     return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
495 | }
496 | Bool    IsLowSurrogate( tchar ch )
497 | {
498 |     return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
499 | }
500 | 
501 | tchar   CombineSurrogatePair( tchar high, tchar low )
502 | {
503 |     assert( IsHighSurrogate(high) && IsLowSurrogate(low) );
504 |     return ( ((low - kUTF16LowSurrogateBegin) * 0x400) + 
505 |              high - kUTF16HighSurrogateBegin + 0x10000 );
506 | }
507 | 
508 | Bool   SplitSurrogatePair( tchar utf16, tchar* low, tchar* high )
509 | {
510 |     Bool ok = ( IsValidCombinedChar( utf16 ) && high && low );
511 |     if ( ok )
512 |     {
513 |         *low  = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
514 |         *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
515 |     }
516 |     return ok;
517 | }
518 | 
519 | Bool    IsValidCombinedChar( tchar ch )
520 | {
521 |     return ( ch >= kUTF16SurrogatesBegin &&
522 |              (ch & 0x0000FFFE) != 0x0000FFFE &&
523 |              (ch & 0x0000FFFF) != 0x0000FFFF );
524 | }
525 | 
526 | Bool    IsCombinedChar( tchar ch )
527 | {
528 |     return ( ch >= kUTF16SurrogatesBegin );
529 | }
530 | 


--------------------------------------------------------------------------------
/elementtidy-1.0-20050212/tidylib/src/lexer.h:
--------------------------------------------------------------------------------
  1 | #ifndef __LEXER_H__
  2 | #define __LEXER_H__
  3 | 
  4 | /* lexer.h -- Lexer for html parser
  5 |   
  6 |    (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  7 |    See tidy.h for the copyright notice.
  8 |   
  9 |    CVS Info:
 10 |     $LastChangedBy$ 
 11 |     $LastChangedDate$ 
 12 |     $LastChangedRevision$ 
 13 | 
 14 | */
 15 | 
 16 | /*
 17 |   Given an input source, it returns a sequence of tokens.
 18 | 
 19 |      GetToken(source) gets the next token
 20 |      UngetToken(source) provides one level undo
 21 | 
 22 |   The tags include an attribute list:
 23 | 
 24 |     - linked list of attribute/value nodes
 25 |     - each node has 2 NULL-terminated strings.
 26 |     - entities are replaced in attribute values
 27 | 
 28 |   white space is compacted if not in preformatted mode
 29 |   If not in preformatted mode then leading white space
 30 |   is discarded and subsequent white space sequences
 31 |   compacted to single space characters.
 32 | 
 33 |   If XmlTags is no then Tag names are folded to upper
 34 |   case and attribute names to lower case.
 35 | 
 36 |  Not yet done:
 37 |     -   Doctype subset and marked sections
 38 | */
 39 | 
 40 | #ifdef __cplusplus
 41 | extern "C" {
 42 | #endif
 43 | 
 44 | #include "forward.h"
 45 | 
 46 | /* lexer character types
 47 | */
 48 | #define digit       1
 49 | #define letter      2
 50 | #define namechar    4
 51 | #define white       8
 52 | #define newline     16
 53 | #define lowercase   32
 54 | #define uppercase   64
 55 | 
 56 | 
 57 | /* node->type is one of these values
 58 | */
 59 | #define RootNode        0
 60 | #define DocTypeTag      1
 61 | #define CommentTag      2
 62 | #define ProcInsTag      3
 63 | #define TextNode        4
 64 | #define StartTag        5
 65 | #define EndTag          6
 66 | #define StartEndTag     7
 67 | #define CDATATag        8
 68 | #define SectionTag      9
 69 | #define AspTag          10
 70 | #define JsteTag         11
 71 | #define PhpTag          12
 72 | #define XmlDecl         13
 73 | 
 74 | 
 75 | 
 76 | /* lexer GetToken states
 77 | */
 78 | #define LEX_CONTENT     0
 79 | #define LEX_GT          1
 80 | #define LEX_ENDTAG      2
 81 | #define LEX_STARTTAG    3
 82 | #define LEX_COMMENT     4
 83 | #define LEX_DOCTYPE     5
 84 | #define LEX_PROCINSTR   6
 85 | #define LEX_ENDCOMMENT  7
 86 | #define LEX_CDATA       8
 87 | #define LEX_SECTION     9
 88 | #define LEX_ASP         10
 89 | #define LEX_JSTE        11
 90 | #define LEX_PHP         12
 91 | #define LEX_XMLDECL     13
 92 | 
 93 | /* ParseDocTypeDecl state constants */
 94 | #define DT_INTERMEDIATE 0
 95 | #define DT_DOCTYPENAME  1
 96 | #define DT_PUBLICSYSTEM 2
 97 | #define DT_QUOTEDSTRING 3
 98 | #define DT_INTSUBSET    4
 99 | 
100 | /* content model shortcut encoding
101 | */
102 | #define CM_UNKNOWN      0
103 | #define CM_EMPTY        (1 << 0)
104 | #define CM_HTML         (1 << 1)
105 | #define CM_HEAD         (1 << 2)
106 | #define CM_BLOCK        (1 << 3)
107 | #define CM_INLINE       (1 << 4)
108 | #define CM_LIST         (1 << 5)
109 | #define CM_DEFLIST      (1 << 6)
110 | #define CM_TABLE        (1 << 7)
111 | #define CM_ROWGRP       (1 << 8)
112 | #define CM_ROW          (1 << 9)
113 | #define CM_FIELD        (1 << 10)
114 | #define CM_OBJECT       (1 << 11)
115 | #define CM_PARAM        (1 << 12)
116 | #define CM_FRAMES       (1 << 13)
117 | #define CM_HEADING      (1 << 14)
118 | #define CM_OPT          (1 << 15)
119 | #define CM_IMG          (1 << 16)
120 | #define CM_MIXED        (1 << 17)
121 | #define CM_NO_INDENT    (1 << 18)
122 | #define CM_OBSOLETE     (1 << 19)
123 | #define CM_NEW          (1 << 20)
124 | #define CM_OMITST       (1 << 21)
125 | 
126 | /* If the document uses just HTML 2.0 tags and attributes described
127 | ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
128 | ** If there are proprietary tags and attributes then describe it as
129 | ** HTML Proprietary. If it includes the xml-lang or xmlns attributes
130 | ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
131 | ** flavors of Voyager (strict, loose or frameset).
132 | */
133 | 
134 | /* unknown */
135 | #define xxxx                   0u
136 | 
137 | /* W3C defined HTML/XHTML family document types */
138 | #define HT20                   1u
139 | #define HT32                   2u
140 | #define H40S                   4u
141 | #define H40T                   8u
142 | #define H40F                  16u
143 | #define H41S                  32u
144 | #define H41T                  64u
145 | #define H41F                 128u
146 | #define X10S                 256u
147 | #define X10T                 512u
148 | #define X10F                1024u
149 | #define XH11                2048u
150 | #define XB10                4096u
151 | 
152 | /* proprietary stuff */
153 | #define VERS_SUN            8192u
154 | #define VERS_NETSCAPE      16384u
155 | #define VERS_MICROSOFT     32768u
156 | 
157 | /* special flag */
158 | #define VERS_XML           65536u
159 | 
160 | /* compatibility symbols */
161 | #define VERS_UNKNOWN       (xxxx)
162 | #define VERS_HTML20        (HT20)
163 | #define VERS_HTML32        (HT32)
164 | #define VERS_HTML40_STRICT (H40S|H41S|X10S)
165 | #define VERS_HTML40_LOOSE  (H40T|H41T|X10T)
166 | #define VERS_FRAMESET      (H40F|H41F|X10F)
167 | #define VERS_XHTML11       (XH11)
168 | #define VERS_BASIC         (XB10)
169 | 
170 | /* meta symbols */
171 | #define VERS_HTML40        (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
172 | #define VERS_IFRAME        (VERS_HTML40_LOOSE|VERS_FRAMESET)
173 | #define VERS_LOOSE         (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
174 | #define VERS_EVENTS        (VERS_HTML40|VERS_XHTML11)
175 | #define VERS_FROM32        (VERS_HTML32|VERS_HTML40)
176 | #define VERS_FROM40        (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
177 | #define VERS_XHTML         (X10S|X10T|X10F|XH11|XB10)
178 | 
179 | /* all W3C defined document types */
180 | #define VERS_ALL           (VERS_HTML20|VERS_HTML32|VERS_FROM40)
181 | 
182 | /* all proprietary types */
183 | #define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
184 | 
185 | /* Linked list of class names and styles
186 | */
187 | struct _Style;
188 | typedef struct _Style Style;
189 | 
190 | struct _Style
191 | {
192 |     tmbstr tag;
193 |     tmbstr tag_class;
194 |     tmbstr properties;
195 |     Style *next;
196 | };
197 | 
198 | 
199 | /* Linked list of style properties
200 | */
201 | struct _StyleProp;
202 | typedef struct _StyleProp StyleProp;
203 | 
204 | struct _StyleProp
205 | {
206 |     tmbstr name;
207 |     tmbstr value;
208 |     StyleProp *next;
209 | };
210 | 
211 | 
212 | 
213 | 
214 | /* Attribute/Value linked list node
215 | */
216 | 
217 | struct _AttVal
218 | {
219 |     AttVal*           next;
220 |     const Attribute*  dict;
221 |     Node*             asp;
222 |     Node*             php;
223 |     int               delim;
224 |     tmbstr            attribute;
225 |     tmbstr            value;
226 | };
227 | 
228 | 
229 | 
230 | /*
231 |   Mosaic handles inlines via a separate stack from other elements
232 |   We duplicate this to recover from inline markup errors such as:
233 | 
234 |      <i>italic text
235 |      <p>more italic text</b> normal text
236 | 
237 |   which for compatibility with Mosaic is mapped to:
238 | 
239 |      <i>italic text</i>
240 |      <p><i>more italic text</i> normal text
241 | 
242 |   Note that any inline end tag pop's the effect of the current
243 |   inline start tag, so that </b> pop's <i> in the above example.
244 | */
245 | struct _IStack
246 | {
247 |     IStack*     next;
248 |     const Dict* tag;        /* tag's dictionary definition */
249 |     tmbstr      element;    /* name (NULL for text nodes) */
250 |     AttVal*     attributes;
251 | };
252 | 
253 | 
254 | /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
255 | ** etc. etc.
256 | */
257 | 
258 | struct _Node
259 | {
260 |     Node*       parent;         /* tree structure */
261 |     Node*       prev;
262 |     Node*       next;
263 |     Node*       content;
264 |     Node*       last;
265 | 
266 |     AttVal*     attributes;
267 |     const Dict* was;            /* old tag when it was changed */
268 |     const Dict* tag;            /* tag's dictionary definition */
269 | 
270 |     tmbstr      element;        /* name (NULL for text nodes) */
271 | 
272 |     uint        start;          /* start of span onto text array */
273 |     uint        end;            /* end of span onto text array */
274 |     uint        type;           /* TextNode, StartTag, EndTag etc. */
275 | 
276 |     uint        line;           /* current line of document */
277 |     uint        column;         /* current column of document */
278 | 
279 |     Bool        closed;         /* true if closed by explicit end tag */
280 |     Bool        implicit;       /* true if inferred */
281 |     Bool        linebreak;      /* true if followed by a line break */
282 | };
283 | 
284 | 
285 | /*
286 |   The following are private to the lexer
287 |   Use NewLexer() to create a lexer, and
288 |   FreeLexer() to free it.
289 | */
290 | 
291 | struct _Lexer
292 | {
293 | #if 0  /* Move to TidyDocImpl */
294 |     StreamIn* in;           /* document content input */
295 |     StreamOut* errout;      /* error output stream */
296 | 
297 |     uint badAccess;         /* for accessibility errors */
298 |     uint badLayout;         /* for bad style errors */
299 |     uint badChars;          /* for bad character encodings */
300 |     uint badForm;           /* for mismatched/mispositioned form tags */
301 |     uint warnings;          /* count of warnings in this document */
302 |     uint errors;            /* count of errors */
303 | #endif
304 | 
305 |     uint lines;             /* lines seen */
306 |     uint columns;           /* at start of current token */
307 |     Bool waswhite;          /* used to collapse contiguous white space */
308 |     Bool pushed;            /* true after token has been pushed back */
309 |     Bool insertspace;       /* when space is moved after end tag */
310 |     Bool excludeBlocks;     /* Netscape compatibility */
311 |     Bool exiled;            /* true if moved out of table */
312 |     Bool isvoyager;         /* true if xmlns attribute on html element */
313 |     uint versions;          /* bit vector of HTML versions */
314 |     int  doctype;           /* version as given by doctype (if any) */
315 |     Bool bad_doctype;       /* e.g. if html or PUBLIC is missing */
316 |     uint txtstart;          /* start of current node */
317 |     uint txtend;            /* end of current node */
318 |     uint state;             /* state of lexer's finite state machine */
319 | 
320 |     Node* token;            /* current parse point */
321 |     Node* root;             /* remember root node of the document */
322 |     
323 |     Bool seenEndBody;       /* true if a </body> tag has been encountered */
324 |     Bool seenEndHtml;       /* true if a </html> tag has been encountered */
325 | 
326 |     /*
327 |       Lexer character buffer
328 | 
329 |       Parse tree nodes span onto this buffer
330 |       which contains the concatenated text
331 |       contents of all of the elements.
332 | 
333 |       lexsize must be reset for each file.
334 |     */
335 |     tmbstr lexbuf;          /* MB character buffer */
336 |     uint lexlength;         /* allocated */
337 |     uint lexsize;           /* used */
338 | 
339 |     /* Inline stack for compatibility with Mosaic */
340 |     Node* inode;            /* for deferring text node */
341 |     IStack* insert;         /* for inferring inline tags */
342 |     IStack* istack;
343 |     uint istacklength;      /* allocated */
344 |     uint istacksize;        /* used */
345 |     uint istackbase;        /* start of frame */
346 | 
347 |     Style *styles;          /* used for cleaning up presentation markup */
348 | 
349 | #if 0
350 |     TidyDocImpl* doc;       /* Pointer back to doc for error reporting */
351 | #endif 
352 | };
353 | 
354 | 
355 | /* Lexer Functions
356 | */
357 | Node *CommentToken( Lexer *lexer );
358 | 
359 | /* choose what version to use for new doctype */
360 | int HTMLVersion( TidyDocImpl* doc );
361 | 
362 | /* everything is allowed in proprietary version of HTML */
363 | /* this is handled here rather than in the tag/attr dicts */
364 | 
365 | void ConstrainVersion( TidyDocImpl* doc, uint vers );
366 | 
367 | Bool IsWhite(uint c);
368 | Bool IsDigit(uint c);
369 | Bool IsLetter(uint c);
370 | Bool IsNewline(uint c);
371 | Bool IsNamechar(uint c);
372 | Bool IsXMLLetter(uint c);
373 | Bool IsXMLNamechar(uint c);
374 | 
375 | Bool IsLower(uint c);
376 | Bool IsUpper(uint c);
377 | uint ToLower(uint c);
378 | uint ToUpper(uint c);
379 | 
380 | char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps );
381 | 
382 | 
383 | Lexer* NewLexer( TidyDocImpl* doc );
384 | Bool EndOfInput( TidyDocImpl* doc );
385 | void FreeLexer( TidyDocImpl* doc );
386 | 
387 | /* store character c as UTF-8 encoded byte stream */
388 | void AddCharToLexer( Lexer *lexer, uint c );
389 | 
390 | /*
391 |   Used for elements and text nodes
392 |   element name is NULL for text nodes
393 |   start and end are offsets into lexbuf
394 |   which contains the textual content of
395 |   all elements in the parse tree.
396 | 
397 |   parent and content allow traversal
398 |   of the parse tree in any direction.
399 |   attributes are represented as a linked
400 |   list of AttVal nodes which hold the
401 |   strings for attribute/value pairs.
402 | */
403 | Node* NewNode( Lexer* lexer );
404 | 
405 | 
406 | /* used to clone heading nodes when split by an <HR> */
407 | Node *CloneNode( TidyDocImpl* doc, Node *element );
408 | 
409 | /* clones the given node using source node attributes,
410 | ** no lexer attributes */
411 | Node *CloneNodeEx( TidyDocImpl* doc, Node *element );
412 | 
413 | /* free node's attributes */
414 | void FreeAttrs( TidyDocImpl* doc, Node *node );
415 | 
416 | /* doesn't repair attribute list linkage */
417 | void FreeAttribute( AttVal *av );
418 | 
419 | /* remove attribute from node then free it
420 | */
421 | void RemoveAttribute( Node *node, AttVal *attr );
422 | 
423 | /*
424 |   Free document nodes by iterating through peers and recursing
425 |   through children. Set next to NULL before calling FreeNode()
426 |   to avoid freeing peer nodes. Doesn't patch up prev/next links.
427 |  */
428 | void FreeNode( TidyDocImpl* doc, Node *node );
429 | 
430 | Node* TextToken( Lexer *lexer );
431 | 
432 | /* used for creating preformatted text from Word2000 */
433 | Node *NewLineNode( Lexer *lexer );
434 | 
435 | /* used for adding a &nbsp; for Word2000 */
436 | Node *NewLiteralTextNode(Lexer *lexer, ctmbstr txt );
437 | 
438 | Node* CommentToken(Lexer *lexer);
439 | Node* GetCDATA( TidyDocImpl* doc, Node *container );
440 | 
441 | void AddByte( Lexer *lexer, tmbchar c );
442 | void AddStringLiteral( Lexer* lexer, ctmbstr str );
443 | void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len );
444 | 
445 | /* find element */
446 | Node* FindDocType( TidyDocImpl* doc );
447 | Node* FindHTML( TidyDocImpl* doc );
448 | Node* FindHEAD( TidyDocImpl* doc );
449 | Node* FindTITLE(TidyDocImpl* doc);
450 | Node* FindBody( TidyDocImpl* doc );
451 | Node* FindXmlDecl(TidyDocImpl* doc);
452 | 
453 | /* Returns containing block element, if any */
454 | Node* FindContainer( Node* node );
455 | 
456 | /* add meta element for Tidy */
457 | Bool AddGenerator( TidyDocImpl* doc );
458 | 
459 | /* examine <!DOCTYPE> to identify version */
460 | int FindGivenVersion( TidyDocImpl* doc, Node* doctype );
461 | int ApparentVersion( TidyDocImpl* doc );
462 | 
463 | 
464 | Bool CheckDocTypeKeyWords(Lexer *lexer, Node *doctype);
465 | 
466 | ctmbstr HTMLVersionName( TidyDocImpl* doc );
467 | ctmbstr HTMLVersionNameFromCode( uint vers, Bool isXhtml );
468 | 
469 | Bool SetXHTMLDocType( TidyDocImpl* doc );
470 | 
471 | 
472 | /* fixup doctype if missing */
473 | Bool FixDocType( TidyDocImpl* doc );
474 | 
475 | /* ensure XML document starts with <?XML version="1.0"?> */
476 | /* add encoding attribute if not using ASCII or UTF-8 output */
477 | Bool FixXmlDecl( TidyDocImpl* doc );
478 | 
479 | Node* InferredTag( TidyDocImpl* doc, ctmbstr name );
480 | 
481 | Bool ExpectsContent(Node *node);
482 | 
483 | 
484 | void UngetToken( TidyDocImpl* doc );
485 | 
486 | 
487 | /*
488 |   modes for GetToken()
489 | 
490 |   MixedContent   -- for elements which don't accept PCDATA
491 |   Preformatted   -- white space preserved as is
492 |   IgnoreMarkup   -- for CDATA elements such as script, style
493 | */
494 | #define IgnoreWhitespace    0
495 | #define MixedContent        1
496 | #define Preformatted        2
497 | #define IgnoreMarkup        3
498 | 
499 | Node* GetToken( TidyDocImpl* doc, uint mode );
500 | 
501 | void InitMap(void);
502 | 
503 | Bool IsValidAttrName( ctmbstr attr );
504 | 
505 | 
506 | /* create a new attribute */
507 | AttVal *NewAttribute(void);
508 | 
509 | /* create a new attribute with given name and value */
510 | AttVal *NewAttributeEx(ctmbstr name, ctmbstr value);
511 | 
512 | /*************************************
513 |   In-line Stack functions
514 | *************************************/
515 | 
516 | 
517 | /* duplicate attributes */
518 | AttVal* DupAttrs( TidyDocImpl* doc, AttVal* attrs );
519 | 
520 | /*
521 |   push a copy of an inline node onto stack
522 |   but don't push if implicit or OBJECT or APPLET
523 |   (implicit tags are ones generated from the istack)
524 | 
525 |   One issue arises with pushing inlines when
526 |   the tag is already pushed. For instance:
527 | 
528 |       <p><em>text
529 |       <p><em>more text
530 | 
531 |   Shouldn't be mapped to
532 | 
533 |       <p><em>text</em></p>
534 |       <p><em><em>more text</em></em>
535 | */
536 | void PushInline( TidyDocImpl* doc, Node* node );
537 | 
538 | /* pop inline stack */
539 | void PopInline( TidyDocImpl* doc, Node* node );
540 | 
541 | Bool IsPushed( TidyDocImpl* doc, Node* node );
542 | 
543 | /*
544 |   This has the effect of inserting "missing" inline
545 |   elements around the contents of blocklevel elements
546 |   such as P, TD, TH, DIV, PRE etc. This procedure is
547 |   called at the start of ParseBlock. when the inline
548 |   stack is not empty, as will be the case in:
549 | 
550 |     <i><h1>italic heading</h1></i>
551 | 
552 |   which is then treated as equivalent to
553 | 
554 |     <h1><i>italic heading</i></h1>
555 | 
556 |   This is implemented by setting the lexer into a mode
557 |   where it gets tokens from the inline stack rather than
558 |   from the input stream.
559 | */
560 | int InlineDup( TidyDocImpl* doc, Node *node );
561 | 
562 | /*
563 |  defer duplicates when entering a table or other
564 |  element where the inlines shouldn't be duplicated
565 | */
566 | void DeferDup( TidyDocImpl* doc );
567 | Node *InsertedToken( TidyDocImpl* doc );
568 | 
569 | #ifdef __cplusplus
570 | }
571 | #endif
572 | 
573 | 
574 | #endif /* __LEXER_H__ */
575 | 


--------------------------------------------------------------------------------