├── NEWS
├── AUTHORS
├── html
    ├── tld.list
    ├── uriTests.cc
    ├── ParserSax.cc
    ├── wincstring.h
    ├── Extensions.h
    ├── CharsetConverter.h
    ├── Extensions.cc
    ├── ParserDom.h
    ├── Makefile.am
    ├── gen_tld.pl
    ├── ci_string.h
    ├── utils.h
    ├── debug.h
    ├── CharsetConverter.cc
    ├── ParserSax.h
    ├── Node.h
    ├── Node.cc
    ├── Uri.h
    ├── ParserDom.cc
    ├── profile.h
    ├── tld.h
    ├── tests.cc
    ├── ParserSax.tcc
    ├── utils.cc
    └── Uri.cc
├── css
    ├── css_lex.h
    ├── Makefile.am
    ├── parser.c
    ├── parser.h
    ├── parser_pp.h
    ├── css_syntax.h
    ├── css_lex.l
    ├── default.css
    ├── parser_pp.cc
    └── css_syntax.y
├── htmlcxx.pc.in
├── Makefile.am
├── COPYING
├── README.md
├── wingetopt.h
├── configure.ac
├── ChangeLog
├── htmlcxx.spec
├── htmlcxxapp.vcproj
├── README
├── htmlcxx.vcproj
├── htmlcxx.cc
├── wingetopt.c
├── ASF-2.0
├── INSTALL
└── LGPL_V2


/NEWS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/barmi/htmlcxx/HEAD/AUTHORS


--------------------------------------------------------------------------------
/html/tld.list:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/barmi/htmlcxx/HEAD/html/tld.list


--------------------------------------------------------------------------------
/html/uriTests.cc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/barmi/htmlcxx/HEAD/html/uriTests.cc


--------------------------------------------------------------------------------
/css/css_lex.h:
--------------------------------------------------------------------------------
1 | #ifndef __CSS_LEX_H__
2 | #define __CSS_LEX_H__
3 | 
4 | int init_yylex(const char *buffer, int buf_len);
5 | int end_yylex();
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/html/ParserSax.cc:
--------------------------------------------------------------------------------
1 | #include "ParserSax.h"
2 | 
3 | void htmlcxx::HTML::ParserSax::parse(const std::string &html)
4 | {
5 | //	std::cerr << "Parsing string" << std::endl;
6 | 	parse(html.c_str(), html.c_str() + html.length());
7 | }
8 | 


--------------------------------------------------------------------------------
/htmlcxx.pc.in:
--------------------------------------------------------------------------------
 1 | url=http://htmlcxx.sourceforge.net/
 2 | prefix=@prefix@
 3 | exec_prefix=@exec_prefix@
 4 | libdir=@libdir@
 5 | includedir=@includedir@
 6 | 
 7 | Name: htmlcxx
 8 | Description: html and css apis for c++
 9 | Version: @VERSION@
10 | Libs: -L${libdir} -lhtmlcxx -lcss_parser_pp -lcss_parser
11 | Cflags: -I${includedir} 
12 | URL: ${url}
13 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
 1 | SUBDIRS = html css
 2 | 
 3 | bin_PROGRAMS = htmlcxx
 4 | htmlcxx_SOURCES = htmlcxx.cc wingetopt.h 
 5 | 
 6 | htmlcxx_LDADD = html/libhtmlcxx.la css/libcss_parser_pp.la css/libcss_parser.la
 7 | 
 8 | EXTRA_DIST = ASF-2.0 LGPL_V2 wingetopt.c htmlcxx.spec htmlcxx.vcproj htmlcxxapp.vcproj htmlcxx.pc.in
 9 | 
10 | pkgconfigdir = $(libdir)/pkgconfig
11 | pkgconfig_DATA = htmlcxx.pc
12 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | The htmlcxx code is covered by the LGPL license found in the LGPL_V2 file in
2 | this distribution. The tree.hh code, used in this code, is covered by the
3 | license in the original distribution at http://tree.phi-sci.com/. Only
4 | HTML::Node instances, which the only instances used by htmlcxx, are available
5 | under the LGPL. The uri parsing code is a derivative work of Apache web server
6 | uri parsing routine and covered by the ASF-2.0 file in this distribution.
7 | 


--------------------------------------------------------------------------------
/html/wincstring.h:
--------------------------------------------------------------------------------
 1 | #ifndef __WINCSTRING_H__
 2 | #define __WINCSTRING_H__
 3 | 
 4 | #include <cstring>
 5 | 
 6 | #if defined(WIN32) && !defined(__MINGW32__)
 7 | 
 8 | /*
 9 |  * some functions have strange names on windows
10 |  */
11 | #ifndef strcasecmp
12 | #define strcasecmp   _stricmp
13 | #endif
14 | #ifndef strncasecmp
15 | #define strncasecmp  _strnicmp
16 | #endif
17 | #ifndef snprintf
18 | #define snprintf     _snprintf
19 | #endif
20 | 
21 | #endif
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | htmlcxx
 2 | =======
 3 | * a simple html and css parser written in C++. clone of http://htmlcxx.sourceforge.net/
 4 | * new code repository : https://sourceforge.net/p/htmlcxx/code/
 5 | 
 6 | install
 7 | -------
 8 | * sudo apt install autoconf libtool
 9 | 
10 | * ref: https://stackoverflow.com/questions/22603163/automake-error-ltmain-sh-not-found
11 | 
12 | * libtoolize
13 | * aclocal
14 | * autoheader
15 | * autoconf
16 | * automake --add-missing
17 | 
18 | * ./configure
19 | * make
20 | 


--------------------------------------------------------------------------------
/html/Extensions.h:
--------------------------------------------------------------------------------
 1 | #ifndef __EXTENSIONS_H__
 2 | #define __EXTENSIONS_H__
 3 | 
 4 | #include <set>
 5 | #include <string>
 6 | #include "ci_string.h"
 7 | 
 8 | namespace htmlcxx
 9 | {
10 | 	class Extensions
11 | 	{
12 | 		public:
13 | 			Extensions() {}
14 | 
15 | 			Extensions(const std::string &exts);
16 | 
17 | 			~Extensions() {}
18 | 
19 | 			bool check(const std::string &url);
20 | 			void insert(const ci_string &ext) { mExts.insert(ext); }
21 | 
22 | 		private:
23 | 			std::set<ci_string> mExts;
24 | 	};
25 | }
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/css/Makefile.am:
--------------------------------------------------------------------------------
 1 | lib_LTLIBRARIES = libcss_parser_pp.la libcss_parser.la
 2 | 
 3 | libcss_parser_pp_la_SOURCES = parser_pp.h parser_pp.cc parser.c
 4 | libcss_parser_pp_la_LDFLAGS = -version-info 0:0:0
 5 | 
 6 | libcss_parser_la_SOURCES = css_syntax.y css_lex.l css_lex.h css_syntax.h parser.c parser.h
 7 | libcss_parser_la_LDFLAGS = -version-info 0:0:0
 8 | 
 9 | AM_YFLAGS = -d
10 | AM_LFLAGS = -i
11 | 
12 | includedir = $(prefix)/include/htmlcxx/css
13 | include_HEADERS = parser.h parser_pp.h
14 | 
15 | EXTRA_DIST = default.css
16 | cssdir = $(pkgdatadir)/css
17 | css_DATA = default.css
18 | 


--------------------------------------------------------------------------------
/html/CharsetConverter.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CHARSET_CONVERTER_H__
 2 | #define __CHARSET_CONVERTER_H__
 3 | 
 4 | #include <iconv.h>
 5 | #include <string>
 6 | #include <stdexcept>
 7 | 
 8 | namespace htmlcxx
 9 | {
10 | 	class CharsetConverter
11 | 	{
12 | 		public:
13 | 			class Exception : public std::runtime_error
14 | 			{
15 | 				public:
16 | 					Exception(const std::string &arg)
17 | 						: std::runtime_error(arg) {}
18 | 			};
19 | 			
20 | 			CharsetConverter(const std::string &from, const std::string &to) throw (Exception);
21 | 			~CharsetConverter();
22 | 			
23 | 			std::string convert(const std::string &input);
24 | 
25 | 		private:
26 | 			iconv_t mIconvDescriptor;;
27 | 	};
28 | }
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/wingetopt.h:
--------------------------------------------------------------------------------
 1 | #ifdef __cplusplus
 2 | extern "C" {
 3 | #endif
 4 | 
 5 | #if !defined(WIN32) || defined(__MINGW32__)
 6 | 	#include <getopt.h>
 7 | #else
 8 | 	#ifndef _GETOPT_
 9 | 	#define _GETOPT_
10 | 
11 | 	#include <stdio.h>                  /* for EOF */ 
12 | 	#include <string.h>                 /* for strchr() */ 
13 | 
14 | 	char *optarg = NULL;    /* pointer to the start of the option argument  */ 
15 | 	int   optind = 1;       /* number of the next argv[] to be evaluated    */ 
16 | 	int   opterr = 1;       /* non-zero if a question mark should be returned */
17 | 
18 | 	int getopt(int argc, char *argv[], char *opstring); 
19 | 	#endif
20 | #endif
21 | 
22 | #ifdef __cplusplus
23 | }
24 | #endif
25 | 
26 | 


--------------------------------------------------------------------------------
/css/parser.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include "parser.h"
 3 | 
 4 | void free_rulesets(struct selector_list_t *rules) {
 5 | 
 6 | 	struct selector_list_t *pos = rules;
 7 | 	while (pos != NULL) {
 8 | 		struct selector_list_t *tmp = pos;
 9 | 		struct selector_t *rule = pos->selector;
10 | 		struct property_t *property = rule->property;
11 | 		while (property != NULL) {
12 | 			struct property_t *tmp = property;
13 | 			property = property->next;
14 | 			tmp->count--;
15 | 			if (tmp->count == 0) {
16 | 				free(tmp->name);
17 | 				free(tmp->val);
18 | 				free(tmp);
19 | 			}
20 | 		}
21 | 		while (rule != NULL) {
22 | 			struct selector_t *tmp = rule;
23 | 			rule = rule->next;
24 | 			free(tmp->element_name);
25 | 			free(tmp->id);
26 | 			free(tmp->e_class);
27 | 			free(tmp);
28 | 		}
29 | 		pos = pos->next;
30 | 		free(tmp);
31 | 	}
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/html/Extensions.cc:
--------------------------------------------------------------------------------
 1 | #include <cstring>
 2 | #include "Extensions.h"
 3 | 
 4 | using namespace std;
 5 | using namespace htmlcxx;
 6 | 
 7 | Extensions::Extensions(const string &exts)
 8 | {
 9 | 	const char *begin = exts.c_str();
10 | 	while (*begin)
11 | 	{
12 | 		while (*begin == ' ') ++begin;
13 | 		if (*begin == 0) break;
14 | 
15 | 		const char *end = begin + 1;
16 | 		while (*end && *end != ' ') ++end;
17 | 
18 | 		insert(ci_string(begin, end));
19 | 
20 | 		begin = end;
21 | 	}
22 | }
23 | 
24 | bool Extensions::check(const string &url)
25 | {
26 | 	const char *slash;
27 | 	const char *dot;
28 | 	const char *question;
29 | 
30 | 	question = strchr(url.c_str(), '?');
31 | 
32 | 	if (question) return false;
33 | 	
34 | 	slash = strrchr(url.c_str(), '/');
35 | 	dot = strrchr(url.c_str(), '.');
36 | 
37 | 	if (slash >= dot) return false;
38 | 	
39 | 	ci_string ext(dot);
40 | 	
41 | 	return mExts.find(ext) != mExts.end();
42 | }
43 | 


--------------------------------------------------------------------------------
/html/ParserDom.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HTML_PARSER_DOM_H__
 2 | #define __HTML_PARSER_DOM_H__
 3 | 
 4 | #include "ParserSax.h"
 5 | #include "tree.h"
 6 | 
 7 | namespace htmlcxx
 8 | {
 9 | 	namespace HTML
10 | 	{
11 | 		class ParserDom : public ParserSax
12 | 		{
13 | 			public:
14 | 				ParserDom() {}
15 | 				~ParserDom() {}
16 | 
17 | 				const tree<Node> &parseTree(const std::string &html);
18 | 				const tree<Node> &getTree()
19 | 				{ return mHtmlTree; }
20 | 
21 | 			protected:
22 | 				virtual void beginParsing();
23 | 
24 | 				virtual void foundTag(Node node, bool isEnd);
25 | 				virtual void foundText(Node node);
26 | 				virtual void foundComment(Node node);
27 | 
28 | 				virtual void endParsing();
29 | 				
30 | 				tree<Node> mHtmlTree;
31 | 				tree<Node>::iterator mCurrentState;
32 | 		};
33 | 
34 | 		std::ostream &operator<<(std::ostream &stream, const tree<HTML::Node> &tr);
35 | 	} //namespace HTML
36 | } //namespace htmlcxx
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/html/Makefile.am:
--------------------------------------------------------------------------------
 1 | lib_LTLIBRARIES = libhtmlcxx.la
 2 | libhtmlcxx_la_SOURCES = ParserSax.h ParserSax.tcc ParserSax.cc \
 3 | 	ParserDom.h ParserDom.cc \
 4 | 	Extensions.h Extensions.cc \
 5 | 	CharsetConverter.h CharsetConverter.cc \
 6 | 	Uri.h Uri.cc \
 7 | 	Node.h Node.cc \
 8 | 	utils.h utils.cc \
 9 | 	ci_string.h \
10 | 	tree.h \
11 | 	wincstring.h\
12 | 	debug.h
13 | libhtmlcxx_la_CXXFLAGS =
14 | libhtmlcxx_la_LDFLAGS = -version-info 4:0:1
15 | 
16 | # bin_PROGRAMS = htmlcxx
17 | # htmlcxx_LDADD = libhtmlcxx.la
18 | # htmlcxx_SOURCES = htmlcxx.cc
19 | 						
20 | noinst_PROGRAMS = tests uriTests
21 | tests_SOURCES = tests.cc 
22 | tests_LDADD = libhtmlcxx.la
23 | uriTests_SOURCES = uriTests.cc
24 | uriTests_LDADD = libhtmlcxx.la
25 | 
26 | TESTS = tests uriTests
27 | 
28 | includedir = $(prefix)/include/htmlcxx/html
29 | include_HEADERS = ParserSax.h ParserSax.tcc ParserDom.h utils.h tree.h Node.h Uri.h CharsetConverter.h Extensions.h ci_string.h tld.h
30 | 
31 | EXTRA_DIST = gen_tld.pl tld.list
32 | 


--------------------------------------------------------------------------------
/html/gen_tld.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | print "/** Do not modify. This file is automatically generated\n";
 4 | print "  * using gen_tld.pl and tld.list\n";
 5 | print "  */\n\n";
 6 | 		
 7 | my @tld;
 8 | while(<STDIN>) {
 9 | 	chomp;
10 | 	if(/^([\w\.]+)\s+/) {
11 | 		push @tld, uc $1;
12 | 	}
13 | }
14 | @tld = sort { length($a) <=> length($b) || $a cmp $b } @tld;
15 | @tld = reverse @tld;
16 | 
17 | my $i;
18 | print "static const char *tld[] = {\n";
19 | for($i = 0; $i < scalar (@tld) - 1; ++$i) { print "\t\".$tld[$i]\",\n"; }
20 | print "\t\"." . $tld[scalar(@tld) - 1] . "\"\n";
21 | print "};\n";
22 | print "static uint tldOffset(const char *domain) {\n";
23 | print "	const char *end = domain + strlen(domain);\n";
24 | print "	for(unsigned int i = 0; i < " . scalar @tld . "; ++i) {\n";
25 | print " 	unsigned int len = strlen(tld[i]);\n";
26 | print "		if(strcasecmp(end - len, tld[i]) == 0) {\n";
27 | print "			return len;\n";
28 | print "		}\n";
29 | print "	}\n";
30 | print " return 0;";
31 | print "}\n";
32 | 
33 | 	
34 | 


--------------------------------------------------------------------------------
/css/parser.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CSS_PARSER_H__
 2 | #define __CSS_PARSER_H__
 3 | 
 4 | #define PS_CLASS_NONE 0
 5 | #define PS_CLASS 1
 6 | #define PS_CLASS_LINK 1
 7 | #define PS_CLASS_VISITED 2
 8 | #define PS_CLASS_ACTIVE 3
 9 | 
10 | #define PS_ELEMENT_NONE 0
11 | #define PS_ELEMENT_FIRST_LETTER 1
12 | #define PS_ELEMENT_FIRST_LINE 2
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif
17 | 
18 | struct property_t {
19 | 	char *name;
20 | 	char *val;
21 | 	int important;
22 | 	int count;
23 | 	struct property_t *next;
24 | };
25 | 
26 | struct selector_t {
27 | 	char *element_name;
28 | 	char *id;
29 | 	char *e_class;
30 | 	int pseudo_class;
31 | 	int pseudo_element;
32 | 	struct property_t *property;
33 | 	struct selector_t *next;
34 | };
35 | 
36 | struct selector_list_t {
37 | 	struct selector_t *selector;
38 | 	struct selector_list_t *next;
39 | };
40 | 
41 | struct selector_list_t* css_parse(const char *buffer, int buf_len);
42 | void free_rulesets(struct selector_list_t *rules);
43 | 
44 | #ifdef __cplusplus
45 | }
46 | #endif
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | #                                               -*- Autoconf -*-
 2 | # Process this file with autoconf to produce a configure script.
 3 | 
 4 | #AC_PREREQ(2.57)
 5 | AC_INIT(htmlcxx.cc)
 6 | AM_INIT_AUTOMAKE(htmlcxx, 0.86)
 7 | AM_CONFIG_HEADER([config.h])
 8 | AM_DISABLE_STATIC
 9 | 
10 | # Checks for programs.
11 | AC_PROG_CXX
12 | AC_PROG_CC
13 | AM_PROG_LEX
14 | AC_PROG_YACC
15 | AC_PROG_LIBTOOL
16 | 
17 | # Checks for libraries.
18 | # Replace `main' with a function in -lfl:
19 | AC_CHECK_LIB(fl, main)
20 | AC_CHECK_LIB(iconv, iconv_open)
21 | 
22 | 
23 | # Checks for header files.
24 | AC_HEADER_STDC
25 | # AC_CHECK_HEADERS([])
26 | 
27 | # Checks for typedefs, structures, and compiler characteristics.
28 | AC_HEADER_STDBOOL
29 | AC_C_CONST
30 | AC_C_INLINE
31 | AC_TYPE_SIZE_T
32 | AC_HEADER_TIME
33 | 
34 | # Checks for library functions.
35 | 
36 | AC_LANG_CPLUSPLUS
37 | CXXFLAGS="$CXXFLAGS -DDEFAULT_CSS=\"\\\"$datadir/htmlcxx/css/default.css\\\"\""
38 | 
39 | AC_CONFIG_FILES([Makefile html/Makefile css/Makefile])
40 | AC_OUTPUT(htmlcxx.pc)
41 | 


--------------------------------------------------------------------------------
/html/ci_string.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CI_STRING__
 2 | #define __CI_STRING__
 3 | 
 4 | #include <cctype>
 5 | #include <string>
 6 | 
 7 | struct ci_char_traits : public std::char_traits<char>
 8 | 
 9 | // just inherit all the other functions
10 | //  that we don't need to override
11 | {
12 | 	static bool eq( char c1, char c2 ) {
13 | 		return tolower(c1) == tolower(c2);
14 | 	}
15 | 
16 | 	static bool ne( char c1, char c2 ) {
17 | 		return tolower(c1) != tolower(c2);
18 | 	}
19 | 
20 | 	static bool lt( char c1, char c2 ) {
21 | 		return tolower(c1) < tolower(c2);
22 | 	}
23 | 
24 | 	static int compare( const char* s1,
25 | 			const char* s2,
26 | 			size_t n ) {
27 | 		#ifdef WIN32
28 | 		return _strnicmp(s1, s2, n);
29 | 		#else
30 | 		return strncasecmp( s1, s2, n );
31 | 		#endif
32 | 		// if available on your compiler,
33 | 		//  otherwise you can roll your own
34 | 	}
35 | 
36 | 	static const char*
37 | 		find( const char* s, int n, char a ) {
38 | 			while( n-- > 0 && tolower(*s) != tolower(a) ) {
39 | 				++s;
40 | 			}
41 | 			return s;
42 | 		}
43 | };
44 | 
45 | typedef std::basic_string<char, ci_char_traits> ci_string;
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/html/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HTML_UTILS_H__
 2 | #define __HTML_UTILS_H__
 3 | 
 4 | #include "tree.h"
 5 | #include "Node.h"
 6 | 
 7 | namespace htmlcxx {
 8 | 
 9 | 	class Uri;
10 | 	
11 | 	namespace HTML {
12 | 
13 | 		/** Tries to detect if the buffer is encoded as utf8 */
14 | 		bool detect_utf8(const char *buf, int size);
15 | 
16 | 		/** Trim the argument string at both end and convert sibling blanks
17 | 		 * into a single space */
18 | 		std::string single_blank(const std::string& str);
19 | 
20 | 		/** Remove html comments from the string */
21 | 		std::string strip_comments(const std::string& str);
22 | 		
23 | 		/** Convert html entities into their ISO8859-1 equivalents */
24 | 		std::string decode_entities(const std::string& str);
25 | 		
26 | 		/** Get the value of an attribute inside the text of a tag 
27 | 		  * @return empty string if the attribute is not found, or the value of 
28 |  		  * the attribute
29 | 		  */
30 | 		std::string get_attribute(const std::string& tag, const std::string& attr);
31 | 
32 | 		/** Create absolute url for a link */
33 | 		std::string convert_link(const std::string &relative, const Uri &root);
34 | 
35 | 		/** Create a gml representation of the tree for input of tools like graphviz */
36 | 		std::string serialize_gml(const tree<HTML::Node> &tr); 
37 | 	}
38 | }
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/html/debug.h:
--------------------------------------------------------------------------------
 1 | #ifdef DEBUGP
 2 | #undef DEBUGP
 3 | #endif
 4 | 
 5 | #ifdef __cplusplus
 6 | #include <cstdio>
 7 | #if defined(WIN32) && !defined(__MINGW32__)
 8 | #include <cstring>
 9 | #endif
10 | #else
11 | #include <stdio.h>
12 | #if defined(WIN32) && !defined(__MINGW32__)
13 | #include <string.h>
14 | #endif
15 | #endif
16 | 
17 | #if defined(WIN32) && !defined(__MINGW32__)
18 | #ifndef __DEBUG_H__
19 | #define __DEBUG_H__
20 | #include <stdarg.h>
21 | inline void debugprintf(const char *format, ...)
22 | {
23 |     va_list ap;
24 | 	char *f = NULL;
25 | 	const char *p="%s:%d ";
26 | 	size_t plen = strlen(p);
27 |     va_start(ap, format);
28 | 	f = (char *)malloc(plen + strlen(format) + 1);
29 | 	if (!f) return;
30 | 	memcpy(f, p, plen);
31 | 	memcpy(f + plen, format, strlen(format) + 1);
32 |     vfprintf(stderr, f, ap);
33 |     va_end(ap);
34 | 	free(f);
35 | }
36 | inline void dummyprintf(const char *format, ...)
37 | {}
38 | #endif
39 | #endif
40 | 
41 | #ifdef DEBUG
42 | #if defined(WIN32) && !defined(__MINGW32__)
43 | #define DEBUGP debugprintf
44 | #else
45 | #define DEBUGP(args...) do { fprintf(stderr, "%s:%d ", __FILE__, __LINE__); fprintf(stderr, ## args); } while(0)
46 | #endif
47 | #else
48 | #if defined(WIN32) && !defined(__MINGW32__)
49 | #define DEBUGP dummyprintf
50 | #else
51 | #define DEBUGP(args...)
52 | #endif
53 | #endif
54 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
 1 | 2008-10-12 18:41  davi
 2 | 
 3 | 	* Applied patch by Luca Bruno fixing gcc 4.3 compilation problems.
 4 | 
 5 | 2007-08-11 20:23  davi
 6 | 
 7 | 	* Bumped release again because sf.net problems.
 8 | 
 9 | 2007-08-11 20:14  davi
10 | 
11 | 	* Updated version.
12 | 
13 | 2007-08-11 20:06  davi
14 | 
15 | 	*
16 | 	  Fixed long standing typename bug.
17 | 
18 | 2006-06-16 12:45  davi
19 | 
20 | 	* Added missing files.
21 | 
22 | 2006-06-16 12:23  davi
23 | 
24 | 	* Added missing files and incorporate some pending fixes.
25 | 
26 | 2005-03-24 00:59  davi
27 | 
28 | 	* Removed useless files.
29 | 
30 | 2005-03-24 00:58  davi
31 | 
32 | 	* Small fixes for distribution.
33 | 
34 | 2005-02-22 01:47  davi
35 | 
36 | 	* Lots of changes. Bumped version to 0.7.3.
37 | 
38 | 2004-06-17 16:17  braga
39 | 
40 | 	* Updated some documentation.
41 | 
42 | 2004-06-16 13:38  braga
43 | 
44 | 	* Commiting all changes from local cvs.
45 | 
46 | 2004-03-27 10:56  davi
47 | 
48 | 	* Added css parser.
49 | 
50 | 2004-03-25 16:33  davi
51 | 
52 | 	* Added sample htmlcxx application.
53 | 
54 | 2004-03-25 16:24  davi
55 | 
56 | 	* Initial revision
57 | 
58 | 2004-03-25 16:24  davi
59 | 
60 | 	* htmlcxx is now in CVS
61 | 
62 | 2003-12-13 23:27  davi
63 | 
64 | 	* Initial revision
65 | 
66 | 2003-12-13 23:27  davi
67 | 
68 | 	* htmlcxx - html and css APIs for C++
69 | 
70 | 


--------------------------------------------------------------------------------
/htmlcxx.spec:
--------------------------------------------------------------------------------
 1 | %define name htmlcxx
 2 | %define version 0.80
 3 | %define release 1
 4 | 
 5 | Summary: A html/css1 parser in C++
 6 | Name: %{name}
 7 | Version: %{version}
 8 | Release: %{release}
 9 | Source: %{name}-%{version}.tar.gz
10 | License: LGPL
11 | URL: http://htmlcxx.sf.net
12 | BuildArch: i386
13 | BuildRoot: %{_tmppath}/%{name}-%{version}-root
14 | 
15 | %description
16 | This is a html/css1 parser with politics created trying to mimic mozilla firefox (http://www.mozilla.org) behavior. So you should expect parse trees similar to those create by firefox. However, differently from firefox, htmlcxx does not insert non-existent stuff in your html. Therefore, serializing the DOM tree gives exactly the same bytes contained in the original HTML document.
17 | 
18 | %prep
19 | rm -Rf $RPM_BUILD_ROOT
20 | 
21 | %setup
22 | mkdir $RPM_BUILD_ROOT
23 | mkdir $RPM_BUILD_ROOT/usr
24 | %configure --prefix=/usr
25 | 
26 | %build
27 | make
28 | 
29 | %install
30 | DESTDIR=$RPM_BUILD_ROOT/ make install
31 | 
32 | %files
33 | %defattr(755,root,root)
34 | %_libdir
35 | %_bindir
36 | %_datadir
37 | 
38 | %package dev
39 | Summary: Develoopment headers
40 | 
41 | %description dev
42 | Development headers
43 | %files dev
44 | %_includedir
45 | 
46 | %changelog
47 | * Thu Jun 16 2006 Davi de Castro Reis <davi@akwan.com.br>
48 | + Version 0.80 released
49 | * Thu Jun 9 2005 Davi de Castro Reis <davi@akwan.com.br>
50 | + Created spec
51 | 


--------------------------------------------------------------------------------
/html/CharsetConverter.cc:
--------------------------------------------------------------------------------
 1 | #include <iconv.h>
 2 | #include <cstdio>
 3 | #include <cstring>
 4 | #include <cerrno>
 5 | #include "CharsetConverter.h"
 6 | 
 7 | using namespace std;
 8 | using namespace htmlcxx;
 9 | 
10 | CharsetConverter::CharsetConverter(const string &from, const string &to) throw (Exception)
11 | {
12 | 	mIconvDescriptor = iconv_open(to.c_str(), from.c_str());
13 | 	if (mIconvDescriptor == (iconv_t)(-1))
14 | 	{
15 | 		const char *error_str = strerror(errno);
16 | 		int size = strlen(error_str) + from.length() + to.length() + 26;
17 | 		char error[size];
18 | 		snprintf(error, size, "Can't convert from %s to %s: %s", from.c_str(), to.c_str(), error_str);
19 | 		throw Exception(error);
20 | 	}
21 | }
22 | 
23 | CharsetConverter::~CharsetConverter()
24 | {
25 | 	iconv_close(mIconvDescriptor);
26 | }
27 | 
28 | string CharsetConverter::convert(const string &input)
29 | {
30 | 	const char *inbuf = input.c_str();
31 | 	size_t inbytesleft = input.length();
32 | 
33 | 	size_t outbuf_len = 2 * input.length();
34 | 	char *outbuf_start = new char[outbuf_len];
35 | 	char *outbuf = outbuf_start;
36 | 	size_t outbytesleft = outbuf_len;
37 | 
38 | 	size_t ret;
39 | 	while (1) {
40 | 		ret = iconv(mIconvDescriptor, const_cast<char**>(&inbuf), &inbytesleft, &outbuf, &outbytesleft);
41 | 		if (ret == 0) break;
42 | 		if (ret == (size_t)-1 && errno == E2BIG) return string();
43 | 
44 | 		//				fprintf(stderr, "invalid byte: %d\n", inbuf - input.c_str());
45 | 
46 | 		inbuf++; inbytesleft--;
47 | 	}
48 | 
49 | 	string out(outbuf_start, outbuf_len - outbytesleft);
50 | 
51 | 	delete [] outbuf_start;
52 | 
53 | 	return out;
54 | }
55 | 


--------------------------------------------------------------------------------
/html/ParserSax.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HTML_PARSER_SAX_H__
 2 | #define __HTML_PARSER_SAX_H__
 3 | 
 4 | #include <string>
 5 | 
 6 | #include "Node.h"
 7 | 
 8 | namespace htmlcxx
 9 | {
10 | 	namespace HTML
11 | 	{
12 | 		class ParserSax
13 | 		{
14 | 			public:
15 | 				ParserSax() : mpLiteral(0), mCdata(false) {}
16 | 				virtual ~ParserSax() {}
17 | 
18 | 				/** Parse the html code */
19 | 				void parse(const std::string &html);
20 | 
21 | 				template <typename _Iterator>
22 | 				void parse(_Iterator begin, _Iterator end);
23 | 
24 | 			protected:
25 | 				// Redefine this if you want to do some initialization before
26 | 				// the parsing
27 | 				virtual void beginParsing() {}
28 | 
29 | 				virtual void foundTag(Node node, bool isEnd) {}
30 | 				virtual void foundText(Node node) {}
31 | 				virtual void foundComment(Node node) {}
32 | 
33 | 				virtual void endParsing() {}
34 | 
35 | 
36 | 				template <typename _Iterator>
37 | 				void parse(_Iterator &begin, _Iterator &end,
38 | 						std::forward_iterator_tag);
39 | 
40 | 				template <typename _Iterator>
41 | 				void parseHtmlTag(_Iterator b, _Iterator c);
42 | 
43 | 				template <typename _Iterator>
44 | 				void parseContent(_Iterator b, _Iterator c);
45 | 
46 | 				template <typename _Iterator>
47 | 				void parseComment(_Iterator b, _Iterator c);
48 | 
49 | 				template <typename _Iterator>
50 | 				_Iterator skipHtmlTag(_Iterator ptr, _Iterator end);
51 | 				
52 | 				template <typename _Iterator>
53 | 				_Iterator skipHtmlComment(_Iterator ptr, _Iterator end);
54 | 
55 | 				unsigned long mCurrentOffset;
56 | 				const char *mpLiteral;
57 | 				bool mCdata;
58 | 		};
59 | 
60 | 	}//namespace HTML
61 | }//namespace htmlcxx
62 | 
63 | #include "ParserSax.tcc"
64 | 
65 | #endif
66 | 


--------------------------------------------------------------------------------
/html/Node.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HTML_PARSER_NODE_H
 2 | #define __HTML_PARSER_NODE_H
 3 | 
 4 | #include <map>
 5 | #include <string>
 6 | #include <utility>
 7 | 
 8 | namespace htmlcxx {
 9 | 	namespace HTML {
10 | 		class Node {
11 | 
12 | 			public:
13 | 				Node() {}
14 | 				//Node(const Node &rhs); //uses default
15 | 				~Node() {}
16 | 
17 | 				inline void text(const std::string& text) { this->mText = text; }
18 | 				inline const std::string& text() const { return this->mText; }
19 | 
20 | 				inline void closingText(const std::string &text) { this->mClosingText = text; }
21 | 				inline const std::string& closingText() const { return mClosingText; }
22 | 
23 | 				inline void offset(unsigned int offset) { this->mOffset = offset; }
24 | 				inline unsigned int offset() const { return this->mOffset; }
25 | 
26 | 				inline void length(unsigned int length) { this->mLength = length; }
27 | 				inline unsigned int length() const { return this->mLength; }
28 | 
29 | 				inline void tagName(const std::string& tagname) { this->mTagName = tagname; }
30 | 				inline const std::string& tagName() const { return this->mTagName; }
31 | 
32 | 				bool isTag() const { return this->mIsHtmlTag; }
33 | 				void isTag(bool is_html_tag){ this->mIsHtmlTag = is_html_tag; }
34 | 
35 | 				bool isComment() const { return this->mComment; }
36 | 				void isComment(bool comment){ this->mComment = comment; }
37 | 
38 | 				std::pair<bool, std::string> attribute(const std::string &attr) const
39 | 				{ 
40 | 					std::map<std::string, std::string>::const_iterator i = this->mAttributes.find(attr);
41 | 					if (i != this->mAttributes.end()) {
42 | 						return make_pair(true, i->second);
43 | 					} else {
44 | 						return make_pair(false, std::string());
45 | 					}
46 | 				}
47 | 
48 | 				operator std::string() const;
49 | 				std::ostream &operator<<(std::ostream &stream) const;
50 | 
51 | 				const std::map<std::string, std::string>& attributes() const { return this->mAttributes; }
52 | 				void parseAttributes();
53 | 
54 | 				bool operator==(const Node &rhs) const;
55 | 
56 | 			protected:
57 | 
58 | 				std::string mText;
59 | 				std::string mClosingText;
60 | 				unsigned int mOffset;
61 | 				unsigned int mLength;
62 | 				std::string mTagName;
63 | 				std::map<std::string, std::string> mAttributes;
64 | 				bool mIsHtmlTag;
65 | 				bool mComment;
66 | 		};
67 | 	}
68 | }
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/css/parser_pp.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CSS_PARSER_PP_H__
 2 | #define __CSS_PARSER_PP_H__
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | #include <map>
 7 | #include <iostream>
 8 | 
 9 | namespace htmlcxx {
10 | namespace CSS {
11 | 
12 | extern const char *IE_CSS;
13 | class Parser 
14 | {
15 | 
16 | 	public:
17 | 		friend class Attribute;
18 | 
19 | 		enum PseudoClass { NONE_CLASS, LINK, VISITED, ACTIVE };
20 | 		enum PseudoElement { NONE_ELEMENT, FIRST_LETTER, FIRST_LINE };
21 | 
22 | 		class Selector 
23 | 		{
24 | 			private:
25 | 				std::string mElement;
26 | 				std::string mId;
27 | 				std::string mEClass;
28 | 				PseudoClass mPsClass;
29 | 				PseudoElement mPsElement;
30 | 
31 | 			public:
32 | 				Selector();
33 | 				Selector(const std::string& e, const std::string& i, const std::string& c, const PseudoClass& pc, const PseudoElement &pe);
34 | 				void setElement(const std::string &str);
35 | 				void setId(const std::string &str);
36 | 				void setClass(const std::string &str);
37 | 				void setPseudoClass(enum PseudoClass p);
38 | 				void setPseudoElement(enum PseudoElement p);
39 | 				bool match(const Selector& s) const;
40 | 				bool operator==(const Selector& s) const;
41 | 				bool operator<(const Selector& s) const;
42 | 				friend std::ostream& operator<<(std::ostream& out, const Selector& s);
43 | 		};
44 | 
45 | 	private:
46 | 		static bool match(const std::vector<Selector>& selector, const std::vector<Selector>& path);
47 | 
48 | 		class Attribute 
49 | 		{
50 | 			public:
51 | 				Attribute() {}
52 | 				Attribute(const std::string& v, bool i) : mVal(v), mImportant(i) {}
53 | 				std::string mVal;
54 | 				bool mImportant;
55 | 		};
56 | 
57 | 	public:
58 | 		Parser() {}
59 | 		friend std::ostream& operator<<(std::ostream& out, const std::map<std::string, Parser::Attribute>& s);
60 | 		bool parse(const std::string& css);
61 | 		bool parse(const char *buf, int buf_len);
62 | 		void merge(const Parser& p);
63 | 		std::map<std::string, std::string> getAttributes(const std::vector<Selector>& sv) const;
64 | 
65 | 		friend std::ostream& operator<<(std::ostream& out, const CSS::Parser& p);
66 | 
67 | 	private:
68 | 		typedef std::map<std::vector<Selector>, std::map<std::string, Attribute> > RuleSet;
69 | 		RuleSet mRuleSets;
70 | };
71 | 
72 | std::string pse2str(const enum Parser::PseudoElement& s);
73 | std::string psc2str(const enum Parser::PseudoClass& s);
74 | std::ostream& operator<<(std::ostream& out, const std::map<std::string, Parser::Attribute>& s);
75 | 
76 | }//namespace CSS
77 | }//namespace htmlcxx
78 | #endif
79 | 


--------------------------------------------------------------------------------
/html/Node.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cctype>
  3 | #include <algorithm>
  4 | #include "wincstring.h"
  5 | #include "Node.h"
  6 | 
  7 | //#define DEBUG
  8 | #include "debug.h"
  9 | 
 10 | using namespace std;
 11 | using namespace htmlcxx;
 12 | using namespace HTML;
 13 | 
 14 | void Node::parseAttributes() 
 15 | {
 16 | 	if (!(this->isTag())) return;
 17 | 
 18 | 	const char *end;
 19 | 	const char *ptr = mText.c_str();
 20 | 	if ((ptr = strchr(ptr, '<')) == 0) return;
 21 | 	++ptr;
 22 | 
 23 | 	// Skip initial blankspace
 24 | 	while (isspace(*ptr)) ++ptr;
 25 | 
 26 | 	// Skip tagname
 27 | 	if (!isalpha(*ptr)) return;
 28 | 	while (!isspace(*ptr) && *ptr != '>') ++ptr;
 29 | 
 30 | 	// Skip blankspace after tagname
 31 | 	while (isspace(*ptr)) ++ptr;
 32 | 
 33 | 	while (*ptr && *ptr != '>') 
 34 | 	{
 35 | 		string key, val;
 36 | 
 37 | 		// skip unrecognized
 38 | 		while (*ptr && !isalnum(*ptr) && !isspace(*ptr)) ++ptr;
 39 | 
 40 | 		// skip blankspace
 41 | 		while (isspace(*ptr)) ++ptr;
 42 | 
 43 | 		end = ptr;
 44 | 		while (isalnum(*end) || *end == '-') ++end;
 45 | 		key.assign(end - ptr, '\0');
 46 | 		transform(ptr, end, key.begin(), ::tolower);
 47 | 		ptr = end;
 48 | 
 49 | 		// skip blankspace
 50 | 		while (isspace(*ptr)) ++ptr;
 51 | 
 52 | 		if (*ptr == '=') 
 53 | 		{
 54 | 			++ptr;
 55 | 			while (isspace(*ptr)) ++ptr;
 56 | 			if (*ptr == '"' || *ptr == '\'') 
 57 | 			{
 58 | 				char quote = *ptr;
 59 | //				fprintf(stderr, "Trying to find quote: %c\n", quote);
 60 | 				const char *end = strchr(ptr + 1, quote);
 61 | 				if (end == 0)
 62 | 				{
 63 | 					//b = mText.find_first_of(" >", a+1);
 64 | 					const char *end1, *end2;
 65 | 					end1 = strchr(ptr + 1, ' ');
 66 | 					end2 = strchr(ptr + 1, '>');
 67 | 					if (end1 && end1 < end2) end = end1;
 68 | 					else end = end2;
 69 | 					if (end == 0) return;
 70 | 				}
 71 | 				const char *begin = ptr + 1;
 72 | 				while (isspace(*begin) && begin < end) ++begin;
 73 | 				const char *trimmed_end = end - 1;
 74 | 				while (isspace(*trimmed_end) && trimmed_end >= begin) --trimmed_end;
 75 | 				val.assign(begin, trimmed_end + 1);
 76 | 				ptr = end + 1;
 77 | 			}
 78 | 			else 
 79 | 			{
 80 | 				end = ptr;
 81 | 				while (*end && !isspace(*end) && *end != '>') end++;
 82 | 				val.assign(ptr, end);
 83 | 				ptr = end;
 84 | 			}
 85 | 
 86 | //			fprintf(stderr, "%s = %s\n", key.c_str(), val.c_str());
 87 | 			mAttributes.insert(make_pair(key, val));
 88 | 		}
 89 | 		else
 90 | 		{
 91 | //			fprintf(stderr, "D: %s\n", key.c_str());
 92 | 			if (!key.empty())
 93 | 			{
 94 | 				mAttributes.insert(make_pair(key, string()));
 95 | 			}
 96 | 		}
 97 | 	}
 98 | }
 99 | 
100 | bool Node::operator==(const Node &n) const 
101 | {
102 | 	if (!isTag() || !n.isTag()) return false;
103 | 	return !(strcasecmp(tagName().c_str(), n.tagName().c_str()));
104 | }
105 | 
106 | Node::operator string() const {
107 | 	if (isTag()) return this->tagName();
108 | 	return this->text();
109 | }
110 | 
111 | ostream &Node::operator<<(ostream &stream) const {
112 | 	stream << (string)(*this);
113 | 	return stream;
114 | }
115 | 


--------------------------------------------------------------------------------
/css/css_syntax.h:
--------------------------------------------------------------------------------
  1 | /* A Bison parser, made by GNU Bison 3.0.4.  */
  2 | 
  3 | /* Bison interface for Yacc-like parsers in C
  4 | 
  5 |    Copyright (C) 1984, 1989-1990, 2000-2015 Free Software Foundation, Inc.
  6 | 
  7 |    This program is free software: you can redistribute it and/or modify
  8 |    it under the terms of the GNU General Public License as published by
  9 |    the Free Software Foundation, either version 3 of the License, or
 10 |    (at your option) any later version.
 11 | 
 12 |    This program is distributed in the hope that it will be useful,
 13 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 |    GNU General Public License for more details.
 16 | 
 17 |    You should have received a copy of the GNU General Public License
 18 |    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 19 | 
 20 | /* As a special exception, you may create a larger work that contains
 21 |    part or all of the Bison parser skeleton and distribute that work
 22 |    under terms of your choice, so long as that work isn't itself a
 23 |    parser generator using the skeleton or a modified version thereof
 24 |    as a parser skeleton.  Alternatively, if you modify or redistribute
 25 |    the parser skeleton itself, you may (at your option) remove this
 26 |    special exception, which will cause the skeleton and the resulting
 27 |    Bison output files to be licensed under the GNU General Public
 28 |    License without this special exception.
 29 | 
 30 |    This special exception was added by the Free Software Foundation in
 31 |    version 2.2 of Bison.  */
 32 | 
 33 | #ifndef YY_YY_CSS_SYNTAX_H_INCLUDED
 34 | # define YY_YY_CSS_SYNTAX_H_INCLUDED
 35 | /* Debug traces.  */
 36 | #ifndef YYDEBUG
 37 | # define YYDEBUG 0
 38 | #endif
 39 | #if YYDEBUG
 40 | extern int yydebug;
 41 | #endif
 42 | 
 43 | /* Token type.  */
 44 | #ifndef YYTOKENTYPE
 45 | # define YYTOKENTYPE
 46 |   enum yytokentype
 47 |   {
 48 |     IMPORT_SYM = 258,
 49 |     IMPORTANT_SYM = 259,
 50 |     IDENT = 260,
 51 |     STRING = 261,
 52 |     NUMBER = 262,
 53 |     PERCENTAGE = 263,
 54 |     LENGTH = 264,
 55 |     EMS = 265,
 56 |     EXS = 266,
 57 |     PSCLASS_AFTER_IDENT = 267,
 58 |     HASH_AFTER_IDENT = 268,
 59 |     CLASS_AFTER_IDENT = 269,
 60 |     PSCLASS = 270,
 61 |     VISITED_PSCLASS = 271,
 62 |     ACTIVE_PSCLASS = 272,
 63 |     FIRST_LINE = 273,
 64 |     FIRST_LETTER = 274,
 65 |     HASH = 275,
 66 |     CLASS = 276,
 67 |     URL = 277,
 68 |     RGB = 278,
 69 |     CDO = 279,
 70 |     CDC = 280,
 71 |     CSL = 281
 72 |   };
 73 | #endif
 74 | /* Tokens.  */
 75 | #define IMPORT_SYM 258
 76 | #define IMPORTANT_SYM 259
 77 | #define IDENT 260
 78 | #define STRING 261
 79 | #define NUMBER 262
 80 | #define PERCENTAGE 263
 81 | #define LENGTH 264
 82 | #define EMS 265
 83 | #define EXS 266
 84 | #define PSCLASS_AFTER_IDENT 267
 85 | #define HASH_AFTER_IDENT 268
 86 | #define CLASS_AFTER_IDENT 269
 87 | #define PSCLASS 270
 88 | #define VISITED_PSCLASS 271
 89 | #define ACTIVE_PSCLASS 272
 90 | #define FIRST_LINE 273
 91 | #define FIRST_LETTER 274
 92 | #define HASH 275
 93 | #define CLASS 276
 94 | #define URL 277
 95 | #define RGB 278
 96 | #define CDO 279
 97 | #define CDC 280
 98 | #define CSL 281
 99 | 
100 | /* Value type.  */
101 | #if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
102 | 
103 | union YYSTYPE
104 | {
105 | #line 15 "css_syntax.y" /* yacc.c:1909  */
106 | 
107 | 	char *lexeme;
108 | 	char letter;
109 | 	struct property_t *property;
110 | 	struct selector_t *selector;
111 | 	struct selector_list_t *selector_list;
112 | 	int pseudo_class;
113 | 	int pseudo_element;
114 | 
115 | #line 116 "css_syntax.h" /* yacc.c:1909  */
116 | };
117 | 
118 | typedef union YYSTYPE YYSTYPE;
119 | # define YYSTYPE_IS_TRIVIAL 1
120 | # define YYSTYPE_IS_DECLARED 1
121 | #endif
122 | 
123 | 
124 | 
125 | int yyparse (void *yyparam);
126 | 
127 | #endif /* !YY_YY_CSS_SYNTAX_H_INCLUDED  */
128 | 


--------------------------------------------------------------------------------
/html/Uri.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HTMLCXX__URI_H__
 2 | #define __HTMLCXX__URI_H__
 3 | 
 4 | #include <string>
 5 | #include <climits>
 6 | #include <stdexcept>
 7 | 
 8 | namespace htmlcxx
 9 | {
10 | 	class Uri
11 | 	{
12 | 		public:
13 | 
14 | 			class Exception : public std::runtime_error
15 | 			{
16 | 				public:
17 | 					Exception(const std::string &arg)
18 | 						: std::runtime_error(arg) {}
19 | 			};
20 | 			
21 | 			Uri();
22 | 			Uri(const std::string &uri);
23 | 			~Uri();
24 | 
25 | 			static std::string encode(const std::string &uri);
26 | 			static std::string decode(const std::string &uri);
27 | 
28 | 			std::string unparse(int flags = 0) const;
29 | 			Uri absolute(const Uri &base) const;
30 | 
31 | 			std::string canonicalHostname(unsigned int maxDepth = UINT_MAX) const;
32 | 			unsigned int hostnameDepth() const;
33 | 
34 | 			static const unsigned int URI_FTP_DEFAULT_PORT =          21 ; /**< default FTP port */
35 | 			static const unsigned int URI_SSH_DEFAULT_PORT =          22 ; /**< default SSH port */
36 | 			static const unsigned int URI_TELNET_DEFAULT_PORT =       23 ; /**< default telnet port */
37 | 			static const unsigned int URI_GOPHER_DEFAULT_PORT =       70 ; /**< default Gopher port */
38 | 			static const unsigned int URI_HTTP_DEFAULT_PORT =         80 ; /**< default HTTP port */
39 | 			static const unsigned int URI_POP_DEFAULT_PORT =         110 ; /**< default POP port */
40 | 			static const unsigned int URI_NNTP_DEFAULT_PORT =        119 ; /**< default NNTP port */
41 | 			static const unsigned int URI_IMAP_DEFAULT_PORT =        143 ; /**< default IMAP port */
42 | 			static const unsigned int URI_PROSPERO_DEFAULT_PORT =    191 ; /**< default Prospero port */
43 | 			static const unsigned int URI_WAIS_DEFAULT_PORT =        210 ; /**< default WAIS port */
44 | 			static const unsigned int URI_LDAP_DEFAULT_PORT =        389 ; /**< default LDAP port */
45 | 			static const unsigned int URI_HTTPS_DEFAULT_PORT =       443 ; /**< default HTTPS port */
46 | 			static const unsigned int URI_RTSP_DEFAULT_PORT =        554 ; /**< default RTSP port */
47 | 			static const unsigned int URI_SNEWS_DEFAULT_PORT =       563 ; /**< default SNEWS port */
48 | 			static const unsigned int URI_ACAP_DEFAULT_PORT =        674 ; /**< default ACAP port */
49 | 			static const unsigned int URI_NFS_DEFAULT_PORT =        2049 ; /**< default NFS port */
50 | 			static const unsigned int URI_TIP_DEFAULT_PORT =        3372 ; /**< default TIP port */
51 | 			static const unsigned int URI_SIP_DEFAULT_PORT =        5060 ; /**< default SIP port */
52 | 
53 | 			const static int REMOVE_WWW_PREFIX = 1;
54 | 			const static int REMOVE_TRAILING_BAR = 2;
55 | 			const static int REMOVE_SCHEME = 8;
56 | 			const static int REMOVE_QUERY_VALUES = 16;
57 | 			const static int REMOVE_QUERY = 32;
58 | 			const static int REMOVE_DEFAULT_FILENAMES = 64;
59 | 			const static int REMOVE_FRAGMENT = 128;
60 | 
61 | 			std::string scheme() const;
62 | 			void scheme(std::string scheme);
63 | 			std::string user() const;
64 | 			void user(std::string user);
65 | 			std::string password() const;
66 | 			void password(std::string password);
67 | 			std::string hostname() const;
68 | 			void hostname(std::string hostname);
69 | 			std::string path() const;
70 | 			void path(std::string path);
71 | 			std::string query() const;
72 | 			void query(std::string query);
73 | 			std::string fragment() const;
74 | 			void fragment(std::string fragment);
75 | 			unsigned int port() const;
76 | 			void port(unsigned int port);
77 | 			bool existsFragment() const;
78 | 			void existsFragment(bool existsFragment);
79 | 			bool existsQuery() const;
80 | 			void existsQuery(bool existsQuery);
81 | 		protected:
82 | 			void init(const std::string &uri_str);
83 | 			
84 | 			std::string mScheme;
85 | 			std::string mUser;
86 | 			std::string mPassword;
87 | 			std::string mHostname;
88 | 			std::string mPath;
89 | 			std::string mQuery;
90 | 			std::string mFragment;
91 | 			std::string mPortStr;;
92 | 			bool mExistsQuery;
93 | 			bool mExistsFragment;
94 | 			unsigned int mPort;
95 | 	};
96 | }
97 | 
98 | #endif
99 | 


--------------------------------------------------------------------------------
/css/css_lex.l:
--------------------------------------------------------------------------------
  1 | %{
  2 | 
  3 | #include <string.h>
  4 | #include "css_syntax.h"
  5 | 
  6 | #define YY_DECL int yylex(YYSTYPE *lvalp)
  7 | 
  8 | %}
  9 | 
 10 | %option noyywrap
 11 | 
 12 | unicode		\\[0-9a-f]{1,4}
 13 | latin1		[¡-ÿ]
 14 | escape		{unicode}|\\[ -~¡-ÿ]
 15 | stringchar	{escape}|{latin1}|[ !#$%&(-~]
 16 | nmstrt		[a-z_]|{latin1}|{escape}
 17 | nmchar		[-a-z0-9_]|{latin1}|{escape}
 18 | ident		{nmstrt}{nmchar}*
 19 | name		{nmchar}+
 20 | d		[0-9]
 21 | notnm		[^-a-z0-9\\]|{latin1}
 22 | w		[ \t\r\n]*
 23 | num		{d}+|{d}*\.{d}+
 24 | string		\"({stringchar}|\')*\"|\'({stringchar}|\")*\'
 25 | 
 26 | %x COMMENT
 27 | %s AFTER_IDENT
 28 | 
 29 | %%
 30 | 
 31 | "/*"				{BEGIN(COMMENT);}
 32 | <COMMENT>[^*]*		/* eat anything that's not a '*' */
 33 | <COMMENT>"*"+[^*/]*	/* eat up '*'s not followed by '/'s */
 34 | <COMMENT>"*"+"/"        BEGIN(0);
 35 | @import				{BEGIN(0); return IMPORT_SYM;}
 36 | "!"{w}important			{BEGIN(0); return IMPORTANT_SYM;}
 37 | {ident}				{
 38 | 						BEGIN(AFTER_IDENT);
 39 | 						lvalp->lexeme = strdup(yytext);
 40 | 						return IDENT;
 41 | 					}
 42 | {string}			{
 43 | 						BEGIN(0); 
 44 | 						lvalp->lexeme = strdup(yytext);
 45 | 						return STRING;
 46 | 					}
 47 | 
 48 | {num}				{
 49 | 						BEGIN(0); 
 50 | 						lvalp->lexeme = strdup(yytext);
 51 | 						return NUMBER;
 52 | 					}
 53 | {num}"%"			{
 54 | 						BEGIN(0);
 55 | 						lvalp->lexeme = strdup(yytext);
 56 | 						return PERCENTAGE;
 57 | 					}
 58 | {num}pt/{notnm}			{
 59 | 							BEGIN(0); 
 60 | 							lvalp->lexeme = strdup(yytext);
 61 | 							return LENGTH;
 62 | 						}
 63 | {num}mm/{notnm}			{
 64 | 							BEGIN(0); 
 65 | 							lvalp->lexeme = strdup(yytext);
 66 | 							return LENGTH;
 67 | 						}
 68 | {num}cm/{notnm}			{
 69 | 							BEGIN(0); 
 70 | 							lvalp->lexeme = strdup(yytext);
 71 | 							return LENGTH;
 72 | 						}
 73 | {num}pc/{notnm}			{
 74 | 							BEGIN(0); 
 75 | 							lvalp->lexeme = strdup(yytext);
 76 | 							return LENGTH;
 77 | 						}
 78 | {num}in/{notnm}			{
 79 | 							BEGIN(0); 
 80 | 							lvalp->lexeme = strdup(yytext);
 81 | 							return LENGTH;
 82 | 						}
 83 | {num}px/{notnm}			{
 84 | 							BEGIN(0); 
 85 | 							lvalp->lexeme = strdup(yytext);
 86 | 							return LENGTH;
 87 | 						}
 88 | {num}em/{notnm}			{
 89 | 							BEGIN(0); 
 90 | 							lvalp->lexeme = strdup(yytext);
 91 | 							return EMS;
 92 | 						}
 93 | {num}ex/{notnm}			{
 94 | 							BEGIN(0); 
 95 | 							lvalp->lexeme = strdup(yytext);
 96 | 							return EXS;
 97 | 						}
 98 | 
 99 | <AFTER_IDENT>"#"{name}		{
100 | 								lvalp->lexeme = strdup(yytext+1);
101 | 								return HASH_AFTER_IDENT;
102 | 							}
103 | <AFTER_IDENT>"."{name}		{
104 | 								lvalp->lexeme = strdup(yytext+1);
105 | 								return CLASS_AFTER_IDENT;
106 | 							}
107 | 
108 | "#"{name}			{
109 | 						BEGIN(AFTER_IDENT);
110 | 						lvalp->lexeme = strdup(yytext+1);
111 | 						return HASH;
112 | 					}
113 | "."{name}			{
114 | 						BEGIN(AFTER_IDENT);
115 | 						lvalp->lexeme = strdup(yytext+1);
116 | 						return CLASS;
117 | 					}
118 | 
119 | url\({w}{string}{w}\)					|
120 | url\({w}([^ \r\n\'\")]|\\\ |\\\'|\\\"|\\\))+{w}\)		{
121 | 														BEGIN(0); 
122 | 														lvalp->lexeme = 
123 | 															strdup(yytext);
124 | 														return URL;
125 | 													}
126 | rgb\({w}{num}%?{w}\,{w}{num}%?{w}\,{w}{num}%?{w}\)	{
127 | 														BEGIN(0);
128 | 														lvalp->lexeme = 
129 | 															strdup(yytext);
130 | 														return RGB;
131 | 													}
132 | 
133 | [-/+{};,#:]			{BEGIN(0); return *yytext;}
134 | [ \t\r]+				{BEGIN(0); /* ignore whitespace */}
135 | \n				{BEGIN(0); /* ignore whitespace */}
136 | \<\!\-\-			{BEGIN(0); return CDO;}
137 | \-\-\>				{BEGIN(0); return CDC;}
138 | \/\/				{BEGIN(0); return CSL;}
139 | .				{/*fprintf(stderr, "Illegal character (%d)\n", *yytext);*/}
140 | 
141 | %%
142 | 
143 | int init_yylex(const char *buffer, int buf_len) {
144 | 	yy_scan_bytes(buffer, buf_len);
145 | }
146 | 
147 | int end_yylex() {
148 | 	yy_delete_buffer(YY_CURRENT_BUFFER);
149 | }
150 | 


--------------------------------------------------------------------------------
/htmlcxxapp.vcproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="Windows-1252"?>
  2 | <VisualStudioProject
  3 | 	ProjectType="Visual C++"
  4 | 	Version="7.10"
  5 | 	Name="htmlcxxapp"
  6 | 	ProjectGUID="{BA63B620-80D6-4158-99BE-8CC89A83F1ED}"
  7 | 	Keyword="Win32Proj">
  8 | 	<Platforms>
  9 | 		<Platform
 10 | 			Name="Win32"/>
 11 | 	</Platforms>
 12 | 	<Configurations>
 13 | 		<Configuration
 14 | 			Name="Debug|Win32"
 15 | 			OutputDirectory="Debug"
 16 | 			IntermediateDirectory="Debug"
 17 | 			ConfigurationType="1"
 18 | 			CharacterSet="2">
 19 | 			<Tool
 20 | 				Name="VCCLCompilerTool"
 21 | 				Optimization="0"
 22 | 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;_USE_DEBUG_HEAP"
 23 | 				MinimalRebuild="TRUE"
 24 | 				BasicRuntimeChecks="3"
 25 | 				RuntimeLibrary="1"
 26 | 				RuntimeTypeInfo="TRUE"
 27 | 				UsePrecompiledHeader="0"
 28 | 				WarningLevel="3"
 29 | 				Detect64BitPortabilityProblems="TRUE"
 30 | 				DebugInformationFormat="4"/>
 31 | 			<Tool
 32 | 				Name="VCCustomBuildTool"/>
 33 | 			<Tool
 34 | 				Name="VCLinkerTool"
 35 | 				OutputFile="$(OutDir)/htmlcxx.exe"
 36 | 				LinkIncremental="2"
 37 | 				GenerateDebugInformation="TRUE"
 38 | 				ProgramDatabaseFile="$(OutDir)/htmlcxxapp.pdb"
 39 | 				SubSystem="1"
 40 | 				TargetMachine="1"/>
 41 | 			<Tool
 42 | 				Name="VCMIDLTool"/>
 43 | 			<Tool
 44 | 				Name="VCPostBuildEventTool"/>
 45 | 			<Tool
 46 | 				Name="VCPreBuildEventTool"/>
 47 | 			<Tool
 48 | 				Name="VCPreLinkEventTool"/>
 49 | 			<Tool
 50 | 				Name="VCResourceCompilerTool"/>
 51 | 			<Tool
 52 | 				Name="VCWebServiceProxyGeneratorTool"/>
 53 | 			<Tool
 54 | 				Name="VCXMLDataGeneratorTool"/>
 55 | 			<Tool
 56 | 				Name="VCWebDeploymentTool"/>
 57 | 			<Tool
 58 | 				Name="VCManagedWrapperGeneratorTool"/>
 59 | 			<Tool
 60 | 				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
 61 | 		</Configuration>
 62 | 		<Configuration
 63 | 			Name="Release|Win32"
 64 | 			OutputDirectory="Release"
 65 | 			IntermediateDirectory="Release"
 66 | 			ConfigurationType="1"
 67 | 			CharacterSet="2">
 68 | 			<Tool
 69 | 				Name="VCCLCompilerTool"
 70 | 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
 71 | 				RuntimeLibrary="0"
 72 | 				RuntimeTypeInfo="TRUE"
 73 | 				UsePrecompiledHeader="0"
 74 | 				WarningLevel="3"
 75 | 				Detect64BitPortabilityProblems="TRUE"
 76 | 				DebugInformationFormat="3"/>
 77 | 			<Tool
 78 | 				Name="VCCustomBuildTool"/>
 79 | 			<Tool
 80 | 				Name="VCLinkerTool"
 81 | 				OutputFile="$(OutDir)/htmlcxx.exe"
 82 | 				LinkIncremental="1"
 83 | 				GenerateDebugInformation="TRUE"
 84 | 				SubSystem="1"
 85 | 				OptimizeReferences="2"
 86 | 				EnableCOMDATFolding="2"
 87 | 				TargetMachine="1"/>
 88 | 			<Tool
 89 | 				Name="VCMIDLTool"/>
 90 | 			<Tool
 91 | 				Name="VCPostBuildEventTool"/>
 92 | 			<Tool
 93 | 				Name="VCPreBuildEventTool"/>
 94 | 			<Tool
 95 | 				Name="VCPreLinkEventTool"/>
 96 | 			<Tool
 97 | 				Name="VCResourceCompilerTool"/>
 98 | 			<Tool
 99 | 				Name="VCWebServiceProxyGeneratorTool"/>
100 | 			<Tool
101 | 				Name="VCXMLDataGeneratorTool"/>
102 | 			<Tool
103 | 				Name="VCWebDeploymentTool"/>
104 | 			<Tool
105 | 				Name="VCManagedWrapperGeneratorTool"/>
106 | 			<Tool
107 | 				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
108 | 		</Configuration>
109 | 	</Configurations>
110 | 	<References>
111 | 	</References>
112 | 	<Files>
113 | 		<Filter
114 | 			Name="Source Files"
115 | 			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
116 | 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
117 | 			<File
118 | 				RelativePath=".\htmlcxx.cc"
119 | 				FileType="0">
120 | 			</File>
121 | 			<File
122 | 				RelativePath=".\wingetopt.c">
123 | 			</File>
124 | 		</Filter>
125 | 		<Filter
126 | 			Name="Header Files"
127 | 			Filter="h;hpp;hxx;hm;inl;inc;xsd"
128 | 			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
129 | 			<File
130 | 				RelativePath=".\html\wincstring.h">
131 | 			</File>
132 | 			<File
133 | 				RelativePath=".\wingetopt.h">
134 | 			</File>
135 | 		</Filter>
136 | 		<Filter
137 | 			Name="Resource Files"
138 | 			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
139 | 			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
140 | 		</Filter>
141 | 	</Files>
142 | 	<Globals>
143 | 	</Globals>
144 | </VisualStudioProject>
145 | 


--------------------------------------------------------------------------------
/html/ParserDom.cc:
--------------------------------------------------------------------------------
  1 | #include "ParserDom.h"
  2 | #include "wincstring.h"
  3 | 
  4 | #include <iostream>
  5 | #include <vector>
  6 | 
  7 | //#define DEBUG
  8 | #include "debug.h"
  9 | 
 10 | #define TAG_NAME_MAX 10
 11 | 
 12 | using namespace std;
 13 | using namespace htmlcxx; 
 14 | using namespace HTML; 
 15 | using namespace kp; 
 16 | 
 17 | const tree<HTML::Node>& ParserDom::parseTree(const std::string &html)
 18 | {
 19 | 	this->parse(html);
 20 | 	return this->getTree();
 21 | }
 22 | void ParserDom::beginParsing()
 23 | {
 24 | 	mHtmlTree.clear();
 25 | 	tree<HTML::Node>::iterator top = mHtmlTree.begin();
 26 | 	HTML::Node lambda_node;
 27 | 	lambda_node.offset(0);
 28 | 	lambda_node.length(0);
 29 | 	lambda_node.isTag(true);
 30 | 	lambda_node.isComment(false);
 31 | 	mCurrentState = mHtmlTree.insert(top,lambda_node);
 32 | }
 33 | 
 34 | void ParserDom::endParsing()
 35 | {
 36 | 	tree<HTML::Node>::iterator top = mHtmlTree.begin();
 37 | 	top->length(mCurrentOffset);
 38 | }
 39 | 
 40 | void ParserDom::foundComment(Node node)
 41 | {
 42 | 	//Add child content node, but do not update current state
 43 | 	mHtmlTree.append_child(mCurrentState, node);
 44 | }
 45 | 
 46 | void ParserDom::foundText(Node node)
 47 | {
 48 | 	//Add child content node, but do not update current state
 49 | 	mHtmlTree.append_child(mCurrentState, node);
 50 | }
 51 | 
 52 | void ParserDom::foundTag(Node node, bool isEnd)
 53 | {
 54 | 	if (!isEnd) 
 55 | 	{
 56 | 		//append to current tree node
 57 | 		tree<HTML::Node>::iterator next_state;
 58 | 		next_state = mHtmlTree.append_child(mCurrentState, node);
 59 | 		mCurrentState = next_state;
 60 | 	} 
 61 | 	else 
 62 | 	{
 63 | 		//Look if there is a pending open tag with that same name upwards
 64 | 		//If mCurrentState tag isn't matching tag, maybe a some of its parents
 65 | 		// matches
 66 | 		vector< tree<HTML::Node>::iterator > path;
 67 | 		tree<HTML::Node>::iterator i = mCurrentState;
 68 | 		bool found_open = false;
 69 | 		while (i != mHtmlTree.begin())
 70 | 		{
 71 | #ifdef DEBUG
 72 | 			cerr << "comparing " << node.tagName() << " with " << i->tagName()<<endl<<":";
 73 | 			if (!i->tagName().length()) cerr << "Tag with no name at" << i->offset()<<";"<<i->offset()+i->length();
 74 | #endif
 75 | 			assert(i->isTag());
 76 | 			assert(i->tagName().length());
 77 | 
 78 | 			bool equal;
 79 | 			const char *open = i->tagName().c_str();
 80 | 			const char *close = node.tagName().c_str();
 81 | 			equal = !(strcasecmp(open,close));
 82 | 
 83 | 
 84 | 			if (equal) 
 85 | 			{
 86 | 				DEBUGP("Found matching tag %s\n", i->tagName().c_str());
 87 | 				//Closing tag closes this tag
 88 | 				//Set length to full range between the opening tag and
 89 | 				//closing tag
 90 | 				i->length(node.offset() + node.length() - i->offset());
 91 | 				i->closingText(node.text());
 92 | 
 93 | 				mCurrentState = mHtmlTree.parent(i);
 94 | 				found_open = true;
 95 | 				break;
 96 | 			} 
 97 | 			else 
 98 | 			{
 99 | 				path.push_back(i);
100 | 			}
101 | 
102 | 			i = mHtmlTree.parent(i);
103 | 		}
104 | 
105 | 		if (found_open)
106 | 		{
107 | 			//If match was upper in the tree, so we need to invalidate child
108 | 			//nodes that were waiting for a close
109 | 			for (unsigned int j = 0; j < path.size(); ++j)
110 | 			{
111 | //				path[j]->length(node.offset() - path[j]->offset());
112 | 				mHtmlTree.flatten(path[j]);
113 | 			}
114 | 		} 
115 | 		else 
116 | 		{
117 | 			DEBUGP("Unmatched tag %s\n", node.text().c_str());
118 | 
119 | 			// Treat as comment
120 | 			node.isTag(false);
121 | 			node.isComment(true);
122 | 			mHtmlTree.append_child(mCurrentState, node);
123 | 		}
124 | 	}
125 | }
126 | 
127 | ostream &HTML::operator<<(ostream &stream, const tree<HTML::Node> &tr) 
128 | {
129 | 
130 | 	tree<HTML::Node>::pre_order_iterator it = tr.begin();
131 | 	tree<HTML::Node>::pre_order_iterator end = tr.end();
132 | 
133 | 	int rootdepth = tr.depth(it);
134 | 	stream << "-----" << endl;
135 | 
136 | 	unsigned int n = 0;
137 | 	while ( it != end ) 
138 | 	{
139 | 
140 | 		int cur_depth = tr.depth(it);
141 | 		for(int i=0; i < cur_depth - rootdepth; ++i) stream << "  ";
142 | 		stream << n << "@";
143 | 		stream << "[" << it->offset() << ";";
144 | 		stream << it->offset() + it->length() << ") ";
145 | 		stream << (string)(*it) << endl;
146 | 		++it, ++n;
147 | 	}
148 | 	stream << "-----" << endl;
149 | 	return stream;
150 | }
151 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | htmlcxx - html and css APIs for C++
  2 | 
  3 | ---------------------------------------------
  4 | 
  5 | 
  6 | 	Description
  7 | 	===========
  8 | 
  9 | htmlcxx is a simple non-validating css1 and html parser for C++.
 10 | Although there are several other html parsers available, htmlcxx has some
 11 | characteristics that make it unique:
 12 | 
 13 | - STL like navigation of DOM tree, using excelent's tree.hh library from
 14 |   Kasper Peeters 
 15 | - It is possible to reproduce exactly, character by character, the
 16 |   original document from the parse tree
 17 | - Bundled css parser
 18 | - Optional parsing of attributes	
 19 | - C++ code that looks like C++ (not so true anymore)
 20 | - Offsets of tags/elements in the original document are stored in the
 21 |   nodes of the DOM tree
 22 | 
 23 | The parsing politics of htmlcxx were created trying to mimic mozilla
 24 | firefox (http://www.mozilla.org) behavior. So you should expect parse
 25 | trees similar to those create by firefox. However, differently from firefox,
 26 | htmlcxx does not insert non-existent stuff in your html. Therefore, serializing
 27 | the DOM tree gives exactly the same bytes contained in the original HTML
 28 | document.
 29 | 
 30 |         News for version 0.86
 31 |         =====================
 32 | 
 33 | Fixed a few compilation problems.
 34 | 
 35 |         News for version 0.85
 36 |         =====================
 37 | 
 38 | Fixed gcc 4.3 compiler errors, several minor bug fixes, improved distribution
 39 | of the css library.
 40 | 
 41 | 
 42 | 	News for version 0.7.3
 43 | 	======================
 44 | 
 45 | Added utility code to escape/decode urls as defined by RFC 2396.
 46 | Added new SAX interface. The API was slightly broken to support the new
 47 | SAX interface :-(.
 48 | Added Visual Studio 2003 projects for the WIN32 port.
 49 | 
 50 | 
 51 | 	Examples
 52 | 	========
 53 | 
 54 | Using htmlcxx is quite simple. Take a look
 55 | at this example.
 56 | 
 57 | -----------------------------------------------------------------------
 58 | 
 59 |   #include <htmlcxx/html/ParserDom.h>
 60 |   ...
 61 |   using namespace std;
 62 |   using namespace htmlcxx;
 63 |   
 64 |   //Parse some html code
 65 |   string html = "<html><body>hey</body></html>";
 66 |   HTML::ParserDom parser;
 67 |   tree<HTML::Node> dom = parser.parseTree(html);
 68 |   
 69 |   //Print whole DOM tree
 70 |   cout << dom << endl;
 71 |   
 72 |   //Dump all links in the tree
 73 |   tree<HTML::Node>::iterator it = dom.begin();
 74 |   tree<HTML::Node>::iterator end = dom.end();
 75 |   for (; it != end; ++it)
 76 |   {
 77 |      if (strcasecmp(it->tagName().c_str(), "A") == 0)
 78 |      {
 79 |        it->parseAttributes();
 80 |        cout << it->attribute("href").second << endl;
 81 |      }
 82 |   }
 83 |   
 84 |   //Dump all text of the document
 85 |   it = dom.begin();
 86 |   end = dom.end();
 87 |   for (; it != end; ++it)
 88 |   {
 89 |     if ((!it->isTag()) && (!it->isComment()))
 90 |     {
 91 |       cout << it->text();
 92 |     }
 93 |   }
 94 |   cout << endl;
 95 | 
 96 | -------------------------------------------------
 97 | 
 98 | 
 99 | 	The htmlcxx application
100 | 	=======================
101 | 
102 | htmlcxx is the name of both the library and the utility
103 | application that comes with this package. Although the 
104 | htmlcxx (the application) is mostly useless for programming, you can use it 
105 | to easily see how htmlcxx (the library) would parse your html code.
106 | Just install and try htmlcxx -h.
107 | 
108 | 
109 | 	Downloads
110 | 	=========
111 | 
112 | Use the project page at sourceforge: http://sf.net/projects/htmlcxx
113 | 
114 | 
115 | 	License Stuff
116 | 	=============
117 | 
118 | Code is now under the LGPL. This was our initial intention, and is
119 | now possible thanks to the author of tree.hh, who allowed us to use it
120 | under LGPL only for HTML::Node template instances. Check 
121 | http://www.fsf.org or the COPYING file in the distribution for details
122 | about the LGPL license. The uri parsing code is a derivative work of
123 | Apache web server uri parsing routines. Check 
124 | www.apache.org/licenses/LICENSE-2.0 or the ASF-2.0 file in the
125 | distribution for details.
126 | 
127 | ----------------------------------------
128 | 
129 | Enjoy!
130 | 
131 | Davi de Castro Reis - <davi (a) users sf net>
132 | 
133 | Robson Braga Ara�jo - <braga (a) users sf net>
134 | 
135 | Last Updated: Tue Dec  8 20:37:41 BRST 2015
136 | 


--------------------------------------------------------------------------------
/html/profile.h:
--------------------------------------------------------------------------------
  1 | #ifndef __MY_PROFILE__
  2 | #define __MY_PROFILE__
  3 | 
  4 | #ifdef PROFILE
  5 | 
  6 | #include <iostream>
  7 | #include <iomanip>
  8 | #include <sys/time.h>
  9 | 
 10 | #define USEC_PER_SEC 1000000
 11 | static void __time_val_add(struct timeval *time_, long microseconds)
 12 | {
 13 | 	if (microseconds >= 0)
 14 | 	{
 15 | 		time_->tv_usec += microseconds % USEC_PER_SEC;
 16 | 		time_->tv_sec += microseconds / USEC_PER_SEC;
 17 | 		if (time_->tv_usec >= USEC_PER_SEC)
 18 | 		{
 19 | 			time_->tv_usec -= USEC_PER_SEC;
 20 | 			time_->tv_sec++;
 21 | 		}
 22 | 	}
 23 | 	else
 24 | 	{
 25 | 		microseconds *= -1;
 26 | 		time_->tv_usec -= microseconds % USEC_PER_SEC;
 27 | 		time_->tv_sec -= microseconds / USEC_PER_SEC;
 28 | 		if (time_->tv_usec < 0)
 29 | 		{
 30 | 			time_->tv_usec += USEC_PER_SEC;
 31 | 			time_->tv_sec--;
 32 | 		}
 33 | 	}
 34 | }
 35 | 
 36 | 	
 37 | 
 38 | #define PROFILE_DECLARE_STATIC(x) static struct timeval __profile_start##x = {0,0}, __profile_total##x = {0,0}, __profile_end##x = {0,0};
 39 | #define PROFILE_DECLARE(x) struct timeval __profile_start##x = {0,0}, __profile_total##x = {0,0}, __profile_end##x = {0,0};
 40 | 
 41 | #define PROFILE_START(x) do { gettimeofday(&__profile_start##x, NULL); } while(0);
 42 | #define PROFILE_END(x) do { gettimeofday(&__profile_end##x, NULL); } while(0);
 43 | #define PROFILE_ADD(x) do { if(!(__profile_end##x.tv_sec > __profile_start##x.tv_sec || (__profile_end##x.tv_sec == __profile_start##x.tv_sec && __profile_end##x.tv_usec >= __profile_start##x.tv_usec))) {\
 44 | 								break;\
 45 | 							}\
 46 | 							__time_val_add(&(__profile_total##x), (__profile_end##x.tv_sec - __profile_start##x.tv_sec)*USEC_PER_SEC);\
 47 | 							__time_val_add(&(__profile_total##x), __profile_end##x.tv_usec);\
 48 | 							__time_val_add(&(__profile_total##x), __profile_start##x.tv_usec * -1);\
 49 | 						   } while(0);
 50 | #define PROFILE_END_ADD(x) do { PROFILE_END(x); PROFILE_ADD(x); } while(0);
 51 | #define PROFILE_CLEAR(x) do { __profile_start##x.tv_sec = 0, __profile_start##x.tv_usec = 0, __profile_total##x.tv_sec = 0, __profile_total##x.tv_usec = 0, __profile_end##x.tv_sec = 0, __profile_end##x.tv_usec = 0; } while(0);
 52 | 
 53 | #define __PROFILE_PRINT(prefix, stream, timeval) do {\
 54 | 	int microsec = 0;\
 55 | 	int sec = 0;\
 56 | 	int min = 0;\
 57 | 	int hour = 0;\
 58 | 	/*assert(timeval.tv_sec >= 0);*/\
 59 | 	hour = timeval.tv_sec / 3600;\
 60 | 	min = (timeval.tv_sec  % 3600)/60;\
 61 | 	sec = timeval.tv_sec % 60;\
 62 | 	microsec = timeval.tv_usec;\
 63 | 	stream << prefix;\
 64 | 	stream << hour << "h" << min << "m" << sec << "s " << microsec << " microseconds";\
 65 | 	} while (0);
 66 | 
 67 | #define PROFILE_PRINT(x, prefix, stream) __PROFILE_PRINT(prefix, stream, __profile_total##x)
 68 | 
 69 | #define PROFILE_PRINT_CURRENT(x, prefix, stream) do {\
 70 | 	struct timeval sum;\
 71 | 	sum.tv_sec = 0;\
 72 | 	sum.tv_usec = 0;\
 73 | 	if(!(__profile_end##x.tv_sec > __profile_start##x.tv_sec || (__profile_end##x.tv_sec == __profile_start##x.tv_sec && __profile_end##x.tv_usec >= __profile_start##x.tv_usec))) {\
 74 | 		fprintf(stderr, "Refusing to print invalid profile measure\n");\
 75 | 		break;\
 76 | 	}\
 77 | 	__time_val_add(&sum, (__profile_end##x.tv_sec - __profile_start##x.tv_sec)*1000000);\
 78 | 	__time_val_add(&sum, __profile_end##x.tv_usec);\
 79 | 	__time_val_add(&sum, __profile_start##x.tv_usec * -1);\
 80 | 	__PROFILE_PRINT(prefix, stream, sum);\
 81 | 	} while(0);
 82 | 
 83 | #define PROFILE_PRINT_CURRENT_NL(x, prefix, stream) do {\
 84 | 	PROFILE_PRINT_CURRENT(x, prefix, stream);\
 85 | 	stream << std::endl;\
 86 | 	} while(0);
 87 | #define PROFILE_PRINT_NL(x, prefix, stream) do {\
 88 | 	PROFILE_PRINT(x, prefix, stream);\
 89 | 	stream << std::endl;\
 90 | 	} while(0);
 91 | 
 92 | #else
 93 | #define PROFILE_DECLARE(x)
 94 | #define PROFILE_DECLARE_STATIC(x)
 95 | #define PROFILE_START(x)
 96 | #define PROFILE_END(x)
 97 | #define PROFILE_ADD(x)
 98 | #define PROFILE_END_ADD(x)
 99 | #define PROFILE_PRINT(x, prefix, stream) 
100 | #define PROFILE_PRINT_NL(x, prefix, stream)
101 | #define PROFILE_PRINT_CURRENT(x, prefix, stream) 
102 | #define PROFILE_PRINT_CURRENT_NL(x, prefix, stream)
103 | #define PROFILE_CLEAR(x)
104 | #endif
105 | 
106 | #endif //__MY_PROFILE__
107 | 
108 | 
109 | /*
110 | fprintf(stderr, "Profiler ignoring invalid time measure from %d@%s\n", __LINE__, __FILE__);\
111 | 		fprintf(stderr, "End: sec %u usec %u Start: sec %u usec %u\n", __profile_end##x.tv_sec, __profile_end##x.tv_usec, __profile_start##x.tv_sec, __profile_start##x.tv_usec);\
112 | */
113 | 


--------------------------------------------------------------------------------
/htmlcxx.vcproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="Windows-1252"?>
  2 | <VisualStudioProject
  3 | 	ProjectType="Visual C++"
  4 | 	Version="7.10"
  5 | 	Name="htmlcxx"
  6 | 	ProjectGUID="{63D82F56-4B5F-4616-A3D7-2E6A4A752316}"
  7 | 	Keyword="Win32Proj">
  8 | 	<Platforms>
  9 | 		<Platform
 10 | 			Name="Win32"/>
 11 | 	</Platforms>
 12 | 	<Configurations>
 13 | 		<Configuration
 14 | 			Name="Debug|Win32"
 15 | 			OutputDirectory="Debug"
 16 | 			IntermediateDirectory="Debug"
 17 | 			ConfigurationType="4"
 18 | 			CharacterSet="2">
 19 | 			<Tool
 20 | 				Name="VCCLCompilerTool"
 21 | 				Optimization="0"
 22 | 				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;_USE_DEBUG_HEAP"
 23 | 				MinimalRebuild="TRUE"
 24 | 				BasicRuntimeChecks="3"
 25 | 				RuntimeLibrary="1"
 26 | 				RuntimeTypeInfo="TRUE"
 27 | 				UsePrecompiledHeader="0"
 28 | 				WarningLevel="3"
 29 | 				Detect64BitPortabilityProblems="TRUE"
 30 | 				DebugInformationFormat="4"/>
 31 | 			<Tool
 32 | 				Name="VCCustomBuildTool"/>
 33 | 			<Tool
 34 | 				Name="VCLibrarianTool"
 35 | 				OutputFile="$(OutDir)/htmlcxx.lib"/>
 36 | 			<Tool
 37 | 				Name="VCMIDLTool"/>
 38 | 			<Tool
 39 | 				Name="VCPostBuildEventTool"/>
 40 | 			<Tool
 41 | 				Name="VCPreBuildEventTool"/>
 42 | 			<Tool
 43 | 				Name="VCPreLinkEventTool"/>
 44 | 			<Tool
 45 | 				Name="VCResourceCompilerTool"/>
 46 | 			<Tool
 47 | 				Name="VCWebServiceProxyGeneratorTool"/>
 48 | 			<Tool
 49 | 				Name="VCXMLDataGeneratorTool"/>
 50 | 			<Tool
 51 | 				Name="VCManagedWrapperGeneratorTool"/>
 52 | 			<Tool
 53 | 				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
 54 | 		</Configuration>
 55 | 		<Configuration
 56 | 			Name="Release|Win32"
 57 | 			OutputDirectory="Release"
 58 | 			IntermediateDirectory="Release"
 59 | 			ConfigurationType="4"
 60 | 			CharacterSet="2">
 61 | 			<Tool
 62 | 				Name="VCCLCompilerTool"
 63 | 				PreprocessorDefinitions="WIN32;NDEBUG;_LIB"
 64 | 				RuntimeLibrary="0"
 65 | 				RuntimeTypeInfo="TRUE"
 66 | 				UsePrecompiledHeader="0"
 67 | 				WarningLevel="3"
 68 | 				Detect64BitPortabilityProblems="TRUE"
 69 | 				DebugInformationFormat="3"/>
 70 | 			<Tool
 71 | 				Name="VCCustomBuildTool"/>
 72 | 			<Tool
 73 | 				Name="VCLibrarianTool"
 74 | 				OutputFile="$(OutDir)/htmlcxx.lib"/>
 75 | 			<Tool
 76 | 				Name="VCMIDLTool"/>
 77 | 			<Tool
 78 | 				Name="VCPostBuildEventTool"/>
 79 | 			<Tool
 80 | 				Name="VCPreBuildEventTool"/>
 81 | 			<Tool
 82 | 				Name="VCPreLinkEventTool"/>
 83 | 			<Tool
 84 | 				Name="VCResourceCompilerTool"/>
 85 | 			<Tool
 86 | 				Name="VCWebServiceProxyGeneratorTool"/>
 87 | 			<Tool
 88 | 				Name="VCXMLDataGeneratorTool"/>
 89 | 			<Tool
 90 | 				Name="VCManagedWrapperGeneratorTool"/>
 91 | 			<Tool
 92 | 				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
 93 | 		</Configuration>
 94 | 	</Configurations>
 95 | 	<References>
 96 | 	</References>
 97 | 	<Files>
 98 | 		<Filter
 99 | 			Name="Source Files"
100 | 			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
101 | 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
102 | 			<File
103 | 				RelativePath=".\html\Extensions.cc"
104 | 				FileType="0">
105 | 			</File>
106 | 			<File
107 | 				RelativePath=".\html\Node.cc"
108 | 				FileType="0">
109 | 			</File>
110 | 			<File
111 | 				RelativePath=".\html\ParserDom.cc"
112 | 				FileType="0">
113 | 			</File>
114 | 			<File
115 | 				RelativePath=".\html\ParserSax.cc"
116 | 				FileType="0">
117 | 			</File>
118 | 			<File
119 | 				RelativePath=".\html\Uri.cc"
120 | 				FileType="0">
121 | 			</File>
122 | 			<File
123 | 				RelativePath=".\html\utils.cc"
124 | 				FileType="0">
125 | 			</File>
126 | 		</Filter>
127 | 		<Filter
128 | 			Name="Header Files"
129 | 			Filter="h;hpp;hxx;hm;inl;inc;xsd"
130 | 			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
131 | 			<File
132 | 				RelativePath=".\html\ci_string.h">
133 | 			</File>
134 | 			<File
135 | 				RelativePath=".\html\debug.h">
136 | 			</File>
137 | 			<File
138 | 				RelativePath=".\html\Extensions.h">
139 | 			</File>
140 | 			<File
141 | 				RelativePath=".\html\Node.h">
142 | 			</File>
143 | 			<File
144 | 				RelativePath=".\html\ParserSax.h">
145 | 			</File>
146 | 			<File
147 | 				RelativePath=".\html\tld.h">
148 | 			</File>
149 | 			<File
150 | 				RelativePath=".\html\tree.h">
151 | 			</File>
152 | 			<File
153 | 				RelativePath=".\html\Uri.h">
154 | 			</File>
155 | 			<File
156 | 				RelativePath=".\html\utils.h">
157 | 			</File>
158 | 		</Filter>
159 | 		<Filter
160 | 			Name="Resource Files"
161 | 			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
162 | 			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
163 | 		</Filter>
164 | 		<File
165 | 			RelativePath=".\ReadMe.txt">
166 | 		</File>
167 | 	</Files>
168 | 	<Globals>
169 | 	</Globals>
170 | </VisualStudioProject>
171 | 


--------------------------------------------------------------------------------
/html/tld.h:
--------------------------------------------------------------------------------
  1 | /** Do not modify. This file is automatically generated
  2 |   * using gen_tld.pl and tld.list
  3 |   */
  4 | 
  5 | static const char *tld[] = {
  6 | 	".LOCALHOST",
  7 | 	".INVALID",
  8 | 	".EXAMPLE",
  9 | 	".COOP.BR",
 10 | 	".ZLG.BR",
 11 | 	".VET.BR",
 12 | 	".TUR.BR",
 13 | 	".TRD.BR",
 14 | 	".TMP.BR",
 15 | 	".SRV.BR",
 16 | 	".SLG.BR",
 17 | 	".REC.BR",
 18 | 	".QSL.BR",
 19 | 	".PSI.BR",
 20 | 	".PSC.BR",
 21 | 	".PRO.BR",
 22 | 	".PPG.BR",
 23 | 	".ORG.BR",
 24 | 	".ODO.BR",
 25 | 	".NTR.BR",
 26 | 	".NOT.BR",
 27 | 	".NOM.BR",
 28 | 	".NET.BR",
 29 | 	".MUSEUM",
 30 | 	".MUS.BR",
 31 | 	".MIL.BR",
 32 | 	".MED.BR",
 33 | 	".MAT.BR",
 34 | 	".LEL.BR",
 35 | 	".JOR.BR",
 36 | 	".INF.BR",
 37 | 	".IND.BR",
 38 | 	".IMB.BR",
 39 | 	".GOV.BR",
 40 | 	".GGF.BR",
 41 | 	".G12.BR",
 42 | 	".FST.BR",
 43 | 	".FOT.BR",
 44 | 	".FND.BR",
 45 | 	".FAR.BR",
 46 | 	".ETI.BR",
 47 | 	".ETC.BR",
 48 | 	".ESP.BR",
 49 | 	".ENG.BR",
 50 | 	".EDU.BR",
 51 | 	".ECN.BR",
 52 | 	".COM.BR",
 53 | 	".CNT.BR",
 54 | 	".CNG.BR",
 55 | 	".CIM.BR",
 56 | 	".BMD.BR",
 57 | 	".BIO.BR",
 58 | 	".ATO.BR",
 59 | 	".ART.BR",
 60 | 	".ARQ.BR",
 61 | 	".AGR.BR",
 62 | 	".ADV.BR",
 63 | 	".ADM.BR",
 64 | 	".TV.BR",
 65 | 	".FM.BR",
 66 | 	".AM.BR",
 67 | 	".TEST",
 68 | 	".NAME",
 69 | 	".INFO",
 70 | 	".COOP",
 71 | 	".ARPA",
 72 | 	".AERO",
 73 | 	".PRO",
 74 | 	".ORG",
 75 | 	".NET",
 76 | 	".MIL",
 77 | 	".INT",
 78 | 	".GOV",
 79 | 	".EDU",
 80 | 	".COM",
 81 | 	".BIZ",
 82 | 	".ZW",
 83 | 	".ZM",
 84 | 	".ZA",
 85 | 	".YU",
 86 | 	".YT",
 87 | 	".YE",
 88 | 	".WS",
 89 | 	".WF",
 90 | 	".VU",
 91 | 	".VN",
 92 | 	".VI",
 93 | 	".VG",
 94 | 	".VE",
 95 | 	".VC",
 96 | 	".VA",
 97 | 	".UZ",
 98 | 	".UY",
 99 | 	".US",
100 | 	".UM",
101 | 	".UK",
102 | 	".UG",
103 | 	".UA",
104 | 	".TZ",
105 | 	".TW",
106 | 	".TV",
107 | 	".TT",
108 | 	".TR",
109 | 	".TP",
110 | 	".TO",
111 | 	".TN",
112 | 	".TM",
113 | 	".TK",
114 | 	".TJ",
115 | 	".TH",
116 | 	".TG",
117 | 	".TF",
118 | 	".TD",
119 | 	".TC",
120 | 	".SZ",
121 | 	".SY",
122 | 	".SV",
123 | 	".SU",
124 | 	".ST",
125 | 	".SR",
126 | 	".SO",
127 | 	".SN",
128 | 	".SM",
129 | 	".SL",
130 | 	".SK",
131 | 	".SJ",
132 | 	".SI",
133 | 	".SH",
134 | 	".SG",
135 | 	".SE",
136 | 	".SD",
137 | 	".SC",
138 | 	".SB",
139 | 	".SA",
140 | 	".RW",
141 | 	".RU",
142 | 	".RO",
143 | 	".RE",
144 | 	".QA",
145 | 	".PY",
146 | 	".PW",
147 | 	".PT",
148 | 	".PS",
149 | 	".PR",
150 | 	".PN",
151 | 	".PM",
152 | 	".PL",
153 | 	".PK",
154 | 	".PH",
155 | 	".PG",
156 | 	".PF",
157 | 	".PE",
158 | 	".PA",
159 | 	".OM",
160 | 	".NZ",
161 | 	".NU",
162 | 	".NR",
163 | 	".NP",
164 | 	".NO",
165 | 	".NL",
166 | 	".NI",
167 | 	".NG",
168 | 	".NF",
169 | 	".NE",
170 | 	".NC",
171 | 	".NA",
172 | 	".MZ",
173 | 	".MY",
174 | 	".MX",
175 | 	".MW",
176 | 	".MV",
177 | 	".MU",
178 | 	".MT",
179 | 	".MS",
180 | 	".MR",
181 | 	".MQ",
182 | 	".MP",
183 | 	".MO",
184 | 	".MN",
185 | 	".MM",
186 | 	".ML",
187 | 	".MK",
188 | 	".MH",
189 | 	".MG",
190 | 	".MD",
191 | 	".MC",
192 | 	".MA",
193 | 	".LY",
194 | 	".LV",
195 | 	".LU",
196 | 	".LT",
197 | 	".LS",
198 | 	".LR",
199 | 	".LK",
200 | 	".LI",
201 | 	".LC",
202 | 	".LB",
203 | 	".LA",
204 | 	".KZ",
205 | 	".KY",
206 | 	".KW",
207 | 	".KR",
208 | 	".KP",
209 | 	".KN",
210 | 	".KM",
211 | 	".KI",
212 | 	".KH",
213 | 	".KG",
214 | 	".KE",
215 | 	".JP",
216 | 	".JO",
217 | 	".JM",
218 | 	".JE",
219 | 	".IT",
220 | 	".IS",
221 | 	".IR",
222 | 	".IQ",
223 | 	".IO",
224 | 	".IN",
225 | 	".IM",
226 | 	".IL",
227 | 	".IE",
228 | 	".ID",
229 | 	".HU",
230 | 	".HT",
231 | 	".HR",
232 | 	".HN",
233 | 	".HM",
234 | 	".HK",
235 | 	".GY",
236 | 	".GW",
237 | 	".GU",
238 | 	".GT",
239 | 	".GS",
240 | 	".GR",
241 | 	".GQ",
242 | 	".GP",
243 | 	".GN",
244 | 	".GM",
245 | 	".GL",
246 | 	".GI",
247 | 	".GH",
248 | 	".GG",
249 | 	".GF",
250 | 	".GE",
251 | 	".GD",
252 | 	".GB",
253 | 	".GA",
254 | 	".FR",
255 | 	".FO",
256 | 	".FM",
257 | 	".FK",
258 | 	".FJ",
259 | 	".FI",
260 | 	".ET",
261 | 	".ES",
262 | 	".ER",
263 | 	".EH",
264 | 	".EG",
265 | 	".EE",
266 | 	".EC",
267 | 	".DZ",
268 | 	".DO",
269 | 	".DM",
270 | 	".DK",
271 | 	".DJ",
272 | 	".DE",
273 | 	".CZ",
274 | 	".CY",
275 | 	".CX",
276 | 	".CV",
277 | 	".CU",
278 | 	".CR",
279 | 	".CO",
280 | 	".CN",
281 | 	".CM",
282 | 	".CL",
283 | 	".CK",
284 | 	".CI",
285 | 	".CH",
286 | 	".CG",
287 | 	".CF",
288 | 	".CD",
289 | 	".CC",
290 | 	".CA",
291 | 	".BZ",
292 | 	".BY",
293 | 	".BW",
294 | 	".BV",
295 | 	".BT",
296 | 	".BS",
297 | 	".BR",
298 | 	".BO",
299 | 	".BN",
300 | 	".BM",
301 | 	".BJ",
302 | 	".BI",
303 | 	".BH",
304 | 	".BG",
305 | 	".BF",
306 | 	".BE",
307 | 	".BD",
308 | 	".BB",
309 | 	".BA",
310 | 	".AZ",
311 | 	".AW",
312 | 	".AU",
313 | 	".AT",
314 | 	".AS",
315 | 	".AR",
316 | 	".AQ",
317 | 	".AO",
318 | 	".AN",
319 | 	".AM",
320 | 	".AL",
321 | 	".AI",
322 | 	".AG",
323 | 	".AF",
324 | 	".AE",
325 | 	".AD",
326 | 	".AC"
327 | };
328 | static size_t tldOffset(const char *domain) {
329 | 	const char *end = domain + strlen(domain);
330 | 	for(unsigned int i = 0; i < 321; ++i) 
331 | 	{
332 | 		size_t len = strlen(tld[i]);
333 | 		if(strcasecmp(end - len, tld[i]) == 0) 
334 | 		{
335 | 			return len;
336 | 		}
337 | 	}
338 |  return 0;
339 | }
340 | 


--------------------------------------------------------------------------------
/css/default.css:
--------------------------------------------------------------------------------
  1 | /* rendered CSS1-addressable elements and all applicable non-inherited
  2 | properties set to initial values and default display types */
  3 | 
  4 | A, ABBR, ACRONYM, ADDRESS, BDO, BLOCKQUOTE, BODY, BUTTON, CITE, CODE, DD, DEL,
  5 | DFN, DIV, DL, DT, EM, FIELDSET, FORM, H1, H2, H3, H4, H5, H6, HTML, IFRAME, IMG, INS,
  6 | KBD, LABEL, LI, OBJECT, OL, P, Q, SAMP, SPAN, STRONG, SUB, SUP, UL, VAR, 
  7 | APPLET, B, BIG, CENTER, DIR, FONT, HR, I, MENU, PRE, S, SMALL, STRIKE, TT, U	{
  8 | 	background: transparent;
  9 | 	width: auto;
 10 | 	height: auto;
 11 | 	text-decoration: none;
 12 | 	margin: 0;
 13 | 	padding: 0;
 14 | 	border: 0;
 15 | 	float: none;
 16 | 	clear: none;
 17 | 	vertical-align: baseline;
 18 | 	list-style-image: none;
 19 | 	list-style-type: disc;
 20 | 	list-style-position: outside;
 21 | 	}	 
 22 | 
 23 | ADDRESS, BLOCKQUOTE, BODY, DD, DIV, DL, DT, FIELDSET, FORM, H1, H2, H3, H4, H5,
 24 | H6, OL, P, UL, CENTER, DIR, HR, MENU, PRE	{
 25 | 	display: block;
 26 | 	}
 27 | 
 28 | A, ABBR, ACRONYM, APPLET, BDO, BUTTON, CITE, CODE, DEL, DFN, EM, IFRAME, IMG,
 29 | INS, KBD, LABEL, OBJECT, Q,
 30 | SAMP, SPAN, STRONG, SUB, SUP, VAR, B, BIG, FONT, I, S, SMALL, STRIKE, TT, U	{
 31 | 
 32 | 	display: inline;
 33 | 	}
 34 | 
 35 | LI	{
 36 | 	display: list-item;
 37 | 	}
 38 | 
 39 | /* Begin tree of inherited properties and cascades. */
 40 | 
 41 | /* Describes the default type, color, and link decoration specs of
 42 | Mosaic-derivative browsers to the extent and degree of granularity that users
 43 | may typically override. Uncomment for "factory settings." */
 44 | 
 45 | HTML	{
 46 | 	font-family: "Times New Roman", Times;
 47 | 	font-size: medium; 
 48 | 	font-weight: normal;
 49 | 	color: black;
 50 | 	background-color: #BFBFBF;
 51 | 	}
 52 | 
 53 | PRE, TT, CODE, KBD, SAMP	{
 54 | 	font-family: "Courier New", Courier;
 55 | 	}
 56 | 
 57 | A:link, A:visited, A:active	{
 58 | 	text-decoration: underline;
 59 | 	}
 60 | 
 61 | A:link	{
 62 | 	color: #0000FF;
 63 | 	}
 64 | 	
 65 | A:visited	{
 66 | 	color: #7F007F;
 67 | 	}
 68 | 
 69 | A:active	{
 70 | 	color: #0000FF;
 71 | 	}
 72 | 	
 73 | /* end pre-CSS user settings */
 74 | 
 75 | HTML	{
 76 | 	line-height: 1.12;
 77 | 	word-spacing: normal;
 78 | 	letter-spacing: normal;
 79 | 	text-transform: none;
 80 | 	text-align: left;
 81 | 	text-indent: 0;
 82 | 	white-space: normal;
 83 | 	}
 84 | 
 85 | BODY	{
 86 | 	padding: 8px;
 87 | 	}
 88 | 
 89 | H1	{
 90 | 	font-size: 2em;
 91 | 	margin: .67em 0; 
 92 | 	}
 93 | 
 94 | H2	{ 
 95 | 	font-size: 1.5em;
 96 | 	margin: .75em 0; 
 97 | 	}
 98 | 
 99 | H3	{ 
100 | 	font-size: 1.17em;
101 | 	margin: .83em 0; 
102 | 	}
103 | 
104 | H4, P, BLOCKQUOTE, FIELDSET, FORM, UL, OL, DL, DIR, MENU	{ 
105 | 	margin: 1.12em 0; 
106 | 	}
107 | 
108 | H5	{
109 | 	font-size: .83em; /* varies with pixels-per-em at document root */
110 | 	margin: 1.5em 0; 
111 | 	}
112 | 
113 | H6	{
114 | 	font-size: .6em; /* varies with pixels-per-em at document root */
115 | 	margin: 1.67em 0; 
116 | 	}
117 | 
118 | H1, H2, H3, H4, H5, H6, B, STRONG	{ 
119 | 	font-weight: bolder;
120 | 	}
121 | 	
122 | BLOCKQUOTE	{ 
123 | 	margin-left: 40px;
124 | 	margin-right: 40px;
125 | 	}
126 | 
127 | I, CITE, EM, VAR, ADDRESS	{ 
128 | 	font-style: italic;
129 | 	}
130 | 
131 | PRE, TT, CODE, KBD, SAMP	{ 
132 | 	font-family: monospace;
133 | 	}
134 | 
135 | PRE	{
136 | 	white-space: pre;
137 | 	}
138 | 
139 | BIG	{ 
140 | 	font-size: larger;
141 | 	}
142 | 	
143 | SMALL, SUB, SUP	{
144 | 	font-size: smaller;
145 | 	}
146 | 
147 | SUB	{
148 | 	vertical-align: sub;
149 | 	}
150 | 
151 | SUP	{
152 | 	vertical-align: super;
153 | 	}
154 | 
155 | S, STRIKE, DEL	{
156 | 	text-decoration: line-through;
157 | 	}
158 | 
159 | HR	{
160 | 	border: 1px inset; /* questionable */
161 | 	}
162 | 
163 | OL, UL, DIR, MENU, DD	{
164 | 	padding-left: 40px; 
165 | 	}
166 | 	
167 | OL LI	{
168 | 	list-style-type: decimal;
169 | 	}
170 | 	
171 | UL LI	{
172 | 	list-style-type: disc;
173 | 	}
174 | 
175 | UL UL, UL OL, UL MENU, UL DIR, MENU UL, MENU OL, MENU MENU, MENU DIR, DIR UL,
176 | DIR OL, DIR MENU, DIR DIR, OL UL, OL OL, OL MENU, OL DIR	{
177 | 	margin-top: 0;
178 | 	margin-bottom: 0;
179 | 	}
180 | 
181 | OL UL, UL UL, MENU UL, DIR UL, OL MENU, UL MENU, MENU MENU, DIR MENU, OL DIR, UL
182 | DIR, MENU DIR, DIR DIR 	{
183 |    list-style-type: circle;
184 | 	}
185 | 
186 | OL OL UL, OL UL UL, OL MENU UL, OL DIR UL, OL OL MENU, OL UL MENU, OL MENU MENU,
187 | OL DIR MENU, OL OL DIR, OL UL DIR, OL MENU DIR, OL DIR DIR, UL OL UL, UL UL UL,
188 | UL MENU UL, UL DIR UL, UL OL MENU, UL UL MENU, UL MENU MENU, UL DIR MENU, UL OL
189 | DIR, UL UL DIR, UL MENU DIR, UL DIR DIR, MENU OL UL, MENU UL UL, MENU MENU UL,
190 | MENU DIR UL, MENU OL MENU, MENU UL MENU, MENU MENU MENU, MENU DIR MENU, MENU OL
191 | DIR, MENU UL DIR, MENU MENU DIR, MENU DIR DIR, DIR OL UL, DIR UL UL, DIR MENU
192 | UL, DIR DIR UL, DIR OL MENU, DIR UL MENU, DIR MENU MENU, DIR DIR MENU, DIR OL
193 | DIR, DIR UL DIR, DIR MENU DIR, DIR DIR DIR 	{
194 | 	list-style-type: square;
195 | 	}
196 | 
197 | U, INS	{
198 | 	text-decoration: underline;
199 | 	}
200 | 
201 | CENTER	{
202 | 	text-align: center;
203 | 	}
204 | 
205 | CAPTION, COL, COLGROUP, LEGEND, TABLE, TBODY, TD, TFOOT, TH, THEAD, TR	{
206 | 	background: transparent;
207 | 	text-decoration: none;
208 | 	margin: 1px;
209 | 	padding: 1px;
210 | 	border: none;
211 | 	float: none;
212 | 	clear: none;
213 | 	}
214 | 
215 | TABLE, TBODY, TFOOT, THEAD, TR	{
216 | 	display: block;
217 | 	background-position: top left;
218 | 	width: auto;
219 | 	height: auto;
220 | 	}
221 | 
222 | CAPTION, LEGEND, TD, TH	{ 
223 | 	display: inline;
224 | 	vertical-align: baseline;
225 | 	font-size: 1em;
226 | 	line-height: 1.33em;
227 | 	color: black;
228 | 	word-spacing: normal;
229 | 	letter-spacing: normal;
230 | 	text-transform: none;
231 | 	text-align: left;
232 | 	text-indent: 0;
233 | 	white-space: normal;
234 | 	}
235 | 
236 | TH	{
237 | 	font-weight: bolder;
238 | 	text-align: center;
239 | 	}
240 | 
241 | CAPTION	{
242 | 	text-align: center;
243 | 	}
244 | 
245 | /* not part of the legacy browser default sheet, but an obvious enhancement */
246 | 
247 | OL OL LI	{
248 | 	list-style-type: lower-alpha;
249 | 	}
250 | 	
251 | OL OL OL LI	{
252 | 	list-style-type: lower-roman
253 | 	}
254 | 


--------------------------------------------------------------------------------
/htmlcxx.cc:
--------------------------------------------------------------------------------
  1 | #include "html/ParserDom.h"
  2 | #include "html/utils.h"
  3 | #include "html/wincstring.h"
  4 | #include "css/parser_pp.h"
  5 | #ifndef WIN32
  6 | #include "config.h"
  7 | #else
  8 | #define VERSION "0.6"
  9 | #endif
 10 | 
 11 | #include <cstdlib>
 12 | #include <fstream>
 13 | #include <stdexcept>
 14 | #include <iostream>
 15 | #include <cstdio>
 16 | #include <algorithm>
 17 | 
 18 | #include "wingetopt.h"
 19 | 
 20 | using namespace std;
 21 | using namespace htmlcxx;
 22 | 
 23 | void usage(string prg) {
 24 | 	cerr << "usage:\t" << prg << " [-h] [-V] file.html [file.css]" << endl;
 25 | 	return;
 26 | }
 27 | 
 28 | void usage_long(string prg) {
 29 | 	usage(prg);
 30 | 	cerr << "Html and css parser" << endl << endl;
 31 | 	cerr << "  -V\t print version number and exit" << endl;
 32 | 	cerr << "  -h\t print this help text" << endl;
 33 | 	cerr << "  -C\t disable css parsing" << endl;
 34 | 	return;
 35 | }
 36 | 
 37 | int main(int argc, char **argv) 
 38 | {
 39 | 	tree<HTML::Node> tr;
 40 | 	bool parse_css = true;
 41 | 	bool detail_print = false;
 42 | 	string css_code;
 43 | 	try 
 44 | 	{
 45 | 		while (1) 
 46 | 		{
 47 | 			signed char c = getopt(argc, argv, "hVCD");
 48 | 			if(c == -1) break;
 49 | 			switch(c) {
 50 | 				case 'h':
 51 | 					usage_long(argv[0]);
 52 | 					exit(0);
 53 | 					break;
 54 | 				case 'V':
 55 | 					cerr << VERSION << endl;
 56 | 					exit(0);
 57 | 				case 'C':
 58 | 					parse_css = false;
 59 | 					break;
 60 | 				case 'D':
 61 | 				    detail_print = true;
 62 | 				    break;
 63 | 				default:
 64 | 					usage(argv[0]);
 65 | 					exit(1);
 66 | 					break;
 67 | 			}
 68 | 		}
 69 | 
 70 | 		if (argc != optind + 1 && argc != optind + 2) 
 71 | 		{
 72 | 			usage(argv[0]);
 73 | 			exit(1);
 74 | 		}
 75 | 
 76 | 		ifstream file(argv[optind]);
 77 | 		if (!file.is_open()) 
 78 | 		{
 79 | 			cerr << "Unable to open file " << argv[optind] << endl;
 80 | 			exit(1);
 81 | 		}
 82 | 		string html;
 83 | 
 84 | 		while (1)
 85 | 		{
 86 | 			char buf[BUFSIZ];
 87 | 			file.read(buf, BUFSIZ);
 88 | 			if(file.gcount() == 0) {
 89 | 				break;
 90 | 			}
 91 | 			html.append(buf, file.gcount());
 92 | 		}
 93 | 		file.close();
 94 | 
 95 | 		if(argc == optind + 2) //we have a separate css file
 96 | 		{
 97 | 			ifstream fcss(argv[optind + 1]);
 98 | 			if(!fcss.is_open()) 
 99 | 			{
100 | 				cerr << "Unable to open file " << argv[optind] << endl;
101 | 				exit(1);
102 | 			}
103 | 			while (1)
104 | 			{
105 | 				char buf[BUFSIZ];
106 | 				fcss.read(buf, BUFSIZ);
107 | 				if(fcss.gcount() == 0) {
108 | 					break;
109 | 				}
110 | 				css_code.append(buf, fcss.gcount());
111 | 			}
112 | 			fcss.close();
113 | 		}
114 | 
115 | 
116 | 
117 | 		HTML::ParserDom parser;
118 | 		parser.parse(html);
119 | 		tr = parser.getTree();
120 | 		if (detail_print)
121 | 		{
122 | 		    cout << "-- HTML:start ------------" << endl;
123 | 		    cout << tr << endl;
124 | 		    cout << "-- HTML:end --------------" << endl;
125 | 		}
126 | 
127 |         map<string, int> count_tag;
128 |         tree<HTML::Node>::pre_order_iterator it = tr.begin();
129 |         tree<HTML::Node>::pre_order_iterator end = tr.end();
130 | 
131 |         while ( it != end )
132 |         {
133 |             string tag = (string)(*it);
134 |             if ( it->isTag() && tag.length() > 0 )
135 |             {
136 |                 transform(tag.begin(), tag.end(), tag.begin(), ::toupper);
137 |                 count_tag[tag] += 1;
138 |             }
139 |             ++it;
140 |         }
141 |         for (auto &p: count_tag)
142 |         {
143 |             // print count of html tags
144 |             cout << "html:" << p.first << ":" << p.second << endl;
145 |         }
146 |         cout << "html-count:" << count_tag.size() << endl;
147 | 
148 | 
149 | 	} catch (exception &e) {
150 | 		cerr << "Exception " << e.what() << " caught" << endl;
151 | 		exit(1);
152 | 	} catch (...) {
153 | 		cerr << "Unknow exception caught " << endl;
154 | 	}
155 | 
156 | #ifdef WIN32
157 | 		if(parse_css)
158 | 		{
159 | 			cerr << "Css parsing not supported in win32" << endl;
160 | 			return 1;
161 | 		}
162 | 		return 0;
163 | #else
164 | 	if (parse_css) try
165 | 	{
166 | 		if(!parse_css) exit(0);
167 | 
168 | 		CSS::Parser css_parser;
169 | 		tree<HTML::Node>::iterator it = tr.begin();
170 | 		tree<HTML::Node>::iterator end = tr.end();
171 | 		if(css_code.length()) {
172 | 			css_parser.parse(css_code);
173 | 		}
174 | 		cout << "----------css-------------" << endl;
175 |         cout << css_parser << endl;
176 | 		cout << "----------(css)-------------" << endl;
177 | 
178 |         map<string, int> count_css;
179 | 		//cout << "CSS attributes:" << endl;
180 | 		//cout << endl;
181 | 		while (it != end) 
182 | 		{
183 | 
184 | 			if (it->isTag()) 
185 | 			{
186 | 
187 | 				it->parseAttributes();
188 | 				vector<CSS::Parser::Selector> v;
189 | 				tree<HTML::Node>::iterator k = it;
190 | 				while (k != tr.begin()) 
191 | 				{
192 | 					CSS::Parser::Selector s;
193 | 					s.setElement(k->tagName());
194 | 					s.setId(k->attribute("id").second);
195 | 					s.setClass(k->attribute("class").second);
196 | 					s.setPseudoClass(CSS::Parser::NONE_CLASS);
197 | 					s.setPseudoElement(CSS::Parser::NONE_ELEMENT);
198 | 					v.push_back(s);
199 | 					k = tr.parent(k);
200 | 					//cout << "selector:" << k->tagName() << "," << k->attribute("id").second << "," << k->attribute("class").second << endl;
201 | 					//cout << s << endl;
202 | 				}
203 | 
204 | 				map<string, string> attributes = css_parser.getAttributes(v);
205 | 				map<string, string>::const_iterator mit = attributes.begin();
206 | 				map<string, string>::const_iterator mend = attributes.end();
207 | 
208 | 				string tag = it->tagName();
209 | 				if (detail_print)
210 | 				{
211 | 				    for(unsigned int i = 0; i < tag.size(); ++i) tag[i] = ::toupper(tag[i]);
212 | 				    cout << tag << "@[" << it->offset() << ":" << it->offset() + it->length() << ")" << endl;
213 | 				    for(; mit != mend; ++mit)
214 | 				    {
215 | 				        cout << mit->first << ": " << mit->second << endl;
216 | 				        count_css[mit->first] += 1;
217 | 				    }
218 | 				    cout << endl;
219 | 				}
220 | 				else
221 | 				{
222 |                     for(; mit != mend; ++mit)
223 |                     {
224 |                         count_css[mit->first] += 1;
225 |                     }
226 |                 }
227 | 
228 | 
229 | 				if (strcasecmp(it->tagName().c_str(), "STYLE") == 0) 
230 | 				{
231 | 					tree<HTML::Node>::iterator begin, end;
232 | 					begin = it;
233 | 					end = it;
234 | 					end.skip_children();
235 | 					++end;
236 | 					string css_snippet;
237 | 
238 | 					for (; begin != end; ++begin)
239 | 					{
240 | 						if (!(begin->isTag())) css_snippet.append(begin->text());
241 | 					}
242 | 
243 | 					css_parser.parse(css_snippet);
244 |                     cout << "----------css_snippet-------------" << endl;
245 |                     cout << css_parser << endl;
246 |                     cout << "----------(css_snippet)-------------" << endl;
247 | 				}
248 | 			}
249 | 			++it;
250 | 		}
251 | 
252 |         for (auto &p: count_css)
253 |         {
254 |             // print count of css attributes
255 |             cout << "css:" << p.first << ":" << p.second << endl;
256 |         }
257 |         cout << "css-count:" << count_css.size() << endl;
258 | 	} catch (exception &e) {
259 | 		cerr << "Exception " << e.what() << " caught" << endl;
260 | 		exit(1);
261 | 	} catch (...) {
262 | 		cerr << "Unknow exception caught " << endl;
263 | 	}
264 | 
265 | 	exit(0);
266 | #endif
267 | }
268 | 


--------------------------------------------------------------------------------
/html/tests.cc:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | #include <cstdio>
  3 | #include <cstdlib>
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include "CharsetConverter.h"
  7 | #include "Uri.h"
  8 | #include "ParserDom.h"
  9 | #include "utils.h"
 10 | 
 11 | using namespace std;
 12 | using namespace htmlcxx;
 13 | 
 14 | #define myassert(x) \
 15 | 	do {\
 16 | 		if(!(x)) {\
 17 | 			fprintf(stderr, "Test at %s:%d failed!\n", __FILE__, __LINE__);\
 18 | 			exit(1);\
 19 | 		}\
 20 | 	} while(0)
 21 | 
 22 | bool my_tree_compare(tree<HTML::Node>::iterator begin, tree<HTML::Node>::iterator end, tree<HTML::Node>::iterator ref)
 23 | {
 24 | 	tree<HTML::Node>::iterator it(begin);
 25 | 	while (it != end && ref != end)
 26 | 	{
 27 | 		if (it.number_of_children() != ref.number_of_children())
 28 | 			return false;
 29 | 		if (it->text() != ref->text())
 30 | 			return false;
 31 | 		++it;
 32 | 		++ref;
 33 | 	}
 34 | 	return true;
 35 | }
 36 | 
 37 | class HtmlTest {
 38 | 	public:
 39 | 
 40 | 	bool parse() {
 41 | 		cerr << "Parsing some html... ";
 42 | 		tree<HTML::Node> tr;
 43 | 		string html = "<head></head><body>\n\n\n\n<center>\n<table width=\"600\">\n<tbody><tr>\n<td width=\"120\"><a href=\"/index.html\"><img src=\"/adt-SUA/images/ADT_LOGO.gif\" alt=\"adt logo\" align=\"middle\" border=\"0\"></a></td>\n<td width=\"480\"><font size=\"+2\" face=\"helvetica,arial\"><b>Australian Digital Theses Program<br></b></font></td>\n</tr>\n</tbody></table>\n</center>\n<center>\n</center>\n</body>";
 44 | 
 45 | 		HTML::ParserDom parser;
 46 | 		parser.parse(html);
 47 | 		tr = parser.getTree();
 48 | 		cerr << tr << endl;
 49 | 		cerr << " ok" << endl;
 50 | 		return true;
 51 | 	}
 52 | 
 53 | 	bool string_manip() {
 54 | 
 55 | 		string root_link = "http://www.akwan.com.br/teste/acton.asp?q=atletico";
 56 | 		string root_link2 = "http://answerbook.ime.usp.br:8888/ab2";
 57 | 		string link1 = "../a.html";
 58 | 		string link2 = "//b.html";
 59 | 		string link3 = "servi&#231;o.html";
 60 | 		string link4 = "./d/c.html";
 61 | 		string link5 = "http://www.fadazan.com.br/../../../../../Download/teste/../jacobmacanhan,%203276.jpg";
 62 | 		string link6 = "search?q=galo";
 63 | 		string link7 = "http://casadebruxa.com.br/anuncio/../banner/vai.asp?id=21&url=http://www.clickdirect.com.br";
 64 | 		string link8 = "/ab2/Help_C/ONLINEACCESS/@Ab2HelpView/idmatch(help-library-info)";
 65 | 		string link9 = "/ab2/coll.67.3/@Ab2CollView?";
 66 | 		string link10 = "http://www.a.com.br";
 67 | 		string link11 = "'http://www.b.com.br";
 68 | 		string link12 = "?q=mineiro";
 69 | 		string entities = "nos somos do clube atletico mineiro &#225; &aacute; brasil &nbsp; &teste; &atilde;&auml; &aacute &acirc; &end   ";
 70 | 		string comments = "hello <!-- world --> brazil";
 71 | 		string multiblank = "  1 2  3\r\n   4    5  \r\n  6  \n";
 72 | 		string justblank = "     \r\n         \r\n    \n";
 73 | 		string nonblank = "dsadasdada";
 74 | 		myassert(HTML::strip_comments(comments) == "hello  brazil");
 75 | 		myassert(HTML::single_blank(multiblank) == "1 2 3 4 5 6");
 76 | 		myassert(HTML::single_blank(justblank) == "");
 77 | 		myassert(HTML::single_blank(nonblank) == nonblank);
 78 | 		myassert(HTML::decode_entities(entities) == "nos somos do clube atletico mineiro � � brasil   &teste; �� &aacute � &end   ");
 79 | 		myassert(HTML::convert_link(link1, root_link) == "http://www.akwan.com.br/a.html");
 80 | 		myassert(HTML::convert_link(link2, root_link) == "http://www.akwan.com.br/b.html");
 81 | 		myassert(HTML::convert_link(link3, root_link) == "http://www.akwan.com.br/teste/servi�o.html");
 82 | 		myassert(HTML::convert_link(link4, root_link) == "http://www.akwan.com.br/teste/d/c.html");
 83 | 		myassert(HTML::convert_link(link5, root_link) == "http://www.fadazan.com.br/Download/jacobmacanhan,%203276.jpg");
 84 | 		myassert(HTML::convert_link(link6, root_link) == "http://www.akwan.com.br/teste/search?q=galo");
 85 | 		myassert(HTML::convert_link(link7, root_link) == "http://casadebruxa.com.br/banner/vai.asp?id=21&url=http://www.clickdirect.com.br");
 86 | 		myassert(HTML::convert_link(link8, root_link2) == "http://answerbook.ime.usp.br:8888/ab2/Help_C/ONLINEACCESS/@Ab2HelpView/idmatch(help-library-info)");
 87 | 		myassert(HTML::convert_link(link9, root_link2) == "http://answerbook.ime.usp.br:8888/ab2/coll.67.3/@Ab2CollView?");
 88 | 		myassert(HTML::convert_link(link10, root_link2) == "http://www.a.com.br/");
 89 | 		myassert(HTML::convert_link(link11, root_link) == "http://www.akwan.com.br/teste/'http:/www.b.com.br");
 90 | 		myassert(HTML::convert_link(link12, root_link) == "http://www.akwan.com.br/teste/acton.asp?q=mineiro");
 91 | 
 92 | 		Uri root(root_link);
 93 | 		Uri fragment("#top");
 94 | 		Uri result(fragment.absolute(root));
 95 | 		myassert(result.unparse() == "http://www.akwan.com.br/teste/acton.asp?q=atletico#top");
 96 | 
 97 | 		return true;
 98 | 	}
 99 | 
100 | };
101 | 
102 | class TagInitTest
103 | {
104 | 	public:
105 | 		void test(void)
106 | 		{
107 | 			string html("<html><head><script language=javascript>if (0 < 2) saida = 1;</script></head></html>");
108 | 
109 | 			tree<HTML::Node> reference;
110 | 			HTML::Node n;
111 | 
112 | 			reference.clear();
113 | 			tree<HTML::Node>::iterator current = reference.begin();
114 | 
115 | 			n.offset(0);
116 | 			n.length(html.size());
117 | 			n.isTag(true);
118 | 			n.isComment(false);
119 | 			current = reference.insert(current,n);
120 | 
121 | 			n.offset(0);
122 | 			n.length(html.size());
123 | 			n.isTag(true);
124 | 			n.isComment(false);
125 | 			n.text("<html>");
126 | 			n.tagName("html");
127 | 			n.closingText("</html>");
128 | 			current = reference.append_child(current,n);
129 | 			
130 | 			n.offset(6);
131 | 			n.length(71);
132 | 			n.isTag(true);
133 | 			n.isComment(false);
134 | 			n.text("<head>");
135 | 			n.tagName("head");
136 | 			n.closingText("</head>");
137 | 			current = reference.append_child(current,n);
138 | 
139 | 			n.offset(12);
140 | 			n.length(58);
141 | 			n.isTag(true);
142 | 			n.isComment(false);
143 | 			n.text("<script language=javascript>");
144 | 			n.tagName("script");
145 | 			n.closingText("</script>");
146 | 			current = reference.append_child(current,n);
147 | 
148 | 			n.offset(40);
149 | 			n.length(21);
150 | 			n.isTag(false);
151 | 			n.isComment(false);
152 | 			n.text("if (0 < 2) saida = 1;");
153 | 			n.tagName("if (0 < 2) saida = 1;");
154 | 			n.closingText("");
155 | 			current = reference.append_child(current,n);
156 | 
157 | 			tree<HTML::Node> tr;
158 | 			HTML::ParserDom parser;
159 | 			parser.parse(html);
160 | 			tr = parser.getTree();
161 | //			cerr << reference << endl;
162 | //			cerr << tr << endl;
163 | 			myassert(my_tree_compare(tr.begin(), tr.end(), reference.begin()));
164 | 		}
165 | };
166 | 
167 | class ParseAttrTest
168 | {
169 | 	public:
170 | 		bool test()
171 | 		{
172 | 			string html("<a hRef=\"teste.htm\" attr3=\"http://www.caveofpain.kit.net/pati_10_(piercing).jpg \"target=\"_blank\" attr2=\" none \"centeR attr1='ComiDa>");
173 | 
174 | 			tree<HTML::Node> t;
175 | 			HTML::ParserDom parser;
176 | 			parser.parse(html);
177 | 			t = parser.getTree();
178 | 
179 | 			tree<HTML::Node>::iterator it = t.begin();
180 | 			++it;
181 | 			it->parseAttributes();
182 | 
183 | 			myassert(it->attribute("href").second == "teste.htm");
184 | 			myassert(it->attribute("center").second == "");
185 | 			myassert(it->attribute("attr1").second == "ComiDa");
186 | 			myassert(it->attribute("attr2").second == "none");
187 | 			myassert(it->attribute("attr3").second == "http://www.caveofpain.kit.net/pati_10_(piercing).jpg");
188 | 			return true;
189 | 		}
190 | };
191 | 
192 | class CharsetTest
193 | {
194 | 	public:
195 | 		void test()
196 | 		{
197 | 			CharsetConverter cc("UTF8", "ISO-8859-1");
198 | 			myassert(cc.convert("Você é o meu visitante número") == "Voc� � o meu visitante n�mero");
199 | 
200 | 			CharsetConverter cc2("ISO-8859-1", "UTF8");
201 | 			myassert(cc2.convert("Voc� � o meu visitante n�mero") == "Você é o meu visitante número");
202 | 		}
203 | };
204 | 
205 | class ParserTest : public HTML::ParserSax
206 | {
207 | 	public:
208 | 		ParserTest() {}
209 | 		~ParserTest() {}
210 | 
211 | 	protected:
212 | 		virtual void foundTag(HTML::Node node, bool isEnd)
213 | 		{
214 | //			cerr << "foundTag: " << node << endl;
215 | 		}
216 | 		virtual void foundText(HTML::Node node)
217 | 		{
218 | //			cerr << "foundText: " << node << endl;
219 | 		}
220 | 		virtual void foundComment(HTML::Node node)
221 | 		{
222 | //			cerr << "foundComment: " << node << endl;
223 | 		}
224 | };
225 | 
226 | int main(int argc, char **argv) {
227 | 
228 | 	HtmlTest ht;
229 | 	myassert(ht.parse());
230 | 	myassert(ht.string_manip());
231 | 
232 | 	if (argc > 1)
233 | 	{
234 | 		ifstream f(argv[1]);
235 | 		HTML::ParserSax parser;
236 | //		parser.parse(istreambuf_iterator<char>(f), istreambuf_iterator<char>());
237 | //		tree<HTML::Node> t = parser.getTree();
238 | //		cerr << t << endl;
239 | 	}
240 | 	
241 | 	TagInitTest test2;
242 | 	test2.test();
243 | 
244 | 	ParseAttrTest test3;
245 | 	test3.test();
246 | 
247 |     CharsetTest test4;
248 |     test4.test();
249 | 	
250 | 	return 0;
251 | }
252 | 


--------------------------------------------------------------------------------
/css/parser_pp.cc:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <iterator>
  3 | #include <cctype>
  4 | #include <iostream>
  5 | #include <sstream>
  6 | #include <set>
  7 | #include <string>
  8 | #include "parser.h"
  9 | #include "parser_pp.h"
 10 | 
 11 | //#define DEBUG_PRINT
 12 | 
 13 | using namespace std;
 14 | 
 15 | const char *htmlcxx::CSS::IE_CSS = DEFAULT_CSS;
 16 | 
 17 | namespace htmlcxx {
 18 | namespace CSS {
 19 | 
 20 | 	
 21 | Parser::Selector::Selector() : mPsClass(NONE_CLASS), mPsElement(NONE_ELEMENT) {}
 22 | 
 23 | Parser::Selector::Selector(const string& e, const string& i, const string& c, const PseudoClass& pc, const PseudoElement& pe)
 24 | {
 25 | 	setElement(e);
 26 | 	setId(i);
 27 | 	setClass(c);
 28 | 	setPseudoClass(pc);
 29 | 	setPseudoElement(pe);
 30 | }
 31 | 
 32 | void Parser::Selector::setElement(const string& str) 
 33 | {
 34 | 	mElement = str;
 35 | 	transform(mElement.begin(), mElement.end(), mElement.begin(), ::tolower);
 36 | }
 37 | 
 38 | void Parser::Selector::setId(const string& str) 
 39 | {
 40 | 	mId = str;
 41 | 	transform(mId.begin(), mId.end(), mId.begin(), ::tolower);
 42 | }
 43 | 
 44 | void Parser::Selector::setClass(const string& str) 
 45 | {
 46 | 	mEClass = str;
 47 | 	transform(mEClass.begin(), mEClass.end(), mEClass.begin(), ::tolower);
 48 | }
 49 | 
 50 | void Parser::Selector::setPseudoClass(enum PseudoClass p) 
 51 | {
 52 | 	mPsClass = p;
 53 | }
 54 | 
 55 | void Parser::Selector::setPseudoElement(enum PseudoElement p) 
 56 | {
 57 | 	mPsElement = p;
 58 | }
 59 | 
 60 | bool Parser::Selector::match(const Selector& s) const 
 61 | {
 62 | 	if (mElement.empty()) return false;
 63 | 
 64 | 	set<string> cls;
 65 | 	istringstream s1(mEClass);
 66 | 	string str;
 67 | 	while (s1 >> str)
 68 | 	{
 69 | 	    cls.insert(str);
 70 | 	}
 71 | 
 72 | 	return ((!s.mElement.empty() && mElement == s.mElement) ||
 73 |             (!s.mId.empty() && mId == s.mId) ||
 74 |             (!s.mEClass.empty() && cls.find(s.mEClass) != cls.end()));
 75 | }
 76 | 
 77 | bool Parser::Selector::operator==(const Selector& s) const 
 78 | {
 79 | 	if (mElement == s.mElement &&
 80 | 			mId == s.mId &&
 81 | 			mEClass == s.mEClass &&
 82 | 			mPsClass == s.mPsClass &&
 83 | 			mPsElement == s.mPsElement)
 84 | 	{
 85 | 		return true;
 86 | 	} 
 87 | 	else
 88 | 	{
 89 | 		return false;
 90 | 	}
 91 | }
 92 | 
 93 | bool Parser::Selector::operator<(const Selector& s) const 
 94 | {
 95 | 	int my_count = 0, count = 0;
 96 | 
 97 | 	if (!mElement.empty()) ++my_count;
 98 | 	if (!mId.empty()) ++my_count;
 99 | 	if (!mEClass.empty()) ++my_count;
100 | 	if (mPsClass != NONE_CLASS) ++my_count;
101 | 	if (mPsElement != NONE_ELEMENT) ++my_count;
102 | 
103 | 	if (!s.mElement.empty()) ++count;
104 | 	if (!s.mId.empty()) ++count;
105 | 	if (!s.mEClass.empty()) ++count;
106 | 	if (s.mPsClass != NONE_CLASS) ++count;
107 | 	if (s.mPsElement != NONE_ELEMENT) ++count;
108 | 
109 | 	if (my_count == count) {
110 | 		if (mElement == s.mElement) {
111 | 			if (mId == s.mId) {
112 | 				if (mEClass == s.mEClass) {
113 | 					if (mPsClass == s.mPsClass) {
114 | 						if (mPsElement == s.mPsElement) {
115 | 							return false;
116 | 						} else {
117 | 							return mPsElement < s.mPsElement;
118 | 						}
119 | 					} else {
120 | 						return mPsClass < s.mPsClass;
121 | 					}
122 | 				} else {
123 | 					return mEClass < s.mEClass;
124 | 				}
125 | 			} else {
126 | 				return mId < s.mId;
127 | 			}
128 | 		} else {
129 | 			return mElement > s.mElement;
130 | 		}
131 | 	} else {
132 | 		return count > my_count;
133 | 	}
134 | }
135 | 
136 | bool Parser::match(const vector<Selector>& selector, const vector<Selector>& path)
137 | {
138 | 	if (path.empty()) return false;
139 | 	if (selector.empty()) return false;
140 | 
141 | 	const Selector& element = path[0];
142 | #ifdef DEBUG_PRINT
143 | 	cout << "Trying: " << path[0] << " against " << selector[0] << endl;
144 | #endif
145 | 	if (element.match(selector[0]))
146 | 	{
147 | #ifdef DEBUG_PRINT
148 | 		cout << "** Matched" << endl;
149 | #endif
150 | 		vector<Selector>::const_iterator m, n;
151 | 		m = path.begin() + 1;
152 | 		n = selector.begin() + 1;
153 | 		while (n != selector.end()) 
154 | 		{
155 | #ifdef DEBUG_PRINT
156 | 			cout << "Trying: " << *m << " against " << *n << endl;
157 | #endif
158 | 			while (m != path.end() && !m->match(*n)) ++m;
159 | 			if (m == path.end()) break;
160 | 			else ++n, ++m;
161 | 		}
162 | 		if (n == selector.end())
163 | 		{
164 | #ifdef DEBUG_PRINT
165 | 		    cout << "** true **" << endl;
166 | #endif
167 | 		    return true;
168 | 		}
169 | 	}
170 | 
171 | 	return false;
172 | }
173 | 
174 | map<string, string>
175 | Parser::getAttributes(const vector<Selector>& path) const
176 | {
177 | 	map<string, string> ret;
178 | 	
179 | 	for (RuleSet::const_iterator i = mRuleSets.begin(); i != mRuleSets.end(); ++i) 
180 | 	{
181 | 		if (match(i->first, path))
182 | 		{
183 | 			map<string, Attribute>::const_iterator j;
184 | 			for (j = i->second.begin(); j != i->second.end(); ++j) 
185 | 			{
186 | 				ret[j->first] = j->second.mVal;
187 | 			}
188 | 		}
189 | 	}
190 | 
191 | 	return ret;
192 | }
193 | 
194 | void Parser::merge(const Parser& p) 
195 | {
196 | 	RuleSet::const_iterator i;
197 | 	for (i = p.mRuleSets.begin(); i != p.mRuleSets.end(); ++i)
198 | 	{
199 | 		map<string, Attribute>& mine = mRuleSets[i->first];
200 | 		const map<string, Attribute>& their = i->second;
201 | 		map<std::string, Attribute>::const_iterator j;
202 | 		for (j = their.begin(); j != their.end(); ++j) 
203 | 		{
204 | 			mine[j->first] = j->second;
205 | 		}
206 | 	}
207 | }
208 | 
209 | bool Parser::parse(const string& css) 
210 | {
211 | 	return parse(css.c_str(), css.length());
212 | }
213 | 
214 | bool Parser::parse(const char *buf, int buf_len) 
215 | {
216 | 
217 | 	struct selector_list_t *css, *pos;
218 | 
219 | 	css = css_parse(buf, buf_len);
220 | 	pos = css;
221 | 
222 | 	while (pos != NULL) {
223 | 		struct selector_t *sel = pos->selector;
224 | 
225 | 		vector<Selector> p;
226 | 		while (sel != NULL) {
227 | 			Selector s;
228 | 			if (sel->element_name) {
229 | 				s.setElement(sel->element_name);
230 | 			}
231 | 			if (sel->id) {
232 | 				s.setId(sel->id);
233 | 			}
234 | 			if (sel->e_class) {
235 | 				s.setClass(sel->e_class);
236 | 			}
237 | 			switch (sel->pseudo_class) {
238 | 				case PS_CLASS_NONE:
239 | 					s.setPseudoClass(NONE_CLASS);
240 | 					break;
241 | 				case PS_CLASS_LINK:
242 | 					s.setPseudoClass(LINK);
243 | 					break;
244 | 				case PS_CLASS_VISITED:
245 | 					s.setPseudoClass(VISITED);
246 | 					break;
247 | 				case PS_CLASS_ACTIVE:
248 | 					s.setPseudoClass(ACTIVE);
249 | 					break;
250 | 			}
251 | 			switch (sel->pseudo_element) {
252 | 				case PS_ELEMENT_NONE:
253 | 					s.setPseudoElement(NONE_ELEMENT);
254 | 					break;
255 | 				case PS_ELEMENT_FIRST_LETTER:
256 | 					s.setPseudoElement(FIRST_LETTER);
257 | 					break;
258 | 				case PS_ELEMENT_FIRST_LINE:
259 | 					s.setPseudoElement(FIRST_LINE);
260 | 					break;
261 | 			}
262 | 			p.push_back(s);
263 | 			sel = sel->next;
264 | 		}
265 | 
266 | 		reverse(p.begin(), p.end());
267 | 
268 | 		map<string, Attribute>& m = mRuleSets[p];
269 | 		struct property_t *prop = pos->selector->property;
270 | 		while (prop != NULL) {
271 | 			m[prop->name] = Attribute(prop->val, prop->important);
272 | 			prop = prop->next;
273 | 		}
274 | 
275 | 		pos = pos->next;
276 | 	}
277 | 
278 | 	free_rulesets(css);
279 | 
280 | 	return true;
281 | }
282 | 
283 | ostream& operator<<(ostream& out, const map<string, Parser::Attribute>& s) 
284 | {
285 | 	map<string, Parser::Attribute>::const_iterator i;
286 | 	for (i = s.begin(); i != s.end(); ++i) 
287 | 	{
288 | 		if (i != s.begin()) out << " ";
289 | 		out << i->first << ": " << i->second.mVal;
290 | 		if (i->second.mImportant)  out << " !important";
291 | 		out << ";";
292 | 	}
293 | 	return out;
294 | }
295 | 
296 | string psc2str(const enum Parser::PseudoClass& s) 
297 | {
298 | 	switch (s) 
299 | 	{
300 | 		case Parser::LINK:
301 | 			return ":link";
302 | 			break;
303 | 		case Parser::VISITED:
304 | 			return ":visited";
305 | 			break;
306 | 		case Parser::ACTIVE:
307 | 			return ":active";
308 | 			break;
309 | 		default:
310 | 			return "";
311 | 	}
312 | }
313 | 
314 | string pse2str(const enum Parser::PseudoElement& s) 
315 | {
316 | 	switch (s) 
317 | 	{
318 | 		case Parser::FIRST_LETTER:
319 | 			return ":first_letter";
320 | 			break;
321 | 		case Parser::FIRST_LINE:
322 | 			return ":first_line";
323 | 			break;
324 | 		default:
325 | 			return "";
326 | 			break;
327 | 	}
328 | }
329 | 
330 | ostream& operator<<(ostream& out, const Parser::Selector& s) 
331 | {
332 | 	out << s.mElement;
333 | 	if (!s.mId.empty()) out << "#" << s.mId;
334 | 	if (!s.mEClass.empty()) out << "." <<  s.mEClass;
335 | 	out << psc2str(s.mPsClass) << pse2str(s.mPsElement);
336 | 	return out;
337 | }
338 | 
339 | ostream& operator<<(ostream &out, const CSS::Parser& p) 
340 | {
341 |     cout << "---------- css out begin" << endl;
342 | 	for (CSS::Parser::RuleSet::const_iterator i = p.mRuleSets.begin(); i != p.mRuleSets.end(); ++i)
343 | 	{
344 | 		if (i != p.mRuleSets.begin()) out << endl;
345 | 		copy(i->first.rbegin(),
346 | 				i->first.rend(),
347 | 				ostream_iterator<CSS::Parser::Selector>(out," "));
348 | 		out << "{ ";
349 | 		out << i->second << " }";
350 | 	}
351 | 	return out;
352 | }
353 | 
354 | }//namespace CSS
355 | }//namespace htmlcxx
356 | 


--------------------------------------------------------------------------------
/wingetopt.c:
--------------------------------------------------------------------------------
  1 | #if defined(WIN32) && !defined(__MINGW32__)
  2 | /***************************************************************************** 
  3 |  * 
  4 |  *  MODULE NAME : GETOPT.C 
  5 |  * 
  6 |  *  COPYRIGHTS: 
  7 |  *             This module contains code made available by IBM 
  8 |  *             Corporation on an AS IS basis.  Any one receiving the 
  9 |  *             module is considered to be licensed under IBM copyrights 
 10 |  *             to use the IBM-provided source code in any way he or she 
 11 |  *             deems fit, including copying it, compiling it, modifying 
 12 |  *             it, and redistributing it, with or without 
 13 |  *             modifications.  No license under any IBM patents or 
 14 |  *             patent applications is to be implied from this copyright 
 15 |  *             license. 
 16 |  * 
 17 |  *             A user of the module should understand that IBM cannot 
 18 |  *             provide technical support for the module and will not be 
 19 |  *             responsible for any consequences of use of the program. 
 20 |  * 
 21 |  *             Any notices, including this one, are not to be removed 
 22 |  *             from the module without the prior written consent of 
 23 |  *             IBM. 
 24 |  * 
 25 |  *  AUTHOR:   Original author: 
 26 |  *                 G. R. Blair (BOBBLAIR at AUSVM1) 
 27 |  *                 Internet: bobblair@bobblair.austin.ibm.com 
 28 |  * 
 29 |  *            Extensively revised by: 
 30 |  *                 John Q. Walker II, Ph.D. (JOHHQ at RALVM6) 
 31 |  *                 Internet: johnq@ralvm6.vnet.ibm.com 
 32 |  * 
 33 |  *****************************************************************************/ 
 34 |  
 35 | /****************************************************************************** 
 36 |  * getopt() 
 37 |  * 
 38 |  * The getopt() function is a command line parser.  It returns the next 
 39 |  * option character in argv that matches an option character in opstring. 
 40 |  * 
 41 |  * The argv argument points to an array of argc+1 elements containing argc 
 42 |  * pointers to character strings followed by a null pointer. 
 43 |  * 
 44 |  * The opstring argument points to a string of option characters; if an 
 45 |  * option character is followed by a colon, the option is expected to have 
 46 |  * an argument that may or may not be separated from it by white space. 
 47 |  * The external variable optarg is set to point to the start of the option 
 48 |  * argument on return from getopt(). 
 49 |  * 
 50 |  * The getopt() function places in optind the argv index of the next argument 
 51 |  * to be processed.  The system initializes the external variable optind to 
 52 |  * 1 before the first call to getopt(). 
 53 |  * 
 54 |  * When all options have been processed (that is, up to the first nonoption 
 55 |  * argument), getopt() returns EOF.  The special option "--" may be used to 
 56 |  * delimit the end of the options; EOF will be returned, and "--" will be 
 57 |  * skipped. 
 58 |  * 
 59 |  * The getopt() function returns a question mark (?) when it encounters an 
 60 |  * option character not included in opstring.  This error message can be 
 61 |  * disabled by setting opterr to zero.  Otherwise, it returns the option 
 62 |  * character that was detected. 
 63 |  * 
 64 |  * If the special option "--" is detected, or all options have been 
 65 |  * processed, EOF is returned. 
 66 |  * 
 67 |  * Options are marked by either a minus sign (-) or a slash (/). 
 68 |  * 
 69 |  * No errors are defined. 
 70 |  *****************************************************************************/ 
 71 | 
 72 | #include <stdio.h>                  /* for EOF */ 
 73 | #include <string.h>                 /* for strchr() */ 
 74 | 
 75 | /* static (global) variables that are specified as exported by getopt() */ 
 76 | extern char *optarg;    /* pointer to the start of the option argument  */ 
 77 | extern int   optind;       /* number of the next argv[] to be evaluated    */ 
 78 | extern int   opterr;       /* non-zero if a question mark should be returned 
 79 |                            when a non-valid option character is detected */ 
 80 |  
 81 | /* handle possible future character set concerns by putting this in a macro */ 
 82 | #define _next_char(string)  (char)(*(string+1)) 
 83 |  
 84 | int getopt(int argc, char *argv[], char *opstring) 
 85 | { 
 86 |     static char *pIndexPosition = NULL; /* place inside current argv string */ 
 87 |     char *pArgString = NULL;        /* where to start from next */ 
 88 |     char *pOptString;               /* the string in our program */ 
 89 |  
 90 |  
 91 |     if (pIndexPosition != NULL) { 
 92 |         /* we last left off inside an argv string */ 
 93 |         if (*(++pIndexPosition)) { 
 94 |             /* there is more to come in the most recent argv */ 
 95 |             pArgString = pIndexPosition; 
 96 |         } 
 97 |     } 
 98 |  
 99 |     if (pArgString == NULL) { 
100 |         /* we didn't leave off in the middle of an argv string */ 
101 |         if (optind >= argc) { 
102 |             /* more command-line arguments than the argument count */ 
103 |             pIndexPosition = NULL;  /* not in the middle of anything */ 
104 |             return EOF;             /* used up all command-line arguments */ 
105 |         } 
106 |  
107 |         /*--------------------------------------------------------------------- 
108 |          * If the next argv[] is not an option, there can be no more options. 
109 |          *-------------------------------------------------------------------*/ 
110 |         pArgString = argv[optind++]; /* set this to the next argument ptr */ 
111 |  
112 |         if (('/' != *pArgString) && /* doesn't start with a slash or a dash? */ 
113 |             ('-' != *pArgString)) { 
114 |             --optind;               /* point to current arg once we're done */ 
115 |             optarg = NULL;          /* no argument follows the option */ 
116 |             pIndexPosition = NULL;  /* not in the middle of anything */ 
117 |             return EOF;             /* used up all the command-line flags */ 
118 |         } 
119 |  
120 |         /* check for special end-of-flags markers */ 
121 |         if ((strcmp(pArgString, "-") == 0) || 
122 |             (strcmp(pArgString, "--") == 0)) { 
123 |             optarg = NULL;          /* no argument follows the option */ 
124 |             pIndexPosition = NULL;  /* not in the middle of anything */ 
125 |             return EOF;             /* encountered the special flag */ 
126 |         } 
127 |  
128 |         pArgString++;               /* look past the / or - */ 
129 |     } 
130 |  
131 |     if (':' == *pArgString) {       /* is it a colon? */ 
132 |         /*--------------------------------------------------------------------- 
133 |          * Rare case: if opterr is non-zero, return a question mark; 
134 |          * otherwise, just return the colon we're on. 
135 |          *-------------------------------------------------------------------*/ 
136 |         return (opterr ? (int)'?' : (int)':'); 
137 |     } 
138 |     else if ((pOptString = strchr(opstring, *pArgString)) == 0) { 
139 |         /*--------------------------------------------------------------------- 
140 |          * The letter on the command-line wasn't any good. 
141 |          *-------------------------------------------------------------------*/ 
142 |         optarg = NULL;              /* no argument follows the option */ 
143 |         pIndexPosition = NULL;      /* not in the middle of anything */ 
144 |         return (opterr ? (int)'?' : (int)*pArgString); 
145 |     } 
146 |     else { 
147 |         /*--------------------------------------------------------------------- 
148 |          * The letter on the command-line matches one we expect to see 
149 |          *-------------------------------------------------------------------*/ 
150 |         if (':' == _next_char(pOptString)) { /* is the next letter a colon? */ 
151 |             /* It is a colon.  Look for an argument string. */ 
152 |             if ('\0' != _next_char(pArgString)) {  /* argument in this argv? */ 
153 |                 optarg = &pArgString[1];   /* Yes, it is */ 
154 |             } 
155 |             else { 
156 |                 /*------------------------------------------------------------- 
157 |                  * The argument string must be in the next argv. 
158 |                  * But, what if there is none (bad input from the user)? 
159 |                  * In that case, return the letter, and optarg as NULL. 
160 |                  *-----------------------------------------------------------*/ 
161 |                 if (optind < argc) 
162 |                     optarg = argv[optind++]; 
163 |                 else { 
164 |                     optarg = NULL; 
165 |                     return (opterr ? (int)'?' : (int)*pArgString); 
166 |                 } 
167 |             } 
168 |             pIndexPosition = NULL;  /* not in the middle of anything */ 
169 |         } 
170 |         else { 
171 |             /* it's not a colon, so just return the letter */ 
172 |             optarg = NULL;          /* no argument follows the option */ 
173 |             pIndexPosition = pArgString;    /* point to the letter we're on */ 
174 |         } 
175 |         return (int)*pArgString;    /* return the letter that matched */ 
176 |     } 
177 | } 
178 | #endif //WIN32
179 | 


--------------------------------------------------------------------------------
/html/ParserSax.tcc:
--------------------------------------------------------------------------------
  1 | #include <cctype>
  2 | #include <cstring>
  3 | #if !defined(WIN32) || defined(__MINGW32__)
  4 | #include <strings.h>
  5 | #endif
  6 | 
  7 | //#define DEBUG
  8 | //#include "debug.h"
  9 | 
 10 | static
 11 | struct literal_tag {
 12 | 	int len;
 13 | 	const char* str;
 14 | 	int is_cdata;
 15 | }   
 16 | literal_mode_elem[] =
 17 | {   
 18 | 	{6, "script", 1},
 19 | 	{5, "style", 1},
 20 | 	{3, "xmp", 1},
 21 | 	{9, "plaintext", 1},
 22 | 	{8, "textarea", 0},
 23 | 	{0, 0, 0}
 24 | };
 25 | 
 26 | template <typename _Iterator>
 27 | void htmlcxx::HTML::ParserSax::parse(_Iterator begin, _Iterator end)
 28 | {
 29 | //	std::cerr << "Parsing iterator" << std::endl;
 30 | 	parse(begin, end, typename std::iterator_traits<_Iterator>::iterator_category());
 31 | }
 32 | 
 33 | template <typename _Iterator>
 34 | void htmlcxx::HTML::ParserSax::parse(_Iterator &begin, _Iterator &end, std::forward_iterator_tag)
 35 | {
 36 | 	typedef _Iterator iterator;
 37 | //	std::cerr << "Parsing forward_iterator" << std::endl;
 38 | 	mCdata = false;
 39 | 	mpLiteral = 0;
 40 | 	mCurrentOffset = 0;
 41 | 	this->beginParsing();
 42 | 
 43 | //	DEBUGP("Parsed text\n");
 44 | 
 45 | 	while (begin != end)
 46 | 	{
 47 | 		*begin; // This is for the multi_pass to release the buffer
 48 | 		iterator c(begin);
 49 | 
 50 | 		while (c != end)
 51 | 		{
 52 | 			// For some tags, the text inside it is considered literal and is
 53 | 			// only closed for its </TAG> counterpart
 54 | 			while (mpLiteral)
 55 | 			{
 56 | //				DEBUGP("Treating literal %s\n", mpLiteral);
 57 | 				while (c != end && *c != '<') ++c;
 58 | 
 59 | 				if (c == end) {
 60 | 					if (c != begin) this->parseContent(begin, c);
 61 | 					goto DONE;
 62 | 				}
 63 | 
 64 | 				iterator end_text(c);
 65 | 				++c;
 66 | 
 67 | 				if (*c == '/')
 68 | 				{
 69 | 					++c;
 70 | 					const char *l = mpLiteral;
 71 | 					while (*l && ::tolower(*c) == *l)
 72 | 					{
 73 | 						++c;
 74 | 						++l;
 75 | 					}
 76 | 
 77 | 					// FIXME: Mozilla stops when it sees a /plaintext. Check
 78 | 					// other browsers and decide what to do
 79 | 					if (!*l && strcmp(mpLiteral, "plaintext"))
 80 | 					{
 81 | 						// matched all and is not tag plaintext
 82 | 						while (isspace(*c)) ++c;
 83 | 
 84 | 						if (*c == '>')
 85 | 						{
 86 | 							++c;
 87 | 							if (begin != end_text)
 88 | 								this->parseContent(begin, end_text);
 89 | 							mpLiteral = 0;
 90 | 							c = end_text;
 91 | 							begin = c;
 92 | 							break;
 93 | 						}
 94 | 					}
 95 | 				}
 96 | 				else if (*c == '!')
 97 | 				{
 98 | 					// we may find a comment and we should support it
 99 | 					iterator e(c);
100 | 					++e;
101 | 
102 | 					if (e != end && *e == '-' && ++e != end && *e == '-')
103 | 					{
104 | //						DEBUGP("Parsing comment\n");
105 | 						++e;
106 | 						c = this->skipHtmlComment(e, end);
107 | 					}
108 | 
109 | 					//if (begin != end_text)
110 | 					//this->parseContent(begin, end_text, end);
111 | 
112 | 					//this->parseComment(end_text, c, end);
113 | 
114 | 					// continue from the end of the comment
115 | 					//begin = c;
116 | 				}
117 | 			}
118 | 
119 | 			if (*c == '<')
120 | 			{
121 | 				iterator d(c);
122 | 				++d;
123 | 				if (d != end)
124 | 				{
125 | 					if (isalpha(*d))
126 | 					{
127 | 						// beginning of tag
128 | 						if (begin != c)
129 | 							this->parseContent(begin, c);
130 | 
131 | //						DEBUGP("Parsing beginning of tag\n");
132 | 						d = this->skipHtmlTag(d, end);
133 | 						this->parseHtmlTag(c, d);
134 | 
135 | 						// continue from the end of the tag
136 | 						c = d;
137 | 						begin = c;
138 | 						break;
139 | 					}
140 | 
141 | 					if (*d == '/')
142 | 					{
143 | 						if (begin != c)
144 | 							this->parseContent(begin, c);
145 | 
146 | 						iterator e(d);
147 | 						++e;
148 | 						if (e != end && isalpha(*e))
149 | 						{
150 | 							// end of tag
151 | //							DEBUGP("Parsing end of tag\n");
152 | 							d = this->skipHtmlTag(d, end);
153 | 							this->parseHtmlTag(c, d);
154 | 						}
155 | 						else
156 | 						{
157 | 							// not a conforming end of tag, treat as comment
158 | 							// as Mozilla does
159 | //							DEBUGP("Non conforming end of tag\n");
160 | 							d = this->skipHtmlTag(d, end);
161 | 							this->parseComment(c, d);
162 | 						}
163 | 
164 | 						// continue from the end of the tag
165 | 						c = d;
166 | 						begin = c;
167 | 						break;
168 | 					}
169 | 
170 | 					if (*d == '!')
171 | 					{
172 | 						// comment
173 | 						if (begin != c)
174 | 							this->parseContent(begin, c);
175 | 
176 | 						iterator e(d);
177 | 						++e;
178 | 
179 | 						if (e != end && *e == '-' && ++e != end && *e == '-')
180 | 						{
181 | //							DEBUGP("Parsing comment\n");
182 | 							++e;
183 | 							d = this->skipHtmlComment(e, end);
184 | 						}
185 | 						else
186 | 						{
187 | 							d = this->skipHtmlTag(d, end);
188 | 						}
189 | 
190 | 						this->parseComment(c, d);
191 | 
192 | 						// continue from the end of the comment
193 | 						c = d;
194 | 						begin = c;
195 | 						break;
196 | 					}
197 | 
198 | 					if (*d == '?' || *d == '%')
199 | 					{
200 | 						// something like <?xml or <%VBSCRIPT
201 | 						if (begin != c)
202 | 							this->parseContent(begin, c);
203 | 
204 | 						d = this->skipHtmlTag(d, end);
205 | 
206 | 						this->parseComment(c, d);
207 | 
208 | 						// continue from the end of the comment
209 | 						c = d;
210 | 						begin = c;
211 | 						break;
212 | 					}
213 | 				}
214 | 			}
215 | 			c++;
216 | 		}
217 | 
218 | 		// There may be some text in the end of the document
219 | 		if (begin != c)
220 | 		{
221 | 			this->parseContent(begin, c);
222 | 			begin = c;
223 | 		}
224 | 	}
225 | 
226 | DONE:
227 | 	this->endParsing();
228 | 	return;
229 | }
230 | 
231 | template <typename _Iterator>
232 | void htmlcxx::HTML::ParserSax::parseComment(_Iterator b, _Iterator c)
233 | {
234 | //	DEBUGP("Creating comment node %s\n", std::string(b, c).c_str());
235 | 	htmlcxx::HTML::Node com_node;
236 | 	//FIXME: set_tagname shouldn't be needed, but first I must check
237 | 	//legacy code
238 | 	std::string comment(b, c);
239 | 	com_node.tagName(comment);
240 | 	com_node.text(comment);
241 | 	com_node.offset(mCurrentOffset);
242 | 	com_node.length((unsigned int)comment.length());
243 | 	com_node.isTag(false);
244 | 	com_node.isComment(true);
245 | 
246 | 	mCurrentOffset += com_node.length();
247 | 
248 | 	// Call callback method
249 | 	this->foundComment(com_node);
250 | }
251 | 
252 | template <typename _Iterator>
253 | void htmlcxx::HTML::ParserSax::parseContent(_Iterator b, _Iterator c)
254 | {
255 | //	DEBUGP("Creating text node %s\n", (std::string(b, c)).c_str());
256 | 	htmlcxx::HTML::Node txt_node;
257 | 	//FIXME: set_tagname shouldn't be needed, but first I must check
258 | 	//legacy code
259 | 	std::string text(b, c);
260 | 	txt_node.tagName(text);
261 | 	txt_node.text(text);
262 | 	txt_node.offset(mCurrentOffset);
263 | 	txt_node.length((unsigned int)text.length());
264 | 	txt_node.isTag(false);
265 | 	txt_node.isComment(false);
266 | 
267 | 	mCurrentOffset += txt_node.length();
268 | 
269 | 	// Call callback method
270 | 	this->foundText(txt_node);
271 | }
272 | 
273 | template <typename _Iterator>
274 | void htmlcxx::HTML::ParserSax::parseHtmlTag(_Iterator b, _Iterator c)
275 | {
276 | 	_Iterator name_begin(b);
277 | 	++name_begin;
278 | 	bool is_end_tag = (*name_begin == '/');
279 | 	if (is_end_tag) ++name_begin;
280 | 
281 | 	_Iterator name_end(name_begin);
282 | 	while (name_end != c && isalnum(*name_end)) 
283 | 	{
284 | 		++name_end;
285 | 	}
286 | 
287 | 	std::string name(name_begin, name_end);
288 | //	DEBUGP("Found %s tag %s\n", is_end_tag ? "closing" : "opening", name.c_str());
289 | 
290 | 	if (!is_end_tag) 
291 | 	{
292 | 		std::string::size_type tag_len = name.length();
293 | 		for (int i = 0; literal_mode_elem[i].len; ++i)
294 | 		{
295 | 			if (tag_len == literal_mode_elem[i].len)
296 | 			{
297 |                                 #if defined(WIN32) && !defined(__MINGW32__)
298 | 				if (!_stricmp(name.c_str(), literal_mode_elem[i].str))
299 | 				#else
300 | 				if (!strcasecmp(name.c_str(), literal_mode_elem[i].str))
301 | 				#endif
302 | 				{
303 | 					mpLiteral = literal_mode_elem[i].str;
304 | 					break;
305 | 				}
306 | 			}
307 | 		}
308 | 	} 
309 | 
310 | 	htmlcxx::HTML::Node tag_node;
311 | 	//by now, length is just the size of the tag
312 | 	std::string text(b, c);
313 | 	tag_node.length(static_cast<unsigned int>(text.length()));
314 | 	tag_node.tagName(name);
315 | 	tag_node.text(text);
316 | 	tag_node.offset(mCurrentOffset);
317 | 	tag_node.isTag(true);
318 | 	tag_node.isComment(false);
319 | 
320 | 	mCurrentOffset += tag_node.length();
321 | 
322 | 	this->foundTag(tag_node, is_end_tag);
323 | }
324 | 
325 | template <typename _Iterator>
326 | _Iterator
327 | htmlcxx::HTML::ParserSax::skipHtmlComment(_Iterator c, _Iterator end)
328 | {
329 | 	while ( c != end ) {
330 | 		if (*c++ == '-' && c != end && *c == '-')
331 | 		{
332 | 			_Iterator d(c);
333 | 			while (++c != end && isspace(*c));
334 | 			if (c == end || *c++ == '>') break;
335 | 			c = d;
336 | 		}
337 | 	}
338 | 
339 | 	return c;
340 | }
341 | 
342 | namespace htmlcxx { namespace HTML {
343 | 
344 | template <typename _Iterator>
345 | static inline
346 | _Iterator find_next_quote(_Iterator c, _Iterator end, char quote)
347 | {
348 | //	std::cerr << "generic find" << std::endl;
349 | 	while (c != end && *c != quote) ++c;
350 | 	return c;
351 | }
352 | 
353 | template <>
354 | inline
355 | const char *find_next_quote(const char *c, const char *end, char quote)
356 | {
357 | //	std::cerr << "fast find" << std::endl;
358 | 	const char *d = reinterpret_cast<const char*>(memchr(c, quote, end - c));
359 | 
360 | 	if (d) return d;
361 | 	else return end;
362 | }
363 | 
364 | }}
365 | 
366 | template <typename _Iterator>
367 | _Iterator htmlcxx::HTML::ParserSax::skipHtmlTag(_Iterator c, _Iterator end)
368 | {
369 | 	while (c != end && *c != '>')
370 | 	{
371 | 		if (*c != '=') 
372 | 		{
373 | 			++c;
374 | 		}
375 | 		else
376 | 		{ // found an attribute
377 | 			++c;
378 | 			while (c != end && isspace(*c)) ++c;
379 | 
380 | 			if (c == end) break;
381 | 
382 | 			if (*c == '\"' || *c == '\'') 
383 | 			{
384 | 				_Iterator save(c);
385 | 				char quote = *c++;
386 | 				c = find_next_quote(c, end, quote);
387 | //				while (c != end && *c != quote) ++c;
388 | //				c = static_cast<char*>(memchr(c, quote, end - c));
389 | 				if (c != end) 
390 | 				{
391 | 					++c;
392 | 				} 
393 | 				else 
394 | 				{
395 | 					c = save;
396 | 					++c;
397 | 				}
398 | //				DEBUGP("Quotes: %s\n", std::string(save, c).c_str());
399 | 			}
400 | 		}
401 | 	}
402 | 
403 | 	if (c != end) ++c;
404 | 	
405 | 	return c;
406 | }
407 | 


--------------------------------------------------------------------------------
/ASF-2.0:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/html/utils.cc:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cctype>
  3 | #include <cstring>
  4 | #include <sstream>
  5 | #include "Uri.h"
  6 | 
  7 | #include "utils.h"
  8 | 
  9 | using namespace std;
 10 | namespace htmlcxx {
 11 | 	namespace HTML {
 12 | 
 13 | 		bool detect_utf8(const char *begin, int size)
 14 | 		{
 15 | 			const char *ptr;
 16 | 			const char *end = begin+size;
 17 | 			const char *signature = "﻿";
 18 | 			char previous_byte = 0;
 19 | 			unsigned count_bad_utf = 0;
 20 | 			unsigned count_good_utf = 0;
 21 | 
 22 | 			if (!strncmp(begin, signature, 3)) return true;
 23 | 			
 24 | 			for (ptr = begin; ptr != end; ++ptr)
 25 | 			{
 26 | 				if ((*ptr & 0xC0) == 0x80)
 27 | 				{
 28 | 					if ((previous_byte & 0xC0) == 0xC0)
 29 | 					{
 30 | 						count_good_utf ++;
 31 | 					}
 32 | 					else if ((previous_byte & 0x80) == 0x00)
 33 | 					{
 34 | 						count_bad_utf ++;
 35 | 					}
 36 | 				}
 37 | 				else if ((previous_byte & 0xC0) == 0xC0)
 38 | 				{
 39 | 					count_bad_utf ++;
 40 | 				}
 41 | 
 42 | 				previous_byte = *ptr;
 43 | 			}
 44 | 
 45 | 			return count_good_utf > count_bad_utf;
 46 | 		}
 47 | 
 48 | 		string single_blank(const string &str) {
 49 | 
 50 | 			unsigned int count = 0;
 51 | 			bool first_space = true;
 52 | 			const char *ptr = str.c_str();
 53 | 
 54 | 			string ret(str.length(), ' ');
 55 | 			
 56 | 			// Skip space at beginning
 57 | 			while (isspace(*ptr)) ++ptr;
 58 | 			
 59 | 			while (*ptr)
 60 | 			{
 61 | 				if (isspace(*ptr))
 62 | 				{
 63 | 					if (first_space)
 64 | 					{
 65 | 						first_space = false;
 66 | 						ret[count++] = ' ';
 67 | 					}
 68 | 				}
 69 | 				else
 70 | 				{
 71 | 					first_space = true;
 72 | 					ret[count++] = *ptr;
 73 | 				}
 74 | 				
 75 | 				++ptr;
 76 | 			}
 77 | 
 78 | 			// Trim space at the end
 79 | 			string::size_type a;
 80 | 			a = ret.find_last_not_of(' ', count);
 81 | 			if (a != string::npos)
 82 | 				ret.erase(a+1);
 83 | 			else
 84 | 			{
 85 | 				a = 0;
 86 | 				ret.erase(a);
 87 | 			}
 88 | 
 89 | 			return ret;
 90 | 		}
 91 | 
 92 | 		string strip_comments(const string &str) {
 93 | 
 94 | 			string ret;
 95 | 			ret.reserve(str.size());
 96 | 
 97 | 			const char *ptr = str.c_str();
 98 | 			const char *end = ptr + str.length();
 99 | 
100 | 			bool inside_comment = false;
101 | 			while(1) {
102 | 				if(!inside_comment) {
103 | 					if(ptr  + 4 < end) {
104 | 						if(*ptr == '<' && *(ptr+1) == '!' && *(ptr+2) =='-' && *(ptr + 3) == '-' && isspace(*(ptr + 4))) {
105 | 							inside_comment = true;
106 | 						}
107 | 					}
108 | 				} else {
109 | 					if(ptr + 2 < end) {
110 | 						if(*ptr == '-' && *(ptr+1) == '-' && *(ptr+2) == '>' ) {
111 | 							inside_comment = false;
112 | 							ptr += 3;
113 | 						}
114 | 					}
115 | 				}
116 | 				if(ptr == end) break;
117 | 				if(!inside_comment) ret += *ptr;
118 | 				ptr++;
119 | 			}
120 | 
121 | 			ret.resize(ret.size());
122 | 
123 | 			return ret;
124 | 
125 | 		}
126 | 
127 | 		static struct {
128 | 			const char *str;
129 | 			unsigned char chr;
130 | 		} entities[] = {
131 | 			/* 00 */
132 | 			{ "quot", 34 },
133 | 			{ "amp", 38 },
134 | 			{ "lt", 60 },
135 | 			{ "gt", 62 },
136 | 			{ "nbsp", ' ' },
137 | 			{ "iexcl", 161 },
138 | 			{ "cent", 162 },
139 | 			{ "pound", 163 },
140 | 			{ "curren", 164 },
141 | 			{ "yen", 165 },
142 | 			/* 10 */
143 | 			{ "brvbar", 166 },
144 | 			{ "sect", 167 },
145 | 			{ "uml", 168 },
146 | 			{ "copy", 169 },
147 | 			{ "ordf", 170 },
148 | 			{ "laquo", 171 },
149 | 			{ "not", 172 },
150 | 			{ "shy", 173 },
151 | 			{ "reg", 174 },
152 | 			{ "macr", 175 },
153 | 			/* 20 */
154 | 			{ "deg", 176 },
155 | 			{ "plusmn", 177 },
156 | 			{ "sup2", 178 },
157 | 			{ "sup3", 179 },
158 | 			{ "acute", 180 },
159 | 			{ "micro", 181 },
160 | 			{ "para", 182 },
161 | 			{ "middot", 183 },
162 | 			{ "cedil", 184 },
163 | 			{ "sup1", 185 },
164 | 			/* 30 */
165 | 			{ "ordm", 186 },
166 | 			{ "raquo", 187 },
167 | 			{ "frac14", 188 },
168 | 			{ "frac12", 189 },
169 | 			{ "frac34", 190 },
170 | 			{ "iquest", 191 },
171 | 			{ "Agrave", 192 },
172 | 			{ "Aacute", 193 },
173 | 			{ "Acirc", 194 },
174 | 			{ "Atilde", 195 },
175 | 			/* 40 */
176 | 			{ "Auml", 196 },
177 | 			{ "ring", 197 },
178 | 			{ "AElig", 198 },
179 | 			{ "Ccedil", 199 },
180 | 			{ "Egrave", 200 },
181 | 			{ "Eacute", 201 },
182 | 			{ "Ecirc", 202 },
183 | 			{ "Euml", 203 },
184 | 			{ "Igrave", 204 },
185 | 			{ "Iacute", 205 },
186 | 			/* 50 */
187 | 			{ "Icirc", 206 },
188 | 			{ "Iuml", 207 },
189 | 			{ "ETH", 208 },
190 | 			{ "Ntilde", 209 },
191 | 			{ "Ograve", 210 },
192 | 			{ "Oacute", 211 },
193 | 			{ "Ocirc", 212 },
194 | 			{ "Otilde", 213 },
195 | 			{ "Ouml", 214 },
196 | 			{ "times", 215 },
197 | 			/* 60 */
198 | 			{ "Oslash", 216 },
199 | 			{ "Ugrave", 217 },
200 | 			{ "Uacute", 218 },
201 | 			{ "Ucirc", 219 },
202 | 			{ "Uuml", 220 },
203 | 			{ "Yacute", 221 },
204 | 			{ "THORN", 222 },
205 | 			{ "szlig", 223 },
206 | 			{ "agrave", 224 },
207 | 			{ "aacute", 225 },
208 | 			/* 70 */
209 | 			{ "acirc", 226 },
210 | 			{ "atilde", 227 },
211 | 			{ "auml", 228 },
212 | 			{ "aring", 229 },
213 | 			{ "aelig", 230 },
214 | 			{ "ccedil", 231 },
215 | 			{ "egrave", 232 },
216 | 			{ "eacute", 233 },
217 | 			{ "ecirc", 234 },
218 | 			{ "euml", 235 },
219 | 			/* 80 */
220 | 			{ "igrave", 236 },
221 | 			{ "iacute", 237 },
222 | 			{ "icirc", 238 },
223 | 			{ "iuml", 239 },
224 | 			{ "ieth", 240 },
225 | 			{ "ntilde", 241 },
226 | 			{ "ograve", 242 },
227 | 			{ "oacute", 243 },
228 | 			{ "ocirc", 244 },
229 | 			{ "otilde", 245 },
230 | 			/* 90 */
231 | 			{ "ouml", 246 },
232 | 			{ "divide", 247 },
233 | 			{ "oslash", 248 },
234 | 			{ "ugrave", 249 },
235 | 			{ "uacute", 250 },
236 | 			{ "ucirc", 251 },
237 | 			{ "uuml", 252 },
238 | 			{ "yacute", 253 },
239 | 			{ "thorn", 254 },
240 | 			{ "yuml", 255 },
241 | 			/* 100 */
242 | 			{ NULL, 0 },
243 | 		};
244 | 
245 | 		string decode_entities(const string &str)
246 | 		{
247 | 			unsigned int count = 0;
248 | 			const char *ptr = str.c_str();
249 | 			const char *end;
250 | 
251 | 			string ret(str);
252 | 			string entity;
253 | 
254 | 			ptr = strchr(ptr, '&');
255 | 			if (ptr == NULL) return ret;
256 | 
257 | 			count += static_cast<unsigned int>(ptr - str.c_str());
258 | 
259 | //			printf("url_init: %s\n", str.c_str());
260 | 			while (*ptr)
261 | 			{
262 | 				if (*ptr == '&' && ((end = strchr(ptr, ';')) != NULL))
263 | 				{
264 | 					entity.assign(ptr + 1, end);
265 | //					printf("Entity: %d %s\n", entity.length(), entity.c_str());
266 | 					if (!entity.empty() && entity[0] == '#')
267 | 					{
268 | 						entity.erase(0, 1);
269 | 						int chr = atoi(entity.c_str());
270 | 						if (chr > 0 && chr <= UCHAR_MAX)
271 | 						{
272 | 							ret[count++] = chr;
273 | 						}
274 | 						ptr = end + 1;
275 | 					}
276 | 					else
277 | 					{
278 | 						bool found = false;
279 | 						for (int i = 0; entities[i].str != NULL; i++)
280 | 						{
281 | 							if (entity == entities[i].str)
282 | 							{
283 | 								found = true;
284 | 								ret[count++] = entities[i].chr;
285 | 								ptr = end + 1;
286 | 								break;
287 | 							}
288 | 						}
289 | 
290 | 						if (!found)
291 | 						{
292 | 							ret[count++] = *ptr++;
293 | 						}
294 | 					}
295 | 				}
296 | 				else
297 | 				{
298 | 					ret[count++] = *ptr++;
299 | 				}
300 | 			}
301 | 
302 | 			ret.erase(count);
303 | 
304 | //			printf("url_end: %s\n", ret.c_str());
305 | 			return ret;
306 | 		}
307 | 
308 | 		string get_attribute(const string& tag, const string& attr) {
309 | 			string val;
310 | 			string low_tag(tag);
311 | 			string low_attr(attr);
312 | 
313 | 			transform(low_attr.begin(), low_attr.end(), low_attr.begin(), ::tolower);
314 | 			transform(low_tag.begin(), low_tag.end(), low_tag.begin(), ::tolower);
315 | 
316 | 			string::size_type a;
317 | 			a = low_tag.find(low_attr);
318 | 			if (a == string::npos)
319 | 				return val;
320 | 
321 | 			a += attr.length();
322 | 			while (a < tag.length() && isspace(tag[a])) a++;
323 | 			if (a == tag.length() || tag[a] != '=')
324 | 				return val;
325 | 			a++;
326 | 			while (a < tag.length() && isspace(tag[a])) a++;
327 | 			if (a == tag.length())
328 | 				return val;
329 | 
330 | 			if (tag[a] == '"') {
331 | 				string::size_type b = tag.find('"', a+1);
332 | 				if (b == string::npos) return val;
333 | 				val = tag.substr(a+1, b-a-1);
334 | 			} else if (tag[a] == '\'') {
335 | 				string::size_type b = tag.find('\'', a+1);
336 | 				if (b == string::npos) return val;
337 | 				val = tag.substr(a+1, b-a-1);
338 | 			} else {
339 | 				while (a < tag.length() && !isspace(tag[a]) && tag[a] != '>') {
340 | 					val += tag[a++];
341 | 				}
342 | 			}
343 | 
344 | 			return val;
345 | 		}
346 | 
347 | 		string normalize_slashs(const string &url)
348 | 		{
349 | 			const int NONE = 0;
350 | 			const int LASTSLASH = 1;
351 | 			const int LASTDOTSLASH = 2;
352 | 			const int LASTDOTDOTSLASH = 3;
353 | 			int state = NONE;
354 | 			const char *question_dash;
355 | 			const char *question;
356 | 			const char *dash;
357 | 			unsigned int count = 0;
358 | 			const char *ptr = url.c_str();
359 | 			string ret(url);
360 | 
361 | 			question = strchr(ptr, '?');
362 | 			dash = strchr(ptr, '#');
363 | 			if (question &&(!dash || question < dash)) question_dash = question;
364 | 			else question_dash = dash;
365 | 			if (question_dash == 0) question_dash = url.c_str() + url.length();
366 | 
367 | 			const char *problem;
368 | 			const char *problem1 = strstr(ptr, "//");
369 | 			const char *problem2 = strstr(ptr, "/.");
370 | 
371 | 			if (problem1 && (!problem2 || problem1 < problem2)) problem = problem1;
372 | 			else problem = problem2;
373 | 
374 | 			if (problem && problem < question_dash)
375 | 			{
376 | 				ptr = problem;
377 | 				count = static_cast<unsigned int>(ptr - url.c_str());
378 | 				while (*ptr && ptr < question_dash)
379 | 				{
380 | 					switch (state)
381 | 					{
382 | 						case LASTSLASH:
383 | 							if (*ptr == '/')
384 | 							{
385 | 								++ptr;
386 | 								state = LASTSLASH;
387 | 							}
388 | 							else if (*ptr == '.')
389 | 							{
390 | 								++ptr;
391 | 								state = LASTDOTSLASH;
392 | 							}
393 | 							else
394 | 							{
395 | 								ret[count++] = *ptr;
396 | 								++ptr;
397 | 								state = NONE;
398 | 							}
399 | 							break;
400 | 						case LASTDOTSLASH:
401 | 							if (*ptr == '/')
402 | 							{
403 | 								++ptr;
404 | 								state = LASTSLASH;
405 | 							}
406 | 							else if (*ptr == '.')
407 | 							{
408 | 								++ptr;
409 | 								state = LASTDOTDOTSLASH;
410 | 							}
411 | 							else
412 | 							{
413 | 								ret[count++] = '.';
414 | 								ret[count++] = *ptr;
415 | 								++ptr;
416 | 								state = NONE;
417 | 							}
418 | 							break;
419 | 						case LASTDOTDOTSLASH:
420 | 							if (*ptr == '/')
421 | 							{
422 | 								const char *last_slash = ret.c_str() + count - 2;
423 | 								while (last_slash >= ret.c_str() && *last_slash != '/')
424 | 									--last_slash;
425 | 								if (last_slash >= ret.c_str())
426 | 									count = static_cast<unsigned int>(last_slash - ret.c_str() + 1);
427 | 								++ptr;
428 | 								state = LASTSLASH;
429 | 							}
430 | 							else
431 | 							{
432 | 								ret[count++] = '.';
433 | 								ret[count++] = '.';
434 | 								ret[count++] = *ptr;
435 | 								++ptr;
436 | 								state = NONE;
437 | 							}
438 | 							break;
439 | 						default:
440 | 							if (*ptr == '/')
441 | 							{
442 | 								ret[count++] = *ptr;
443 | 								++ptr;
444 | 								state = LASTSLASH;
445 | 							}
446 | 							else
447 | 							{
448 | 								ret[count++] = *ptr;
449 | 								++ptr;
450 | 								state = NONE;
451 | 							}
452 | 					}
453 | 				}
454 | 
455 | 				if (question_dash)
456 | 				{
457 | 					while (*ptr)
458 | 					{
459 | 						ret[count++] = *ptr;
460 | 						++ptr;
461 | 					}
462 | 				}
463 | 
464 | 				ret.erase(count);
465 | 			}
466 | 
467 | 			return ret;
468 | 		}
469 | 
470 | 		string convert_link(const string& relative, const Uri& root)
471 | 		{
472 | 			string url(relative);
473 | 			
474 | 			url = HTML::decode_entities(url);
475 | 
476 | 			string::size_type a;
477 | 			a = 0;
478 | 			while ((a = url.find_first_of(" \r\n", a)) != string::npos)
479 | 			{
480 | 				switch (url[a])
481 | 				{
482 | 					case ' ':
483 | 						url.replace(a, 1, "%20");
484 | 						break;
485 | 					case '\r':
486 | 						url.erase(a, 1);
487 | 						break;
488 | 					case '\n':
489 | 						url.erase(a, 1);
490 | 						break;
491 | 				}
492 | 			}
493 | 
494 | 			Uri uri;
495 | 			try
496 | 			{
497 | 				Uri rel(url);
498 | 				uri = rel.absolute(root);
499 | 				uri.path(normalize_slashs(uri.path()));
500 | 			}
501 | 			catch (Uri::Exception)
502 | 			{
503 | 				return string();
504 | 			}
505 | 
506 | 			return uri.unparse(Uri::REMOVE_FRAGMENT);
507 | 		}
508 | 
509 | 		string __serialize_gml(const tree<HTML::Node> &tr, tree<HTML::Node>::iterator it, tree<HTML::Node>::iterator end, unsigned int parent_id, unsigned int& label) {
510 | 
511 | 			using namespace std;
512 | 			ostringstream ret;
513 | 			tree<HTML::Node>::sibling_iterator sib = tr.begin(it);
514 | 			while(sib != tr.end(it)) {
515 | 				ret << "node [ id " << ++label << "\n label \"" << label << "\"\n]\n";
516 | 				ret << "edge [ \n source " << parent_id << "\n target " << label << "\n]" << endl;
517 | 				ret << __serialize_gml(tr, sib, end, label, label);
518 | 				++sib;
519 | 			}	
520 | 			ret << ends;
521 | 			string str = ret.str();
522 | 			return str;
523 | 		}
524 | 
525 | 
526 | 		string serialize_gml(const tree<HTML::Node> &tr) {
527 | 
528 | 			using namespace std;
529 | 
530 | 			tree<HTML::Node>::pre_order_iterator it = tr.begin();
531 | 			tree<HTML::Node>::pre_order_iterator end = tr.end();
532 | 
533 | 			string ret;
534 | 			ret += "graph [";
535 | 			ret += "directed 1\n";
536 | 			ret += "node [ id 0\n label \"0\"\n ]\n";
537 | 			unsigned int label = 0;
538 | 			ret += __serialize_gml(tr, it, end, 0, label);
539 | 			ret += "]";
540 | 			return ret;
541 | 
542 | 		}
543 | 
544 | 	}//namespace html
545 | }//namespace htmlcxx
546 | 


--------------------------------------------------------------------------------
/css/css_syntax.y:
--------------------------------------------------------------------------------
  1 | %{
  2 | #include <stdio.h>
  3 | #include <string.h>
  4 | #include "css_lex.h"
  5 | #include "parser.h"
  6 | 
  7 | #define YYERROR_VERBOSE 1
  8 | #define YYDEBUG 1
  9 | 
 10 | %}
 11 | 
 12 | %parse-param {void *yyparam}
 13 | %pure-parser
 14 | 
 15 | %union {
 16 | 	char *lexeme;
 17 | 	char letter;
 18 | 	struct property_t *property;
 19 | 	struct selector_t *selector;
 20 | 	struct selector_list_t *selector_list;
 21 | 	int pseudo_class;
 22 | 	int pseudo_element;
 23 | }
 24 | 
 25 | %{
 26 | 
 27 | int yylex(YYSTYPE *lvalp);
 28 | 
 29 | int yyerror(void *yyparam, const char *s) {
 30 | #if YYDEBUG
 31 | 	fprintf(stderr, "Error: %s\n", s);
 32 | #endif
 33 | 	return 0;
 34 | }
 35 | 
 36 | %}
 37 | 
 38 | %token IMPORT_SYM
 39 | %token IMPORTANT_SYM
 40 | %token IDENT
 41 | %token STRING
 42 | %token NUMBER
 43 | %token PERCENTAGE
 44 | %token LENGTH
 45 | %token EMS
 46 | %token EXS
 47 | %token PSCLASS_AFTER_IDENT
 48 | %token HASH_AFTER_IDENT
 49 | %token CLASS_AFTER_IDENT
 50 | %token PSCLASS
 51 | %token VISITED_PSCLASS
 52 | %token ACTIVE_PSCLASS
 53 | %token FIRST_LINE
 54 | %token FIRST_LETTER
 55 | %token HASH
 56 | %token CLASS
 57 | %token URL
 58 | %token RGB
 59 | %token CDO
 60 | %token CDC
 61 | %token CSL
 62 | 
 63 | %type <pseudo_class> solitary_pseudo_class
 64 | %type <pseudo_class> pseudo_class
 65 | %type <selector> selector
 66 | %type <selector> simple_selector
 67 | %type <selector> simple_selectors
 68 | %type <selector_list> selectors
 69 | %type <selector_list> ruleset
 70 | %type <selector_list> rulesets
 71 | %type <letter> unary_operator
 72 | %type <letter> operator
 73 | %type <property> declaration
 74 | %type <property> declarations
 75 | %type <lexeme> NUMBER
 76 | %type <lexeme> STRING
 77 | %type <lexeme> PERCENTAGE
 78 | %type <lexeme> LENGTH
 79 | %type <lexeme> EMS
 80 | %type <lexeme> EXS
 81 | %type <lexeme> URL
 82 | %type <lexeme> RGB
 83 | %type <lexeme> IDENT
 84 | %type <lexeme> HASH
 85 | %type <lexeme> HASH_AFTER_IDENT
 86 | %type <lexeme> CLASS_AFTER_IDENT
 87 | %type <lexeme> CLASS
 88 | %type <lexeme> hexcolor
 89 | %type <lexeme> value
 90 | %type <lexeme> id
 91 | %type <lexeme> solitary_id
 92 | %type <lexeme> term
 93 | %type <lexeme> property
 94 | %type <lexeme> expr
 95 | %type <lexeme> element_name
 96 | %type <lexeme> class
 97 | %type <lexeme> solitary_class
 98 | %type <lexeme> string_or_url
 99 | 
100 | %%
101 | 
102 | stylesheet
103 | : comments imports rulesets	{
104 | 						*(struct selector_list_t**) yyparam = $3;
105 | 					}
106 | ;
107 | 
108 | rulesets
109 | : ruleset comments rulesets	{
110 | 								struct selector_list_t *pos = $1;
111 | 								if (pos != NULL) {
112 | 									while (pos->next != NULL) {
113 | 										pos = pos->next;
114 | 									}
115 | 									pos->next = $3;
116 | 								} else {
117 | 									$1 = $3;
118 | 								}
119 | 								$$ = $1;
120 | 							}
121 | | { $$ = NULL;  }
122 | ;
123 | 
124 | imports
125 | : import comments imports
126 | |
127 | ;
128 | 
129 | comments
130 | : comments comment
131 | |
132 | ;
133 | 
134 | comment
135 | : CDO
136 | | CDC
137 | | CSL
138 | ;
139 | 
140 | import
141 | : IMPORT_SYM string_or_url ';' /* E.g., @import url(fun.css); */
142 | ;
143 | 
144 | string_or_url
145 | : STRING { $$ = $1; }
146 | | URL	{
147 | 			char *begin = $1;
148 | 			char *end = $1 + strlen($1);
149 | 
150 | 			/* Skip url( */
151 | 			begin += 4;
152 | 			/* skip whitespace */
153 | 			while (*begin == ' ')
154 | 				++begin;
155 | 
156 | 			/* Skip ) */
157 | 			end -= 2;
158 | 			/* skip whitespace */
159 | 			while (*end == ' ')
160 | 				--end;
161 | 
162 | 			end[1] = 0;
163 | 
164 | 			$$ = strdup(begin);
165 | 			free($1);
166 | 		}
167 | ;
168 | 
169 | unary_operator
170 | : '-' { $$ = '-'; }
171 | | '+' { $$ = '+'; }
172 | ;
173 | 
174 | operator
175 | : '/' { $$ = '/'; }
176 | | ',' { $$ = ','; }
177 | | /* empty */ {$$ = ' '; }
178 | ;
179 | 
180 | property
181 | : IDENT { $$ = $1; }
182 | ;
183 | 
184 | ruleset
185 | : selectors '{' declarations '}'	{
186 | 										struct selector_list_t *pos = $1;
187 | 										while (pos != NULL) {
188 | 											struct property_t *i = $3;
189 | 											while (i != NULL) {
190 | 												i->count++;
191 | 												i = i->next;
192 | 											}
193 | 											pos->selector->property = $3;
194 | 											pos = pos->next;
195 | 										}
196 | 										$$ = $1;
197 | 									}
198 | ;
199 | 
200 | selectors
201 | : selector	{
202 | 				if ($1 != NULL) {
203 | 					$$ = (struct selector_list_t*)
204 | 						malloc (sizeof(struct selector_list_t));
205 | 					$$->selector = $1;
206 | 					$$->next = NULL;
207 | 				} else {
208 | 					$$ = NULL;
209 | 				}
210 | 			}
211 | | selector ',' selectors	{
212 | 								if ($1 != NULL) {
213 | 									struct selector_list_t *new;
214 | 									new = (struct selector_list_t*)
215 | 										malloc (sizeof(struct selector_list_t));
216 | 									new->selector = $1;
217 | 									new->next = $3;
218 | 									$$ = new;
219 | 								} else {
220 | 									$$ = $3;
221 | 								}
222 | 							}
223 | ;
224 | 
225 | declarations
226 | : declaration 					{
227 | 									$$ = $1;
228 | 								}
229 | | declaration ';' declarations	{
230 | 									if ($1 != NULL) {
231 | 										$1->next = $3;
232 | 										$$ = $1;
233 | 									} else {
234 | 										$$ = $3;
235 | 									}
236 | 								}
237 | ;
238 | 
239 | selector
240 | : simple_selectors	{ $$ = $1; }
241 | | selector error { $$ = NULL; }
242 | | error { $$ = NULL; }
243 | ;
244 | 
245 | simple_selectors
246 | : simple_selector { $$ = $1; }
247 | | simple_selector simple_selectors	{
248 | 										$1->next = $2;
249 | 										$$ = $1;
250 | 									}
251 | ;
252 | 
253 | /* An "id" is an ID that is attached to an element type
254 |  ** on its left, as in: P#p007
255 |  ** A "solitary_id" is an ID that is not so attached,
256 |  ** as in: #p007
257 |  ** Analogously for classes and pseudo-classes.
258 |  */
259 | simple_selector
260 | : element_name id class pseudo_class {
261 | 										$$ = (struct selector_t*)
262 | 											malloc(sizeof(struct selector_t));
263 | 										$$->element_name = $1;
264 | 										$$->id = $2;
265 | 										$$->e_class = $3;
266 | 										$$->pseudo_class = $4;
267 | 										$$->pseudo_element = 0;
268 | 										$$->next = NULL;
269 | 									 } /* eg: H1.subject */
270 | | element_name id pseudo_class	{
271 | 									$$ = (struct selector_t*)
272 | 										malloc(sizeof(struct selector_t));
273 | 									$$->element_name = $1;
274 | 									$$->id = $2;
275 | 									$$->e_class = NULL;
276 | 									$$->pseudo_class = $3;
277 | 									$$->pseudo_element = 0;
278 | 									$$->next = NULL;
279 | 								}
280 | | element_name class pseudo_class	{
281 | 										$$ = (struct selector_t*)
282 | 											malloc(sizeof(struct selector_t));
283 | 										$$->element_name = $1;
284 | 										$$->id = NULL;
285 | 										$$->e_class = $2;
286 | 										$$->pseudo_class = $3;
287 | 										$$->pseudo_element = 0;
288 | 										$$->next = NULL;
289 | 									}
290 | | element_name id class	{
291 | 							$$ = (struct selector_t*)
292 | 								malloc(sizeof(struct selector_t));
293 | 							$$->element_name = $1;
294 | 							$$->id = $2;
295 | 							$$->e_class = $3;
296 | 							$$->pseudo_class = 0;
297 | 							$$->pseudo_element = 0;
298 | 							$$->next = NULL;
299 | 						}
300 | | element_name id	{
301 | 						$$ = (struct selector_t*)
302 | 							malloc(sizeof(struct selector_t));
303 | 						$$->element_name = $1;
304 | 						$$->id = $2;
305 | 						$$->e_class = NULL;
306 | 						$$->pseudo_class = 0;
307 | 						$$->pseudo_element = 0;
308 | 						$$->next = NULL;
309 | 					}
310 | | element_name class	{
311 | 							$$ = (struct selector_t*)
312 | 								malloc(sizeof(struct selector_t));
313 | 							$$->element_name = $1;
314 | 							$$->id = NULL;
315 | 							$$->e_class = $2;
316 | 							$$->pseudo_class = 0;
317 | 							$$->pseudo_element = 0;
318 | 							$$->next = NULL;
319 | 						}
320 | | element_name pseudo_class	{
321 | 								$$ = (struct selector_t*)
322 | 									malloc(sizeof(struct selector_t));
323 | 								$$->element_name = $1;
324 | 								$$->id = NULL;
325 | 								$$->e_class = NULL;
326 | 								$$->pseudo_class = $2;
327 | 								$$->pseudo_element = 0;
328 | 								$$->next = NULL;
329 | 							}
330 | | element_name	{
331 | 					$$ = (struct selector_t*)
332 | 						malloc(sizeof(struct selector_t));
333 | 					$$->element_name = $1;
334 | 					$$->id = NULL;
335 | 					$$->e_class = NULL;
336 | 					$$->pseudo_class = 0;
337 | 					$$->pseudo_element = 0;
338 | 					$$->next = NULL;
339 | 				}
340 | | solitary_id class pseudo_class	{
341 | 										$$ = (struct selector_t*)
342 | 											malloc(sizeof(struct selector_t));
343 | 										$$->element_name = NULL;
344 | 										$$->id = $1;
345 | 										$$->e_class = $2;
346 | 										$$->pseudo_class = $3;
347 | 										$$->pseudo_element = 0;
348 | 										$$->next = NULL;
349 | 									}	/* eg: #xyz33 */
350 | | solitary_id class	{
351 | 						$$ = (struct selector_t*)
352 | 							malloc(sizeof(struct selector_t));
353 | 						$$->element_name = NULL;
354 | 						$$->id = $1;
355 | 						$$->e_class = $2;
356 | 						$$->pseudo_class = 0;
357 | 						$$->pseudo_element = 0;
358 | 						$$->next = NULL;
359 | 					}
360 | | solitary_id pseudo_class	{
361 | 								$$ = (struct selector_t*)
362 | 									malloc(sizeof(struct selector_t));
363 | 								$$->element_name = NULL;
364 | 								$$->id = $1;
365 | 								$$->e_class = NULL;
366 | 								$$->pseudo_class = $2;
367 | 								$$->pseudo_element = 0;
368 | 								$$->next = NULL;
369 | 							}
370 | | solitary_id	{
371 | 					$$ = (struct selector_t*)
372 | 						malloc(sizeof(struct selector_t));
373 | 					$$->element_name = NULL;
374 | 					$$->id = $1;
375 | 					$$->e_class = NULL;
376 | 					$$->pseudo_class = 0;
377 | 					$$->pseudo_element = 0;
378 | 					$$->next = NULL;
379 | 				}
380 | | solitary_class pseudo_class	{
381 | 									$$ = (struct selector_t*)
382 | 										malloc(sizeof(struct selector_t));
383 | 									$$->element_name = NULL;
384 | 									$$->id = NULL;
385 | 									$$->e_class = $1;
386 | 									$$->pseudo_class = $2;
387 | 									$$->pseudo_element = 0;
388 | 									$$->next = NULL;
389 | 								}/* eg: .author */
390 | | solitary_class	{
391 | 						$$ = (struct selector_t*)
392 | 							malloc(sizeof(struct selector_t));
393 | 						$$->element_name = NULL;
394 | 						$$->id = NULL;
395 | 						$$->e_class = $1;
396 | 						$$->pseudo_class = 0;
397 | 						$$->pseudo_element = 0;
398 | 						$$->next = NULL;
399 | 					}
400 | | solitary_pseudo_class	{
401 | 							$$ = (struct selector_t*)
402 | 								malloc(sizeof(struct selector_t));
403 | 							$$->element_name = NULL;
404 | 							$$->id = NULL;
405 | 							$$->e_class = NULL;
406 | 							$$->pseudo_class = $1;
407 | 							$$->pseudo_element = 0;
408 | 							$$->next = NULL;
409 | 						} /* eg: :link */
410 | ;
411 | 
412 | element_name
413 | : IDENT { $$ = $1; }
414 | ;
415 | 
416 | pseudo_class					/* as in:  A:link */
417 | : ':' IDENT { $$ = PS_CLASS; }
418 | ;
419 | 
420 | solitary_pseudo_class				/* as in:  :link */
421 | : ':' IDENT { $$ = PS_CLASS; }
422 | ;
423 | 
424 | class						/* as in:  P.note */
425 | : CLASS_AFTER_IDENT { $$ = $1; }
426 | ;
427 | 
428 | solitary_class					/* as in:  .note */
429 | : CLASS { $$ = $1; }
430 | ;
431 | 
432 | /* There is a constraint on the id and solitary_id that the
433 |  ** part after the "#" must be a valid HTML ID value;
434 |  ** e.g., "#x77" is OK, but "#77" is not.
435 |  */
436 | id
437 | : HASH_AFTER_IDENT { $$ = $1; }
438 | ;
439 | 
440 | solitary_id
441 | : HASH			{ $$ = $1; }
442 | ;
443 | 
444 | declaration
445 | : property ':' expr prio	{
446 | 								$$ = (struct property_t*)
447 | 									malloc(sizeof(struct property_t));
448 | 								$$->name = $1;
449 | 								$$->val = $3;
450 | 								$$->important = 1;
451 | 								$$->count = 0;
452 | 								$$->next = NULL;
453 | 							}
454 | | property ':' expr			{
455 | 								$$ = (struct property_t*)
456 | 									malloc(sizeof(struct property_t));
457 | 								$$->name = $1;
458 | 								$$->val = $3;
459 | 								$$->important = 0;
460 | 								$$->count = 0;
461 | 								$$->next = NULL;
462 | 							}
463 | | error	{ $$ = NULL; }
464 | | /* empty */				{
465 | 								$$ = NULL;
466 | 							} /* Prevents syntax errors... */
467 | ;
468 | 
469 | prio
470 | : IMPORTANT_SYM	 { }		/* !important */
471 | ;
472 | 
473 | expr
474 | : term					{ $$ = $1; } 
475 | | expr operator term	{
476 | 							char *s = (char*) malloc (strlen($1)+strlen($3)+2);
477 | 							strcpy(s, $1);
478 | 							s[strlen(s)+1] = 0;
479 | 							s[strlen(s)] = $2;
480 | 							strcat(s, $3);
481 | 							free($1);
482 | 							free($3);
483 | 							$$ = s;
484 | 						}
485 | | expr error			{ $$ = $1; }
486 | ;
487 | 
488 | term
489 | : unary_operator value 	{
490 | 							char *s = (char*) malloc(strlen($2)+2);
491 | 							s[0] = $1;
492 | 							s[1] = 0;
493 | 							strcat(s, $2);
494 | 							free($2);
495 | 							$$ = s;
496 | 						}
497 | | value { $$ = $1; }
498 | ;
499 | 
500 | value
501 | : NUMBER { $$ = $1; }
502 | | STRING { $$ = $1; }
503 | | PERCENTAGE { $$ = $1; }
504 | | LENGTH { $$ = $1; }
505 | | EMS { $$ = $1; }
506 | | EXS { $$ = $1; }
507 | | IDENT { $$ = $1; }
508 | | hexcolor { $$ = $1; }
509 | | URL { $$ = $1; }
510 | | RGB { $$ = $1; }
511 | ; 
512 | 
513 | /* There is a constraint on the color that it must
514 |  ** have either 3 or 6 hex-digits (i.e., [0-9a-fA-F])
515 |  ** after the "#"; e.g., "#000" is OK, but "#abcd" is not.
516 |  */
517 | hexcolor
518 | : HASH	{ 
519 | 			$$ = (char*) malloc (strlen($1)+2);
520 | 			sprintf($$, "#%s", $1);
521 | 			free($1);
522 | 		}
523 | | HASH_AFTER_IDENT	{ 
524 | 						$$ = (char*) malloc (strlen($1)+2);
525 | 						sprintf($$, "#%s", $1);
526 | 						free($1);
527 | 					}
528 | ;
529 | 
530 | %%
531 | 
532 | struct selector_list_t* css_parse(const char *buffer, int buf_len) {
533 | 	struct selector_list_t *ret = NULL;
534 | 	//yydebug = 1;
535 | 	init_yylex(buffer, buf_len);
536 | 	yyparse(&ret);
537 | 	end_yylex();
538 | 	return ret;
539 | }
540 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
  1 | Installation Instructions
  2 | *************************
  3 | 
  4 | Copyright (C) 1994-1996, 1999-2002, 2004-2013 Free Software Foundation,
  5 | Inc.
  6 | 
  7 |    Copying and distribution of this file, with or without modification,
  8 | are permitted in any medium without royalty provided the copyright
  9 | notice and this notice are preserved.  This file is offered as-is,
 10 | without warranty of any kind.
 11 | 
 12 | Basic Installation
 13 | ==================
 14 | 
 15 |    Briefly, the shell command `./configure && make && make install'
 16 | should configure, build, and install this package.  The following
 17 | more-detailed instructions are generic; see the `README' file for
 18 | instructions specific to this package.  Some packages provide this
 19 | `INSTALL' file but do not implement all of the features documented
 20 | below.  The lack of an optional feature in a given package is not
 21 | necessarily a bug.  More recommendations for GNU packages can be found
 22 | in *note Makefile Conventions: (standards)Makefile Conventions.
 23 | 
 24 |    The `configure' shell script attempts to guess correct values for
 25 | various system-dependent variables used during compilation.  It uses
 26 | those values to create a `Makefile' in each directory of the package.
 27 | It may also create one or more `.h' files containing system-dependent
 28 | definitions.  Finally, it creates a shell script `config.status' that
 29 | you can run in the future to recreate the current configuration, and a
 30 | file `config.log' containing compiler output (useful mainly for
 31 | debugging `configure').
 32 | 
 33 |    It can also use an optional file (typically called `config.cache'
 34 | and enabled with `--cache-file=config.cache' or simply `-C') that saves
 35 | the results of its tests to speed up reconfiguring.  Caching is
 36 | disabled by default to prevent problems with accidental use of stale
 37 | cache files.
 38 | 
 39 |    If you need to do unusual things to compile the package, please try
 40 | to figure out how `configure' could check whether to do them, and mail
 41 | diffs or instructions to the address given in the `README' so they can
 42 | be considered for the next release.  If you are using the cache, and at
 43 | some point `config.cache' contains results you don't want to keep, you
 44 | may remove or edit it.
 45 | 
 46 |    The file `configure.ac' (or `configure.in') is used to create
 47 | `configure' by a program called `autoconf'.  You need `configure.ac' if
 48 | you want to change it or regenerate `configure' using a newer version
 49 | of `autoconf'.
 50 | 
 51 |    The simplest way to compile this package is:
 52 | 
 53 |   1. `cd' to the directory containing the package's source code and type
 54 |      `./configure' to configure the package for your system.
 55 | 
 56 |      Running `configure' might take a while.  While running, it prints
 57 |      some messages telling which features it is checking for.
 58 | 
 59 |   2. Type `make' to compile the package.
 60 | 
 61 |   3. Optionally, type `make check' to run any self-tests that come with
 62 |      the package, generally using the just-built uninstalled binaries.
 63 | 
 64 |   4. Type `make install' to install the programs and any data files and
 65 |      documentation.  When installing into a prefix owned by root, it is
 66 |      recommended that the package be configured and built as a regular
 67 |      user, and only the `make install' phase executed with root
 68 |      privileges.
 69 | 
 70 |   5. Optionally, type `make installcheck' to repeat any self-tests, but
 71 |      this time using the binaries in their final installed location.
 72 |      This target does not install anything.  Running this target as a
 73 |      regular user, particularly if the prior `make install' required
 74 |      root privileges, verifies that the installation completed
 75 |      correctly.
 76 | 
 77 |   6. You can remove the program binaries and object files from the
 78 |      source code directory by typing `make clean'.  To also remove the
 79 |      files that `configure' created (so you can compile the package for
 80 |      a different kind of computer), type `make distclean'.  There is
 81 |      also a `make maintainer-clean' target, but that is intended mainly
 82 |      for the package's developers.  If you use it, you may have to get
 83 |      all sorts of other programs in order to regenerate files that came
 84 |      with the distribution.
 85 | 
 86 |   7. Often, you can also type `make uninstall' to remove the installed
 87 |      files again.  In practice, not all packages have tested that
 88 |      uninstallation works correctly, even though it is required by the
 89 |      GNU Coding Standards.
 90 | 
 91 |   8. Some packages, particularly those that use Automake, provide `make
 92 |      distcheck', which can by used by developers to test that all other
 93 |      targets like `make install' and `make uninstall' work correctly.
 94 |      This target is generally not run by end users.
 95 | 
 96 | Compilers and Options
 97 | =====================
 98 | 
 99 |    Some systems require unusual options for compilation or linking that
100 | the `configure' script does not know about.  Run `./configure --help'
101 | for details on some of the pertinent environment variables.
102 | 
103 |    You can give `configure' initial values for configuration parameters
104 | by setting variables in the command line or in the environment.  Here
105 | is an example:
106 | 
107 |      ./configure CC=c99 CFLAGS=-g LIBS=-lposix
108 | 
109 |    *Note Defining Variables::, for more details.
110 | 
111 | Compiling For Multiple Architectures
112 | ====================================
113 | 
114 |    You can compile the package for more than one kind of computer at the
115 | same time, by placing the object files for each architecture in their
116 | own directory.  To do this, you can use GNU `make'.  `cd' to the
117 | directory where you want the object files and executables to go and run
118 | the `configure' script.  `configure' automatically checks for the
119 | source code in the directory that `configure' is in and in `..'.  This
120 | is known as a "VPATH" build.
121 | 
122 |    With a non-GNU `make', it is safer to compile the package for one
123 | architecture at a time in the source code directory.  After you have
124 | installed the package for one architecture, use `make distclean' before
125 | reconfiguring for another architecture.
126 | 
127 |    On MacOS X 10.5 and later systems, you can create libraries and
128 | executables that work on multiple system types--known as "fat" or
129 | "universal" binaries--by specifying multiple `-arch' options to the
130 | compiler but only a single `-arch' option to the preprocessor.  Like
131 | this:
132 | 
133 |      ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
134 |                  CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
135 |                  CPP="gcc -E" CXXCPP="g++ -E"
136 | 
137 |    This is not guaranteed to produce working output in all cases, you
138 | may have to build one architecture at a time and combine the results
139 | using the `lipo' tool if you have problems.
140 | 
141 | Installation Names
142 | ==================
143 | 
144 |    By default, `make install' installs the package's commands under
145 | `/usr/local/bin', include files under `/usr/local/include', etc.  You
146 | can specify an installation prefix other than `/usr/local' by giving
147 | `configure' the option `--prefix=PREFIX', where PREFIX must be an
148 | absolute file name.
149 | 
150 |    You can specify separate installation prefixes for
151 | architecture-specific files and architecture-independent files.  If you
152 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses
153 | PREFIX as the prefix for installing programs and libraries.
154 | Documentation and other data files still use the regular prefix.
155 | 
156 |    In addition, if you use an unusual directory layout you can give
157 | options like `--bindir=DIR' to specify different values for particular
158 | kinds of files.  Run `configure --help' for a list of the directories
159 | you can set and what kinds of files go in them.  In general, the
160 | default for these options is expressed in terms of `${prefix}', so that
161 | specifying just `--prefix' will affect all of the other directory
162 | specifications that were not explicitly provided.
163 | 
164 |    The most portable way to affect installation locations is to pass the
165 | correct locations to `configure'; however, many packages provide one or
166 | both of the following shortcuts of passing variable assignments to the
167 | `make install' command line to change installation locations without
168 | having to reconfigure or recompile.
169 | 
170 |    The first method involves providing an override variable for each
171 | affected directory.  For example, `make install
172 | prefix=/alternate/directory' will choose an alternate location for all
173 | directory configuration variables that were expressed in terms of
174 | `${prefix}'.  Any directories that were specified during `configure',
175 | but not in terms of `${prefix}', must each be overridden at install
176 | time for the entire installation to be relocated.  The approach of
177 | makefile variable overrides for each directory variable is required by
178 | the GNU Coding Standards, and ideally causes no recompilation.
179 | However, some platforms have known limitations with the semantics of
180 | shared libraries that end up requiring recompilation when using this
181 | method, particularly noticeable in packages that use GNU Libtool.
182 | 
183 |    The second method involves providing the `DESTDIR' variable.  For
184 | example, `make install DESTDIR=/alternate/directory' will prepend
185 | `/alternate/directory' before all installation names.  The approach of
186 | `DESTDIR' overrides is not required by the GNU Coding Standards, and
187 | does not work on platforms that have drive letters.  On the other hand,
188 | it does better at avoiding recompilation issues, and works well even
189 | when some directory options were not specified in terms of `${prefix}'
190 | at `configure' time.
191 | 
192 | Optional Features
193 | =================
194 | 
195 |    If the package supports it, you can cause programs to be installed
196 | with an extra prefix or suffix on their names by giving `configure' the
197 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
198 | 
199 |    Some packages pay attention to `--enable-FEATURE' options to
200 | `configure', where FEATURE indicates an optional part of the package.
201 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE
202 | is something like `gnu-as' or `x' (for the X Window System).  The
203 | `README' should mention any `--enable-' and `--with-' options that the
204 | package recognizes.
205 | 
206 |    For packages that use the X Window System, `configure' can usually
207 | find the X include and library files automatically, but if it doesn't,
208 | you can use the `configure' options `--x-includes=DIR' and
209 | `--x-libraries=DIR' to specify their locations.
210 | 
211 |    Some packages offer the ability to configure how verbose the
212 | execution of `make' will be.  For these packages, running `./configure
213 | --enable-silent-rules' sets the default to minimal output, which can be
214 | overridden with `make V=1'; while running `./configure
215 | --disable-silent-rules' sets the default to verbose, which can be
216 | overridden with `make V=0'.
217 | 
218 | Particular systems
219 | ==================
220 | 
221 |    On HP-UX, the default C compiler is not ANSI C compatible.  If GNU
222 | CC is not installed, it is recommended to use the following options in
223 | order to use an ANSI C compiler:
224 | 
225 |      ./configure CC="cc -Ae -D_XOPEN_SOURCE=500"
226 | 
227 | and if that doesn't work, install pre-built binaries of GCC for HP-UX.
228 | 
229 |    HP-UX `make' updates targets which have the same time stamps as
230 | their prerequisites, which makes it generally unusable when shipped
231 | generated files such as `configure' are involved.  Use GNU `make'
232 | instead.
233 | 
234 |    On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
235 | parse its `<wchar.h>' header file.  The option `-nodtk' can be used as
236 | a workaround.  If GNU CC is not installed, it is therefore recommended
237 | to try
238 | 
239 |      ./configure CC="cc"
240 | 
241 | and if that doesn't work, try
242 | 
243 |      ./configure CC="cc -nodtk"
244 | 
245 |    On Solaris, don't put `/usr/ucb' early in your `PATH'.  This
246 | directory contains several dysfunctional programs; working variants of
247 | these programs are available in `/usr/bin'.  So, if you need `/usr/ucb'
248 | in your `PATH', put it _after_ `/usr/bin'.
249 | 
250 |    On Haiku, software installed for all users goes in `/boot/common',
251 | not `/usr/local'.  It is recommended to use the following options:
252 | 
253 |      ./configure --prefix=/boot/common
254 | 
255 | Specifying the System Type
256 | ==========================
257 | 
258 |    There may be some features `configure' cannot figure out
259 | automatically, but needs to determine by the type of machine the package
260 | will run on.  Usually, assuming the package is built to be run on the
261 | _same_ architectures, `configure' can figure that out, but if it prints
262 | a message saying it cannot guess the machine type, give it the
263 | `--build=TYPE' option.  TYPE can either be a short name for the system
264 | type, such as `sun4', or a canonical name which has the form:
265 | 
266 |      CPU-COMPANY-SYSTEM
267 | 
268 | where SYSTEM can have one of these forms:
269 | 
270 |      OS
271 |      KERNEL-OS
272 | 
273 |    See the file `config.sub' for the possible values of each field.  If
274 | `config.sub' isn't included in this package, then this package doesn't
275 | need to know the machine type.
276 | 
277 |    If you are _building_ compiler tools for cross-compiling, you should
278 | use the option `--target=TYPE' to select the type of system they will
279 | produce code for.
280 | 
281 |    If you want to _use_ a cross compiler, that generates code for a
282 | platform different from the build platform, you should specify the
283 | "host" platform (i.e., that on which the generated programs will
284 | eventually be run) with `--host=TYPE'.
285 | 
286 | Sharing Defaults
287 | ================
288 | 
289 |    If you want to set default values for `configure' scripts to share,
290 | you can create a site shell script called `config.site' that gives
291 | default values for variables like `CC', `cache_file', and `prefix'.
292 | `configure' looks for `PREFIX/share/config.site' if it exists, then
293 | `PREFIX/etc/config.site' if it exists.  Or, you can set the
294 | `CONFIG_SITE' environment variable to the location of the site script.
295 | A warning: not all `configure' scripts look for a site script.
296 | 
297 | Defining Variables
298 | ==================
299 | 
300 |    Variables not defined in a site shell script can be set in the
301 | environment passed to `configure'.  However, some packages may run
302 | configure again during the build, and the customized values of these
303 | variables may be lost.  In order to avoid this problem, you should set
304 | them in the `configure' command line, using `VAR=value'.  For example:
305 | 
306 |      ./configure CC=/usr/local2/bin/gcc
307 | 
308 | causes the specified `gcc' to be used as the C compiler (unless it is
309 | overridden in the site shell script).
310 | 
311 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to
312 | an Autoconf limitation.  Until the limitation is lifted, you can use
313 | this workaround:
314 | 
315 |      CONFIG_SHELL=/bin/bash ./configure CONFIG_SHELL=/bin/bash
316 | 
317 | `configure' Invocation
318 | ======================
319 | 
320 |    `configure' recognizes the following options to control how it
321 | operates.
322 | 
323 | `--help'
324 | `-h'
325 |      Print a summary of all of the options to `configure', and exit.
326 | 
327 | `--help=short'
328 | `--help=recursive'
329 |      Print a summary of the options unique to this package's
330 |      `configure', and exit.  The `short' variant lists options used
331 |      only in the top level, while the `recursive' variant lists options
332 |      also present in any nested packages.
333 | 
334 | `--version'
335 | `-V'
336 |      Print the version of Autoconf used to generate the `configure'
337 |      script, and exit.
338 | 
339 | `--cache-file=FILE'
340 |      Enable the cache: use and save the results of the tests in FILE,
341 |      traditionally `config.cache'.  FILE defaults to `/dev/null' to
342 |      disable caching.
343 | 
344 | `--config-cache'
345 | `-C'
346 |      Alias for `--cache-file=config.cache'.
347 | 
348 | `--quiet'
349 | `--silent'
350 | `-q'
351 |      Do not print messages saying which checks are being made.  To
352 |      suppress all normal output, redirect it to `/dev/null' (any error
353 |      messages will still be shown).
354 | 
355 | `--srcdir=DIR'
356 |      Look for the package's source code in directory DIR.  Usually
357 |      `configure' can determine that directory automatically.
358 | 
359 | `--prefix=DIR'
360 |      Use DIR as the installation prefix.  *note Installation Names::
361 |      for more details, including other options available for fine-tuning
362 |      the installation locations.
363 | 
364 | `--no-create'
365 | `-n'
366 |      Run the configure checks, but stop before creating any output
367 |      files.
368 | 
369 | `configure' also accepts some other, not widely useful, options.  Run
370 | `configure --help' for more details.
371 | 


--------------------------------------------------------------------------------
/html/Uri.cc:
--------------------------------------------------------------------------------
  1 | #include "Uri.h"
  2 | 
  3 | #include "wincstring.h"
  4 | #include <sstream>
  5 | #include <cstdlib>
  6 | #include <cassert>
  7 | #include "tld.h"
  8 | 
  9 | //#define DEBUG
 10 | #include "debug.h"
 11 | 
 12 | using namespace std;
 13 | using namespace htmlcxx;
 14 | 
 15 | /** Structure to store various schemes and their default ports */
 16 | struct schemes_t {
 17 |     /** The name of the scheme */
 18 |     const char *name;
 19 |     /** The default port for the scheme */
 20 |     unsigned int default_port;
 21 | };
 22 | 
 23 | /* Some WWW schemes and their default ports; this is basically /etc/services */
 24 | /* This will become global when the protocol abstraction comes */
 25 | /* As the schemes are searched by a linear search, */
 26 | /* they are sorted by their expected frequency */
 27 | static schemes_t schemes[] =
 28 | {
 29 |     {"http",     Uri::URI_HTTP_DEFAULT_PORT},
 30 |     {"ftp",      Uri::URI_FTP_DEFAULT_PORT},
 31 |     {"https",    Uri::URI_HTTPS_DEFAULT_PORT},
 32 |     {"gopher",   Uri::URI_GOPHER_DEFAULT_PORT},
 33 |     {"ldap",     Uri::URI_LDAP_DEFAULT_PORT},
 34 |     {"nntp",     Uri::URI_NNTP_DEFAULT_PORT},
 35 |     {"snews",    Uri::URI_SNEWS_DEFAULT_PORT},
 36 |     {"imap",     Uri::URI_IMAP_DEFAULT_PORT},
 37 |     {"pop",      Uri::URI_POP_DEFAULT_PORT},
 38 |     {"sip",      Uri::URI_SIP_DEFAULT_PORT},
 39 |     {"rtsp",     Uri::URI_RTSP_DEFAULT_PORT},
 40 |     {"wais",     Uri::URI_WAIS_DEFAULT_PORT},
 41 |     {"z39.50r",  Uri::URI_WAIS_DEFAULT_PORT},
 42 |     {"z39.50s",  Uri::URI_WAIS_DEFAULT_PORT},
 43 |     {"prospero", Uri::URI_PROSPERO_DEFAULT_PORT},
 44 |     {"nfs",      Uri::URI_NFS_DEFAULT_PORT},
 45 |     {"tip",      Uri::URI_TIP_DEFAULT_PORT},
 46 |     {"acap",     Uri::URI_ACAP_DEFAULT_PORT},
 47 |     {"telnet",   Uri::URI_TELNET_DEFAULT_PORT},
 48 |     {"ssh",      Uri::URI_SSH_DEFAULT_PORT},
 49 |     { NULL, 0xFFFF }     /* unknown port */
 50 | };
 51 | 
 52 | static unsigned int port_of_Scheme(const char *scheme_str)
 53 | {
 54 |     schemes_t *scheme;
 55 | 
 56 |     if (scheme_str) {
 57 |         for (scheme = schemes; scheme->name != NULL; ++scheme) {
 58 |             if (strcasecmp(scheme_str, scheme->name) == 0) {
 59 |                 return scheme->default_port;
 60 |             }
 61 |         }
 62 |     }
 63 |     return 0;
 64 | }
 65 | 
 66 | /* We have a apr_table_t that we can index by character and it tells us if the
 67 |  * character is one of the interesting delimiters.  Note that we even get
 68 |  * compares for NUL for free -- it's just another delimiter.
 69 |  */
 70 | 
 71 | #define T_COLON           0x01        /* ':' */
 72 | #define T_SLASH           0x02        /* '/' */
 73 | #define T_QUESTION        0x04        /* '?' */
 74 | #define T_HASH            0x08        /* '#' */
 75 | #define T_NUL             0x80        /* '\0' */
 76 | 
 77 | /* the uri_delims.h file is autogenerated by gen_uri_delims.c */
 78 | /* this file is automatically generated by gen_uri_delims, do not edit */
 79 | static const unsigned char uri_delims[256] = {
 80 |     T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 81 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,T_HASH,0,0,0,0,
 82 |     0,0,0,0,0,0,0,T_SLASH,0,0,0,0,0,0,0,0,0,0,T_COLON,0,
 83 |     0,0,0,T_QUESTION,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 84 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 85 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 86 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 87 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 88 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 89 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 90 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 91 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 92 |     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 
 93 | };
 94 | 
 95 | /* it works like this:
 96 |     if (uri_delims[ch] & NOTEND_foobar) {
 97 |         then we're not at a delimiter for foobar
 98 |     }
 99 | */
100 | 
101 | /* Note that we optimize the scheme scanning here, we cheat and let the
102 |  * compiler know that it doesn't have to do the & masking.
103 |  */
104 | #define NOTEND_SCHEME     (0xff)
105 | #define NOTEND_HOSTINFO   (T_SLASH | T_QUESTION | T_HASH | T_NUL)
106 | #define NOTEND_PATH       (T_QUESTION | T_HASH | T_NUL)
107 | 
108 | 
109 | static size_t wwwPrefixOffset(const std::string& hostname);
110 | 
111 | Uri::Uri()
112 | : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0)
113 | {}
114 | 
115 | Uri::Uri(const string &uri_str)
116 | : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0)
117 | {
118 | 	init(uri_str);
119 | }
120 | 
121 | void Uri::init(const string &uri_str)
122 | {
123 | 	DEBUGP("Parsing uri %s\n", uri_str.c_str());
124 | 
125 | 	if(uri_str.empty()) return;
126 | 	const char *uri = uri_str.c_str();
127 | 	const char *s;
128 | 	const char *s1;
129 | 	const char *hostinfo;
130 | 	char *endstr;
131 | 	
132 | 	/* We assume the processor has a branch predictor like most --
133 | 	 * it assumes forward branches are untaken and backwards are taken.  That's
134 | 	 * the reason for the gotos.  -djg
135 | 	 */
136 | 	if (uri[0] == '/') {
137 | 		deal_with_path:
138 | 		DEBUGP("Dealing with path\n");
139 | 		/* we expect uri to point to first character of path ... remember
140 | 		 * that the path could be empty -- http://foobar?query for example
141 | 		 */
142 | 		s = uri;
143 | 		while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) {
144 | 			++s;
145 | 		}
146 | 		if (s != uri) {
147 | 			mPath.assign(uri, s - uri);
148 | 			DEBUGP("Path is %s\n", mPath.c_str());
149 | 		}
150 | 		if (*s == 0) {
151 | 			return;
152 | 		}
153 | 		if (*s == '?') {
154 | 			++s;
155 | 			s1 = strchr(s, '#');
156 | 			if (s1) {
157 | 				mFragment.assign(s1 + 1);
158 | 				mExistsFragment = true;
159 | 				DEBUGP("Fragment is %s\n", mFragment.c_str());
160 | 				mQuery.assign(s, s1 - s);
161 | 				mExistsQuery = true;
162 | 				DEBUGP("Query is %s\n", mQuery.c_str());
163 | 			}
164 | 			else {
165 | 				mQuery.assign(s);
166 | 				mExistsQuery = true;
167 | 				DEBUGP("Query is %s\n", mQuery.c_str());
168 | 			}
169 | 			return;
170 | 		}
171 | 		/* otherwise it's a fragment */
172 | 		mFragment.assign(s + 1);
173 | 		mExistsFragment = true;
174 | 		DEBUGP("Fragment is %s\n", mFragment.c_str());
175 | 		return;
176 | 	}
177 | 
178 | 	DEBUGP("Dealing with scheme\n");
179 | 	/* find the scheme: */
180 | 	if (!isalpha(*uri)) goto deal_with_path;
181 | 	s = uri;
182 | 	while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) {
183 | 		++s;
184 | 	}
185 | 	/* scheme must be non-empty and followed by :// */
186 | 	if (s == uri || s[0] != ':' || s[1] != '/' || s[2] != '/') {
187 | 		goto deal_with_path;        /* backwards predicted taken! */
188 | 	}
189 | 
190 | 	mScheme.assign(uri, s - uri);
191 | 	DEBUGP("Scheme is %s\n", mScheme.c_str());
192 | 	s += 3;
193 | 
194 | 	DEBUGP("Finding hostinfo\n");
195 | 	hostinfo = s;
196 | 	DEBUGP("Hostinfo is %s\n", hostinfo);
197 | 	while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) {
198 | 		++s;
199 | 	}
200 | 	uri = s;        /* whatever follows hostinfo is start of uri */
201 | //	mHostinfo.assign(hostinfo, uri - hostinfo);
202 | 
203 | 	/* If there's a username:password@host:port, the @ we want is the last @...
204 | 	 * too bad there's no memrchr()... For the C purists, note that hostinfo
205 | 	 * is definately not the first character of the original uri so therefore
206 | 	 * &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C.
207 | 	 */
208 | 	do {
209 | 		--s;
210 | 	} while (s >= hostinfo && *s != '@');
211 | 	if (s < hostinfo) {
212 | 		/* again we want the common case to be fall through */
213 | deal_with_host:
214 | 		DEBUGP("Dealing with host\n");
215 | 		/* We expect hostinfo to point to the first character of
216 | 		 * the hostname.  If there's a port it is the first colon.
217 | 		 */
218 | 		s = (char *)memchr(hostinfo, ':', uri - hostinfo);
219 | 		if (s == NULL) {
220 | 			/* we expect the common case to have no port */
221 | 			mHostname.assign(hostinfo, uri - hostinfo);
222 | 			DEBUGP("Hostname is %s\n", mHostname.c_str());
223 | 			goto deal_with_path;
224 | 		}
225 | 		mHostname.assign(hostinfo, s - hostinfo);
226 | 		DEBUGP("Hostname is %s\n", mHostname.c_str());
227 | 		++s;
228 | 		if (uri != s) {
229 | 			mPortStr.assign(s, uri - s);
230 | 			mPort = strtol(mPortStr.c_str(), &endstr, 10);
231 | 			if (*endstr == '\0') {
232 | 				goto deal_with_path;
233 | 			}
234 | 			/* Invalid characters after ':' found */
235 | 			DEBUGP("Throwing invalid url exception\n");
236 | 			throw Exception("Invalid character after ':'");
237 | 		}
238 | 		this->mPort = port_of_Scheme(mScheme.c_str());
239 | 		goto deal_with_path;
240 | 	}
241 | 
242 | 	/* first colon delimits username:password */
243 | 	s1 = (char *)memchr(hostinfo, ':', s - hostinfo);
244 | 	if (s1) {
245 | 		mUser.assign(hostinfo, s1 - hostinfo);
246 | 		++s1;
247 | 		mPassword.assign(s1, s - s1);
248 | 	}
249 | 	else {
250 | 		mUser.assign(hostinfo, s - hostinfo);
251 | 	}
252 | 	hostinfo = s + 1;
253 | 	goto deal_with_host;
254 | }
255 | 
256 | Uri::~Uri() {
257 | }
258 | 
259 | string Uri::scheme() const { return mScheme; }
260 | 
261 | void Uri::scheme(string scheme) {
262 | 	mScheme = scheme;
263 | }
264 | string Uri::user() const { return mUser; }
265 | void Uri::user(string user) {
266 | 	mUser = user;
267 | }
268 | string Uri::password() const { return mPassword; }
269 | void Uri::password(string password) {
270 | 	mPassword = password;
271 | }
272 | string Uri::hostname() const { return mHostname; }
273 | void Uri::hostname(string hostname) {
274 | 	mHostname = hostname;
275 | }
276 | string Uri::path() const { return mPath; }
277 | void Uri::path(string path) {
278 | 	mPath = path;
279 | }
280 | bool Uri::existsFragment() const { return mExistsFragment; }
281 | void Uri::existsFragment(bool existsFragment) {
282 | 	mExistsFragment = existsFragment;
283 | }
284 | bool Uri::existsQuery() const { return mExistsQuery; }
285 | void Uri::existsQuery(bool existsQuery) {
286 | 	mExistsQuery = existsQuery;
287 | }
288 | string Uri::query() const { return mQuery; }
289 | void Uri::query(string query) {
290 | 	mQuery = query;
291 | }
292 | string Uri::fragment() const { return mFragment; }
293 | void Uri::fragment(string fragment) {
294 | 	mFragment = fragment;
295 | }
296 | unsigned int Uri::port() const { return mPort; }
297 | void Uri::port(unsigned int port) { mPort = port; }
298 | 
299 | static const char *default_filenames[] = { "index", "default", NULL };
300 | static const char *default_extensions[] = { ".html", ".htm", ".php", ".shtml", ".asp", ".cgi", NULL };
301 | 
302 | static unsigned short default_port_for_scheme(const char *scheme_str)
303 | {
304 | 	schemes_t *scheme;
305 | 
306 | 	if (scheme_str == NULL)
307 | 		return 0;
308 | 
309 | 	for (scheme = schemes; scheme->name != NULL; ++scheme)
310 | 		if (strcasecmp(scheme_str, scheme->name) == 0)
311 | 			return scheme->default_port;
312 | 
313 | 	return 0;
314 | }
315 | 
316 | Uri Uri::absolute(const Uri &base) const
317 | {
318 | 	if (mScheme.empty())
319 | 	{
320 | 		Uri root(base);
321 | 
322 | 		if (root.mPath.empty()) root.mPath = "/";
323 | 
324 | 		if (mPath.empty())
325 | 		{
326 | 			if (mExistsQuery)
327 | 			{
328 | 				root.mQuery = mQuery;
329 | 				root.mExistsQuery = mExistsQuery;
330 | 				root.mFragment = mFragment;
331 | 				root.mExistsFragment = mExistsFragment;
332 | 			}
333 | 			else if (mExistsFragment)
334 | 			{
335 | 				root.mFragment = mFragment;
336 | 				root.mExistsFragment = mExistsFragment;
337 | 			}
338 | 		}
339 | 		else if (mPath[0] == '/')
340 | 		{
341 | 			root.mPath = mPath;
342 | 			root.mQuery = mQuery;
343 | 			root.mExistsQuery = mExistsQuery;
344 | 			root.mFragment = mFragment;
345 | 			root.mExistsFragment = mExistsFragment;
346 | 		}
347 | 		else
348 | 		{
349 | 			string path(root.mPath);
350 | 			string::size_type find;
351 | 			find = path.rfind("/");
352 | 			if (find != string::npos) path.erase(find+1);
353 | 			path += mPath;
354 | 			root.mPath = path;
355 | 			root.mQuery = mQuery;
356 | 			root.mExistsQuery = mExistsQuery;
357 | 			root.mFragment = mFragment;
358 | 			root.mExistsFragment = mExistsFragment;
359 | 		}
360 | 
361 | 		return root;
362 | 	}
363 | 
364 | 	if (mPath.empty())
365 | 	{
366 | 		Uri root(*this);
367 | 		root.mPath = "/";
368 | 
369 | 		return root;
370 | 	}
371 | 
372 | 	return *this;
373 | }
374 | 
375 | string Uri::unparse(int flags ) const
376 | {
377 | 	string ret;
378 | 	ret.reserve(mScheme.length() + mUser.length() + mPassword.length() + mHostname.length() + mPath.length() + mQuery.length() + mFragment.length() + mPortStr.length());
379 | 
380 | 	DEBUGP("Unparsing scheme\n");
381 | 	if(!(Uri::REMOVE_SCHEME & flags)) {
382 | 		if(!mScheme.empty()) {
383 | 			ret +=  mScheme;
384 | 			ret += "://";
385 | 		}
386 | 	}
387 | 	DEBUGP("Unparsing hostname\n");
388 | 	if(!mHostname.empty()) { 
389 | 		size_t offset = 0;
390 | 		if(flags & Uri::REMOVE_WWW_PREFIX && mHostname.length() > 3) {
391 | 			offset = wwwPrefixOffset(mHostname);
392 | 		}
393 | 		ret += (mHostname.c_str() + offset);
394 | 	}
395 | 	DEBUGP("Unparsing port\n");
396 | 	if (!mPortStr.empty() && !(!mScheme.empty() && mPort == default_port_for_scheme(mScheme.c_str())))
397 | 	{
398 | 		ret += ':';
399 | 		ret += mPortStr;
400 | 	}
401 | 	DEBUGP("Unparsing path\n");
402 | 	if(!mPath.empty()) 
403 | 	{
404 | 		char *buf = new char[mPath.length() + 1];
405 | 		memcpy(buf, mPath.c_str(), mPath.length() + 1);
406 | 		if(flags & Uri::REMOVE_DEFAULT_FILENAMES) {
407 | 			const char **ptr = default_extensions;
408 | 			char *end = buf + mPath.length();
409 | 			size_t offset = 0;
410 | 			while(*ptr != NULL) {
411 | 				size_t len = strlen(*ptr);
412 | 				if((strcmp(end - len, *ptr)) == 0) {
413 | 					offset = len;
414 | 					break;
415 | 				}
416 | 				++ptr;
417 | 			} 
418 | 			if(offset == 0) goto remove_bar;
419 | 			ptr = default_filenames;
420 | 			bool found = false;
421 | 			while(*ptr != NULL) {
422 | 				size_t len = strlen(*ptr);
423 | 				if(strncmp(end - offset - len, *ptr, len) == 0) {
424 | 					offset += len; 
425 | 					found = true;
426 | 					break;
427 | 				}
428 | 				++ptr;
429 | 			}
430 | 			if(found) {
431 | 				*(end - offset) = 0; //cut filename
432 | 			}
433 | 
434 | 		}
435 | 		remove_bar:
436 | 		if(flags & Uri::REMOVE_TRAILING_BAR) {
437 | 			if(strlen(buf) > 1 && buf[strlen(buf) - 1] == '/') { //do not remove if path is only the bar
438 | 				buf[strlen(buf) - 1] = 0;
439 | 			}
440 | 		} 
441 | 		ret += buf;
442 | 		delete [] buf;
443 | 	}
444 | 	DEBUGP("Unparsing query\n");
445 | 	if(!(flags & Uri::REMOVE_QUERY) && mExistsQuery) {
446 | 		ret += '?';
447 | 		if(flags & Uri::REMOVE_QUERY_VALUES) {
448 | 			const char *ptr = mQuery.c_str();
449 | 			bool inside = false;
450 | 			while(*ptr) {
451 | 				if(*ptr == '=') {
452 | 					inside = true;
453 | 				}
454 | 				if(*ptr == '&') {
455 | 					inside = false;
456 | 				}
457 | 				if(inside) {
458 | 					++ptr;
459 | 				} else {
460 | 					ret += *ptr;
461 | 					++ptr;
462 | 				}
463 | 			}
464 | 		} else {
465 | 			ret += mQuery;
466 | 		}
467 | 	}
468 | 	DEBUGP("Unparsing fragment\n");
469 | 	if(!(flags & Uri::REMOVE_FRAGMENT) && mExistsFragment)
470 | 	{
471 | 		ret += '#';
472 | 		ret += mFragment;
473 | 	}
474 | 
475 | 	return ret;
476 | }
477 | 
478 | static size_t wwwPrefixOffset(const std::string& hostname) 
479 | {
480 | 	string::size_type len = hostname.length();
481 | 	if(strncasecmp("www", hostname.c_str(), 3) == 0)
482 | 	{
483 | 		if(len > 3 && hostname[3] == '.') 
484 | 		{
485 | 			return 4;
486 | 		}
487 | 		if(len > 4 && isdigit(hostname[3]) && hostname[4] == '.')
488 | 		{
489 | 			return 5;
490 | 		}
491 | 	}
492 | 	return 0;
493 | }
494 | 
495 | 
496 | 	
497 | 
498 | std::string Uri::canonicalHostname(unsigned int maxDepth) const
499 | {
500 | 
501 | 	size_t prefixOffset = wwwPrefixOffset(mHostname);
502 | 	size_t suffixOffset = tldOffset(mHostname.c_str());
503 | 	unsigned int depth = 0;
504 | 	string::const_iterator canonicalStart = mHostname.begin() + prefixOffset;
505 | 	string::const_iterator ptr = mHostname.begin();
506 | 	ptr += mHostname.length() - suffixOffset;
507 | 	while (depth < maxDepth && ptr > canonicalStart) 
508 | 	{
509 | 		--ptr;
510 | 		if (*ptr == '.') ++depth;
511 | 	}
512 | 	if (*ptr == '.') ++ptr;
513 | 	return string(ptr, mHostname.end());
514 | }
515 | 
516 | std::string Uri::decode(const std::string &uri)
517 | {
518 |     //Note from RFC1630:  "Sequences which start with a percent sign
519 |     //but are not followed by two hexadecimal characters (0-9,A-F) are reserved
520 |     //for future extension"
521 | 	const unsigned char *ptr = (const unsigned char *)uri.c_str();
522 | 	string ret;
523 | 	ret.reserve(uri.length());
524 | 	for (; *ptr; ++ptr)
525 | 	{
526 | 		if (*ptr == '%')
527 | 		{
528 | 			if (*(ptr + 1))
529 | 			{
530 | 				char a = *(ptr + 1);
531 | 				char b = *(ptr + 2);
532 | 				if (!((a >= 0x30 && a < 0x40) || (a >= 0x41 && a < 0x47))) continue;
533 | 				if (!((b >= 0x30 && b < 0x40) || (b >= 0x41 && b < 0x47))) continue;
534 | 				char buf[3];
535 | 				buf[0] = a;
536 | 				buf[1] = b;
537 | 				buf[2] = 0;
538 | 				ret += (char)strtoul(buf, NULL, 16);
539 | 				ptr += 2;
540 | 				continue;
541 | 			}
542 | 		}
543 | 		ret += *ptr;
544 | 	}
545 | 	return ret;
546 | }
547 | 
548 | //This vector is generated by safechars.py. Please do not edit by hand.
549 | static const char safe[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
550 | 
551 | 
552 | std::string Uri::encode(const std::string &uri)
553 | {
554 | 	string ret;	
555 | 	const unsigned char *ptr = (const unsigned char *)uri.c_str();
556 | 	ret.reserve(uri.length());
557 | 
558 | 	for (; *ptr ; ++ptr)
559 | 	{
560 | 		if (!safe[*ptr]) 
561 | 		{
562 | 			char buf[5];
563 | 			memset(buf, 0, 5);
564 | 			snprintf(buf, 5, "%%%X", (*ptr));
565 | 			ret.append(buf); 	
566 | 		}
567 | 		else 
568 | 		{
569 | 			ret += *ptr;
570 | 		}
571 | 	}
572 | 	return ret;
573 | }
574 | 
575 | 


--------------------------------------------------------------------------------
/LGPL_V2:
--------------------------------------------------------------------------------
  1 | 		  GNU LIBRARY GENERAL PUBLIC LICENSE
  2 | 		       Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1991 Free Software Foundation, Inc.
  5 |                     51 Franklin Street, Fifth Floor
  6 |                     Boston, MA 02110-1301, USA.
  7 |  Everyone is permitted to copy and distribute verbatim copies
  8 |  of this license document, but changing it is not allowed.
  9 | 
 10 | [This is the first released version of the library GPL.  It is
 11 |  numbered 2 because it goes with version 2 of the ordinary GPL.]
 12 | 
 13 | 			    Preamble
 14 | 
 15 |   The licenses for most software are designed to take away your
 16 | freedom to share and change it.  By contrast, the GNU General Public
 17 | Licenses are intended to guarantee your freedom to share and change
 18 | free software--to make sure the software is free for all its users.
 19 | 
 20 |   This license, the Library General Public License, applies to some
 21 | specially designated Free Software Foundation software, and to any
 22 | other libraries whose authors decide to use it.  You can use it for
 23 | your libraries, too.
 24 | 
 25 |   When we speak of free software, we are referring to freedom, not
 26 | price.  Our General Public Licenses are designed to make sure that you
 27 | have the freedom to distribute copies of free software (and charge for
 28 | this service if you wish), that you receive source code or can get it
 29 | if you want it, that you can change the software or use pieces of it
 30 | in new free programs; and that you know you can do these things.
 31 | 
 32 |   To protect your rights, we need to make restrictions that forbid
 33 | anyone to deny you these rights or to ask you to surrender the rights.
 34 | These restrictions translate to certain responsibilities for you if
 35 | you distribute copies of the library, or if you modify it.
 36 | 
 37 |   For example, if you distribute copies of the library, whether gratis
 38 | or for a fee, you must give the recipients all the rights that we gave
 39 | you.  You must make sure that they, too, receive or can get the source
 40 | code.  If you link a program with the library, you must provide
 41 | complete object files to the recipients so that they can relink them
 42 | with the library, after making changes to the library and recompiling
 43 | it.  And you must show them these terms so they know their rights.
 44 | 
 45 |   Our method of protecting your rights has two steps: (1) copyright
 46 | the library, and (2) offer you this license which gives you legal
 47 | permission to copy, distribute and/or modify the library.
 48 | 
 49 |   Also, for each distributor's protection, we want to make certain
 50 | that everyone understands that there is no warranty for this free
 51 | library.  If the library is modified by someone else and passed on, we
 52 | want its recipients to know that what they have is not the original
 53 | version, so that any problems introduced by others will not reflect on
 54 | the original authors' reputations.
 55 | 
 56 |   Finally, any free program is threatened constantly by software
 57 | patents.  We wish to avoid the danger that companies distributing free
 58 | software will individually obtain patent licenses, thus in effect
 59 | transforming the program into proprietary software.  To prevent this,
 60 | we have made it clear that any patent must be licensed for everyone's
 61 | free use or not licensed at all.
 62 | 
 63 |   Most GNU software, including some libraries, is covered by the ordinary
 64 | GNU General Public License, which was designed for utility programs.  This
 65 | license, the GNU Library General Public License, applies to certain
 66 | designated libraries.  This license is quite different from the ordinary
 67 | one; be sure to read it in full, and don't assume that anything in it is
 68 | the same as in the ordinary license.
 69 | 
 70 |   The reason we have a separate public license for some libraries is that
 71 | they blur the distinction we usually make between modifying or adding to a
 72 | program and simply using it.  Linking a program with a library, without
 73 | changing the library, is in some sense simply using the library, and is
 74 | analogous to running a utility program or application program.  However, in
 75 | a textual and legal sense, the linked executable is a combined work, a
 76 | derivative of the original library, and the ordinary General Public License
 77 | treats it as such.
 78 | 
 79 |   Because of this blurred distinction, using the ordinary General
 80 | Public License for libraries did not effectively promote software
 81 | sharing, because most developers did not use the libraries.  We
 82 | concluded that weaker conditions might promote sharing better.
 83 | 
 84 |   However, unrestricted linking of non-free programs would deprive the
 85 | users of those programs of all benefit from the free status of the
 86 | libraries themselves.  This Library General Public License is intended to
 87 | permit developers of non-free programs to use free libraries, while
 88 | preserving your freedom as a user of such programs to change the free
 89 | libraries that are incorporated in them.  (We have not seen how to achieve
 90 | this as regards changes in header files, but we have achieved it as regards
 91 | changes in the actual functions of the Library.)  The hope is that this
 92 | will lead to faster development of free libraries.
 93 | 
 94 |   The precise terms and conditions for copying, distribution and
 95 | modification follow.  Pay close attention to the difference between a
 96 | "work based on the library" and a "work that uses the library".  The
 97 | former contains code derived from the library, while the latter only
 98 | works together with the library.
 99 | 
100 |   Note that it is possible for a library to be covered by the ordinary
101 | General Public License rather than by this special one.
102 | 
103 | 		  GNU LIBRARY GENERAL PUBLIC LICENSE
104 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
105 | 
106 |   0. This License Agreement applies to any software library which
107 | contains a notice placed by the copyright holder or other authorized
108 | party saying it may be distributed under the terms of this Library
109 | General Public License (also called "this License").  Each licensee is
110 | addressed as "you".
111 | 
112 |   A "library" means a collection of software functions and/or data
113 | prepared so as to be conveniently linked with application programs
114 | (which use some of those functions and data) to form executables.
115 | 
116 |   The "Library", below, refers to any such software library or work
117 | which has been distributed under these terms.  A "work based on the
118 | Library" means either the Library or any derivative work under
119 | copyright law: that is to say, a work containing the Library or a
120 | portion of it, either verbatim or with modifications and/or translated
121 | straightforwardly into another language.  (Hereinafter, translation is
122 | included without limitation in the term "modification".)
123 | 
124 |   "Source code" for a work means the preferred form of the work for
125 | making modifications to it.  For a library, complete source code means
126 | all the source code for all modules it contains, plus any associated
127 | interface definition files, plus the scripts used to control compilation
128 | and installation of the library.
129 | 
130 |   Activities other than copying, distribution and modification are not
131 | covered by this License; they are outside its scope.  The act of
132 | running a program using the Library is not restricted, and output from
133 | such a program is covered only if its contents constitute a work based
134 | on the Library (independent of the use of the Library in a tool for
135 | writing it).  Whether that is true depends on what the Library does
136 | and what the program that uses the Library does.
137 |   
138 |   1. You may copy and distribute verbatim copies of the Library's
139 | complete source code as you receive it, in any medium, provided that
140 | you conspicuously and appropriately publish on each copy an
141 | appropriate copyright notice and disclaimer of warranty; keep intact
142 | all the notices that refer to this License and to the absence of any
143 | warranty; and distribute a copy of this License along with the
144 | Library.
145 | 
146 |   You may charge a fee for the physical act of transferring a copy,
147 | and you may at your option offer warranty protection in exchange for a
148 | fee.
149 | 
150 |   2. You may modify your copy or copies of the Library or any portion
151 | of it, thus forming a work based on the Library, and copy and
152 | distribute such modifications or work under the terms of Section 1
153 | above, provided that you also meet all of these conditions:
154 | 
155 |     a) The modified work must itself be a software library.
156 | 
157 |     b) You must cause the files modified to carry prominent notices
158 |     stating that you changed the files and the date of any change.
159 | 
160 |     c) You must cause the whole of the work to be licensed at no
161 |     charge to all third parties under the terms of this License.
162 | 
163 |     d) If a facility in the modified Library refers to a function or a
164 |     table of data to be supplied by an application program that uses
165 |     the facility, other than as an argument passed when the facility
166 |     is invoked, then you must make a good faith effort to ensure that,
167 |     in the event an application does not supply such function or
168 |     table, the facility still operates, and performs whatever part of
169 |     its purpose remains meaningful.
170 | 
171 |     (For example, a function in a library to compute square roots has
172 |     a purpose that is entirely well-defined independent of the
173 |     application.  Therefore, Subsection 2d requires that any
174 |     application-supplied function or table used by this function must
175 |     be optional: if the application does not supply it, the square
176 |     root function must still compute square roots.)
177 | 
178 | These requirements apply to the modified work as a whole.  If
179 | identifiable sections of that work are not derived from the Library,
180 | and can be reasonably considered independent and separate works in
181 | themselves, then this License, and its terms, do not apply to those
182 | sections when you distribute them as separate works.  But when you
183 | distribute the same sections as part of a whole which is a work based
184 | on the Library, the distribution of the whole must be on the terms of
185 | this License, whose permissions for other licensees extend to the
186 | entire whole, and thus to each and every part regardless of who wrote
187 | it.
188 | 
189 | Thus, it is not the intent of this section to claim rights or contest
190 | your rights to work written entirely by you; rather, the intent is to
191 | exercise the right to control the distribution of derivative or
192 | collective works based on the Library.
193 | 
194 | In addition, mere aggregation of another work not based on the Library
195 | with the Library (or with a work based on the Library) on a volume of
196 | a storage or distribution medium does not bring the other work under
197 | the scope of this License.
198 | 
199 |   3. You may opt to apply the terms of the ordinary GNU General Public
200 | License instead of this License to a given copy of the Library.  To do
201 | this, you must alter all the notices that refer to this License, so
202 | that they refer to the ordinary GNU General Public License, version 2,
203 | instead of to this License.  (If a newer version than version 2 of the
204 | ordinary GNU General Public License has appeared, then you can specify
205 | that version instead if you wish.)  Do not make any other change in
206 | these notices.
207 | 
208 |   Once this change is made in a given copy, it is irreversible for
209 | that copy, so the ordinary GNU General Public License applies to all
210 | subsequent copies and derivative works made from that copy.
211 | 
212 |   This option is useful when you wish to copy part of the code of
213 | the Library into a program that is not a library.
214 | 
215 |   4. You may copy and distribute the Library (or a portion or
216 | derivative of it, under Section 2) in object code or executable form
217 | under the terms of Sections 1 and 2 above provided that you accompany
218 | it with the complete corresponding machine-readable source code, which
219 | must be distributed under the terms of Sections 1 and 2 above on a
220 | medium customarily used for software interchange.
221 | 
222 |   If distribution of object code is made by offering access to copy
223 | from a designated place, then offering equivalent access to copy the
224 | source code from the same place satisfies the requirement to
225 | distribute the source code, even though third parties are not
226 | compelled to copy the source along with the object code.
227 | 
228 |   5. A program that contains no derivative of any portion of the
229 | Library, but is designed to work with the Library by being compiled or
230 | linked with it, is called a "work that uses the Library".  Such a
231 | work, in isolation, is not a derivative work of the Library, and
232 | therefore falls outside the scope of this License.
233 | 
234 |   However, linking a "work that uses the Library" with the Library
235 | creates an executable that is a derivative of the Library (because it
236 | contains portions of the Library), rather than a "work that uses the
237 | library".  The executable is therefore covered by this License.
238 | Section 6 states terms for distribution of such executables.
239 | 
240 |   When a "work that uses the Library" uses material from a header file
241 | that is part of the Library, the object code for the work may be a
242 | derivative work of the Library even though the source code is not.
243 | Whether this is true is especially significant if the work can be
244 | linked without the Library, or if the work is itself a library.  The
245 | threshold for this to be true is not precisely defined by law.
246 | 
247 |   If such an object file uses only numerical parameters, data
248 | structure layouts and accessors, and small macros and small inline
249 | functions (ten lines or less in length), then the use of the object
250 | file is unrestricted, regardless of whether it is legally a derivative
251 | work.  (Executables containing this object code plus portions of the
252 | Library will still fall under Section 6.)
253 | 
254 |   Otherwise, if the work is a derivative of the Library, you may
255 | distribute the object code for the work under the terms of Section 6.
256 | Any executables containing that work also fall under Section 6,
257 | whether or not they are linked directly with the Library itself.
258 | 
259 |   6. As an exception to the Sections above, you may also compile or
260 | link a "work that uses the Library" with the Library to produce a
261 | work containing portions of the Library, and distribute that work
262 | under terms of your choice, provided that the terms permit
263 | modification of the work for the customer's own use and reverse
264 | engineering for debugging such modifications.
265 | 
266 |   You must give prominent notice with each copy of the work that the
267 | Library is used in it and that the Library and its use are covered by
268 | this License.  You must supply a copy of this License.  If the work
269 | during execution displays copyright notices, you must include the
270 | copyright notice for the Library among them, as well as a reference
271 | directing the user to the copy of this License.  Also, you must do one
272 | of these things:
273 | 
274 |     a) Accompany the work with the complete corresponding
275 |     machine-readable source code for the Library including whatever
276 |     changes were used in the work (which must be distributed under
277 |     Sections 1 and 2 above); and, if the work is an executable linked
278 |     with the Library, with the complete machine-readable "work that
279 |     uses the Library", as object code and/or source code, so that the
280 |     user can modify the Library and then relink to produce a modified
281 |     executable containing the modified Library.  (It is understood
282 |     that the user who changes the contents of definitions files in the
283 |     Library will not necessarily be able to recompile the application
284 |     to use the modified definitions.)
285 | 
286 |     b) Accompany the work with a written offer, valid for at
287 |     least three years, to give the same user the materials
288 |     specified in Subsection 6a, above, for a charge no more
289 |     than the cost of performing this distribution.
290 | 
291 |     c) If distribution of the work is made by offering access to copy
292 |     from a designated place, offer equivalent access to copy the above
293 |     specified materials from the same place.
294 | 
295 |     d) Verify that the user has already received a copy of these
296 |     materials or that you have already sent this user a copy.
297 | 
298 |   For an executable, the required form of the "work that uses the
299 | Library" must include any data and utility programs needed for
300 | reproducing the executable from it.  However, as a special exception,
301 | the source code distributed need not include anything that is normally
302 | distributed (in either source or binary form) with the major
303 | components (compiler, kernel, and so on) of the operating system on
304 | which the executable runs, unless that component itself accompanies
305 | the executable.
306 | 
307 |   It may happen that this requirement contradicts the license
308 | restrictions of other proprietary libraries that do not normally
309 | accompany the operating system.  Such a contradiction means you cannot
310 | use both them and the Library together in an executable that you
311 | distribute.
312 | 
313 |   7. You may place library facilities that are a work based on the
314 | Library side-by-side in a single library together with other library
315 | facilities not covered by this License, and distribute such a combined
316 | library, provided that the separate distribution of the work based on
317 | the Library and of the other library facilities is otherwise
318 | permitted, and provided that you do these two things:
319 | 
320 |     a) Accompany the combined library with a copy of the same work
321 |     based on the Library, uncombined with any other library
322 |     facilities.  This must be distributed under the terms of the
323 |     Sections above.
324 | 
325 |     b) Give prominent notice with the combined library of the fact
326 |     that part of it is a work based on the Library, and explaining
327 |     where to find the accompanying uncombined form of the same work.
328 | 
329 |   8. You may not copy, modify, sublicense, link with, or distribute
330 | the Library except as expressly provided under this License.  Any
331 | attempt otherwise to copy, modify, sublicense, link with, or
332 | distribute the Library is void, and will automatically terminate your
333 | rights under this License.  However, parties who have received copies,
334 | or rights, from you under this License will not have their licenses
335 | terminated so long as such parties remain in full compliance.
336 | 
337 |   9. You are not required to accept this License, since you have not
338 | signed it.  However, nothing else grants you permission to modify or
339 | distribute the Library or its derivative works.  These actions are
340 | prohibited by law if you do not accept this License.  Therefore, by
341 | modifying or distributing the Library (or any work based on the
342 | Library), you indicate your acceptance of this License to do so, and
343 | all its terms and conditions for copying, distributing or modifying
344 | the Library or works based on it.
345 | 
346 |   10. Each time you redistribute the Library (or any work based on the
347 | Library), the recipient automatically receives a license from the
348 | original licensor to copy, distribute, link with or modify the Library
349 | subject to these terms and conditions.  You may not impose any further
350 | restrictions on the recipients' exercise of the rights granted herein.
351 | You are not responsible for enforcing compliance by third parties to
352 | this License.
353 | 
354 |   11. If, as a consequence of a court judgment or allegation of patent
355 | infringement or for any other reason (not limited to patent issues),
356 | conditions are imposed on you (whether by court order, agreement or
357 | otherwise) that contradict the conditions of this License, they do not
358 | excuse you from the conditions of this License.  If you cannot
359 | distribute so as to satisfy simultaneously your obligations under this
360 | License and any other pertinent obligations, then as a consequence you
361 | may not distribute the Library at all.  For example, if a patent
362 | license would not permit royalty-free redistribution of the Library by
363 | all those who receive copies directly or indirectly through you, then
364 | the only way you could satisfy both it and this License would be to
365 | refrain entirely from distribution of the Library.
366 | 
367 | If any portion of this section is held invalid or unenforceable under any
368 | particular circumstance, the balance of the section is intended to apply,
369 | and the section as a whole is intended to apply in other circumstances.
370 | 
371 | It is not the purpose of this section to induce you to infringe any
372 | patents or other property right claims or to contest validity of any
373 | such claims; this section has the sole purpose of protecting the
374 | integrity of the free software distribution system which is
375 | implemented by public license practices.  Many people have made
376 | generous contributions to the wide range of software distributed
377 | through that system in reliance on consistent application of that
378 | system; it is up to the author/donor to decide if he or she is willing
379 | to distribute software through any other system and a licensee cannot
380 | impose that choice.
381 | 
382 | This section is intended to make thoroughly clear what is believed to
383 | be a consequence of the rest of this License.
384 | 
385 |   12. If the distribution and/or use of the Library is restricted in
386 | certain countries either by patents or by copyrighted interfaces, the
387 | original copyright holder who places the Library under this License may add
388 | an explicit geographical distribution limitation excluding those countries,
389 | so that distribution is permitted only in or among countries not thus
390 | excluded.  In such case, this License incorporates the limitation as if
391 | written in the body of this License.
392 | 
393 |   13. The Free Software Foundation may publish revised and/or new
394 | versions of the Library General Public License from time to time.
395 | Such new versions will be similar in spirit to the present version,
396 | but may differ in detail to address new problems or concerns.
397 | 
398 | Each version is given a distinguishing version number.  If the Library
399 | specifies a version number of this License which applies to it and
400 | "any later version", you have the option of following the terms and
401 | conditions either of that version or of any later version published by
402 | the Free Software Foundation.  If the Library does not specify a
403 | license version number, you may choose any version ever published by
404 | the Free Software Foundation.
405 | 
406 |   14. If you wish to incorporate parts of the Library into other free
407 | programs whose distribution conditions are incompatible with these,
408 | write to the author to ask for permission.  For software which is
409 | copyrighted by the Free Software Foundation, write to the Free
410 | Software Foundation; we sometimes make exceptions for this.  Our
411 | decision will be guided by the two goals of preserving the free status
412 | of all derivatives of our free software and of promoting the sharing
413 | and reuse of software generally.
414 | 
415 | 			    NO WARRANTY
416 | 
417 |   15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
418 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
419 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
420 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
421 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
422 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
423 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
424 | LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
425 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
426 | 
427 |   16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
428 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
429 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
430 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
431 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
432 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
433 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
434 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
435 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
436 | DAMAGES.
437 | 
438 | 		     END OF TERMS AND CONDITIONS
439 | 
440 |            How to Apply These Terms to Your New Libraries
441 | 
442 |   If you develop a new library, and you want it to be of the greatest
443 | possible use to the public, we recommend making it free software that
444 | everyone can redistribute and change.  You can do so by permitting
445 | redistribution under these terms (or, alternatively, under the terms of the
446 | ordinary General Public License).
447 | 
448 |   To apply these terms, attach the following notices to the library.  It is
449 | safest to attach them to the start of each source file to most effectively
450 | convey the exclusion of warranty; and each file should have at least the
451 | "copyright" line and a pointer to where the full notice is found.
452 | 
453 |     <one line to give the library's name and a brief idea of what it does.>
454 |     Copyright (C) <year>  <name of author>
455 | 
456 |     This library is free software; you can redistribute it and/or
457 |     modify it under the terms of the GNU Lesser General Public
458 |     License as published by the Free Software Foundation; either
459 |     version 2 of the License, or (at your option) any later version.
460 | 
461 |     This library is distributed in the hope that it will be useful,
462 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
463 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
464 |     Lesser General Public License for more details.
465 | 
466 |     You should have received a copy of the GNU Lesser General Public
467 |     License along with this library; if not, write to the Free Software
468 |     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
469 | 
470 | Also add information on how to contact you by electronic and paper mail.
471 | 
472 | You should also get your employer (if you work as a programmer) or your
473 | school, if any, to sign a "copyright disclaimer" for the library, if
474 | necessary.  Here is a sample; alter the names:
475 | 
476 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the
477 |   library `Frob' (a library for tweaking knobs) written by James Random Hacker.
478 | 
479 |   <signature of Ty Coon>, 1 April 1990
480 |   Ty Coon, President of Vice
481 | 
482 | That's all there is to it!
483 | 


--------------------------------------------------------------------------------