├── NEWS ├── AUTHORS ├── html ├── tld.list ├── uriTests.cc ├── ParserSax.cc ├── wincstring.h ├── Extensions.h ├── CharsetConverter.h ├── Extensions.cc ├── ParserDom.h ├── Makefile.am ├── gen_tld.pl ├── ci_string.h ├── utils.h ├── debug.h ├── CharsetConverter.cc ├── ParserSax.h ├── Node.h ├── Node.cc ├── Uri.h ├── ParserDom.cc ├── profile.h ├── tld.h ├── tests.cc ├── ParserSax.tcc ├── utils.cc └── Uri.cc ├── css ├── css_lex.h ├── Makefile.am ├── parser.c ├── parser.h ├── parser_pp.h ├── css_syntax.h ├── css_lex.l ├── default.css ├── parser_pp.cc └── css_syntax.y ├── htmlcxx.pc.in ├── Makefile.am ├── COPYING ├── README.md ├── wingetopt.h ├── configure.ac ├── ChangeLog ├── htmlcxx.spec ├── htmlcxxapp.vcproj ├── README ├── htmlcxx.vcproj ├── htmlcxx.cc ├── wingetopt.c ├── ASF-2.0 ├── INSTALL └── LGPL_V2 /NEWS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barmi/htmlcxx/HEAD/AUTHORS -------------------------------------------------------------------------------- /html/tld.list: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barmi/htmlcxx/HEAD/html/tld.list -------------------------------------------------------------------------------- /html/uriTests.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barmi/htmlcxx/HEAD/html/uriTests.cc -------------------------------------------------------------------------------- /css/css_lex.h: -------------------------------------------------------------------------------- 1 | #ifndef __CSS_LEX_H__ 2 | #define __CSS_LEX_H__ 3 | 4 | int init_yylex(const char *buffer, int buf_len); 5 | int end_yylex(); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /html/ParserSax.cc: -------------------------------------------------------------------------------- 1 | #include "ParserSax.h" 2 | 3 | void htmlcxx::HTML::ParserSax::parse(const std::string &html) 4 | { 5 | // std::cerr << "Parsing string" << std::endl; 6 | parse(html.c_str(), html.c_str() + html.length()); 7 | } 8 | -------------------------------------------------------------------------------- /htmlcxx.pc.in: -------------------------------------------------------------------------------- 1 | url=http://htmlcxx.sourceforge.net/ 2 | prefix=@prefix@ 3 | exec_prefix=@exec_prefix@ 4 | libdir=@libdir@ 5 | includedir=@includedir@ 6 | 7 | Name: htmlcxx 8 | Description: html and css apis for c++ 9 | Version: @VERSION@ 10 | Libs: -L${libdir} -lhtmlcxx -lcss_parser_pp -lcss_parser 11 | Cflags: -I${includedir} 12 | URL: ${url} 13 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = html css 2 | 3 | bin_PROGRAMS = htmlcxx 4 | htmlcxx_SOURCES = htmlcxx.cc wingetopt.h 5 | 6 | htmlcxx_LDADD = html/libhtmlcxx.la css/libcss_parser_pp.la css/libcss_parser.la 7 | 8 | EXTRA_DIST = ASF-2.0 LGPL_V2 wingetopt.c htmlcxx.spec htmlcxx.vcproj htmlcxxapp.vcproj htmlcxx.pc.in 9 | 10 | pkgconfigdir = $(libdir)/pkgconfig 11 | pkgconfig_DATA = htmlcxx.pc 12 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | The htmlcxx code is covered by the LGPL license found in the LGPL_V2 file in 2 | this distribution. The tree.hh code, used in this code, is covered by the 3 | license in the original distribution at http://tree.phi-sci.com/. Only 4 | HTML::Node instances, which the only instances used by htmlcxx, are available 5 | under the LGPL. The uri parsing code is a derivative work of Apache web server 6 | uri parsing routine and covered by the ASF-2.0 file in this distribution. 7 | -------------------------------------------------------------------------------- /html/wincstring.h: -------------------------------------------------------------------------------- 1 | #ifndef __WINCSTRING_H__ 2 | #define __WINCSTRING_H__ 3 | 4 | #include 5 | 6 | #if defined(WIN32) && !defined(__MINGW32__) 7 | 8 | /* 9 | * some functions have strange names on windows 10 | */ 11 | #ifndef strcasecmp 12 | #define strcasecmp _stricmp 13 | #endif 14 | #ifndef strncasecmp 15 | #define strncasecmp _strnicmp 16 | #endif 17 | #ifndef snprintf 18 | #define snprintf _snprintf 19 | #endif 20 | 21 | #endif 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | htmlcxx 2 | ======= 3 | * a simple html and css parser written in C++. clone of http://htmlcxx.sourceforge.net/ 4 | * new code repository : https://sourceforge.net/p/htmlcxx/code/ 5 | 6 | install 7 | ------- 8 | * sudo apt install autoconf libtool 9 | 10 | * ref: https://stackoverflow.com/questions/22603163/automake-error-ltmain-sh-not-found 11 | 12 | * libtoolize 13 | * aclocal 14 | * autoheader 15 | * autoconf 16 | * automake --add-missing 17 | 18 | * ./configure 19 | * make 20 | -------------------------------------------------------------------------------- /html/Extensions.h: -------------------------------------------------------------------------------- 1 | #ifndef __EXTENSIONS_H__ 2 | #define __EXTENSIONS_H__ 3 | 4 | #include 5 | #include 6 | #include "ci_string.h" 7 | 8 | namespace htmlcxx 9 | { 10 | class Extensions 11 | { 12 | public: 13 | Extensions() {} 14 | 15 | Extensions(const std::string &exts); 16 | 17 | ~Extensions() {} 18 | 19 | bool check(const std::string &url); 20 | void insert(const ci_string &ext) { mExts.insert(ext); } 21 | 22 | private: 23 | std::set mExts; 24 | }; 25 | } 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /css/Makefile.am: -------------------------------------------------------------------------------- 1 | lib_LTLIBRARIES = libcss_parser_pp.la libcss_parser.la 2 | 3 | libcss_parser_pp_la_SOURCES = parser_pp.h parser_pp.cc parser.c 4 | libcss_parser_pp_la_LDFLAGS = -version-info 0:0:0 5 | 6 | libcss_parser_la_SOURCES = css_syntax.y css_lex.l css_lex.h css_syntax.h parser.c parser.h 7 | libcss_parser_la_LDFLAGS = -version-info 0:0:0 8 | 9 | AM_YFLAGS = -d 10 | AM_LFLAGS = -i 11 | 12 | includedir = $(prefix)/include/htmlcxx/css 13 | include_HEADERS = parser.h parser_pp.h 14 | 15 | EXTRA_DIST = default.css 16 | cssdir = $(pkgdatadir)/css 17 | css_DATA = default.css 18 | -------------------------------------------------------------------------------- /html/CharsetConverter.h: -------------------------------------------------------------------------------- 1 | #ifndef __CHARSET_CONVERTER_H__ 2 | #define __CHARSET_CONVERTER_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace htmlcxx 9 | { 10 | class CharsetConverter 11 | { 12 | public: 13 | class Exception : public std::runtime_error 14 | { 15 | public: 16 | Exception(const std::string &arg) 17 | : std::runtime_error(arg) {} 18 | }; 19 | 20 | CharsetConverter(const std::string &from, const std::string &to) throw (Exception); 21 | ~CharsetConverter(); 22 | 23 | std::string convert(const std::string &input); 24 | 25 | private: 26 | iconv_t mIconvDescriptor;; 27 | }; 28 | } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /wingetopt.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #if !defined(WIN32) || defined(__MINGW32__) 6 | #include 7 | #else 8 | #ifndef _GETOPT_ 9 | #define _GETOPT_ 10 | 11 | #include /* for EOF */ 12 | #include /* for strchr() */ 13 | 14 | char *optarg = NULL; /* pointer to the start of the option argument */ 15 | int optind = 1; /* number of the next argv[] to be evaluated */ 16 | int opterr = 1; /* non-zero if a question mark should be returned */ 17 | 18 | int getopt(int argc, char *argv[], char *opstring); 19 | #endif 20 | #endif 21 | 22 | #ifdef __cplusplus 23 | } 24 | #endif 25 | 26 | -------------------------------------------------------------------------------- /css/parser.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "parser.h" 3 | 4 | void free_rulesets(struct selector_list_t *rules) { 5 | 6 | struct selector_list_t *pos = rules; 7 | while (pos != NULL) { 8 | struct selector_list_t *tmp = pos; 9 | struct selector_t *rule = pos->selector; 10 | struct property_t *property = rule->property; 11 | while (property != NULL) { 12 | struct property_t *tmp = property; 13 | property = property->next; 14 | tmp->count--; 15 | if (tmp->count == 0) { 16 | free(tmp->name); 17 | free(tmp->val); 18 | free(tmp); 19 | } 20 | } 21 | while (rule != NULL) { 22 | struct selector_t *tmp = rule; 23 | rule = rule->next; 24 | free(tmp->element_name); 25 | free(tmp->id); 26 | free(tmp->e_class); 27 | free(tmp); 28 | } 29 | pos = pos->next; 30 | free(tmp); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /html/Extensions.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "Extensions.h" 3 | 4 | using namespace std; 5 | using namespace htmlcxx; 6 | 7 | Extensions::Extensions(const string &exts) 8 | { 9 | const char *begin = exts.c_str(); 10 | while (*begin) 11 | { 12 | while (*begin == ' ') ++begin; 13 | if (*begin == 0) break; 14 | 15 | const char *end = begin + 1; 16 | while (*end && *end != ' ') ++end; 17 | 18 | insert(ci_string(begin, end)); 19 | 20 | begin = end; 21 | } 22 | } 23 | 24 | bool Extensions::check(const string &url) 25 | { 26 | const char *slash; 27 | const char *dot; 28 | const char *question; 29 | 30 | question = strchr(url.c_str(), '?'); 31 | 32 | if (question) return false; 33 | 34 | slash = strrchr(url.c_str(), '/'); 35 | dot = strrchr(url.c_str(), '.'); 36 | 37 | if (slash >= dot) return false; 38 | 39 | ci_string ext(dot); 40 | 41 | return mExts.find(ext) != mExts.end(); 42 | } 43 | -------------------------------------------------------------------------------- /html/ParserDom.h: -------------------------------------------------------------------------------- 1 | #ifndef __HTML_PARSER_DOM_H__ 2 | #define __HTML_PARSER_DOM_H__ 3 | 4 | #include "ParserSax.h" 5 | #include "tree.h" 6 | 7 | namespace htmlcxx 8 | { 9 | namespace HTML 10 | { 11 | class ParserDom : public ParserSax 12 | { 13 | public: 14 | ParserDom() {} 15 | ~ParserDom() {} 16 | 17 | const tree &parseTree(const std::string &html); 18 | const tree &getTree() 19 | { return mHtmlTree; } 20 | 21 | protected: 22 | virtual void beginParsing(); 23 | 24 | virtual void foundTag(Node node, bool isEnd); 25 | virtual void foundText(Node node); 26 | virtual void foundComment(Node node); 27 | 28 | virtual void endParsing(); 29 | 30 | tree mHtmlTree; 31 | tree::iterator mCurrentState; 32 | }; 33 | 34 | std::ostream &operator<<(std::ostream &stream, const tree &tr); 35 | } //namespace HTML 36 | } //namespace htmlcxx 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /html/Makefile.am: -------------------------------------------------------------------------------- 1 | lib_LTLIBRARIES = libhtmlcxx.la 2 | libhtmlcxx_la_SOURCES = ParserSax.h ParserSax.tcc ParserSax.cc \ 3 | ParserDom.h ParserDom.cc \ 4 | Extensions.h Extensions.cc \ 5 | CharsetConverter.h CharsetConverter.cc \ 6 | Uri.h Uri.cc \ 7 | Node.h Node.cc \ 8 | utils.h utils.cc \ 9 | ci_string.h \ 10 | tree.h \ 11 | wincstring.h\ 12 | debug.h 13 | libhtmlcxx_la_CXXFLAGS = 14 | libhtmlcxx_la_LDFLAGS = -version-info 4:0:1 15 | 16 | # bin_PROGRAMS = htmlcxx 17 | # htmlcxx_LDADD = libhtmlcxx.la 18 | # htmlcxx_SOURCES = htmlcxx.cc 19 | 20 | noinst_PROGRAMS = tests uriTests 21 | tests_SOURCES = tests.cc 22 | tests_LDADD = libhtmlcxx.la 23 | uriTests_SOURCES = uriTests.cc 24 | uriTests_LDADD = libhtmlcxx.la 25 | 26 | TESTS = tests uriTests 27 | 28 | includedir = $(prefix)/include/htmlcxx/html 29 | include_HEADERS = ParserSax.h ParserSax.tcc ParserDom.h utils.h tree.h Node.h Uri.h CharsetConverter.h Extensions.h ci_string.h tld.h 30 | 31 | EXTRA_DIST = gen_tld.pl tld.list 32 | -------------------------------------------------------------------------------- /html/gen_tld.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | print "/** Do not modify. This file is automatically generated\n"; 4 | print " * using gen_tld.pl and tld.list\n"; 5 | print " */\n\n"; 6 | 7 | my @tld; 8 | while() { 9 | chomp; 10 | if(/^([\w\.]+)\s+/) { 11 | push @tld, uc $1; 12 | } 13 | } 14 | @tld = sort { length($a) <=> length($b) || $a cmp $b } @tld; 15 | @tld = reverse @tld; 16 | 17 | my $i; 18 | print "static const char *tld[] = {\n"; 19 | for($i = 0; $i < scalar (@tld) - 1; ++$i) { print "\t\".$tld[$i]\",\n"; } 20 | print "\t\"." . $tld[scalar(@tld) - 1] . "\"\n"; 21 | print "};\n"; 22 | print "static uint tldOffset(const char *domain) {\n"; 23 | print " const char *end = domain + strlen(domain);\n"; 24 | print " for(unsigned int i = 0; i < " . scalar @tld . "; ++i) {\n"; 25 | print " unsigned int len = strlen(tld[i]);\n"; 26 | print " if(strcasecmp(end - len, tld[i]) == 0) {\n"; 27 | print " return len;\n"; 28 | print " }\n"; 29 | print " }\n"; 30 | print " return 0;"; 31 | print "}\n"; 32 | 33 | 34 | -------------------------------------------------------------------------------- /css/parser.h: -------------------------------------------------------------------------------- 1 | #ifndef __CSS_PARSER_H__ 2 | #define __CSS_PARSER_H__ 3 | 4 | #define PS_CLASS_NONE 0 5 | #define PS_CLASS 1 6 | #define PS_CLASS_LINK 1 7 | #define PS_CLASS_VISITED 2 8 | #define PS_CLASS_ACTIVE 3 9 | 10 | #define PS_ELEMENT_NONE 0 11 | #define PS_ELEMENT_FIRST_LETTER 1 12 | #define PS_ELEMENT_FIRST_LINE 2 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif 17 | 18 | struct property_t { 19 | char *name; 20 | char *val; 21 | int important; 22 | int count; 23 | struct property_t *next; 24 | }; 25 | 26 | struct selector_t { 27 | char *element_name; 28 | char *id; 29 | char *e_class; 30 | int pseudo_class; 31 | int pseudo_element; 32 | struct property_t *property; 33 | struct selector_t *next; 34 | }; 35 | 36 | struct selector_list_t { 37 | struct selector_t *selector; 38 | struct selector_list_t *next; 39 | }; 40 | 41 | struct selector_list_t* css_parse(const char *buffer, int buf_len); 42 | void free_rulesets(struct selector_list_t *rules); 43 | 44 | #ifdef __cplusplus 45 | } 46 | #endif 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # -*- Autoconf -*- 2 | # Process this file with autoconf to produce a configure script. 3 | 4 | #AC_PREREQ(2.57) 5 | AC_INIT(htmlcxx.cc) 6 | AM_INIT_AUTOMAKE(htmlcxx, 0.86) 7 | AM_CONFIG_HEADER([config.h]) 8 | AM_DISABLE_STATIC 9 | 10 | # Checks for programs. 11 | AC_PROG_CXX 12 | AC_PROG_CC 13 | AM_PROG_LEX 14 | AC_PROG_YACC 15 | AC_PROG_LIBTOOL 16 | 17 | # Checks for libraries. 18 | # Replace `main' with a function in -lfl: 19 | AC_CHECK_LIB(fl, main) 20 | AC_CHECK_LIB(iconv, iconv_open) 21 | 22 | 23 | # Checks for header files. 24 | AC_HEADER_STDC 25 | # AC_CHECK_HEADERS([]) 26 | 27 | # Checks for typedefs, structures, and compiler characteristics. 28 | AC_HEADER_STDBOOL 29 | AC_C_CONST 30 | AC_C_INLINE 31 | AC_TYPE_SIZE_T 32 | AC_HEADER_TIME 33 | 34 | # Checks for library functions. 35 | 36 | AC_LANG_CPLUSPLUS 37 | CXXFLAGS="$CXXFLAGS -DDEFAULT_CSS=\"\\\"$datadir/htmlcxx/css/default.css\\\"\"" 38 | 39 | AC_CONFIG_FILES([Makefile html/Makefile css/Makefile]) 40 | AC_OUTPUT(htmlcxx.pc) 41 | -------------------------------------------------------------------------------- /html/ci_string.h: -------------------------------------------------------------------------------- 1 | #ifndef __CI_STRING__ 2 | #define __CI_STRING__ 3 | 4 | #include 5 | #include 6 | 7 | struct ci_char_traits : public std::char_traits 8 | 9 | // just inherit all the other functions 10 | // that we don't need to override 11 | { 12 | static bool eq( char c1, char c2 ) { 13 | return tolower(c1) == tolower(c2); 14 | } 15 | 16 | static bool ne( char c1, char c2 ) { 17 | return tolower(c1) != tolower(c2); 18 | } 19 | 20 | static bool lt( char c1, char c2 ) { 21 | return tolower(c1) < tolower(c2); 22 | } 23 | 24 | static int compare( const char* s1, 25 | const char* s2, 26 | size_t n ) { 27 | #ifdef WIN32 28 | return _strnicmp(s1, s2, n); 29 | #else 30 | return strncasecmp( s1, s2, n ); 31 | #endif 32 | // if available on your compiler, 33 | // otherwise you can roll your own 34 | } 35 | 36 | static const char* 37 | find( const char* s, int n, char a ) { 38 | while( n-- > 0 && tolower(*s) != tolower(a) ) { 39 | ++s; 40 | } 41 | return s; 42 | } 43 | }; 44 | 45 | typedef std::basic_string ci_string; 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /html/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __HTML_UTILS_H__ 2 | #define __HTML_UTILS_H__ 3 | 4 | #include "tree.h" 5 | #include "Node.h" 6 | 7 | namespace htmlcxx { 8 | 9 | class Uri; 10 | 11 | namespace HTML { 12 | 13 | /** Tries to detect if the buffer is encoded as utf8 */ 14 | bool detect_utf8(const char *buf, int size); 15 | 16 | /** Trim the argument string at both end and convert sibling blanks 17 | * into a single space */ 18 | std::string single_blank(const std::string& str); 19 | 20 | /** Remove html comments from the string */ 21 | std::string strip_comments(const std::string& str); 22 | 23 | /** Convert html entities into their ISO8859-1 equivalents */ 24 | std::string decode_entities(const std::string& str); 25 | 26 | /** Get the value of an attribute inside the text of a tag 27 | * @return empty string if the attribute is not found, or the value of 28 | * the attribute 29 | */ 30 | std::string get_attribute(const std::string& tag, const std::string& attr); 31 | 32 | /** Create absolute url for a link */ 33 | std::string convert_link(const std::string &relative, const Uri &root); 34 | 35 | /** Create a gml representation of the tree for input of tools like graphviz */ 36 | std::string serialize_gml(const tree &tr); 37 | } 38 | } 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /html/debug.h: -------------------------------------------------------------------------------- 1 | #ifdef DEBUGP 2 | #undef DEBUGP 3 | #endif 4 | 5 | #ifdef __cplusplus 6 | #include 7 | #if defined(WIN32) && !defined(__MINGW32__) 8 | #include 9 | #endif 10 | #else 11 | #include 12 | #if defined(WIN32) && !defined(__MINGW32__) 13 | #include 14 | #endif 15 | #endif 16 | 17 | #if defined(WIN32) && !defined(__MINGW32__) 18 | #ifndef __DEBUG_H__ 19 | #define __DEBUG_H__ 20 | #include 21 | inline void debugprintf(const char *format, ...) 22 | { 23 | va_list ap; 24 | char *f = NULL; 25 | const char *p="%s:%d "; 26 | size_t plen = strlen(p); 27 | va_start(ap, format); 28 | f = (char *)malloc(plen + strlen(format) + 1); 29 | if (!f) return; 30 | memcpy(f, p, plen); 31 | memcpy(f + plen, format, strlen(format) + 1); 32 | vfprintf(stderr, f, ap); 33 | va_end(ap); 34 | free(f); 35 | } 36 | inline void dummyprintf(const char *format, ...) 37 | {} 38 | #endif 39 | #endif 40 | 41 | #ifdef DEBUG 42 | #if defined(WIN32) && !defined(__MINGW32__) 43 | #define DEBUGP debugprintf 44 | #else 45 | #define DEBUGP(args...) do { fprintf(stderr, "%s:%d ", __FILE__, __LINE__); fprintf(stderr, ## args); } while(0) 46 | #endif 47 | #else 48 | #if defined(WIN32) && !defined(__MINGW32__) 49 | #define DEBUGP dummyprintf 50 | #else 51 | #define DEBUGP(args...) 52 | #endif 53 | #endif 54 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2008-10-12 18:41 davi 2 | 3 | * Applied patch by Luca Bruno fixing gcc 4.3 compilation problems. 4 | 5 | 2007-08-11 20:23 davi 6 | 7 | * Bumped release again because sf.net problems. 8 | 9 | 2007-08-11 20:14 davi 10 | 11 | * Updated version. 12 | 13 | 2007-08-11 20:06 davi 14 | 15 | * 16 | Fixed long standing typename bug. 17 | 18 | 2006-06-16 12:45 davi 19 | 20 | * Added missing files. 21 | 22 | 2006-06-16 12:23 davi 23 | 24 | * Added missing files and incorporate some pending fixes. 25 | 26 | 2005-03-24 00:59 davi 27 | 28 | * Removed useless files. 29 | 30 | 2005-03-24 00:58 davi 31 | 32 | * Small fixes for distribution. 33 | 34 | 2005-02-22 01:47 davi 35 | 36 | * Lots of changes. Bumped version to 0.7.3. 37 | 38 | 2004-06-17 16:17 braga 39 | 40 | * Updated some documentation. 41 | 42 | 2004-06-16 13:38 braga 43 | 44 | * Commiting all changes from local cvs. 45 | 46 | 2004-03-27 10:56 davi 47 | 48 | * Added css parser. 49 | 50 | 2004-03-25 16:33 davi 51 | 52 | * Added sample htmlcxx application. 53 | 54 | 2004-03-25 16:24 davi 55 | 56 | * Initial revision 57 | 58 | 2004-03-25 16:24 davi 59 | 60 | * htmlcxx is now in CVS 61 | 62 | 2003-12-13 23:27 davi 63 | 64 | * Initial revision 65 | 66 | 2003-12-13 23:27 davi 67 | 68 | * htmlcxx - html and css APIs for C++ 69 | 70 | -------------------------------------------------------------------------------- /htmlcxx.spec: -------------------------------------------------------------------------------- 1 | %define name htmlcxx 2 | %define version 0.80 3 | %define release 1 4 | 5 | Summary: A html/css1 parser in C++ 6 | Name: %{name} 7 | Version: %{version} 8 | Release: %{release} 9 | Source: %{name}-%{version}.tar.gz 10 | License: LGPL 11 | URL: http://htmlcxx.sf.net 12 | BuildArch: i386 13 | BuildRoot: %{_tmppath}/%{name}-%{version}-root 14 | 15 | %description 16 | This is a html/css1 parser with politics created trying to mimic mozilla firefox (http://www.mozilla.org) behavior. So you should expect parse trees similar to those create by firefox. However, differently from firefox, htmlcxx does not insert non-existent stuff in your html. Therefore, serializing the DOM tree gives exactly the same bytes contained in the original HTML document. 17 | 18 | %prep 19 | rm -Rf $RPM_BUILD_ROOT 20 | 21 | %setup 22 | mkdir $RPM_BUILD_ROOT 23 | mkdir $RPM_BUILD_ROOT/usr 24 | %configure --prefix=/usr 25 | 26 | %build 27 | make 28 | 29 | %install 30 | DESTDIR=$RPM_BUILD_ROOT/ make install 31 | 32 | %files 33 | %defattr(755,root,root) 34 | %_libdir 35 | %_bindir 36 | %_datadir 37 | 38 | %package dev 39 | Summary: Develoopment headers 40 | 41 | %description dev 42 | Development headers 43 | %files dev 44 | %_includedir 45 | 46 | %changelog 47 | * Thu Jun 16 2006 Davi de Castro Reis 48 | + Version 0.80 released 49 | * Thu Jun 9 2005 Davi de Castro Reis 50 | + Created spec 51 | -------------------------------------------------------------------------------- /html/CharsetConverter.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "CharsetConverter.h" 6 | 7 | using namespace std; 8 | using namespace htmlcxx; 9 | 10 | CharsetConverter::CharsetConverter(const string &from, const string &to) throw (Exception) 11 | { 12 | mIconvDescriptor = iconv_open(to.c_str(), from.c_str()); 13 | if (mIconvDescriptor == (iconv_t)(-1)) 14 | { 15 | const char *error_str = strerror(errno); 16 | int size = strlen(error_str) + from.length() + to.length() + 26; 17 | char error[size]; 18 | snprintf(error, size, "Can't convert from %s to %s: %s", from.c_str(), to.c_str(), error_str); 19 | throw Exception(error); 20 | } 21 | } 22 | 23 | CharsetConverter::~CharsetConverter() 24 | { 25 | iconv_close(mIconvDescriptor); 26 | } 27 | 28 | string CharsetConverter::convert(const string &input) 29 | { 30 | const char *inbuf = input.c_str(); 31 | size_t inbytesleft = input.length(); 32 | 33 | size_t outbuf_len = 2 * input.length(); 34 | char *outbuf_start = new char[outbuf_len]; 35 | char *outbuf = outbuf_start; 36 | size_t outbytesleft = outbuf_len; 37 | 38 | size_t ret; 39 | while (1) { 40 | ret = iconv(mIconvDescriptor, const_cast(&inbuf), &inbytesleft, &outbuf, &outbytesleft); 41 | if (ret == 0) break; 42 | if (ret == (size_t)-1 && errno == E2BIG) return string(); 43 | 44 | // fprintf(stderr, "invalid byte: %d\n", inbuf - input.c_str()); 45 | 46 | inbuf++; inbytesleft--; 47 | } 48 | 49 | string out(outbuf_start, outbuf_len - outbytesleft); 50 | 51 | delete [] outbuf_start; 52 | 53 | return out; 54 | } 55 | -------------------------------------------------------------------------------- /html/ParserSax.h: -------------------------------------------------------------------------------- 1 | #ifndef __HTML_PARSER_SAX_H__ 2 | #define __HTML_PARSER_SAX_H__ 3 | 4 | #include 5 | 6 | #include "Node.h" 7 | 8 | namespace htmlcxx 9 | { 10 | namespace HTML 11 | { 12 | class ParserSax 13 | { 14 | public: 15 | ParserSax() : mpLiteral(0), mCdata(false) {} 16 | virtual ~ParserSax() {} 17 | 18 | /** Parse the html code */ 19 | void parse(const std::string &html); 20 | 21 | template 22 | void parse(_Iterator begin, _Iterator end); 23 | 24 | protected: 25 | // Redefine this if you want to do some initialization before 26 | // the parsing 27 | virtual void beginParsing() {} 28 | 29 | virtual void foundTag(Node node, bool isEnd) {} 30 | virtual void foundText(Node node) {} 31 | virtual void foundComment(Node node) {} 32 | 33 | virtual void endParsing() {} 34 | 35 | 36 | template 37 | void parse(_Iterator &begin, _Iterator &end, 38 | std::forward_iterator_tag); 39 | 40 | template 41 | void parseHtmlTag(_Iterator b, _Iterator c); 42 | 43 | template 44 | void parseContent(_Iterator b, _Iterator c); 45 | 46 | template 47 | void parseComment(_Iterator b, _Iterator c); 48 | 49 | template 50 | _Iterator skipHtmlTag(_Iterator ptr, _Iterator end); 51 | 52 | template 53 | _Iterator skipHtmlComment(_Iterator ptr, _Iterator end); 54 | 55 | unsigned long mCurrentOffset; 56 | const char *mpLiteral; 57 | bool mCdata; 58 | }; 59 | 60 | }//namespace HTML 61 | }//namespace htmlcxx 62 | 63 | #include "ParserSax.tcc" 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /html/Node.h: -------------------------------------------------------------------------------- 1 | #ifndef __HTML_PARSER_NODE_H 2 | #define __HTML_PARSER_NODE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace htmlcxx { 9 | namespace HTML { 10 | class Node { 11 | 12 | public: 13 | Node() {} 14 | //Node(const Node &rhs); //uses default 15 | ~Node() {} 16 | 17 | inline void text(const std::string& text) { this->mText = text; } 18 | inline const std::string& text() const { return this->mText; } 19 | 20 | inline void closingText(const std::string &text) { this->mClosingText = text; } 21 | inline const std::string& closingText() const { return mClosingText; } 22 | 23 | inline void offset(unsigned int offset) { this->mOffset = offset; } 24 | inline unsigned int offset() const { return this->mOffset; } 25 | 26 | inline void length(unsigned int length) { this->mLength = length; } 27 | inline unsigned int length() const { return this->mLength; } 28 | 29 | inline void tagName(const std::string& tagname) { this->mTagName = tagname; } 30 | inline const std::string& tagName() const { return this->mTagName; } 31 | 32 | bool isTag() const { return this->mIsHtmlTag; } 33 | void isTag(bool is_html_tag){ this->mIsHtmlTag = is_html_tag; } 34 | 35 | bool isComment() const { return this->mComment; } 36 | void isComment(bool comment){ this->mComment = comment; } 37 | 38 | std::pair attribute(const std::string &attr) const 39 | { 40 | std::map::const_iterator i = this->mAttributes.find(attr); 41 | if (i != this->mAttributes.end()) { 42 | return make_pair(true, i->second); 43 | } else { 44 | return make_pair(false, std::string()); 45 | } 46 | } 47 | 48 | operator std::string() const; 49 | std::ostream &operator<<(std::ostream &stream) const; 50 | 51 | const std::map& attributes() const { return this->mAttributes; } 52 | void parseAttributes(); 53 | 54 | bool operator==(const Node &rhs) const; 55 | 56 | protected: 57 | 58 | std::string mText; 59 | std::string mClosingText; 60 | unsigned int mOffset; 61 | unsigned int mLength; 62 | std::string mTagName; 63 | std::map mAttributes; 64 | bool mIsHtmlTag; 65 | bool mComment; 66 | }; 67 | } 68 | } 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /css/parser_pp.h: -------------------------------------------------------------------------------- 1 | #ifndef __CSS_PARSER_PP_H__ 2 | #define __CSS_PARSER_PP_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace htmlcxx { 10 | namespace CSS { 11 | 12 | extern const char *IE_CSS; 13 | class Parser 14 | { 15 | 16 | public: 17 | friend class Attribute; 18 | 19 | enum PseudoClass { NONE_CLASS, LINK, VISITED, ACTIVE }; 20 | enum PseudoElement { NONE_ELEMENT, FIRST_LETTER, FIRST_LINE }; 21 | 22 | class Selector 23 | { 24 | private: 25 | std::string mElement; 26 | std::string mId; 27 | std::string mEClass; 28 | PseudoClass mPsClass; 29 | PseudoElement mPsElement; 30 | 31 | public: 32 | Selector(); 33 | Selector(const std::string& e, const std::string& i, const std::string& c, const PseudoClass& pc, const PseudoElement &pe); 34 | void setElement(const std::string &str); 35 | void setId(const std::string &str); 36 | void setClass(const std::string &str); 37 | void setPseudoClass(enum PseudoClass p); 38 | void setPseudoElement(enum PseudoElement p); 39 | bool match(const Selector& s) const; 40 | bool operator==(const Selector& s) const; 41 | bool operator<(const Selector& s) const; 42 | friend std::ostream& operator<<(std::ostream& out, const Selector& s); 43 | }; 44 | 45 | private: 46 | static bool match(const std::vector& selector, const std::vector& path); 47 | 48 | class Attribute 49 | { 50 | public: 51 | Attribute() {} 52 | Attribute(const std::string& v, bool i) : mVal(v), mImportant(i) {} 53 | std::string mVal; 54 | bool mImportant; 55 | }; 56 | 57 | public: 58 | Parser() {} 59 | friend std::ostream& operator<<(std::ostream& out, const std::map& s); 60 | bool parse(const std::string& css); 61 | bool parse(const char *buf, int buf_len); 62 | void merge(const Parser& p); 63 | std::map getAttributes(const std::vector& sv) const; 64 | 65 | friend std::ostream& operator<<(std::ostream& out, const CSS::Parser& p); 66 | 67 | private: 68 | typedef std::map, std::map > RuleSet; 69 | RuleSet mRuleSets; 70 | }; 71 | 72 | std::string pse2str(const enum Parser::PseudoElement& s); 73 | std::string psc2str(const enum Parser::PseudoClass& s); 74 | std::ostream& operator<<(std::ostream& out, const std::map& s); 75 | 76 | }//namespace CSS 77 | }//namespace htmlcxx 78 | #endif 79 | -------------------------------------------------------------------------------- /html/Node.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "wincstring.h" 5 | #include "Node.h" 6 | 7 | //#define DEBUG 8 | #include "debug.h" 9 | 10 | using namespace std; 11 | using namespace htmlcxx; 12 | using namespace HTML; 13 | 14 | void Node::parseAttributes() 15 | { 16 | if (!(this->isTag())) return; 17 | 18 | const char *end; 19 | const char *ptr = mText.c_str(); 20 | if ((ptr = strchr(ptr, '<')) == 0) return; 21 | ++ptr; 22 | 23 | // Skip initial blankspace 24 | while (isspace(*ptr)) ++ptr; 25 | 26 | // Skip tagname 27 | if (!isalpha(*ptr)) return; 28 | while (!isspace(*ptr) && *ptr != '>') ++ptr; 29 | 30 | // Skip blankspace after tagname 31 | while (isspace(*ptr)) ++ptr; 32 | 33 | while (*ptr && *ptr != '>') 34 | { 35 | string key, val; 36 | 37 | // skip unrecognized 38 | while (*ptr && !isalnum(*ptr) && !isspace(*ptr)) ++ptr; 39 | 40 | // skip blankspace 41 | while (isspace(*ptr)) ++ptr; 42 | 43 | end = ptr; 44 | while (isalnum(*end) || *end == '-') ++end; 45 | key.assign(end - ptr, '\0'); 46 | transform(ptr, end, key.begin(), ::tolower); 47 | ptr = end; 48 | 49 | // skip blankspace 50 | while (isspace(*ptr)) ++ptr; 51 | 52 | if (*ptr == '=') 53 | { 54 | ++ptr; 55 | while (isspace(*ptr)) ++ptr; 56 | if (*ptr == '"' || *ptr == '\'') 57 | { 58 | char quote = *ptr; 59 | // fprintf(stderr, "Trying to find quote: %c\n", quote); 60 | const char *end = strchr(ptr + 1, quote); 61 | if (end == 0) 62 | { 63 | //b = mText.find_first_of(" >", a+1); 64 | const char *end1, *end2; 65 | end1 = strchr(ptr + 1, ' '); 66 | end2 = strchr(ptr + 1, '>'); 67 | if (end1 && end1 < end2) end = end1; 68 | else end = end2; 69 | if (end == 0) return; 70 | } 71 | const char *begin = ptr + 1; 72 | while (isspace(*begin) && begin < end) ++begin; 73 | const char *trimmed_end = end - 1; 74 | while (isspace(*trimmed_end) && trimmed_end >= begin) --trimmed_end; 75 | val.assign(begin, trimmed_end + 1); 76 | ptr = end + 1; 77 | } 78 | else 79 | { 80 | end = ptr; 81 | while (*end && !isspace(*end) && *end != '>') end++; 82 | val.assign(ptr, end); 83 | ptr = end; 84 | } 85 | 86 | // fprintf(stderr, "%s = %s\n", key.c_str(), val.c_str()); 87 | mAttributes.insert(make_pair(key, val)); 88 | } 89 | else 90 | { 91 | // fprintf(stderr, "D: %s\n", key.c_str()); 92 | if (!key.empty()) 93 | { 94 | mAttributes.insert(make_pair(key, string())); 95 | } 96 | } 97 | } 98 | } 99 | 100 | bool Node::operator==(const Node &n) const 101 | { 102 | if (!isTag() || !n.isTag()) return false; 103 | return !(strcasecmp(tagName().c_str(), n.tagName().c_str())); 104 | } 105 | 106 | Node::operator string() const { 107 | if (isTag()) return this->tagName(); 108 | return this->text(); 109 | } 110 | 111 | ostream &Node::operator<<(ostream &stream) const { 112 | stream << (string)(*this); 113 | return stream; 114 | } 115 | -------------------------------------------------------------------------------- /css/css_syntax.h: -------------------------------------------------------------------------------- 1 | /* A Bison parser, made by GNU Bison 3.0.4. */ 2 | 3 | /* Bison interface for Yacc-like parsers in C 4 | 5 | Copyright (C) 1984, 1989-1990, 2000-2015 Free Software Foundation, Inc. 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program. If not, see . */ 19 | 20 | /* As a special exception, you may create a larger work that contains 21 | part or all of the Bison parser skeleton and distribute that work 22 | under terms of your choice, so long as that work isn't itself a 23 | parser generator using the skeleton or a modified version thereof 24 | as a parser skeleton. Alternatively, if you modify or redistribute 25 | the parser skeleton itself, you may (at your option) remove this 26 | special exception, which will cause the skeleton and the resulting 27 | Bison output files to be licensed under the GNU General Public 28 | License without this special exception. 29 | 30 | This special exception was added by the Free Software Foundation in 31 | version 2.2 of Bison. */ 32 | 33 | #ifndef YY_YY_CSS_SYNTAX_H_INCLUDED 34 | # define YY_YY_CSS_SYNTAX_H_INCLUDED 35 | /* Debug traces. */ 36 | #ifndef YYDEBUG 37 | # define YYDEBUG 0 38 | #endif 39 | #if YYDEBUG 40 | extern int yydebug; 41 | #endif 42 | 43 | /* Token type. */ 44 | #ifndef YYTOKENTYPE 45 | # define YYTOKENTYPE 46 | enum yytokentype 47 | { 48 | IMPORT_SYM = 258, 49 | IMPORTANT_SYM = 259, 50 | IDENT = 260, 51 | STRING = 261, 52 | NUMBER = 262, 53 | PERCENTAGE = 263, 54 | LENGTH = 264, 55 | EMS = 265, 56 | EXS = 266, 57 | PSCLASS_AFTER_IDENT = 267, 58 | HASH_AFTER_IDENT = 268, 59 | CLASS_AFTER_IDENT = 269, 60 | PSCLASS = 270, 61 | VISITED_PSCLASS = 271, 62 | ACTIVE_PSCLASS = 272, 63 | FIRST_LINE = 273, 64 | FIRST_LETTER = 274, 65 | HASH = 275, 66 | CLASS = 276, 67 | URL = 277, 68 | RGB = 278, 69 | CDO = 279, 70 | CDC = 280, 71 | CSL = 281 72 | }; 73 | #endif 74 | /* Tokens. */ 75 | #define IMPORT_SYM 258 76 | #define IMPORTANT_SYM 259 77 | #define IDENT 260 78 | #define STRING 261 79 | #define NUMBER 262 80 | #define PERCENTAGE 263 81 | #define LENGTH 264 82 | #define EMS 265 83 | #define EXS 266 84 | #define PSCLASS_AFTER_IDENT 267 85 | #define HASH_AFTER_IDENT 268 86 | #define CLASS_AFTER_IDENT 269 87 | #define PSCLASS 270 88 | #define VISITED_PSCLASS 271 89 | #define ACTIVE_PSCLASS 272 90 | #define FIRST_LINE 273 91 | #define FIRST_LETTER 274 92 | #define HASH 275 93 | #define CLASS 276 94 | #define URL 277 95 | #define RGB 278 96 | #define CDO 279 97 | #define CDC 280 98 | #define CSL 281 99 | 100 | /* Value type. */ 101 | #if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED 102 | 103 | union YYSTYPE 104 | { 105 | #line 15 "css_syntax.y" /* yacc.c:1909 */ 106 | 107 | char *lexeme; 108 | char letter; 109 | struct property_t *property; 110 | struct selector_t *selector; 111 | struct selector_list_t *selector_list; 112 | int pseudo_class; 113 | int pseudo_element; 114 | 115 | #line 116 "css_syntax.h" /* yacc.c:1909 */ 116 | }; 117 | 118 | typedef union YYSTYPE YYSTYPE; 119 | # define YYSTYPE_IS_TRIVIAL 1 120 | # define YYSTYPE_IS_DECLARED 1 121 | #endif 122 | 123 | 124 | 125 | int yyparse (void *yyparam); 126 | 127 | #endif /* !YY_YY_CSS_SYNTAX_H_INCLUDED */ 128 | -------------------------------------------------------------------------------- /html/Uri.h: -------------------------------------------------------------------------------- 1 | #ifndef __HTMLCXX__URI_H__ 2 | #define __HTMLCXX__URI_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace htmlcxx 9 | { 10 | class Uri 11 | { 12 | public: 13 | 14 | class Exception : public std::runtime_error 15 | { 16 | public: 17 | Exception(const std::string &arg) 18 | : std::runtime_error(arg) {} 19 | }; 20 | 21 | Uri(); 22 | Uri(const std::string &uri); 23 | ~Uri(); 24 | 25 | static std::string encode(const std::string &uri); 26 | static std::string decode(const std::string &uri); 27 | 28 | std::string unparse(int flags = 0) const; 29 | Uri absolute(const Uri &base) const; 30 | 31 | std::string canonicalHostname(unsigned int maxDepth = UINT_MAX) const; 32 | unsigned int hostnameDepth() const; 33 | 34 | static const unsigned int URI_FTP_DEFAULT_PORT = 21 ; /**< default FTP port */ 35 | static const unsigned int URI_SSH_DEFAULT_PORT = 22 ; /**< default SSH port */ 36 | static const unsigned int URI_TELNET_DEFAULT_PORT = 23 ; /**< default telnet port */ 37 | static const unsigned int URI_GOPHER_DEFAULT_PORT = 70 ; /**< default Gopher port */ 38 | static const unsigned int URI_HTTP_DEFAULT_PORT = 80 ; /**< default HTTP port */ 39 | static const unsigned int URI_POP_DEFAULT_PORT = 110 ; /**< default POP port */ 40 | static const unsigned int URI_NNTP_DEFAULT_PORT = 119 ; /**< default NNTP port */ 41 | static const unsigned int URI_IMAP_DEFAULT_PORT = 143 ; /**< default IMAP port */ 42 | static const unsigned int URI_PROSPERO_DEFAULT_PORT = 191 ; /**< default Prospero port */ 43 | static const unsigned int URI_WAIS_DEFAULT_PORT = 210 ; /**< default WAIS port */ 44 | static const unsigned int URI_LDAP_DEFAULT_PORT = 389 ; /**< default LDAP port */ 45 | static const unsigned int URI_HTTPS_DEFAULT_PORT = 443 ; /**< default HTTPS port */ 46 | static const unsigned int URI_RTSP_DEFAULT_PORT = 554 ; /**< default RTSP port */ 47 | static const unsigned int URI_SNEWS_DEFAULT_PORT = 563 ; /**< default SNEWS port */ 48 | static const unsigned int URI_ACAP_DEFAULT_PORT = 674 ; /**< default ACAP port */ 49 | static const unsigned int URI_NFS_DEFAULT_PORT = 2049 ; /**< default NFS port */ 50 | static const unsigned int URI_TIP_DEFAULT_PORT = 3372 ; /**< default TIP port */ 51 | static const unsigned int URI_SIP_DEFAULT_PORT = 5060 ; /**< default SIP port */ 52 | 53 | const static int REMOVE_WWW_PREFIX = 1; 54 | const static int REMOVE_TRAILING_BAR = 2; 55 | const static int REMOVE_SCHEME = 8; 56 | const static int REMOVE_QUERY_VALUES = 16; 57 | const static int REMOVE_QUERY = 32; 58 | const static int REMOVE_DEFAULT_FILENAMES = 64; 59 | const static int REMOVE_FRAGMENT = 128; 60 | 61 | std::string scheme() const; 62 | void scheme(std::string scheme); 63 | std::string user() const; 64 | void user(std::string user); 65 | std::string password() const; 66 | void password(std::string password); 67 | std::string hostname() const; 68 | void hostname(std::string hostname); 69 | std::string path() const; 70 | void path(std::string path); 71 | std::string query() const; 72 | void query(std::string query); 73 | std::string fragment() const; 74 | void fragment(std::string fragment); 75 | unsigned int port() const; 76 | void port(unsigned int port); 77 | bool existsFragment() const; 78 | void existsFragment(bool existsFragment); 79 | bool existsQuery() const; 80 | void existsQuery(bool existsQuery); 81 | protected: 82 | void init(const std::string &uri_str); 83 | 84 | std::string mScheme; 85 | std::string mUser; 86 | std::string mPassword; 87 | std::string mHostname; 88 | std::string mPath; 89 | std::string mQuery; 90 | std::string mFragment; 91 | std::string mPortStr;; 92 | bool mExistsQuery; 93 | bool mExistsFragment; 94 | unsigned int mPort; 95 | }; 96 | } 97 | 98 | #endif 99 | -------------------------------------------------------------------------------- /css/css_lex.l: -------------------------------------------------------------------------------- 1 | %{ 2 | 3 | #include 4 | #include "css_syntax.h" 5 | 6 | #define YY_DECL int yylex(YYSTYPE *lvalp) 7 | 8 | %} 9 | 10 | %option noyywrap 11 | 12 | unicode \\[0-9a-f]{1,4} 13 | latin1 [¡-ÿ] 14 | escape {unicode}|\\[ -~¡-ÿ] 15 | stringchar {escape}|{latin1}|[ !#$%&(-~] 16 | nmstrt [a-z_]|{latin1}|{escape} 17 | nmchar [-a-z0-9_]|{latin1}|{escape} 18 | ident {nmstrt}{nmchar}* 19 | name {nmchar}+ 20 | d [0-9] 21 | notnm [^-a-z0-9\\]|{latin1} 22 | w [ \t\r\n]* 23 | num {d}+|{d}*\.{d}+ 24 | string \"({stringchar}|\')*\"|\'({stringchar}|\")*\' 25 | 26 | %x COMMENT 27 | %s AFTER_IDENT 28 | 29 | %% 30 | 31 | "/*" {BEGIN(COMMENT);} 32 | [^*]* /* eat anything that's not a '*' */ 33 | "*"+[^*/]* /* eat up '*'s not followed by '/'s */ 34 | "*"+"/" BEGIN(0); 35 | @import {BEGIN(0); return IMPORT_SYM;} 36 | "!"{w}important {BEGIN(0); return IMPORTANT_SYM;} 37 | {ident} { 38 | BEGIN(AFTER_IDENT); 39 | lvalp->lexeme = strdup(yytext); 40 | return IDENT; 41 | } 42 | {string} { 43 | BEGIN(0); 44 | lvalp->lexeme = strdup(yytext); 45 | return STRING; 46 | } 47 | 48 | {num} { 49 | BEGIN(0); 50 | lvalp->lexeme = strdup(yytext); 51 | return NUMBER; 52 | } 53 | {num}"%" { 54 | BEGIN(0); 55 | lvalp->lexeme = strdup(yytext); 56 | return PERCENTAGE; 57 | } 58 | {num}pt/{notnm} { 59 | BEGIN(0); 60 | lvalp->lexeme = strdup(yytext); 61 | return LENGTH; 62 | } 63 | {num}mm/{notnm} { 64 | BEGIN(0); 65 | lvalp->lexeme = strdup(yytext); 66 | return LENGTH; 67 | } 68 | {num}cm/{notnm} { 69 | BEGIN(0); 70 | lvalp->lexeme = strdup(yytext); 71 | return LENGTH; 72 | } 73 | {num}pc/{notnm} { 74 | BEGIN(0); 75 | lvalp->lexeme = strdup(yytext); 76 | return LENGTH; 77 | } 78 | {num}in/{notnm} { 79 | BEGIN(0); 80 | lvalp->lexeme = strdup(yytext); 81 | return LENGTH; 82 | } 83 | {num}px/{notnm} { 84 | BEGIN(0); 85 | lvalp->lexeme = strdup(yytext); 86 | return LENGTH; 87 | } 88 | {num}em/{notnm} { 89 | BEGIN(0); 90 | lvalp->lexeme = strdup(yytext); 91 | return EMS; 92 | } 93 | {num}ex/{notnm} { 94 | BEGIN(0); 95 | lvalp->lexeme = strdup(yytext); 96 | return EXS; 97 | } 98 | 99 | "#"{name} { 100 | lvalp->lexeme = strdup(yytext+1); 101 | return HASH_AFTER_IDENT; 102 | } 103 | "."{name} { 104 | lvalp->lexeme = strdup(yytext+1); 105 | return CLASS_AFTER_IDENT; 106 | } 107 | 108 | "#"{name} { 109 | BEGIN(AFTER_IDENT); 110 | lvalp->lexeme = strdup(yytext+1); 111 | return HASH; 112 | } 113 | "."{name} { 114 | BEGIN(AFTER_IDENT); 115 | lvalp->lexeme = strdup(yytext+1); 116 | return CLASS; 117 | } 118 | 119 | url\({w}{string}{w}\) | 120 | url\({w}([^ \r\n\'\")]|\\\ |\\\'|\\\"|\\\))+{w}\) { 121 | BEGIN(0); 122 | lvalp->lexeme = 123 | strdup(yytext); 124 | return URL; 125 | } 126 | rgb\({w}{num}%?{w}\,{w}{num}%?{w}\,{w}{num}%?{w}\) { 127 | BEGIN(0); 128 | lvalp->lexeme = 129 | strdup(yytext); 130 | return RGB; 131 | } 132 | 133 | [-/+{};,#:] {BEGIN(0); return *yytext;} 134 | [ \t\r]+ {BEGIN(0); /* ignore whitespace */} 135 | \n {BEGIN(0); /* ignore whitespace */} 136 | \<\!\-\- {BEGIN(0); return CDO;} 137 | \-\-\> {BEGIN(0); return CDC;} 138 | \/\/ {BEGIN(0); return CSL;} 139 | . {/*fprintf(stderr, "Illegal character (%d)\n", *yytext);*/} 140 | 141 | %% 142 | 143 | int init_yylex(const char *buffer, int buf_len) { 144 | yy_scan_bytes(buffer, buf_len); 145 | } 146 | 147 | int end_yylex() { 148 | yy_delete_buffer(YY_CURRENT_BUFFER); 149 | } 150 | -------------------------------------------------------------------------------- /htmlcxxapp.vcproj: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | 11 | 12 | 13 | 19 | 31 | 33 | 41 | 43 | 45 | 47 | 49 | 51 | 53 | 55 | 57 | 59 | 61 | 62 | 68 | 77 | 79 | 88 | 90 | 92 | 94 | 96 | 98 | 100 | 102 | 104 | 106 | 108 | 109 | 110 | 111 | 112 | 113 | 117 | 120 | 121 | 123 | 124 | 125 | 129 | 131 | 132 | 134 | 135 | 136 | 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /html/ParserDom.cc: -------------------------------------------------------------------------------- 1 | #include "ParserDom.h" 2 | #include "wincstring.h" 3 | 4 | #include 5 | #include 6 | 7 | //#define DEBUG 8 | #include "debug.h" 9 | 10 | #define TAG_NAME_MAX 10 11 | 12 | using namespace std; 13 | using namespace htmlcxx; 14 | using namespace HTML; 15 | using namespace kp; 16 | 17 | const tree& ParserDom::parseTree(const std::string &html) 18 | { 19 | this->parse(html); 20 | return this->getTree(); 21 | } 22 | void ParserDom::beginParsing() 23 | { 24 | mHtmlTree.clear(); 25 | tree::iterator top = mHtmlTree.begin(); 26 | HTML::Node lambda_node; 27 | lambda_node.offset(0); 28 | lambda_node.length(0); 29 | lambda_node.isTag(true); 30 | lambda_node.isComment(false); 31 | mCurrentState = mHtmlTree.insert(top,lambda_node); 32 | } 33 | 34 | void ParserDom::endParsing() 35 | { 36 | tree::iterator top = mHtmlTree.begin(); 37 | top->length(mCurrentOffset); 38 | } 39 | 40 | void ParserDom::foundComment(Node node) 41 | { 42 | //Add child content node, but do not update current state 43 | mHtmlTree.append_child(mCurrentState, node); 44 | } 45 | 46 | void ParserDom::foundText(Node node) 47 | { 48 | //Add child content node, but do not update current state 49 | mHtmlTree.append_child(mCurrentState, node); 50 | } 51 | 52 | void ParserDom::foundTag(Node node, bool isEnd) 53 | { 54 | if (!isEnd) 55 | { 56 | //append to current tree node 57 | tree::iterator next_state; 58 | next_state = mHtmlTree.append_child(mCurrentState, node); 59 | mCurrentState = next_state; 60 | } 61 | else 62 | { 63 | //Look if there is a pending open tag with that same name upwards 64 | //If mCurrentState tag isn't matching tag, maybe a some of its parents 65 | // matches 66 | vector< tree::iterator > path; 67 | tree::iterator i = mCurrentState; 68 | bool found_open = false; 69 | while (i != mHtmlTree.begin()) 70 | { 71 | #ifdef DEBUG 72 | cerr << "comparing " << node.tagName() << " with " << i->tagName()<tagName().length()) cerr << "Tag with no name at" << i->offset()<<";"<offset()+i->length(); 74 | #endif 75 | assert(i->isTag()); 76 | assert(i->tagName().length()); 77 | 78 | bool equal; 79 | const char *open = i->tagName().c_str(); 80 | const char *close = node.tagName().c_str(); 81 | equal = !(strcasecmp(open,close)); 82 | 83 | 84 | if (equal) 85 | { 86 | DEBUGP("Found matching tag %s\n", i->tagName().c_str()); 87 | //Closing tag closes this tag 88 | //Set length to full range between the opening tag and 89 | //closing tag 90 | i->length(node.offset() + node.length() - i->offset()); 91 | i->closingText(node.text()); 92 | 93 | mCurrentState = mHtmlTree.parent(i); 94 | found_open = true; 95 | break; 96 | } 97 | else 98 | { 99 | path.push_back(i); 100 | } 101 | 102 | i = mHtmlTree.parent(i); 103 | } 104 | 105 | if (found_open) 106 | { 107 | //If match was upper in the tree, so we need to invalidate child 108 | //nodes that were waiting for a close 109 | for (unsigned int j = 0; j < path.size(); ++j) 110 | { 111 | // path[j]->length(node.offset() - path[j]->offset()); 112 | mHtmlTree.flatten(path[j]); 113 | } 114 | } 115 | else 116 | { 117 | DEBUGP("Unmatched tag %s\n", node.text().c_str()); 118 | 119 | // Treat as comment 120 | node.isTag(false); 121 | node.isComment(true); 122 | mHtmlTree.append_child(mCurrentState, node); 123 | } 124 | } 125 | } 126 | 127 | ostream &HTML::operator<<(ostream &stream, const tree &tr) 128 | { 129 | 130 | tree::pre_order_iterator it = tr.begin(); 131 | tree::pre_order_iterator end = tr.end(); 132 | 133 | int rootdepth = tr.depth(it); 134 | stream << "-----" << endl; 135 | 136 | unsigned int n = 0; 137 | while ( it != end ) 138 | { 139 | 140 | int cur_depth = tr.depth(it); 141 | for(int i=0; i < cur_depth - rootdepth; ++i) stream << " "; 142 | stream << n << "@"; 143 | stream << "[" << it->offset() << ";"; 144 | stream << it->offset() + it->length() << ") "; 145 | stream << (string)(*it) << endl; 146 | ++it, ++n; 147 | } 148 | stream << "-----" << endl; 149 | return stream; 150 | } 151 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | htmlcxx - html and css APIs for C++ 2 | 3 | --------------------------------------------- 4 | 5 | 6 | Description 7 | =========== 8 | 9 | htmlcxx is a simple non-validating css1 and html parser for C++. 10 | Although there are several other html parsers available, htmlcxx has some 11 | characteristics that make it unique: 12 | 13 | - STL like navigation of DOM tree, using excelent's tree.hh library from 14 | Kasper Peeters 15 | - It is possible to reproduce exactly, character by character, the 16 | original document from the parse tree 17 | - Bundled css parser 18 | - Optional parsing of attributes 19 | - C++ code that looks like C++ (not so true anymore) 20 | - Offsets of tags/elements in the original document are stored in the 21 | nodes of the DOM tree 22 | 23 | The parsing politics of htmlcxx were created trying to mimic mozilla 24 | firefox (http://www.mozilla.org) behavior. So you should expect parse 25 | trees similar to those create by firefox. However, differently from firefox, 26 | htmlcxx does not insert non-existent stuff in your html. Therefore, serializing 27 | the DOM tree gives exactly the same bytes contained in the original HTML 28 | document. 29 | 30 | News for version 0.86 31 | ===================== 32 | 33 | Fixed a few compilation problems. 34 | 35 | News for version 0.85 36 | ===================== 37 | 38 | Fixed gcc 4.3 compiler errors, several minor bug fixes, improved distribution 39 | of the css library. 40 | 41 | 42 | News for version 0.7.3 43 | ====================== 44 | 45 | Added utility code to escape/decode urls as defined by RFC 2396. 46 | Added new SAX interface. The API was slightly broken to support the new 47 | SAX interface :-(. 48 | Added Visual Studio 2003 projects for the WIN32 port. 49 | 50 | 51 | Examples 52 | ======== 53 | 54 | Using htmlcxx is quite simple. Take a look 55 | at this example. 56 | 57 | ----------------------------------------------------------------------- 58 | 59 | #include 60 | ... 61 | using namespace std; 62 | using namespace htmlcxx; 63 | 64 | //Parse some html code 65 | string html = "hey"; 66 | HTML::ParserDom parser; 67 | tree dom = parser.parseTree(html); 68 | 69 | //Print whole DOM tree 70 | cout << dom << endl; 71 | 72 | //Dump all links in the tree 73 | tree::iterator it = dom.begin(); 74 | tree::iterator end = dom.end(); 75 | for (; it != end; ++it) 76 | { 77 | if (strcasecmp(it->tagName().c_str(), "A") == 0) 78 | { 79 | it->parseAttributes(); 80 | cout << it->attribute("href").second << endl; 81 | } 82 | } 83 | 84 | //Dump all text of the document 85 | it = dom.begin(); 86 | end = dom.end(); 87 | for (; it != end; ++it) 88 | { 89 | if ((!it->isTag()) && (!it->isComment())) 90 | { 91 | cout << it->text(); 92 | } 93 | } 94 | cout << endl; 95 | 96 | ------------------------------------------------- 97 | 98 | 99 | The htmlcxx application 100 | ======================= 101 | 102 | htmlcxx is the name of both the library and the utility 103 | application that comes with this package. Although the 104 | htmlcxx (the application) is mostly useless for programming, you can use it 105 | to easily see how htmlcxx (the library) would parse your html code. 106 | Just install and try htmlcxx -h. 107 | 108 | 109 | Downloads 110 | ========= 111 | 112 | Use the project page at sourceforge: http://sf.net/projects/htmlcxx 113 | 114 | 115 | License Stuff 116 | ============= 117 | 118 | Code is now under the LGPL. This was our initial intention, and is 119 | now possible thanks to the author of tree.hh, who allowed us to use it 120 | under LGPL only for HTML::Node template instances. Check 121 | http://www.fsf.org or the COPYING file in the distribution for details 122 | about the LGPL license. The uri parsing code is a derivative work of 123 | Apache web server uri parsing routines. Check 124 | www.apache.org/licenses/LICENSE-2.0 or the ASF-2.0 file in the 125 | distribution for details. 126 | 127 | ---------------------------------------- 128 | 129 | Enjoy! 130 | 131 | Davi de Castro Reis - 132 | 133 | Robson Braga Ara�jo - 134 | 135 | Last Updated: Tue Dec 8 20:37:41 BRST 2015 136 | -------------------------------------------------------------------------------- /html/profile.h: -------------------------------------------------------------------------------- 1 | #ifndef __MY_PROFILE__ 2 | #define __MY_PROFILE__ 3 | 4 | #ifdef PROFILE 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #define USEC_PER_SEC 1000000 11 | static void __time_val_add(struct timeval *time_, long microseconds) 12 | { 13 | if (microseconds >= 0) 14 | { 15 | time_->tv_usec += microseconds % USEC_PER_SEC; 16 | time_->tv_sec += microseconds / USEC_PER_SEC; 17 | if (time_->tv_usec >= USEC_PER_SEC) 18 | { 19 | time_->tv_usec -= USEC_PER_SEC; 20 | time_->tv_sec++; 21 | } 22 | } 23 | else 24 | { 25 | microseconds *= -1; 26 | time_->tv_usec -= microseconds % USEC_PER_SEC; 27 | time_->tv_sec -= microseconds / USEC_PER_SEC; 28 | if (time_->tv_usec < 0) 29 | { 30 | time_->tv_usec += USEC_PER_SEC; 31 | time_->tv_sec--; 32 | } 33 | } 34 | } 35 | 36 | 37 | 38 | #define PROFILE_DECLARE_STATIC(x) static struct timeval __profile_start##x = {0,0}, __profile_total##x = {0,0}, __profile_end##x = {0,0}; 39 | #define PROFILE_DECLARE(x) struct timeval __profile_start##x = {0,0}, __profile_total##x = {0,0}, __profile_end##x = {0,0}; 40 | 41 | #define PROFILE_START(x) do { gettimeofday(&__profile_start##x, NULL); } while(0); 42 | #define PROFILE_END(x) do { gettimeofday(&__profile_end##x, NULL); } while(0); 43 | #define PROFILE_ADD(x) do { if(!(__profile_end##x.tv_sec > __profile_start##x.tv_sec || (__profile_end##x.tv_sec == __profile_start##x.tv_sec && __profile_end##x.tv_usec >= __profile_start##x.tv_usec))) {\ 44 | break;\ 45 | }\ 46 | __time_val_add(&(__profile_total##x), (__profile_end##x.tv_sec - __profile_start##x.tv_sec)*USEC_PER_SEC);\ 47 | __time_val_add(&(__profile_total##x), __profile_end##x.tv_usec);\ 48 | __time_val_add(&(__profile_total##x), __profile_start##x.tv_usec * -1);\ 49 | } while(0); 50 | #define PROFILE_END_ADD(x) do { PROFILE_END(x); PROFILE_ADD(x); } while(0); 51 | #define PROFILE_CLEAR(x) do { __profile_start##x.tv_sec = 0, __profile_start##x.tv_usec = 0, __profile_total##x.tv_sec = 0, __profile_total##x.tv_usec = 0, __profile_end##x.tv_sec = 0, __profile_end##x.tv_usec = 0; } while(0); 52 | 53 | #define __PROFILE_PRINT(prefix, stream, timeval) do {\ 54 | int microsec = 0;\ 55 | int sec = 0;\ 56 | int min = 0;\ 57 | int hour = 0;\ 58 | /*assert(timeval.tv_sec >= 0);*/\ 59 | hour = timeval.tv_sec / 3600;\ 60 | min = (timeval.tv_sec % 3600)/60;\ 61 | sec = timeval.tv_sec % 60;\ 62 | microsec = timeval.tv_usec;\ 63 | stream << prefix;\ 64 | stream << hour << "h" << min << "m" << sec << "s " << microsec << " microseconds";\ 65 | } while (0); 66 | 67 | #define PROFILE_PRINT(x, prefix, stream) __PROFILE_PRINT(prefix, stream, __profile_total##x) 68 | 69 | #define PROFILE_PRINT_CURRENT(x, prefix, stream) do {\ 70 | struct timeval sum;\ 71 | sum.tv_sec = 0;\ 72 | sum.tv_usec = 0;\ 73 | if(!(__profile_end##x.tv_sec > __profile_start##x.tv_sec || (__profile_end##x.tv_sec == __profile_start##x.tv_sec && __profile_end##x.tv_usec >= __profile_start##x.tv_usec))) {\ 74 | fprintf(stderr, "Refusing to print invalid profile measure\n");\ 75 | break;\ 76 | }\ 77 | __time_val_add(&sum, (__profile_end##x.tv_sec - __profile_start##x.tv_sec)*1000000);\ 78 | __time_val_add(&sum, __profile_end##x.tv_usec);\ 79 | __time_val_add(&sum, __profile_start##x.tv_usec * -1);\ 80 | __PROFILE_PRINT(prefix, stream, sum);\ 81 | } while(0); 82 | 83 | #define PROFILE_PRINT_CURRENT_NL(x, prefix, stream) do {\ 84 | PROFILE_PRINT_CURRENT(x, prefix, stream);\ 85 | stream << std::endl;\ 86 | } while(0); 87 | #define PROFILE_PRINT_NL(x, prefix, stream) do {\ 88 | PROFILE_PRINT(x, prefix, stream);\ 89 | stream << std::endl;\ 90 | } while(0); 91 | 92 | #else 93 | #define PROFILE_DECLARE(x) 94 | #define PROFILE_DECLARE_STATIC(x) 95 | #define PROFILE_START(x) 96 | #define PROFILE_END(x) 97 | #define PROFILE_ADD(x) 98 | #define PROFILE_END_ADD(x) 99 | #define PROFILE_PRINT(x, prefix, stream) 100 | #define PROFILE_PRINT_NL(x, prefix, stream) 101 | #define PROFILE_PRINT_CURRENT(x, prefix, stream) 102 | #define PROFILE_PRINT_CURRENT_NL(x, prefix, stream) 103 | #define PROFILE_CLEAR(x) 104 | #endif 105 | 106 | #endif //__MY_PROFILE__ 107 | 108 | 109 | /* 110 | fprintf(stderr, "Profiler ignoring invalid time measure from %d@%s\n", __LINE__, __FILE__);\ 111 | fprintf(stderr, "End: sec %u usec %u Start: sec %u usec %u\n", __profile_end##x.tv_sec, __profile_end##x.tv_usec, __profile_start##x.tv_sec, __profile_start##x.tv_usec);\ 112 | */ 113 | -------------------------------------------------------------------------------- /htmlcxx.vcproj: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | 11 | 12 | 13 | 19 | 31 | 33 | 36 | 38 | 40 | 42 | 44 | 46 | 48 | 50 | 52 | 54 | 55 | 61 | 70 | 72 | 75 | 77 | 79 | 81 | 83 | 85 | 87 | 89 | 91 | 93 | 94 | 95 | 96 | 97 | 98 | 102 | 105 | 106 | 109 | 110 | 113 | 114 | 117 | 118 | 121 | 122 | 125 | 126 | 127 | 131 | 133 | 134 | 136 | 137 | 139 | 140 | 142 | 143 | 145 | 146 | 148 | 149 | 151 | 152 | 154 | 155 | 157 | 158 | 159 | 163 | 164 | 166 | 167 | 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /html/tld.h: -------------------------------------------------------------------------------- 1 | /** Do not modify. This file is automatically generated 2 | * using gen_tld.pl and tld.list 3 | */ 4 | 5 | static const char *tld[] = { 6 | ".LOCALHOST", 7 | ".INVALID", 8 | ".EXAMPLE", 9 | ".COOP.BR", 10 | ".ZLG.BR", 11 | ".VET.BR", 12 | ".TUR.BR", 13 | ".TRD.BR", 14 | ".TMP.BR", 15 | ".SRV.BR", 16 | ".SLG.BR", 17 | ".REC.BR", 18 | ".QSL.BR", 19 | ".PSI.BR", 20 | ".PSC.BR", 21 | ".PRO.BR", 22 | ".PPG.BR", 23 | ".ORG.BR", 24 | ".ODO.BR", 25 | ".NTR.BR", 26 | ".NOT.BR", 27 | ".NOM.BR", 28 | ".NET.BR", 29 | ".MUSEUM", 30 | ".MUS.BR", 31 | ".MIL.BR", 32 | ".MED.BR", 33 | ".MAT.BR", 34 | ".LEL.BR", 35 | ".JOR.BR", 36 | ".INF.BR", 37 | ".IND.BR", 38 | ".IMB.BR", 39 | ".GOV.BR", 40 | ".GGF.BR", 41 | ".G12.BR", 42 | ".FST.BR", 43 | ".FOT.BR", 44 | ".FND.BR", 45 | ".FAR.BR", 46 | ".ETI.BR", 47 | ".ETC.BR", 48 | ".ESP.BR", 49 | ".ENG.BR", 50 | ".EDU.BR", 51 | ".ECN.BR", 52 | ".COM.BR", 53 | ".CNT.BR", 54 | ".CNG.BR", 55 | ".CIM.BR", 56 | ".BMD.BR", 57 | ".BIO.BR", 58 | ".ATO.BR", 59 | ".ART.BR", 60 | ".ARQ.BR", 61 | ".AGR.BR", 62 | ".ADV.BR", 63 | ".ADM.BR", 64 | ".TV.BR", 65 | ".FM.BR", 66 | ".AM.BR", 67 | ".TEST", 68 | ".NAME", 69 | ".INFO", 70 | ".COOP", 71 | ".ARPA", 72 | ".AERO", 73 | ".PRO", 74 | ".ORG", 75 | ".NET", 76 | ".MIL", 77 | ".INT", 78 | ".GOV", 79 | ".EDU", 80 | ".COM", 81 | ".BIZ", 82 | ".ZW", 83 | ".ZM", 84 | ".ZA", 85 | ".YU", 86 | ".YT", 87 | ".YE", 88 | ".WS", 89 | ".WF", 90 | ".VU", 91 | ".VN", 92 | ".VI", 93 | ".VG", 94 | ".VE", 95 | ".VC", 96 | ".VA", 97 | ".UZ", 98 | ".UY", 99 | ".US", 100 | ".UM", 101 | ".UK", 102 | ".UG", 103 | ".UA", 104 | ".TZ", 105 | ".TW", 106 | ".TV", 107 | ".TT", 108 | ".TR", 109 | ".TP", 110 | ".TO", 111 | ".TN", 112 | ".TM", 113 | ".TK", 114 | ".TJ", 115 | ".TH", 116 | ".TG", 117 | ".TF", 118 | ".TD", 119 | ".TC", 120 | ".SZ", 121 | ".SY", 122 | ".SV", 123 | ".SU", 124 | ".ST", 125 | ".SR", 126 | ".SO", 127 | ".SN", 128 | ".SM", 129 | ".SL", 130 | ".SK", 131 | ".SJ", 132 | ".SI", 133 | ".SH", 134 | ".SG", 135 | ".SE", 136 | ".SD", 137 | ".SC", 138 | ".SB", 139 | ".SA", 140 | ".RW", 141 | ".RU", 142 | ".RO", 143 | ".RE", 144 | ".QA", 145 | ".PY", 146 | ".PW", 147 | ".PT", 148 | ".PS", 149 | ".PR", 150 | ".PN", 151 | ".PM", 152 | ".PL", 153 | ".PK", 154 | ".PH", 155 | ".PG", 156 | ".PF", 157 | ".PE", 158 | ".PA", 159 | ".OM", 160 | ".NZ", 161 | ".NU", 162 | ".NR", 163 | ".NP", 164 | ".NO", 165 | ".NL", 166 | ".NI", 167 | ".NG", 168 | ".NF", 169 | ".NE", 170 | ".NC", 171 | ".NA", 172 | ".MZ", 173 | ".MY", 174 | ".MX", 175 | ".MW", 176 | ".MV", 177 | ".MU", 178 | ".MT", 179 | ".MS", 180 | ".MR", 181 | ".MQ", 182 | ".MP", 183 | ".MO", 184 | ".MN", 185 | ".MM", 186 | ".ML", 187 | ".MK", 188 | ".MH", 189 | ".MG", 190 | ".MD", 191 | ".MC", 192 | ".MA", 193 | ".LY", 194 | ".LV", 195 | ".LU", 196 | ".LT", 197 | ".LS", 198 | ".LR", 199 | ".LK", 200 | ".LI", 201 | ".LC", 202 | ".LB", 203 | ".LA", 204 | ".KZ", 205 | ".KY", 206 | ".KW", 207 | ".KR", 208 | ".KP", 209 | ".KN", 210 | ".KM", 211 | ".KI", 212 | ".KH", 213 | ".KG", 214 | ".KE", 215 | ".JP", 216 | ".JO", 217 | ".JM", 218 | ".JE", 219 | ".IT", 220 | ".IS", 221 | ".IR", 222 | ".IQ", 223 | ".IO", 224 | ".IN", 225 | ".IM", 226 | ".IL", 227 | ".IE", 228 | ".ID", 229 | ".HU", 230 | ".HT", 231 | ".HR", 232 | ".HN", 233 | ".HM", 234 | ".HK", 235 | ".GY", 236 | ".GW", 237 | ".GU", 238 | ".GT", 239 | ".GS", 240 | ".GR", 241 | ".GQ", 242 | ".GP", 243 | ".GN", 244 | ".GM", 245 | ".GL", 246 | ".GI", 247 | ".GH", 248 | ".GG", 249 | ".GF", 250 | ".GE", 251 | ".GD", 252 | ".GB", 253 | ".GA", 254 | ".FR", 255 | ".FO", 256 | ".FM", 257 | ".FK", 258 | ".FJ", 259 | ".FI", 260 | ".ET", 261 | ".ES", 262 | ".ER", 263 | ".EH", 264 | ".EG", 265 | ".EE", 266 | ".EC", 267 | ".DZ", 268 | ".DO", 269 | ".DM", 270 | ".DK", 271 | ".DJ", 272 | ".DE", 273 | ".CZ", 274 | ".CY", 275 | ".CX", 276 | ".CV", 277 | ".CU", 278 | ".CR", 279 | ".CO", 280 | ".CN", 281 | ".CM", 282 | ".CL", 283 | ".CK", 284 | ".CI", 285 | ".CH", 286 | ".CG", 287 | ".CF", 288 | ".CD", 289 | ".CC", 290 | ".CA", 291 | ".BZ", 292 | ".BY", 293 | ".BW", 294 | ".BV", 295 | ".BT", 296 | ".BS", 297 | ".BR", 298 | ".BO", 299 | ".BN", 300 | ".BM", 301 | ".BJ", 302 | ".BI", 303 | ".BH", 304 | ".BG", 305 | ".BF", 306 | ".BE", 307 | ".BD", 308 | ".BB", 309 | ".BA", 310 | ".AZ", 311 | ".AW", 312 | ".AU", 313 | ".AT", 314 | ".AS", 315 | ".AR", 316 | ".AQ", 317 | ".AO", 318 | ".AN", 319 | ".AM", 320 | ".AL", 321 | ".AI", 322 | ".AG", 323 | ".AF", 324 | ".AE", 325 | ".AD", 326 | ".AC" 327 | }; 328 | static size_t tldOffset(const char *domain) { 329 | const char *end = domain + strlen(domain); 330 | for(unsigned int i = 0; i < 321; ++i) 331 | { 332 | size_t len = strlen(tld[i]); 333 | if(strcasecmp(end - len, tld[i]) == 0) 334 | { 335 | return len; 336 | } 337 | } 338 | return 0; 339 | } 340 | -------------------------------------------------------------------------------- /css/default.css: -------------------------------------------------------------------------------- 1 | /* rendered CSS1-addressable elements and all applicable non-inherited 2 | properties set to initial values and default display types */ 3 | 4 | A, ABBR, ACRONYM, ADDRESS, BDO, BLOCKQUOTE, BODY, BUTTON, CITE, CODE, DD, DEL, 5 | DFN, DIV, DL, DT, EM, FIELDSET, FORM, H1, H2, H3, H4, H5, H6, HTML, IFRAME, IMG, INS, 6 | KBD, LABEL, LI, OBJECT, OL, P, Q, SAMP, SPAN, STRONG, SUB, SUP, UL, VAR, 7 | APPLET, B, BIG, CENTER, DIR, FONT, HR, I, MENU, PRE, S, SMALL, STRIKE, TT, U { 8 | background: transparent; 9 | width: auto; 10 | height: auto; 11 | text-decoration: none; 12 | margin: 0; 13 | padding: 0; 14 | border: 0; 15 | float: none; 16 | clear: none; 17 | vertical-align: baseline; 18 | list-style-image: none; 19 | list-style-type: disc; 20 | list-style-position: outside; 21 | } 22 | 23 | ADDRESS, BLOCKQUOTE, BODY, DD, DIV, DL, DT, FIELDSET, FORM, H1, H2, H3, H4, H5, 24 | H6, OL, P, UL, CENTER, DIR, HR, MENU, PRE { 25 | display: block; 26 | } 27 | 28 | A, ABBR, ACRONYM, APPLET, BDO, BUTTON, CITE, CODE, DEL, DFN, EM, IFRAME, IMG, 29 | INS, KBD, LABEL, OBJECT, Q, 30 | SAMP, SPAN, STRONG, SUB, SUP, VAR, B, BIG, FONT, I, S, SMALL, STRIKE, TT, U { 31 | 32 | display: inline; 33 | } 34 | 35 | LI { 36 | display: list-item; 37 | } 38 | 39 | /* Begin tree of inherited properties and cascades. */ 40 | 41 | /* Describes the default type, color, and link decoration specs of 42 | Mosaic-derivative browsers to the extent and degree of granularity that users 43 | may typically override. Uncomment for "factory settings." */ 44 | 45 | HTML { 46 | font-family: "Times New Roman", Times; 47 | font-size: medium; 48 | font-weight: normal; 49 | color: black; 50 | background-color: #BFBFBF; 51 | } 52 | 53 | PRE, TT, CODE, KBD, SAMP { 54 | font-family: "Courier New", Courier; 55 | } 56 | 57 | A:link, A:visited, A:active { 58 | text-decoration: underline; 59 | } 60 | 61 | A:link { 62 | color: #0000FF; 63 | } 64 | 65 | A:visited { 66 | color: #7F007F; 67 | } 68 | 69 | A:active { 70 | color: #0000FF; 71 | } 72 | 73 | /* end pre-CSS user settings */ 74 | 75 | HTML { 76 | line-height: 1.12; 77 | word-spacing: normal; 78 | letter-spacing: normal; 79 | text-transform: none; 80 | text-align: left; 81 | text-indent: 0; 82 | white-space: normal; 83 | } 84 | 85 | BODY { 86 | padding: 8px; 87 | } 88 | 89 | H1 { 90 | font-size: 2em; 91 | margin: .67em 0; 92 | } 93 | 94 | H2 { 95 | font-size: 1.5em; 96 | margin: .75em 0; 97 | } 98 | 99 | H3 { 100 | font-size: 1.17em; 101 | margin: .83em 0; 102 | } 103 | 104 | H4, P, BLOCKQUOTE, FIELDSET, FORM, UL, OL, DL, DIR, MENU { 105 | margin: 1.12em 0; 106 | } 107 | 108 | H5 { 109 | font-size: .83em; /* varies with pixels-per-em at document root */ 110 | margin: 1.5em 0; 111 | } 112 | 113 | H6 { 114 | font-size: .6em; /* varies with pixels-per-em at document root */ 115 | margin: 1.67em 0; 116 | } 117 | 118 | H1, H2, H3, H4, H5, H6, B, STRONG { 119 | font-weight: bolder; 120 | } 121 | 122 | BLOCKQUOTE { 123 | margin-left: 40px; 124 | margin-right: 40px; 125 | } 126 | 127 | I, CITE, EM, VAR, ADDRESS { 128 | font-style: italic; 129 | } 130 | 131 | PRE, TT, CODE, KBD, SAMP { 132 | font-family: monospace; 133 | } 134 | 135 | PRE { 136 | white-space: pre; 137 | } 138 | 139 | BIG { 140 | font-size: larger; 141 | } 142 | 143 | SMALL, SUB, SUP { 144 | font-size: smaller; 145 | } 146 | 147 | SUB { 148 | vertical-align: sub; 149 | } 150 | 151 | SUP { 152 | vertical-align: super; 153 | } 154 | 155 | S, STRIKE, DEL { 156 | text-decoration: line-through; 157 | } 158 | 159 | HR { 160 | border: 1px inset; /* questionable */ 161 | } 162 | 163 | OL, UL, DIR, MENU, DD { 164 | padding-left: 40px; 165 | } 166 | 167 | OL LI { 168 | list-style-type: decimal; 169 | } 170 | 171 | UL LI { 172 | list-style-type: disc; 173 | } 174 | 175 | UL UL, UL OL, UL MENU, UL DIR, MENU UL, MENU OL, MENU MENU, MENU DIR, DIR UL, 176 | DIR OL, DIR MENU, DIR DIR, OL UL, OL OL, OL MENU, OL DIR { 177 | margin-top: 0; 178 | margin-bottom: 0; 179 | } 180 | 181 | OL UL, UL UL, MENU UL, DIR UL, OL MENU, UL MENU, MENU MENU, DIR MENU, OL DIR, UL 182 | DIR, MENU DIR, DIR DIR { 183 | list-style-type: circle; 184 | } 185 | 186 | OL OL UL, OL UL UL, OL MENU UL, OL DIR UL, OL OL MENU, OL UL MENU, OL MENU MENU, 187 | OL DIR MENU, OL OL DIR, OL UL DIR, OL MENU DIR, OL DIR DIR, UL OL UL, UL UL UL, 188 | UL MENU UL, UL DIR UL, UL OL MENU, UL UL MENU, UL MENU MENU, UL DIR MENU, UL OL 189 | DIR, UL UL DIR, UL MENU DIR, UL DIR DIR, MENU OL UL, MENU UL UL, MENU MENU UL, 190 | MENU DIR UL, MENU OL MENU, MENU UL MENU, MENU MENU MENU, MENU DIR MENU, MENU OL 191 | DIR, MENU UL DIR, MENU MENU DIR, MENU DIR DIR, DIR OL UL, DIR UL UL, DIR MENU 192 | UL, DIR DIR UL, DIR OL MENU, DIR UL MENU, DIR MENU MENU, DIR DIR MENU, DIR OL 193 | DIR, DIR UL DIR, DIR MENU DIR, DIR DIR DIR { 194 | list-style-type: square; 195 | } 196 | 197 | U, INS { 198 | text-decoration: underline; 199 | } 200 | 201 | CENTER { 202 | text-align: center; 203 | } 204 | 205 | CAPTION, COL, COLGROUP, LEGEND, TABLE, TBODY, TD, TFOOT, TH, THEAD, TR { 206 | background: transparent; 207 | text-decoration: none; 208 | margin: 1px; 209 | padding: 1px; 210 | border: none; 211 | float: none; 212 | clear: none; 213 | } 214 | 215 | TABLE, TBODY, TFOOT, THEAD, TR { 216 | display: block; 217 | background-position: top left; 218 | width: auto; 219 | height: auto; 220 | } 221 | 222 | CAPTION, LEGEND, TD, TH { 223 | display: inline; 224 | vertical-align: baseline; 225 | font-size: 1em; 226 | line-height: 1.33em; 227 | color: black; 228 | word-spacing: normal; 229 | letter-spacing: normal; 230 | text-transform: none; 231 | text-align: left; 232 | text-indent: 0; 233 | white-space: normal; 234 | } 235 | 236 | TH { 237 | font-weight: bolder; 238 | text-align: center; 239 | } 240 | 241 | CAPTION { 242 | text-align: center; 243 | } 244 | 245 | /* not part of the legacy browser default sheet, but an obvious enhancement */ 246 | 247 | OL OL LI { 248 | list-style-type: lower-alpha; 249 | } 250 | 251 | OL OL OL LI { 252 | list-style-type: lower-roman 253 | } 254 | -------------------------------------------------------------------------------- /htmlcxx.cc: -------------------------------------------------------------------------------- 1 | #include "html/ParserDom.h" 2 | #include "html/utils.h" 3 | #include "html/wincstring.h" 4 | #include "css/parser_pp.h" 5 | #ifndef WIN32 6 | #include "config.h" 7 | #else 8 | #define VERSION "0.6" 9 | #endif 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "wingetopt.h" 19 | 20 | using namespace std; 21 | using namespace htmlcxx; 22 | 23 | void usage(string prg) { 24 | cerr << "usage:\t" << prg << " [-h] [-V] file.html [file.css]" << endl; 25 | return; 26 | } 27 | 28 | void usage_long(string prg) { 29 | usage(prg); 30 | cerr << "Html and css parser" << endl << endl; 31 | cerr << " -V\t print version number and exit" << endl; 32 | cerr << " -h\t print this help text" << endl; 33 | cerr << " -C\t disable css parsing" << endl; 34 | return; 35 | } 36 | 37 | int main(int argc, char **argv) 38 | { 39 | tree tr; 40 | bool parse_css = true; 41 | bool detail_print = false; 42 | string css_code; 43 | try 44 | { 45 | while (1) 46 | { 47 | signed char c = getopt(argc, argv, "hVCD"); 48 | if(c == -1) break; 49 | switch(c) { 50 | case 'h': 51 | usage_long(argv[0]); 52 | exit(0); 53 | break; 54 | case 'V': 55 | cerr << VERSION << endl; 56 | exit(0); 57 | case 'C': 58 | parse_css = false; 59 | break; 60 | case 'D': 61 | detail_print = true; 62 | break; 63 | default: 64 | usage(argv[0]); 65 | exit(1); 66 | break; 67 | } 68 | } 69 | 70 | if (argc != optind + 1 && argc != optind + 2) 71 | { 72 | usage(argv[0]); 73 | exit(1); 74 | } 75 | 76 | ifstream file(argv[optind]); 77 | if (!file.is_open()) 78 | { 79 | cerr << "Unable to open file " << argv[optind] << endl; 80 | exit(1); 81 | } 82 | string html; 83 | 84 | while (1) 85 | { 86 | char buf[BUFSIZ]; 87 | file.read(buf, BUFSIZ); 88 | if(file.gcount() == 0) { 89 | break; 90 | } 91 | html.append(buf, file.gcount()); 92 | } 93 | file.close(); 94 | 95 | if(argc == optind + 2) //we have a separate css file 96 | { 97 | ifstream fcss(argv[optind + 1]); 98 | if(!fcss.is_open()) 99 | { 100 | cerr << "Unable to open file " << argv[optind] << endl; 101 | exit(1); 102 | } 103 | while (1) 104 | { 105 | char buf[BUFSIZ]; 106 | fcss.read(buf, BUFSIZ); 107 | if(fcss.gcount() == 0) { 108 | break; 109 | } 110 | css_code.append(buf, fcss.gcount()); 111 | } 112 | fcss.close(); 113 | } 114 | 115 | 116 | 117 | HTML::ParserDom parser; 118 | parser.parse(html); 119 | tr = parser.getTree(); 120 | if (detail_print) 121 | { 122 | cout << "-- HTML:start ------------" << endl; 123 | cout << tr << endl; 124 | cout << "-- HTML:end --------------" << endl; 125 | } 126 | 127 | map count_tag; 128 | tree::pre_order_iterator it = tr.begin(); 129 | tree::pre_order_iterator end = tr.end(); 130 | 131 | while ( it != end ) 132 | { 133 | string tag = (string)(*it); 134 | if ( it->isTag() && tag.length() > 0 ) 135 | { 136 | transform(tag.begin(), tag.end(), tag.begin(), ::toupper); 137 | count_tag[tag] += 1; 138 | } 139 | ++it; 140 | } 141 | for (auto &p: count_tag) 142 | { 143 | // print count of html tags 144 | cout << "html:" << p.first << ":" << p.second << endl; 145 | } 146 | cout << "html-count:" << count_tag.size() << endl; 147 | 148 | 149 | } catch (exception &e) { 150 | cerr << "Exception " << e.what() << " caught" << endl; 151 | exit(1); 152 | } catch (...) { 153 | cerr << "Unknow exception caught " << endl; 154 | } 155 | 156 | #ifdef WIN32 157 | if(parse_css) 158 | { 159 | cerr << "Css parsing not supported in win32" << endl; 160 | return 1; 161 | } 162 | return 0; 163 | #else 164 | if (parse_css) try 165 | { 166 | if(!parse_css) exit(0); 167 | 168 | CSS::Parser css_parser; 169 | tree::iterator it = tr.begin(); 170 | tree::iterator end = tr.end(); 171 | if(css_code.length()) { 172 | css_parser.parse(css_code); 173 | } 174 | cout << "----------css-------------" << endl; 175 | cout << css_parser << endl; 176 | cout << "----------(css)-------------" << endl; 177 | 178 | map count_css; 179 | //cout << "CSS attributes:" << endl; 180 | //cout << endl; 181 | while (it != end) 182 | { 183 | 184 | if (it->isTag()) 185 | { 186 | 187 | it->parseAttributes(); 188 | vector v; 189 | tree::iterator k = it; 190 | while (k != tr.begin()) 191 | { 192 | CSS::Parser::Selector s; 193 | s.setElement(k->tagName()); 194 | s.setId(k->attribute("id").second); 195 | s.setClass(k->attribute("class").second); 196 | s.setPseudoClass(CSS::Parser::NONE_CLASS); 197 | s.setPseudoElement(CSS::Parser::NONE_ELEMENT); 198 | v.push_back(s); 199 | k = tr.parent(k); 200 | //cout << "selector:" << k->tagName() << "," << k->attribute("id").second << "," << k->attribute("class").second << endl; 201 | //cout << s << endl; 202 | } 203 | 204 | map attributes = css_parser.getAttributes(v); 205 | map::const_iterator mit = attributes.begin(); 206 | map::const_iterator mend = attributes.end(); 207 | 208 | string tag = it->tagName(); 209 | if (detail_print) 210 | { 211 | for(unsigned int i = 0; i < tag.size(); ++i) tag[i] = ::toupper(tag[i]); 212 | cout << tag << "@[" << it->offset() << ":" << it->offset() + it->length() << ")" << endl; 213 | for(; mit != mend; ++mit) 214 | { 215 | cout << mit->first << ": " << mit->second << endl; 216 | count_css[mit->first] += 1; 217 | } 218 | cout << endl; 219 | } 220 | else 221 | { 222 | for(; mit != mend; ++mit) 223 | { 224 | count_css[mit->first] += 1; 225 | } 226 | } 227 | 228 | 229 | if (strcasecmp(it->tagName().c_str(), "STYLE") == 0) 230 | { 231 | tree::iterator begin, end; 232 | begin = it; 233 | end = it; 234 | end.skip_children(); 235 | ++end; 236 | string css_snippet; 237 | 238 | for (; begin != end; ++begin) 239 | { 240 | if (!(begin->isTag())) css_snippet.append(begin->text()); 241 | } 242 | 243 | css_parser.parse(css_snippet); 244 | cout << "----------css_snippet-------------" << endl; 245 | cout << css_parser << endl; 246 | cout << "----------(css_snippet)-------------" << endl; 247 | } 248 | } 249 | ++it; 250 | } 251 | 252 | for (auto &p: count_css) 253 | { 254 | // print count of css attributes 255 | cout << "css:" << p.first << ":" << p.second << endl; 256 | } 257 | cout << "css-count:" << count_css.size() << endl; 258 | } catch (exception &e) { 259 | cerr << "Exception " << e.what() << " caught" << endl; 260 | exit(1); 261 | } catch (...) { 262 | cerr << "Unknow exception caught " << endl; 263 | } 264 | 265 | exit(0); 266 | #endif 267 | } 268 | -------------------------------------------------------------------------------- /html/tests.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "CharsetConverter.h" 7 | #include "Uri.h" 8 | #include "ParserDom.h" 9 | #include "utils.h" 10 | 11 | using namespace std; 12 | using namespace htmlcxx; 13 | 14 | #define myassert(x) \ 15 | do {\ 16 | if(!(x)) {\ 17 | fprintf(stderr, "Test at %s:%d failed!\n", __FILE__, __LINE__);\ 18 | exit(1);\ 19 | }\ 20 | } while(0) 21 | 22 | bool my_tree_compare(tree::iterator begin, tree::iterator end, tree::iterator ref) 23 | { 24 | tree::iterator it(begin); 25 | while (it != end && ref != end) 26 | { 27 | if (it.number_of_children() != ref.number_of_children()) 28 | return false; 29 | if (it->text() != ref->text()) 30 | return false; 31 | ++it; 32 | ++ref; 33 | } 34 | return true; 35 | } 36 | 37 | class HtmlTest { 38 | public: 39 | 40 | bool parse() { 41 | cerr << "Parsing some html... "; 42 | tree tr; 43 | string html = "\n\n\n\n
\n\n\n\n\n\n
\"adtAustralian Digital Theses Program
\n
\n
\n
\n"; 44 | 45 | HTML::ParserDom parser; 46 | parser.parse(html); 47 | tr = parser.getTree(); 48 | cerr << tr << endl; 49 | cerr << " ok" << endl; 50 | return true; 51 | } 52 | 53 | bool string_manip() { 54 | 55 | string root_link = "http://www.akwan.com.br/teste/acton.asp?q=atletico"; 56 | string root_link2 = "http://answerbook.ime.usp.br:8888/ab2"; 57 | string link1 = "../a.html"; 58 | string link2 = "//b.html"; 59 | string link3 = "serviço.html"; 60 | string link4 = "./d/c.html"; 61 | string link5 = "http://www.fadazan.com.br/../../../../../Download/teste/../jacobmacanhan,%203276.jpg"; 62 | string link6 = "search?q=galo"; 63 | string link7 = "http://casadebruxa.com.br/anuncio/../banner/vai.asp?id=21&url=http://www.clickdirect.com.br"; 64 | string link8 = "/ab2/Help_C/ONLINEACCESS/@Ab2HelpView/idmatch(help-library-info)"; 65 | string link9 = "/ab2/coll.67.3/@Ab2CollView?"; 66 | string link10 = "http://www.a.com.br"; 67 | string link11 = "'http://www.b.com.br"; 68 | string link12 = "?q=mineiro"; 69 | string entities = "nos somos do clube atletico mineiro á á brasil   &teste; ãä á â &end "; 70 | string comments = "hello brazil"; 71 | string multiblank = " 1 2 3\r\n 4 5 \r\n 6 \n"; 72 | string justblank = " \r\n \r\n \n"; 73 | string nonblank = "dsadasdada"; 74 | myassert(HTML::strip_comments(comments) == "hello brazil"); 75 | myassert(HTML::single_blank(multiblank) == "1 2 3 4 5 6"); 76 | myassert(HTML::single_blank(justblank) == ""); 77 | myassert(HTML::single_blank(nonblank) == nonblank); 78 | myassert(HTML::decode_entities(entities) == "nos somos do clube atletico mineiro � � brasil &teste; �� á � &end "); 79 | myassert(HTML::convert_link(link1, root_link) == "http://www.akwan.com.br/a.html"); 80 | myassert(HTML::convert_link(link2, root_link) == "http://www.akwan.com.br/b.html"); 81 | myassert(HTML::convert_link(link3, root_link) == "http://www.akwan.com.br/teste/servi�o.html"); 82 | myassert(HTML::convert_link(link4, root_link) == "http://www.akwan.com.br/teste/d/c.html"); 83 | myassert(HTML::convert_link(link5, root_link) == "http://www.fadazan.com.br/Download/jacobmacanhan,%203276.jpg"); 84 | myassert(HTML::convert_link(link6, root_link) == "http://www.akwan.com.br/teste/search?q=galo"); 85 | myassert(HTML::convert_link(link7, root_link) == "http://casadebruxa.com.br/banner/vai.asp?id=21&url=http://www.clickdirect.com.br"); 86 | myassert(HTML::convert_link(link8, root_link2) == "http://answerbook.ime.usp.br:8888/ab2/Help_C/ONLINEACCESS/@Ab2HelpView/idmatch(help-library-info)"); 87 | myassert(HTML::convert_link(link9, root_link2) == "http://answerbook.ime.usp.br:8888/ab2/coll.67.3/@Ab2CollView?"); 88 | myassert(HTML::convert_link(link10, root_link2) == "http://www.a.com.br/"); 89 | myassert(HTML::convert_link(link11, root_link) == "http://www.akwan.com.br/teste/'http:/www.b.com.br"); 90 | myassert(HTML::convert_link(link12, root_link) == "http://www.akwan.com.br/teste/acton.asp?q=mineiro"); 91 | 92 | Uri root(root_link); 93 | Uri fragment("#top"); 94 | Uri result(fragment.absolute(root)); 95 | myassert(result.unparse() == "http://www.akwan.com.br/teste/acton.asp?q=atletico#top"); 96 | 97 | return true; 98 | } 99 | 100 | }; 101 | 102 | class TagInitTest 103 | { 104 | public: 105 | void test(void) 106 | { 107 | string html(""); 108 | 109 | tree reference; 110 | HTML::Node n; 111 | 112 | reference.clear(); 113 | tree::iterator current = reference.begin(); 114 | 115 | n.offset(0); 116 | n.length(html.size()); 117 | n.isTag(true); 118 | n.isComment(false); 119 | current = reference.insert(current,n); 120 | 121 | n.offset(0); 122 | n.length(html.size()); 123 | n.isTag(true); 124 | n.isComment(false); 125 | n.text(""); 126 | n.tagName("html"); 127 | n.closingText(""); 128 | current = reference.append_child(current,n); 129 | 130 | n.offset(6); 131 | n.length(71); 132 | n.isTag(true); 133 | n.isComment(false); 134 | n.text(""); 135 | n.tagName("head"); 136 | n.closingText(""); 137 | current = reference.append_child(current,n); 138 | 139 | n.offset(12); 140 | n.length(58); 141 | n.isTag(true); 142 | n.isComment(false); 143 | n.text(""); 146 | current = reference.append_child(current,n); 147 | 148 | n.offset(40); 149 | n.length(21); 150 | n.isTag(false); 151 | n.isComment(false); 152 | n.text("if (0 < 2) saida = 1;"); 153 | n.tagName("if (0 < 2) saida = 1;"); 154 | n.closingText(""); 155 | current = reference.append_child(current,n); 156 | 157 | tree tr; 158 | HTML::ParserDom parser; 159 | parser.parse(html); 160 | tr = parser.getTree(); 161 | // cerr << reference << endl; 162 | // cerr << tr << endl; 163 | myassert(my_tree_compare(tr.begin(), tr.end(), reference.begin())); 164 | } 165 | }; 166 | 167 | class ParseAttrTest 168 | { 169 | public: 170 | bool test() 171 | { 172 | string html("= argc) { 102 | /* more command-line arguments than the argument count */ 103 | pIndexPosition = NULL; /* not in the middle of anything */ 104 | return EOF; /* used up all command-line arguments */ 105 | } 106 | 107 | /*--------------------------------------------------------------------- 108 | * If the next argv[] is not an option, there can be no more options. 109 | *-------------------------------------------------------------------*/ 110 | pArgString = argv[optind++]; /* set this to the next argument ptr */ 111 | 112 | if (('/' != *pArgString) && /* doesn't start with a slash or a dash? */ 113 | ('-' != *pArgString)) { 114 | --optind; /* point to current arg once we're done */ 115 | optarg = NULL; /* no argument follows the option */ 116 | pIndexPosition = NULL; /* not in the middle of anything */ 117 | return EOF; /* used up all the command-line flags */ 118 | } 119 | 120 | /* check for special end-of-flags markers */ 121 | if ((strcmp(pArgString, "-") == 0) || 122 | (strcmp(pArgString, "--") == 0)) { 123 | optarg = NULL; /* no argument follows the option */ 124 | pIndexPosition = NULL; /* not in the middle of anything */ 125 | return EOF; /* encountered the special flag */ 126 | } 127 | 128 | pArgString++; /* look past the / or - */ 129 | } 130 | 131 | if (':' == *pArgString) { /* is it a colon? */ 132 | /*--------------------------------------------------------------------- 133 | * Rare case: if opterr is non-zero, return a question mark; 134 | * otherwise, just return the colon we're on. 135 | *-------------------------------------------------------------------*/ 136 | return (opterr ? (int)'?' : (int)':'); 137 | } 138 | else if ((pOptString = strchr(opstring, *pArgString)) == 0) { 139 | /*--------------------------------------------------------------------- 140 | * The letter on the command-line wasn't any good. 141 | *-------------------------------------------------------------------*/ 142 | optarg = NULL; /* no argument follows the option */ 143 | pIndexPosition = NULL; /* not in the middle of anything */ 144 | return (opterr ? (int)'?' : (int)*pArgString); 145 | } 146 | else { 147 | /*--------------------------------------------------------------------- 148 | * The letter on the command-line matches one we expect to see 149 | *-------------------------------------------------------------------*/ 150 | if (':' == _next_char(pOptString)) { /* is the next letter a colon? */ 151 | /* It is a colon. Look for an argument string. */ 152 | if ('\0' != _next_char(pArgString)) { /* argument in this argv? */ 153 | optarg = &pArgString[1]; /* Yes, it is */ 154 | } 155 | else { 156 | /*------------------------------------------------------------- 157 | * The argument string must be in the next argv. 158 | * But, what if there is none (bad input from the user)? 159 | * In that case, return the letter, and optarg as NULL. 160 | *-----------------------------------------------------------*/ 161 | if (optind < argc) 162 | optarg = argv[optind++]; 163 | else { 164 | optarg = NULL; 165 | return (opterr ? (int)'?' : (int)*pArgString); 166 | } 167 | } 168 | pIndexPosition = NULL; /* not in the middle of anything */ 169 | } 170 | else { 171 | /* it's not a colon, so just return the letter */ 172 | optarg = NULL; /* no argument follows the option */ 173 | pIndexPosition = pArgString; /* point to the letter we're on */ 174 | } 175 | return (int)*pArgString; /* return the letter that matched */ 176 | } 177 | } 178 | #endif //WIN32 179 | -------------------------------------------------------------------------------- /html/ParserSax.tcc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #if !defined(WIN32) || defined(__MINGW32__) 4 | #include 5 | #endif 6 | 7 | //#define DEBUG 8 | //#include "debug.h" 9 | 10 | static 11 | struct literal_tag { 12 | int len; 13 | const char* str; 14 | int is_cdata; 15 | } 16 | literal_mode_elem[] = 17 | { 18 | {6, "script", 1}, 19 | {5, "style", 1}, 20 | {3, "xmp", 1}, 21 | {9, "plaintext", 1}, 22 | {8, "textarea", 0}, 23 | {0, 0, 0} 24 | }; 25 | 26 | template 27 | void htmlcxx::HTML::ParserSax::parse(_Iterator begin, _Iterator end) 28 | { 29 | // std::cerr << "Parsing iterator" << std::endl; 30 | parse(begin, end, typename std::iterator_traits<_Iterator>::iterator_category()); 31 | } 32 | 33 | template 34 | void htmlcxx::HTML::ParserSax::parse(_Iterator &begin, _Iterator &end, std::forward_iterator_tag) 35 | { 36 | typedef _Iterator iterator; 37 | // std::cerr << "Parsing forward_iterator" << std::endl; 38 | mCdata = false; 39 | mpLiteral = 0; 40 | mCurrentOffset = 0; 41 | this->beginParsing(); 42 | 43 | // DEBUGP("Parsed text\n"); 44 | 45 | while (begin != end) 46 | { 47 | *begin; // This is for the multi_pass to release the buffer 48 | iterator c(begin); 49 | 50 | while (c != end) 51 | { 52 | // For some tags, the text inside it is considered literal and is 53 | // only closed for its counterpart 54 | while (mpLiteral) 55 | { 56 | // DEBUGP("Treating literal %s\n", mpLiteral); 57 | while (c != end && *c != '<') ++c; 58 | 59 | if (c == end) { 60 | if (c != begin) this->parseContent(begin, c); 61 | goto DONE; 62 | } 63 | 64 | iterator end_text(c); 65 | ++c; 66 | 67 | if (*c == '/') 68 | { 69 | ++c; 70 | const char *l = mpLiteral; 71 | while (*l && ::tolower(*c) == *l) 72 | { 73 | ++c; 74 | ++l; 75 | } 76 | 77 | // FIXME: Mozilla stops when it sees a /plaintext. Check 78 | // other browsers and decide what to do 79 | if (!*l && strcmp(mpLiteral, "plaintext")) 80 | { 81 | // matched all and is not tag plaintext 82 | while (isspace(*c)) ++c; 83 | 84 | if (*c == '>') 85 | { 86 | ++c; 87 | if (begin != end_text) 88 | this->parseContent(begin, end_text); 89 | mpLiteral = 0; 90 | c = end_text; 91 | begin = c; 92 | break; 93 | } 94 | } 95 | } 96 | else if (*c == '!') 97 | { 98 | // we may find a comment and we should support it 99 | iterator e(c); 100 | ++e; 101 | 102 | if (e != end && *e == '-' && ++e != end && *e == '-') 103 | { 104 | // DEBUGP("Parsing comment\n"); 105 | ++e; 106 | c = this->skipHtmlComment(e, end); 107 | } 108 | 109 | //if (begin != end_text) 110 | //this->parseContent(begin, end_text, end); 111 | 112 | //this->parseComment(end_text, c, end); 113 | 114 | // continue from the end of the comment 115 | //begin = c; 116 | } 117 | } 118 | 119 | if (*c == '<') 120 | { 121 | iterator d(c); 122 | ++d; 123 | if (d != end) 124 | { 125 | if (isalpha(*d)) 126 | { 127 | // beginning of tag 128 | if (begin != c) 129 | this->parseContent(begin, c); 130 | 131 | // DEBUGP("Parsing beginning of tag\n"); 132 | d = this->skipHtmlTag(d, end); 133 | this->parseHtmlTag(c, d); 134 | 135 | // continue from the end of the tag 136 | c = d; 137 | begin = c; 138 | break; 139 | } 140 | 141 | if (*d == '/') 142 | { 143 | if (begin != c) 144 | this->parseContent(begin, c); 145 | 146 | iterator e(d); 147 | ++e; 148 | if (e != end && isalpha(*e)) 149 | { 150 | // end of tag 151 | // DEBUGP("Parsing end of tag\n"); 152 | d = this->skipHtmlTag(d, end); 153 | this->parseHtmlTag(c, d); 154 | } 155 | else 156 | { 157 | // not a conforming end of tag, treat as comment 158 | // as Mozilla does 159 | // DEBUGP("Non conforming end of tag\n"); 160 | d = this->skipHtmlTag(d, end); 161 | this->parseComment(c, d); 162 | } 163 | 164 | // continue from the end of the tag 165 | c = d; 166 | begin = c; 167 | break; 168 | } 169 | 170 | if (*d == '!') 171 | { 172 | // comment 173 | if (begin != c) 174 | this->parseContent(begin, c); 175 | 176 | iterator e(d); 177 | ++e; 178 | 179 | if (e != end && *e == '-' && ++e != end && *e == '-') 180 | { 181 | // DEBUGP("Parsing comment\n"); 182 | ++e; 183 | d = this->skipHtmlComment(e, end); 184 | } 185 | else 186 | { 187 | d = this->skipHtmlTag(d, end); 188 | } 189 | 190 | this->parseComment(c, d); 191 | 192 | // continue from the end of the comment 193 | c = d; 194 | begin = c; 195 | break; 196 | } 197 | 198 | if (*d == '?' || *d == '%') 199 | { 200 | // something like parseContent(begin, c); 203 | 204 | d = this->skipHtmlTag(d, end); 205 | 206 | this->parseComment(c, d); 207 | 208 | // continue from the end of the comment 209 | c = d; 210 | begin = c; 211 | break; 212 | } 213 | } 214 | } 215 | c++; 216 | } 217 | 218 | // There may be some text in the end of the document 219 | if (begin != c) 220 | { 221 | this->parseContent(begin, c); 222 | begin = c; 223 | } 224 | } 225 | 226 | DONE: 227 | this->endParsing(); 228 | return; 229 | } 230 | 231 | template 232 | void htmlcxx::HTML::ParserSax::parseComment(_Iterator b, _Iterator c) 233 | { 234 | // DEBUGP("Creating comment node %s\n", std::string(b, c).c_str()); 235 | htmlcxx::HTML::Node com_node; 236 | //FIXME: set_tagname shouldn't be needed, but first I must check 237 | //legacy code 238 | std::string comment(b, c); 239 | com_node.tagName(comment); 240 | com_node.text(comment); 241 | com_node.offset(mCurrentOffset); 242 | com_node.length((unsigned int)comment.length()); 243 | com_node.isTag(false); 244 | com_node.isComment(true); 245 | 246 | mCurrentOffset += com_node.length(); 247 | 248 | // Call callback method 249 | this->foundComment(com_node); 250 | } 251 | 252 | template 253 | void htmlcxx::HTML::ParserSax::parseContent(_Iterator b, _Iterator c) 254 | { 255 | // DEBUGP("Creating text node %s\n", (std::string(b, c)).c_str()); 256 | htmlcxx::HTML::Node txt_node; 257 | //FIXME: set_tagname shouldn't be needed, but first I must check 258 | //legacy code 259 | std::string text(b, c); 260 | txt_node.tagName(text); 261 | txt_node.text(text); 262 | txt_node.offset(mCurrentOffset); 263 | txt_node.length((unsigned int)text.length()); 264 | txt_node.isTag(false); 265 | txt_node.isComment(false); 266 | 267 | mCurrentOffset += txt_node.length(); 268 | 269 | // Call callback method 270 | this->foundText(txt_node); 271 | } 272 | 273 | template 274 | void htmlcxx::HTML::ParserSax::parseHtmlTag(_Iterator b, _Iterator c) 275 | { 276 | _Iterator name_begin(b); 277 | ++name_begin; 278 | bool is_end_tag = (*name_begin == '/'); 279 | if (is_end_tag) ++name_begin; 280 | 281 | _Iterator name_end(name_begin); 282 | while (name_end != c && isalnum(*name_end)) 283 | { 284 | ++name_end; 285 | } 286 | 287 | std::string name(name_begin, name_end); 288 | // DEBUGP("Found %s tag %s\n", is_end_tag ? "closing" : "opening", name.c_str()); 289 | 290 | if (!is_end_tag) 291 | { 292 | std::string::size_type tag_len = name.length(); 293 | for (int i = 0; literal_mode_elem[i].len; ++i) 294 | { 295 | if (tag_len == literal_mode_elem[i].len) 296 | { 297 | #if defined(WIN32) && !defined(__MINGW32__) 298 | if (!_stricmp(name.c_str(), literal_mode_elem[i].str)) 299 | #else 300 | if (!strcasecmp(name.c_str(), literal_mode_elem[i].str)) 301 | #endif 302 | { 303 | mpLiteral = literal_mode_elem[i].str; 304 | break; 305 | } 306 | } 307 | } 308 | } 309 | 310 | htmlcxx::HTML::Node tag_node; 311 | //by now, length is just the size of the tag 312 | std::string text(b, c); 313 | tag_node.length(static_cast(text.length())); 314 | tag_node.tagName(name); 315 | tag_node.text(text); 316 | tag_node.offset(mCurrentOffset); 317 | tag_node.isTag(true); 318 | tag_node.isComment(false); 319 | 320 | mCurrentOffset += tag_node.length(); 321 | 322 | this->foundTag(tag_node, is_end_tag); 323 | } 324 | 325 | template 326 | _Iterator 327 | htmlcxx::HTML::ParserSax::skipHtmlComment(_Iterator c, _Iterator end) 328 | { 329 | while ( c != end ) { 330 | if (*c++ == '-' && c != end && *c == '-') 331 | { 332 | _Iterator d(c); 333 | while (++c != end && isspace(*c)); 334 | if (c == end || *c++ == '>') break; 335 | c = d; 336 | } 337 | } 338 | 339 | return c; 340 | } 341 | 342 | namespace htmlcxx { namespace HTML { 343 | 344 | template 345 | static inline 346 | _Iterator find_next_quote(_Iterator c, _Iterator end, char quote) 347 | { 348 | // std::cerr << "generic find" << std::endl; 349 | while (c != end && *c != quote) ++c; 350 | return c; 351 | } 352 | 353 | template <> 354 | inline 355 | const char *find_next_quote(const char *c, const char *end, char quote) 356 | { 357 | // std::cerr << "fast find" << std::endl; 358 | const char *d = reinterpret_cast(memchr(c, quote, end - c)); 359 | 360 | if (d) return d; 361 | else return end; 362 | } 363 | 364 | }} 365 | 366 | template 367 | _Iterator htmlcxx::HTML::ParserSax::skipHtmlTag(_Iterator c, _Iterator end) 368 | { 369 | while (c != end && *c != '>') 370 | { 371 | if (*c != '=') 372 | { 373 | ++c; 374 | } 375 | else 376 | { // found an attribute 377 | ++c; 378 | while (c != end && isspace(*c)) ++c; 379 | 380 | if (c == end) break; 381 | 382 | if (*c == '\"' || *c == '\'') 383 | { 384 | _Iterator save(c); 385 | char quote = *c++; 386 | c = find_next_quote(c, end, quote); 387 | // while (c != end && *c != quote) ++c; 388 | // c = static_cast(memchr(c, quote, end - c)); 389 | if (c != end) 390 | { 391 | ++c; 392 | } 393 | else 394 | { 395 | c = save; 396 | ++c; 397 | } 398 | // DEBUGP("Quotes: %s\n", std::string(save, c).c_str()); 399 | } 400 | } 401 | } 402 | 403 | if (c != end) ++c; 404 | 405 | return c; 406 | } 407 | -------------------------------------------------------------------------------- /ASF-2.0: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /html/utils.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "Uri.h" 6 | 7 | #include "utils.h" 8 | 9 | using namespace std; 10 | namespace htmlcxx { 11 | namespace HTML { 12 | 13 | bool detect_utf8(const char *begin, int size) 14 | { 15 | const char *ptr; 16 | const char *end = begin+size; 17 | const char *signature = ""; 18 | char previous_byte = 0; 19 | unsigned count_bad_utf = 0; 20 | unsigned count_good_utf = 0; 21 | 22 | if (!strncmp(begin, signature, 3)) return true; 23 | 24 | for (ptr = begin; ptr != end; ++ptr) 25 | { 26 | if ((*ptr & 0xC0) == 0x80) 27 | { 28 | if ((previous_byte & 0xC0) == 0xC0) 29 | { 30 | count_good_utf ++; 31 | } 32 | else if ((previous_byte & 0x80) == 0x00) 33 | { 34 | count_bad_utf ++; 35 | } 36 | } 37 | else if ((previous_byte & 0xC0) == 0xC0) 38 | { 39 | count_bad_utf ++; 40 | } 41 | 42 | previous_byte = *ptr; 43 | } 44 | 45 | return count_good_utf > count_bad_utf; 46 | } 47 | 48 | string single_blank(const string &str) { 49 | 50 | unsigned int count = 0; 51 | bool first_space = true; 52 | const char *ptr = str.c_str(); 53 | 54 | string ret(str.length(), ' '); 55 | 56 | // Skip space at beginning 57 | while (isspace(*ptr)) ++ptr; 58 | 59 | while (*ptr) 60 | { 61 | if (isspace(*ptr)) 62 | { 63 | if (first_space) 64 | { 65 | first_space = false; 66 | ret[count++] = ' '; 67 | } 68 | } 69 | else 70 | { 71 | first_space = true; 72 | ret[count++] = *ptr; 73 | } 74 | 75 | ++ptr; 76 | } 77 | 78 | // Trim space at the end 79 | string::size_type a; 80 | a = ret.find_last_not_of(' ', count); 81 | if (a != string::npos) 82 | ret.erase(a+1); 83 | else 84 | { 85 | a = 0; 86 | ret.erase(a); 87 | } 88 | 89 | return ret; 90 | } 91 | 92 | string strip_comments(const string &str) { 93 | 94 | string ret; 95 | ret.reserve(str.size()); 96 | 97 | const char *ptr = str.c_str(); 98 | const char *end = ptr + str.length(); 99 | 100 | bool inside_comment = false; 101 | while(1) { 102 | if(!inside_comment) { 103 | if(ptr + 4 < end) { 104 | if(*ptr == '<' && *(ptr+1) == '!' && *(ptr+2) =='-' && *(ptr + 3) == '-' && isspace(*(ptr + 4))) { 105 | inside_comment = true; 106 | } 107 | } 108 | } else { 109 | if(ptr + 2 < end) { 110 | if(*ptr == '-' && *(ptr+1) == '-' && *(ptr+2) == '>' ) { 111 | inside_comment = false; 112 | ptr += 3; 113 | } 114 | } 115 | } 116 | if(ptr == end) break; 117 | if(!inside_comment) ret += *ptr; 118 | ptr++; 119 | } 120 | 121 | ret.resize(ret.size()); 122 | 123 | return ret; 124 | 125 | } 126 | 127 | static struct { 128 | const char *str; 129 | unsigned char chr; 130 | } entities[] = { 131 | /* 00 */ 132 | { "quot", 34 }, 133 | { "amp", 38 }, 134 | { "lt", 60 }, 135 | { "gt", 62 }, 136 | { "nbsp", ' ' }, 137 | { "iexcl", 161 }, 138 | { "cent", 162 }, 139 | { "pound", 163 }, 140 | { "curren", 164 }, 141 | { "yen", 165 }, 142 | /* 10 */ 143 | { "brvbar", 166 }, 144 | { "sect", 167 }, 145 | { "uml", 168 }, 146 | { "copy", 169 }, 147 | { "ordf", 170 }, 148 | { "laquo", 171 }, 149 | { "not", 172 }, 150 | { "shy", 173 }, 151 | { "reg", 174 }, 152 | { "macr", 175 }, 153 | /* 20 */ 154 | { "deg", 176 }, 155 | { "plusmn", 177 }, 156 | { "sup2", 178 }, 157 | { "sup3", 179 }, 158 | { "acute", 180 }, 159 | { "micro", 181 }, 160 | { "para", 182 }, 161 | { "middot", 183 }, 162 | { "cedil", 184 }, 163 | { "sup1", 185 }, 164 | /* 30 */ 165 | { "ordm", 186 }, 166 | { "raquo", 187 }, 167 | { "frac14", 188 }, 168 | { "frac12", 189 }, 169 | { "frac34", 190 }, 170 | { "iquest", 191 }, 171 | { "Agrave", 192 }, 172 | { "Aacute", 193 }, 173 | { "Acirc", 194 }, 174 | { "Atilde", 195 }, 175 | /* 40 */ 176 | { "Auml", 196 }, 177 | { "ring", 197 }, 178 | { "AElig", 198 }, 179 | { "Ccedil", 199 }, 180 | { "Egrave", 200 }, 181 | { "Eacute", 201 }, 182 | { "Ecirc", 202 }, 183 | { "Euml", 203 }, 184 | { "Igrave", 204 }, 185 | { "Iacute", 205 }, 186 | /* 50 */ 187 | { "Icirc", 206 }, 188 | { "Iuml", 207 }, 189 | { "ETH", 208 }, 190 | { "Ntilde", 209 }, 191 | { "Ograve", 210 }, 192 | { "Oacute", 211 }, 193 | { "Ocirc", 212 }, 194 | { "Otilde", 213 }, 195 | { "Ouml", 214 }, 196 | { "times", 215 }, 197 | /* 60 */ 198 | { "Oslash", 216 }, 199 | { "Ugrave", 217 }, 200 | { "Uacute", 218 }, 201 | { "Ucirc", 219 }, 202 | { "Uuml", 220 }, 203 | { "Yacute", 221 }, 204 | { "THORN", 222 }, 205 | { "szlig", 223 }, 206 | { "agrave", 224 }, 207 | { "aacute", 225 }, 208 | /* 70 */ 209 | { "acirc", 226 }, 210 | { "atilde", 227 }, 211 | { "auml", 228 }, 212 | { "aring", 229 }, 213 | { "aelig", 230 }, 214 | { "ccedil", 231 }, 215 | { "egrave", 232 }, 216 | { "eacute", 233 }, 217 | { "ecirc", 234 }, 218 | { "euml", 235 }, 219 | /* 80 */ 220 | { "igrave", 236 }, 221 | { "iacute", 237 }, 222 | { "icirc", 238 }, 223 | { "iuml", 239 }, 224 | { "ieth", 240 }, 225 | { "ntilde", 241 }, 226 | { "ograve", 242 }, 227 | { "oacute", 243 }, 228 | { "ocirc", 244 }, 229 | { "otilde", 245 }, 230 | /* 90 */ 231 | { "ouml", 246 }, 232 | { "divide", 247 }, 233 | { "oslash", 248 }, 234 | { "ugrave", 249 }, 235 | { "uacute", 250 }, 236 | { "ucirc", 251 }, 237 | { "uuml", 252 }, 238 | { "yacute", 253 }, 239 | { "thorn", 254 }, 240 | { "yuml", 255 }, 241 | /* 100 */ 242 | { NULL, 0 }, 243 | }; 244 | 245 | string decode_entities(const string &str) 246 | { 247 | unsigned int count = 0; 248 | const char *ptr = str.c_str(); 249 | const char *end; 250 | 251 | string ret(str); 252 | string entity; 253 | 254 | ptr = strchr(ptr, '&'); 255 | if (ptr == NULL) return ret; 256 | 257 | count += static_cast(ptr - str.c_str()); 258 | 259 | // printf("url_init: %s\n", str.c_str()); 260 | while (*ptr) 261 | { 262 | if (*ptr == '&' && ((end = strchr(ptr, ';')) != NULL)) 263 | { 264 | entity.assign(ptr + 1, end); 265 | // printf("Entity: %d %s\n", entity.length(), entity.c_str()); 266 | if (!entity.empty() && entity[0] == '#') 267 | { 268 | entity.erase(0, 1); 269 | int chr = atoi(entity.c_str()); 270 | if (chr > 0 && chr <= UCHAR_MAX) 271 | { 272 | ret[count++] = chr; 273 | } 274 | ptr = end + 1; 275 | } 276 | else 277 | { 278 | bool found = false; 279 | for (int i = 0; entities[i].str != NULL; i++) 280 | { 281 | if (entity == entities[i].str) 282 | { 283 | found = true; 284 | ret[count++] = entities[i].chr; 285 | ptr = end + 1; 286 | break; 287 | } 288 | } 289 | 290 | if (!found) 291 | { 292 | ret[count++] = *ptr++; 293 | } 294 | } 295 | } 296 | else 297 | { 298 | ret[count++] = *ptr++; 299 | } 300 | } 301 | 302 | ret.erase(count); 303 | 304 | // printf("url_end: %s\n", ret.c_str()); 305 | return ret; 306 | } 307 | 308 | string get_attribute(const string& tag, const string& attr) { 309 | string val; 310 | string low_tag(tag); 311 | string low_attr(attr); 312 | 313 | transform(low_attr.begin(), low_attr.end(), low_attr.begin(), ::tolower); 314 | transform(low_tag.begin(), low_tag.end(), low_tag.begin(), ::tolower); 315 | 316 | string::size_type a; 317 | a = low_tag.find(low_attr); 318 | if (a == string::npos) 319 | return val; 320 | 321 | a += attr.length(); 322 | while (a < tag.length() && isspace(tag[a])) a++; 323 | if (a == tag.length() || tag[a] != '=') 324 | return val; 325 | a++; 326 | while (a < tag.length() && isspace(tag[a])) a++; 327 | if (a == tag.length()) 328 | return val; 329 | 330 | if (tag[a] == '"') { 331 | string::size_type b = tag.find('"', a+1); 332 | if (b == string::npos) return val; 333 | val = tag.substr(a+1, b-a-1); 334 | } else if (tag[a] == '\'') { 335 | string::size_type b = tag.find('\'', a+1); 336 | if (b == string::npos) return val; 337 | val = tag.substr(a+1, b-a-1); 338 | } else { 339 | while (a < tag.length() && !isspace(tag[a]) && tag[a] != '>') { 340 | val += tag[a++]; 341 | } 342 | } 343 | 344 | return val; 345 | } 346 | 347 | string normalize_slashs(const string &url) 348 | { 349 | const int NONE = 0; 350 | const int LASTSLASH = 1; 351 | const int LASTDOTSLASH = 2; 352 | const int LASTDOTDOTSLASH = 3; 353 | int state = NONE; 354 | const char *question_dash; 355 | const char *question; 356 | const char *dash; 357 | unsigned int count = 0; 358 | const char *ptr = url.c_str(); 359 | string ret(url); 360 | 361 | question = strchr(ptr, '?'); 362 | dash = strchr(ptr, '#'); 363 | if (question &&(!dash || question < dash)) question_dash = question; 364 | else question_dash = dash; 365 | if (question_dash == 0) question_dash = url.c_str() + url.length(); 366 | 367 | const char *problem; 368 | const char *problem1 = strstr(ptr, "//"); 369 | const char *problem2 = strstr(ptr, "/."); 370 | 371 | if (problem1 && (!problem2 || problem1 < problem2)) problem = problem1; 372 | else problem = problem2; 373 | 374 | if (problem && problem < question_dash) 375 | { 376 | ptr = problem; 377 | count = static_cast(ptr - url.c_str()); 378 | while (*ptr && ptr < question_dash) 379 | { 380 | switch (state) 381 | { 382 | case LASTSLASH: 383 | if (*ptr == '/') 384 | { 385 | ++ptr; 386 | state = LASTSLASH; 387 | } 388 | else if (*ptr == '.') 389 | { 390 | ++ptr; 391 | state = LASTDOTSLASH; 392 | } 393 | else 394 | { 395 | ret[count++] = *ptr; 396 | ++ptr; 397 | state = NONE; 398 | } 399 | break; 400 | case LASTDOTSLASH: 401 | if (*ptr == '/') 402 | { 403 | ++ptr; 404 | state = LASTSLASH; 405 | } 406 | else if (*ptr == '.') 407 | { 408 | ++ptr; 409 | state = LASTDOTDOTSLASH; 410 | } 411 | else 412 | { 413 | ret[count++] = '.'; 414 | ret[count++] = *ptr; 415 | ++ptr; 416 | state = NONE; 417 | } 418 | break; 419 | case LASTDOTDOTSLASH: 420 | if (*ptr == '/') 421 | { 422 | const char *last_slash = ret.c_str() + count - 2; 423 | while (last_slash >= ret.c_str() && *last_slash != '/') 424 | --last_slash; 425 | if (last_slash >= ret.c_str()) 426 | count = static_cast(last_slash - ret.c_str() + 1); 427 | ++ptr; 428 | state = LASTSLASH; 429 | } 430 | else 431 | { 432 | ret[count++] = '.'; 433 | ret[count++] = '.'; 434 | ret[count++] = *ptr; 435 | ++ptr; 436 | state = NONE; 437 | } 438 | break; 439 | default: 440 | if (*ptr == '/') 441 | { 442 | ret[count++] = *ptr; 443 | ++ptr; 444 | state = LASTSLASH; 445 | } 446 | else 447 | { 448 | ret[count++] = *ptr; 449 | ++ptr; 450 | state = NONE; 451 | } 452 | } 453 | } 454 | 455 | if (question_dash) 456 | { 457 | while (*ptr) 458 | { 459 | ret[count++] = *ptr; 460 | ++ptr; 461 | } 462 | } 463 | 464 | ret.erase(count); 465 | } 466 | 467 | return ret; 468 | } 469 | 470 | string convert_link(const string& relative, const Uri& root) 471 | { 472 | string url(relative); 473 | 474 | url = HTML::decode_entities(url); 475 | 476 | string::size_type a; 477 | a = 0; 478 | while ((a = url.find_first_of(" \r\n", a)) != string::npos) 479 | { 480 | switch (url[a]) 481 | { 482 | case ' ': 483 | url.replace(a, 1, "%20"); 484 | break; 485 | case '\r': 486 | url.erase(a, 1); 487 | break; 488 | case '\n': 489 | url.erase(a, 1); 490 | break; 491 | } 492 | } 493 | 494 | Uri uri; 495 | try 496 | { 497 | Uri rel(url); 498 | uri = rel.absolute(root); 499 | uri.path(normalize_slashs(uri.path())); 500 | } 501 | catch (Uri::Exception) 502 | { 503 | return string(); 504 | } 505 | 506 | return uri.unparse(Uri::REMOVE_FRAGMENT); 507 | } 508 | 509 | string __serialize_gml(const tree &tr, tree::iterator it, tree::iterator end, unsigned int parent_id, unsigned int& label) { 510 | 511 | using namespace std; 512 | ostringstream ret; 513 | tree::sibling_iterator sib = tr.begin(it); 514 | while(sib != tr.end(it)) { 515 | ret << "node [ id " << ++label << "\n label \"" << label << "\"\n]\n"; 516 | ret << "edge [ \n source " << parent_id << "\n target " << label << "\n]" << endl; 517 | ret << __serialize_gml(tr, sib, end, label, label); 518 | ++sib; 519 | } 520 | ret << ends; 521 | string str = ret.str(); 522 | return str; 523 | } 524 | 525 | 526 | string serialize_gml(const tree &tr) { 527 | 528 | using namespace std; 529 | 530 | tree::pre_order_iterator it = tr.begin(); 531 | tree::pre_order_iterator end = tr.end(); 532 | 533 | string ret; 534 | ret += "graph ["; 535 | ret += "directed 1\n"; 536 | ret += "node [ id 0\n label \"0\"\n ]\n"; 537 | unsigned int label = 0; 538 | ret += __serialize_gml(tr, it, end, 0, label); 539 | ret += "]"; 540 | return ret; 541 | 542 | } 543 | 544 | }//namespace html 545 | }//namespace htmlcxx 546 | -------------------------------------------------------------------------------- /css/css_syntax.y: -------------------------------------------------------------------------------- 1 | %{ 2 | #include 3 | #include 4 | #include "css_lex.h" 5 | #include "parser.h" 6 | 7 | #define YYERROR_VERBOSE 1 8 | #define YYDEBUG 1 9 | 10 | %} 11 | 12 | %parse-param {void *yyparam} 13 | %pure-parser 14 | 15 | %union { 16 | char *lexeme; 17 | char letter; 18 | struct property_t *property; 19 | struct selector_t *selector; 20 | struct selector_list_t *selector_list; 21 | int pseudo_class; 22 | int pseudo_element; 23 | } 24 | 25 | %{ 26 | 27 | int yylex(YYSTYPE *lvalp); 28 | 29 | int yyerror(void *yyparam, const char *s) { 30 | #if YYDEBUG 31 | fprintf(stderr, "Error: %s\n", s); 32 | #endif 33 | return 0; 34 | } 35 | 36 | %} 37 | 38 | %token IMPORT_SYM 39 | %token IMPORTANT_SYM 40 | %token IDENT 41 | %token STRING 42 | %token NUMBER 43 | %token PERCENTAGE 44 | %token LENGTH 45 | %token EMS 46 | %token EXS 47 | %token PSCLASS_AFTER_IDENT 48 | %token HASH_AFTER_IDENT 49 | %token CLASS_AFTER_IDENT 50 | %token PSCLASS 51 | %token VISITED_PSCLASS 52 | %token ACTIVE_PSCLASS 53 | %token FIRST_LINE 54 | %token FIRST_LETTER 55 | %token HASH 56 | %token CLASS 57 | %token URL 58 | %token RGB 59 | %token CDO 60 | %token CDC 61 | %token CSL 62 | 63 | %type solitary_pseudo_class 64 | %type pseudo_class 65 | %type selector 66 | %type simple_selector 67 | %type simple_selectors 68 | %type selectors 69 | %type ruleset 70 | %type rulesets 71 | %type unary_operator 72 | %type operator 73 | %type declaration 74 | %type declarations 75 | %type NUMBER 76 | %type STRING 77 | %type PERCENTAGE 78 | %type LENGTH 79 | %type EMS 80 | %type EXS 81 | %type URL 82 | %type RGB 83 | %type IDENT 84 | %type HASH 85 | %type HASH_AFTER_IDENT 86 | %type CLASS_AFTER_IDENT 87 | %type CLASS 88 | %type hexcolor 89 | %type value 90 | %type id 91 | %type solitary_id 92 | %type term 93 | %type property 94 | %type expr 95 | %type element_name 96 | %type class 97 | %type solitary_class 98 | %type string_or_url 99 | 100 | %% 101 | 102 | stylesheet 103 | : comments imports rulesets { 104 | *(struct selector_list_t**) yyparam = $3; 105 | } 106 | ; 107 | 108 | rulesets 109 | : ruleset comments rulesets { 110 | struct selector_list_t *pos = $1; 111 | if (pos != NULL) { 112 | while (pos->next != NULL) { 113 | pos = pos->next; 114 | } 115 | pos->next = $3; 116 | } else { 117 | $1 = $3; 118 | } 119 | $$ = $1; 120 | } 121 | | { $$ = NULL; } 122 | ; 123 | 124 | imports 125 | : import comments imports 126 | | 127 | ; 128 | 129 | comments 130 | : comments comment 131 | | 132 | ; 133 | 134 | comment 135 | : CDO 136 | | CDC 137 | | CSL 138 | ; 139 | 140 | import 141 | : IMPORT_SYM string_or_url ';' /* E.g., @import url(fun.css); */ 142 | ; 143 | 144 | string_or_url 145 | : STRING { $$ = $1; } 146 | | URL { 147 | char *begin = $1; 148 | char *end = $1 + strlen($1); 149 | 150 | /* Skip url( */ 151 | begin += 4; 152 | /* skip whitespace */ 153 | while (*begin == ' ') 154 | ++begin; 155 | 156 | /* Skip ) */ 157 | end -= 2; 158 | /* skip whitespace */ 159 | while (*end == ' ') 160 | --end; 161 | 162 | end[1] = 0; 163 | 164 | $$ = strdup(begin); 165 | free($1); 166 | } 167 | ; 168 | 169 | unary_operator 170 | : '-' { $$ = '-'; } 171 | | '+' { $$ = '+'; } 172 | ; 173 | 174 | operator 175 | : '/' { $$ = '/'; } 176 | | ',' { $$ = ','; } 177 | | /* empty */ {$$ = ' '; } 178 | ; 179 | 180 | property 181 | : IDENT { $$ = $1; } 182 | ; 183 | 184 | ruleset 185 | : selectors '{' declarations '}' { 186 | struct selector_list_t *pos = $1; 187 | while (pos != NULL) { 188 | struct property_t *i = $3; 189 | while (i != NULL) { 190 | i->count++; 191 | i = i->next; 192 | } 193 | pos->selector->property = $3; 194 | pos = pos->next; 195 | } 196 | $$ = $1; 197 | } 198 | ; 199 | 200 | selectors 201 | : selector { 202 | if ($1 != NULL) { 203 | $$ = (struct selector_list_t*) 204 | malloc (sizeof(struct selector_list_t)); 205 | $$->selector = $1; 206 | $$->next = NULL; 207 | } else { 208 | $$ = NULL; 209 | } 210 | } 211 | | selector ',' selectors { 212 | if ($1 != NULL) { 213 | struct selector_list_t *new; 214 | new = (struct selector_list_t*) 215 | malloc (sizeof(struct selector_list_t)); 216 | new->selector = $1; 217 | new->next = $3; 218 | $$ = new; 219 | } else { 220 | $$ = $3; 221 | } 222 | } 223 | ; 224 | 225 | declarations 226 | : declaration { 227 | $$ = $1; 228 | } 229 | | declaration ';' declarations { 230 | if ($1 != NULL) { 231 | $1->next = $3; 232 | $$ = $1; 233 | } else { 234 | $$ = $3; 235 | } 236 | } 237 | ; 238 | 239 | selector 240 | : simple_selectors { $$ = $1; } 241 | | selector error { $$ = NULL; } 242 | | error { $$ = NULL; } 243 | ; 244 | 245 | simple_selectors 246 | : simple_selector { $$ = $1; } 247 | | simple_selector simple_selectors { 248 | $1->next = $2; 249 | $$ = $1; 250 | } 251 | ; 252 | 253 | /* An "id" is an ID that is attached to an element type 254 | ** on its left, as in: P#p007 255 | ** A "solitary_id" is an ID that is not so attached, 256 | ** as in: #p007 257 | ** Analogously for classes and pseudo-classes. 258 | */ 259 | simple_selector 260 | : element_name id class pseudo_class { 261 | $$ = (struct selector_t*) 262 | malloc(sizeof(struct selector_t)); 263 | $$->element_name = $1; 264 | $$->id = $2; 265 | $$->e_class = $3; 266 | $$->pseudo_class = $4; 267 | $$->pseudo_element = 0; 268 | $$->next = NULL; 269 | } /* eg: H1.subject */ 270 | | element_name id pseudo_class { 271 | $$ = (struct selector_t*) 272 | malloc(sizeof(struct selector_t)); 273 | $$->element_name = $1; 274 | $$->id = $2; 275 | $$->e_class = NULL; 276 | $$->pseudo_class = $3; 277 | $$->pseudo_element = 0; 278 | $$->next = NULL; 279 | } 280 | | element_name class pseudo_class { 281 | $$ = (struct selector_t*) 282 | malloc(sizeof(struct selector_t)); 283 | $$->element_name = $1; 284 | $$->id = NULL; 285 | $$->e_class = $2; 286 | $$->pseudo_class = $3; 287 | $$->pseudo_element = 0; 288 | $$->next = NULL; 289 | } 290 | | element_name id class { 291 | $$ = (struct selector_t*) 292 | malloc(sizeof(struct selector_t)); 293 | $$->element_name = $1; 294 | $$->id = $2; 295 | $$->e_class = $3; 296 | $$->pseudo_class = 0; 297 | $$->pseudo_element = 0; 298 | $$->next = NULL; 299 | } 300 | | element_name id { 301 | $$ = (struct selector_t*) 302 | malloc(sizeof(struct selector_t)); 303 | $$->element_name = $1; 304 | $$->id = $2; 305 | $$->e_class = NULL; 306 | $$->pseudo_class = 0; 307 | $$->pseudo_element = 0; 308 | $$->next = NULL; 309 | } 310 | | element_name class { 311 | $$ = (struct selector_t*) 312 | malloc(sizeof(struct selector_t)); 313 | $$->element_name = $1; 314 | $$->id = NULL; 315 | $$->e_class = $2; 316 | $$->pseudo_class = 0; 317 | $$->pseudo_element = 0; 318 | $$->next = NULL; 319 | } 320 | | element_name pseudo_class { 321 | $$ = (struct selector_t*) 322 | malloc(sizeof(struct selector_t)); 323 | $$->element_name = $1; 324 | $$->id = NULL; 325 | $$->e_class = NULL; 326 | $$->pseudo_class = $2; 327 | $$->pseudo_element = 0; 328 | $$->next = NULL; 329 | } 330 | | element_name { 331 | $$ = (struct selector_t*) 332 | malloc(sizeof(struct selector_t)); 333 | $$->element_name = $1; 334 | $$->id = NULL; 335 | $$->e_class = NULL; 336 | $$->pseudo_class = 0; 337 | $$->pseudo_element = 0; 338 | $$->next = NULL; 339 | } 340 | | solitary_id class pseudo_class { 341 | $$ = (struct selector_t*) 342 | malloc(sizeof(struct selector_t)); 343 | $$->element_name = NULL; 344 | $$->id = $1; 345 | $$->e_class = $2; 346 | $$->pseudo_class = $3; 347 | $$->pseudo_element = 0; 348 | $$->next = NULL; 349 | } /* eg: #xyz33 */ 350 | | solitary_id class { 351 | $$ = (struct selector_t*) 352 | malloc(sizeof(struct selector_t)); 353 | $$->element_name = NULL; 354 | $$->id = $1; 355 | $$->e_class = $2; 356 | $$->pseudo_class = 0; 357 | $$->pseudo_element = 0; 358 | $$->next = NULL; 359 | } 360 | | solitary_id pseudo_class { 361 | $$ = (struct selector_t*) 362 | malloc(sizeof(struct selector_t)); 363 | $$->element_name = NULL; 364 | $$->id = $1; 365 | $$->e_class = NULL; 366 | $$->pseudo_class = $2; 367 | $$->pseudo_element = 0; 368 | $$->next = NULL; 369 | } 370 | | solitary_id { 371 | $$ = (struct selector_t*) 372 | malloc(sizeof(struct selector_t)); 373 | $$->element_name = NULL; 374 | $$->id = $1; 375 | $$->e_class = NULL; 376 | $$->pseudo_class = 0; 377 | $$->pseudo_element = 0; 378 | $$->next = NULL; 379 | } 380 | | solitary_class pseudo_class { 381 | $$ = (struct selector_t*) 382 | malloc(sizeof(struct selector_t)); 383 | $$->element_name = NULL; 384 | $$->id = NULL; 385 | $$->e_class = $1; 386 | $$->pseudo_class = $2; 387 | $$->pseudo_element = 0; 388 | $$->next = NULL; 389 | }/* eg: .author */ 390 | | solitary_class { 391 | $$ = (struct selector_t*) 392 | malloc(sizeof(struct selector_t)); 393 | $$->element_name = NULL; 394 | $$->id = NULL; 395 | $$->e_class = $1; 396 | $$->pseudo_class = 0; 397 | $$->pseudo_element = 0; 398 | $$->next = NULL; 399 | } 400 | | solitary_pseudo_class { 401 | $$ = (struct selector_t*) 402 | malloc(sizeof(struct selector_t)); 403 | $$->element_name = NULL; 404 | $$->id = NULL; 405 | $$->e_class = NULL; 406 | $$->pseudo_class = $1; 407 | $$->pseudo_element = 0; 408 | $$->next = NULL; 409 | } /* eg: :link */ 410 | ; 411 | 412 | element_name 413 | : IDENT { $$ = $1; } 414 | ; 415 | 416 | pseudo_class /* as in: A:link */ 417 | : ':' IDENT { $$ = PS_CLASS; } 418 | ; 419 | 420 | solitary_pseudo_class /* as in: :link */ 421 | : ':' IDENT { $$ = PS_CLASS; } 422 | ; 423 | 424 | class /* as in: P.note */ 425 | : CLASS_AFTER_IDENT { $$ = $1; } 426 | ; 427 | 428 | solitary_class /* as in: .note */ 429 | : CLASS { $$ = $1; } 430 | ; 431 | 432 | /* There is a constraint on the id and solitary_id that the 433 | ** part after the "#" must be a valid HTML ID value; 434 | ** e.g., "#x77" is OK, but "#77" is not. 435 | */ 436 | id 437 | : HASH_AFTER_IDENT { $$ = $1; } 438 | ; 439 | 440 | solitary_id 441 | : HASH { $$ = $1; } 442 | ; 443 | 444 | declaration 445 | : property ':' expr prio { 446 | $$ = (struct property_t*) 447 | malloc(sizeof(struct property_t)); 448 | $$->name = $1; 449 | $$->val = $3; 450 | $$->important = 1; 451 | $$->count = 0; 452 | $$->next = NULL; 453 | } 454 | | property ':' expr { 455 | $$ = (struct property_t*) 456 | malloc(sizeof(struct property_t)); 457 | $$->name = $1; 458 | $$->val = $3; 459 | $$->important = 0; 460 | $$->count = 0; 461 | $$->next = NULL; 462 | } 463 | | error { $$ = NULL; } 464 | | /* empty */ { 465 | $$ = NULL; 466 | } /* Prevents syntax errors... */ 467 | ; 468 | 469 | prio 470 | : IMPORTANT_SYM { } /* !important */ 471 | ; 472 | 473 | expr 474 | : term { $$ = $1; } 475 | | expr operator term { 476 | char *s = (char*) malloc (strlen($1)+strlen($3)+2); 477 | strcpy(s, $1); 478 | s[strlen(s)+1] = 0; 479 | s[strlen(s)] = $2; 480 | strcat(s, $3); 481 | free($1); 482 | free($3); 483 | $$ = s; 484 | } 485 | | expr error { $$ = $1; } 486 | ; 487 | 488 | term 489 | : unary_operator value { 490 | char *s = (char*) malloc(strlen($2)+2); 491 | s[0] = $1; 492 | s[1] = 0; 493 | strcat(s, $2); 494 | free($2); 495 | $$ = s; 496 | } 497 | | value { $$ = $1; } 498 | ; 499 | 500 | value 501 | : NUMBER { $$ = $1; } 502 | | STRING { $$ = $1; } 503 | | PERCENTAGE { $$ = $1; } 504 | | LENGTH { $$ = $1; } 505 | | EMS { $$ = $1; } 506 | | EXS { $$ = $1; } 507 | | IDENT { $$ = $1; } 508 | | hexcolor { $$ = $1; } 509 | | URL { $$ = $1; } 510 | | RGB { $$ = $1; } 511 | ; 512 | 513 | /* There is a constraint on the color that it must 514 | ** have either 3 or 6 hex-digits (i.e., [0-9a-fA-F]) 515 | ** after the "#"; e.g., "#000" is OK, but "#abcd" is not. 516 | */ 517 | hexcolor 518 | : HASH { 519 | $$ = (char*) malloc (strlen($1)+2); 520 | sprintf($$, "#%s", $1); 521 | free($1); 522 | } 523 | | HASH_AFTER_IDENT { 524 | $$ = (char*) malloc (strlen($1)+2); 525 | sprintf($$, "#%s", $1); 526 | free($1); 527 | } 528 | ; 529 | 530 | %% 531 | 532 | struct selector_list_t* css_parse(const char *buffer, int buf_len) { 533 | struct selector_list_t *ret = NULL; 534 | //yydebug = 1; 535 | init_yylex(buffer, buf_len); 536 | yyparse(&ret); 537 | end_yylex(); 538 | return ret; 539 | } 540 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | Installation Instructions 2 | ************************* 3 | 4 | Copyright (C) 1994-1996, 1999-2002, 2004-2013 Free Software Foundation, 5 | Inc. 6 | 7 | Copying and distribution of this file, with or without modification, 8 | are permitted in any medium without royalty provided the copyright 9 | notice and this notice are preserved. This file is offered as-is, 10 | without warranty of any kind. 11 | 12 | Basic Installation 13 | ================== 14 | 15 | Briefly, the shell command `./configure && make && make install' 16 | should configure, build, and install this package. The following 17 | more-detailed instructions are generic; see the `README' file for 18 | instructions specific to this package. Some packages provide this 19 | `INSTALL' file but do not implement all of the features documented 20 | below. The lack of an optional feature in a given package is not 21 | necessarily a bug. More recommendations for GNU packages can be found 22 | in *note Makefile Conventions: (standards)Makefile Conventions. 23 | 24 | The `configure' shell script attempts to guess correct values for 25 | various system-dependent variables used during compilation. It uses 26 | those values to create a `Makefile' in each directory of the package. 27 | It may also create one or more `.h' files containing system-dependent 28 | definitions. Finally, it creates a shell script `config.status' that 29 | you can run in the future to recreate the current configuration, and a 30 | file `config.log' containing compiler output (useful mainly for 31 | debugging `configure'). 32 | 33 | It can also use an optional file (typically called `config.cache' 34 | and enabled with `--cache-file=config.cache' or simply `-C') that saves 35 | the results of its tests to speed up reconfiguring. Caching is 36 | disabled by default to prevent problems with accidental use of stale 37 | cache files. 38 | 39 | If you need to do unusual things to compile the package, please try 40 | to figure out how `configure' could check whether to do them, and mail 41 | diffs or instructions to the address given in the `README' so they can 42 | be considered for the next release. If you are using the cache, and at 43 | some point `config.cache' contains results you don't want to keep, you 44 | may remove or edit it. 45 | 46 | The file `configure.ac' (or `configure.in') is used to create 47 | `configure' by a program called `autoconf'. You need `configure.ac' if 48 | you want to change it or regenerate `configure' using a newer version 49 | of `autoconf'. 50 | 51 | The simplest way to compile this package is: 52 | 53 | 1. `cd' to the directory containing the package's source code and type 54 | `./configure' to configure the package for your system. 55 | 56 | Running `configure' might take a while. While running, it prints 57 | some messages telling which features it is checking for. 58 | 59 | 2. Type `make' to compile the package. 60 | 61 | 3. Optionally, type `make check' to run any self-tests that come with 62 | the package, generally using the just-built uninstalled binaries. 63 | 64 | 4. Type `make install' to install the programs and any data files and 65 | documentation. When installing into a prefix owned by root, it is 66 | recommended that the package be configured and built as a regular 67 | user, and only the `make install' phase executed with root 68 | privileges. 69 | 70 | 5. Optionally, type `make installcheck' to repeat any self-tests, but 71 | this time using the binaries in their final installed location. 72 | This target does not install anything. Running this target as a 73 | regular user, particularly if the prior `make install' required 74 | root privileges, verifies that the installation completed 75 | correctly. 76 | 77 | 6. You can remove the program binaries and object files from the 78 | source code directory by typing `make clean'. To also remove the 79 | files that `configure' created (so you can compile the package for 80 | a different kind of computer), type `make distclean'. There is 81 | also a `make maintainer-clean' target, but that is intended mainly 82 | for the package's developers. If you use it, you may have to get 83 | all sorts of other programs in order to regenerate files that came 84 | with the distribution. 85 | 86 | 7. Often, you can also type `make uninstall' to remove the installed 87 | files again. In practice, not all packages have tested that 88 | uninstallation works correctly, even though it is required by the 89 | GNU Coding Standards. 90 | 91 | 8. Some packages, particularly those that use Automake, provide `make 92 | distcheck', which can by used by developers to test that all other 93 | targets like `make install' and `make uninstall' work correctly. 94 | This target is generally not run by end users. 95 | 96 | Compilers and Options 97 | ===================== 98 | 99 | Some systems require unusual options for compilation or linking that 100 | the `configure' script does not know about. Run `./configure --help' 101 | for details on some of the pertinent environment variables. 102 | 103 | You can give `configure' initial values for configuration parameters 104 | by setting variables in the command line or in the environment. Here 105 | is an example: 106 | 107 | ./configure CC=c99 CFLAGS=-g LIBS=-lposix 108 | 109 | *Note Defining Variables::, for more details. 110 | 111 | Compiling For Multiple Architectures 112 | ==================================== 113 | 114 | You can compile the package for more than one kind of computer at the 115 | same time, by placing the object files for each architecture in their 116 | own directory. To do this, you can use GNU `make'. `cd' to the 117 | directory where you want the object files and executables to go and run 118 | the `configure' script. `configure' automatically checks for the 119 | source code in the directory that `configure' is in and in `..'. This 120 | is known as a "VPATH" build. 121 | 122 | With a non-GNU `make', it is safer to compile the package for one 123 | architecture at a time in the source code directory. After you have 124 | installed the package for one architecture, use `make distclean' before 125 | reconfiguring for another architecture. 126 | 127 | On MacOS X 10.5 and later systems, you can create libraries and 128 | executables that work on multiple system types--known as "fat" or 129 | "universal" binaries--by specifying multiple `-arch' options to the 130 | compiler but only a single `-arch' option to the preprocessor. Like 131 | this: 132 | 133 | ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ 134 | CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ 135 | CPP="gcc -E" CXXCPP="g++ -E" 136 | 137 | This is not guaranteed to produce working output in all cases, you 138 | may have to build one architecture at a time and combine the results 139 | using the `lipo' tool if you have problems. 140 | 141 | Installation Names 142 | ================== 143 | 144 | By default, `make install' installs the package's commands under 145 | `/usr/local/bin', include files under `/usr/local/include', etc. You 146 | can specify an installation prefix other than `/usr/local' by giving 147 | `configure' the option `--prefix=PREFIX', where PREFIX must be an 148 | absolute file name. 149 | 150 | You can specify separate installation prefixes for 151 | architecture-specific files and architecture-independent files. If you 152 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses 153 | PREFIX as the prefix for installing programs and libraries. 154 | Documentation and other data files still use the regular prefix. 155 | 156 | In addition, if you use an unusual directory layout you can give 157 | options like `--bindir=DIR' to specify different values for particular 158 | kinds of files. Run `configure --help' for a list of the directories 159 | you can set and what kinds of files go in them. In general, the 160 | default for these options is expressed in terms of `${prefix}', so that 161 | specifying just `--prefix' will affect all of the other directory 162 | specifications that were not explicitly provided. 163 | 164 | The most portable way to affect installation locations is to pass the 165 | correct locations to `configure'; however, many packages provide one or 166 | both of the following shortcuts of passing variable assignments to the 167 | `make install' command line to change installation locations without 168 | having to reconfigure or recompile. 169 | 170 | The first method involves providing an override variable for each 171 | affected directory. For example, `make install 172 | prefix=/alternate/directory' will choose an alternate location for all 173 | directory configuration variables that were expressed in terms of 174 | `${prefix}'. Any directories that were specified during `configure', 175 | but not in terms of `${prefix}', must each be overridden at install 176 | time for the entire installation to be relocated. The approach of 177 | makefile variable overrides for each directory variable is required by 178 | the GNU Coding Standards, and ideally causes no recompilation. 179 | However, some platforms have known limitations with the semantics of 180 | shared libraries that end up requiring recompilation when using this 181 | method, particularly noticeable in packages that use GNU Libtool. 182 | 183 | The second method involves providing the `DESTDIR' variable. For 184 | example, `make install DESTDIR=/alternate/directory' will prepend 185 | `/alternate/directory' before all installation names. The approach of 186 | `DESTDIR' overrides is not required by the GNU Coding Standards, and 187 | does not work on platforms that have drive letters. On the other hand, 188 | it does better at avoiding recompilation issues, and works well even 189 | when some directory options were not specified in terms of `${prefix}' 190 | at `configure' time. 191 | 192 | Optional Features 193 | ================= 194 | 195 | If the package supports it, you can cause programs to be installed 196 | with an extra prefix or suffix on their names by giving `configure' the 197 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. 198 | 199 | Some packages pay attention to `--enable-FEATURE' options to 200 | `configure', where FEATURE indicates an optional part of the package. 201 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE 202 | is something like `gnu-as' or `x' (for the X Window System). The 203 | `README' should mention any `--enable-' and `--with-' options that the 204 | package recognizes. 205 | 206 | For packages that use the X Window System, `configure' can usually 207 | find the X include and library files automatically, but if it doesn't, 208 | you can use the `configure' options `--x-includes=DIR' and 209 | `--x-libraries=DIR' to specify their locations. 210 | 211 | Some packages offer the ability to configure how verbose the 212 | execution of `make' will be. For these packages, running `./configure 213 | --enable-silent-rules' sets the default to minimal output, which can be 214 | overridden with `make V=1'; while running `./configure 215 | --disable-silent-rules' sets the default to verbose, which can be 216 | overridden with `make V=0'. 217 | 218 | Particular systems 219 | ================== 220 | 221 | On HP-UX, the default C compiler is not ANSI C compatible. If GNU 222 | CC is not installed, it is recommended to use the following options in 223 | order to use an ANSI C compiler: 224 | 225 | ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" 226 | 227 | and if that doesn't work, install pre-built binaries of GCC for HP-UX. 228 | 229 | HP-UX `make' updates targets which have the same time stamps as 230 | their prerequisites, which makes it generally unusable when shipped 231 | generated files such as `configure' are involved. Use GNU `make' 232 | instead. 233 | 234 | On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot 235 | parse its `' header file. The option `-nodtk' can be used as 236 | a workaround. If GNU CC is not installed, it is therefore recommended 237 | to try 238 | 239 | ./configure CC="cc" 240 | 241 | and if that doesn't work, try 242 | 243 | ./configure CC="cc -nodtk" 244 | 245 | On Solaris, don't put `/usr/ucb' early in your `PATH'. This 246 | directory contains several dysfunctional programs; working variants of 247 | these programs are available in `/usr/bin'. So, if you need `/usr/ucb' 248 | in your `PATH', put it _after_ `/usr/bin'. 249 | 250 | On Haiku, software installed for all users goes in `/boot/common', 251 | not `/usr/local'. It is recommended to use the following options: 252 | 253 | ./configure --prefix=/boot/common 254 | 255 | Specifying the System Type 256 | ========================== 257 | 258 | There may be some features `configure' cannot figure out 259 | automatically, but needs to determine by the type of machine the package 260 | will run on. Usually, assuming the package is built to be run on the 261 | _same_ architectures, `configure' can figure that out, but if it prints 262 | a message saying it cannot guess the machine type, give it the 263 | `--build=TYPE' option. TYPE can either be a short name for the system 264 | type, such as `sun4', or a canonical name which has the form: 265 | 266 | CPU-COMPANY-SYSTEM 267 | 268 | where SYSTEM can have one of these forms: 269 | 270 | OS 271 | KERNEL-OS 272 | 273 | See the file `config.sub' for the possible values of each field. If 274 | `config.sub' isn't included in this package, then this package doesn't 275 | need to know the machine type. 276 | 277 | If you are _building_ compiler tools for cross-compiling, you should 278 | use the option `--target=TYPE' to select the type of system they will 279 | produce code for. 280 | 281 | If you want to _use_ a cross compiler, that generates code for a 282 | platform different from the build platform, you should specify the 283 | "host" platform (i.e., that on which the generated programs will 284 | eventually be run) with `--host=TYPE'. 285 | 286 | Sharing Defaults 287 | ================ 288 | 289 | If you want to set default values for `configure' scripts to share, 290 | you can create a site shell script called `config.site' that gives 291 | default values for variables like `CC', `cache_file', and `prefix'. 292 | `configure' looks for `PREFIX/share/config.site' if it exists, then 293 | `PREFIX/etc/config.site' if it exists. Or, you can set the 294 | `CONFIG_SITE' environment variable to the location of the site script. 295 | A warning: not all `configure' scripts look for a site script. 296 | 297 | Defining Variables 298 | ================== 299 | 300 | Variables not defined in a site shell script can be set in the 301 | environment passed to `configure'. However, some packages may run 302 | configure again during the build, and the customized values of these 303 | variables may be lost. In order to avoid this problem, you should set 304 | them in the `configure' command line, using `VAR=value'. For example: 305 | 306 | ./configure CC=/usr/local2/bin/gcc 307 | 308 | causes the specified `gcc' to be used as the C compiler (unless it is 309 | overridden in the site shell script). 310 | 311 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to 312 | an Autoconf limitation. Until the limitation is lifted, you can use 313 | this workaround: 314 | 315 | CONFIG_SHELL=/bin/bash ./configure CONFIG_SHELL=/bin/bash 316 | 317 | `configure' Invocation 318 | ====================== 319 | 320 | `configure' recognizes the following options to control how it 321 | operates. 322 | 323 | `--help' 324 | `-h' 325 | Print a summary of all of the options to `configure', and exit. 326 | 327 | `--help=short' 328 | `--help=recursive' 329 | Print a summary of the options unique to this package's 330 | `configure', and exit. The `short' variant lists options used 331 | only in the top level, while the `recursive' variant lists options 332 | also present in any nested packages. 333 | 334 | `--version' 335 | `-V' 336 | Print the version of Autoconf used to generate the `configure' 337 | script, and exit. 338 | 339 | `--cache-file=FILE' 340 | Enable the cache: use and save the results of the tests in FILE, 341 | traditionally `config.cache'. FILE defaults to `/dev/null' to 342 | disable caching. 343 | 344 | `--config-cache' 345 | `-C' 346 | Alias for `--cache-file=config.cache'. 347 | 348 | `--quiet' 349 | `--silent' 350 | `-q' 351 | Do not print messages saying which checks are being made. To 352 | suppress all normal output, redirect it to `/dev/null' (any error 353 | messages will still be shown). 354 | 355 | `--srcdir=DIR' 356 | Look for the package's source code in directory DIR. Usually 357 | `configure' can determine that directory automatically. 358 | 359 | `--prefix=DIR' 360 | Use DIR as the installation prefix. *note Installation Names:: 361 | for more details, including other options available for fine-tuning 362 | the installation locations. 363 | 364 | `--no-create' 365 | `-n' 366 | Run the configure checks, but stop before creating any output 367 | files. 368 | 369 | `configure' also accepts some other, not widely useful, options. Run 370 | `configure --help' for more details. 371 | -------------------------------------------------------------------------------- /html/Uri.cc: -------------------------------------------------------------------------------- 1 | #include "Uri.h" 2 | 3 | #include "wincstring.h" 4 | #include 5 | #include 6 | #include 7 | #include "tld.h" 8 | 9 | //#define DEBUG 10 | #include "debug.h" 11 | 12 | using namespace std; 13 | using namespace htmlcxx; 14 | 15 | /** Structure to store various schemes and their default ports */ 16 | struct schemes_t { 17 | /** The name of the scheme */ 18 | const char *name; 19 | /** The default port for the scheme */ 20 | unsigned int default_port; 21 | }; 22 | 23 | /* Some WWW schemes and their default ports; this is basically /etc/services */ 24 | /* This will become global when the protocol abstraction comes */ 25 | /* As the schemes are searched by a linear search, */ 26 | /* they are sorted by their expected frequency */ 27 | static schemes_t schemes[] = 28 | { 29 | {"http", Uri::URI_HTTP_DEFAULT_PORT}, 30 | {"ftp", Uri::URI_FTP_DEFAULT_PORT}, 31 | {"https", Uri::URI_HTTPS_DEFAULT_PORT}, 32 | {"gopher", Uri::URI_GOPHER_DEFAULT_PORT}, 33 | {"ldap", Uri::URI_LDAP_DEFAULT_PORT}, 34 | {"nntp", Uri::URI_NNTP_DEFAULT_PORT}, 35 | {"snews", Uri::URI_SNEWS_DEFAULT_PORT}, 36 | {"imap", Uri::URI_IMAP_DEFAULT_PORT}, 37 | {"pop", Uri::URI_POP_DEFAULT_PORT}, 38 | {"sip", Uri::URI_SIP_DEFAULT_PORT}, 39 | {"rtsp", Uri::URI_RTSP_DEFAULT_PORT}, 40 | {"wais", Uri::URI_WAIS_DEFAULT_PORT}, 41 | {"z39.50r", Uri::URI_WAIS_DEFAULT_PORT}, 42 | {"z39.50s", Uri::URI_WAIS_DEFAULT_PORT}, 43 | {"prospero", Uri::URI_PROSPERO_DEFAULT_PORT}, 44 | {"nfs", Uri::URI_NFS_DEFAULT_PORT}, 45 | {"tip", Uri::URI_TIP_DEFAULT_PORT}, 46 | {"acap", Uri::URI_ACAP_DEFAULT_PORT}, 47 | {"telnet", Uri::URI_TELNET_DEFAULT_PORT}, 48 | {"ssh", Uri::URI_SSH_DEFAULT_PORT}, 49 | { NULL, 0xFFFF } /* unknown port */ 50 | }; 51 | 52 | static unsigned int port_of_Scheme(const char *scheme_str) 53 | { 54 | schemes_t *scheme; 55 | 56 | if (scheme_str) { 57 | for (scheme = schemes; scheme->name != NULL; ++scheme) { 58 | if (strcasecmp(scheme_str, scheme->name) == 0) { 59 | return scheme->default_port; 60 | } 61 | } 62 | } 63 | return 0; 64 | } 65 | 66 | /* We have a apr_table_t that we can index by character and it tells us if the 67 | * character is one of the interesting delimiters. Note that we even get 68 | * compares for NUL for free -- it's just another delimiter. 69 | */ 70 | 71 | #define T_COLON 0x01 /* ':' */ 72 | #define T_SLASH 0x02 /* '/' */ 73 | #define T_QUESTION 0x04 /* '?' */ 74 | #define T_HASH 0x08 /* '#' */ 75 | #define T_NUL 0x80 /* '\0' */ 76 | 77 | /* the uri_delims.h file is autogenerated by gen_uri_delims.c */ 78 | /* this file is automatically generated by gen_uri_delims, do not edit */ 79 | static const unsigned char uri_delims[256] = { 80 | T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 81 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,T_HASH,0,0,0,0, 82 | 0,0,0,0,0,0,0,T_SLASH,0,0,0,0,0,0,0,0,0,0,T_COLON,0, 83 | 0,0,0,T_QUESTION,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 84 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 85 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 86 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 87 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 88 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 89 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 90 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 91 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 92 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 93 | }; 94 | 95 | /* it works like this: 96 | if (uri_delims[ch] & NOTEND_foobar) { 97 | then we're not at a delimiter for foobar 98 | } 99 | */ 100 | 101 | /* Note that we optimize the scheme scanning here, we cheat and let the 102 | * compiler know that it doesn't have to do the & masking. 103 | */ 104 | #define NOTEND_SCHEME (0xff) 105 | #define NOTEND_HOSTINFO (T_SLASH | T_QUESTION | T_HASH | T_NUL) 106 | #define NOTEND_PATH (T_QUESTION | T_HASH | T_NUL) 107 | 108 | 109 | static size_t wwwPrefixOffset(const std::string& hostname); 110 | 111 | Uri::Uri() 112 | : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0) 113 | {} 114 | 115 | Uri::Uri(const string &uri_str) 116 | : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0) 117 | { 118 | init(uri_str); 119 | } 120 | 121 | void Uri::init(const string &uri_str) 122 | { 123 | DEBUGP("Parsing uri %s\n", uri_str.c_str()); 124 | 125 | if(uri_str.empty()) return; 126 | const char *uri = uri_str.c_str(); 127 | const char *s; 128 | const char *s1; 129 | const char *hostinfo; 130 | char *endstr; 131 | 132 | /* We assume the processor has a branch predictor like most -- 133 | * it assumes forward branches are untaken and backwards are taken. That's 134 | * the reason for the gotos. -djg 135 | */ 136 | if (uri[0] == '/') { 137 | deal_with_path: 138 | DEBUGP("Dealing with path\n"); 139 | /* we expect uri to point to first character of path ... remember 140 | * that the path could be empty -- http://foobar?query for example 141 | */ 142 | s = uri; 143 | while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) { 144 | ++s; 145 | } 146 | if (s != uri) { 147 | mPath.assign(uri, s - uri); 148 | DEBUGP("Path is %s\n", mPath.c_str()); 149 | } 150 | if (*s == 0) { 151 | return; 152 | } 153 | if (*s == '?') { 154 | ++s; 155 | s1 = strchr(s, '#'); 156 | if (s1) { 157 | mFragment.assign(s1 + 1); 158 | mExistsFragment = true; 159 | DEBUGP("Fragment is %s\n", mFragment.c_str()); 160 | mQuery.assign(s, s1 - s); 161 | mExistsQuery = true; 162 | DEBUGP("Query is %s\n", mQuery.c_str()); 163 | } 164 | else { 165 | mQuery.assign(s); 166 | mExistsQuery = true; 167 | DEBUGP("Query is %s\n", mQuery.c_str()); 168 | } 169 | return; 170 | } 171 | /* otherwise it's a fragment */ 172 | mFragment.assign(s + 1); 173 | mExistsFragment = true; 174 | DEBUGP("Fragment is %s\n", mFragment.c_str()); 175 | return; 176 | } 177 | 178 | DEBUGP("Dealing with scheme\n"); 179 | /* find the scheme: */ 180 | if (!isalpha(*uri)) goto deal_with_path; 181 | s = uri; 182 | while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) { 183 | ++s; 184 | } 185 | /* scheme must be non-empty and followed by :// */ 186 | if (s == uri || s[0] != ':' || s[1] != '/' || s[2] != '/') { 187 | goto deal_with_path; /* backwards predicted taken! */ 188 | } 189 | 190 | mScheme.assign(uri, s - uri); 191 | DEBUGP("Scheme is %s\n", mScheme.c_str()); 192 | s += 3; 193 | 194 | DEBUGP("Finding hostinfo\n"); 195 | hostinfo = s; 196 | DEBUGP("Hostinfo is %s\n", hostinfo); 197 | while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) { 198 | ++s; 199 | } 200 | uri = s; /* whatever follows hostinfo is start of uri */ 201 | // mHostinfo.assign(hostinfo, uri - hostinfo); 202 | 203 | /* If there's a username:password@host:port, the @ we want is the last @... 204 | * too bad there's no memrchr()... For the C purists, note that hostinfo 205 | * is definately not the first character of the original uri so therefore 206 | * &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C. 207 | */ 208 | do { 209 | --s; 210 | } while (s >= hostinfo && *s != '@'); 211 | if (s < hostinfo) { 212 | /* again we want the common case to be fall through */ 213 | deal_with_host: 214 | DEBUGP("Dealing with host\n"); 215 | /* We expect hostinfo to point to the first character of 216 | * the hostname. If there's a port it is the first colon. 217 | */ 218 | s = (char *)memchr(hostinfo, ':', uri - hostinfo); 219 | if (s == NULL) { 220 | /* we expect the common case to have no port */ 221 | mHostname.assign(hostinfo, uri - hostinfo); 222 | DEBUGP("Hostname is %s\n", mHostname.c_str()); 223 | goto deal_with_path; 224 | } 225 | mHostname.assign(hostinfo, s - hostinfo); 226 | DEBUGP("Hostname is %s\n", mHostname.c_str()); 227 | ++s; 228 | if (uri != s) { 229 | mPortStr.assign(s, uri - s); 230 | mPort = strtol(mPortStr.c_str(), &endstr, 10); 231 | if (*endstr == '\0') { 232 | goto deal_with_path; 233 | } 234 | /* Invalid characters after ':' found */ 235 | DEBUGP("Throwing invalid url exception\n"); 236 | throw Exception("Invalid character after ':'"); 237 | } 238 | this->mPort = port_of_Scheme(mScheme.c_str()); 239 | goto deal_with_path; 240 | } 241 | 242 | /* first colon delimits username:password */ 243 | s1 = (char *)memchr(hostinfo, ':', s - hostinfo); 244 | if (s1) { 245 | mUser.assign(hostinfo, s1 - hostinfo); 246 | ++s1; 247 | mPassword.assign(s1, s - s1); 248 | } 249 | else { 250 | mUser.assign(hostinfo, s - hostinfo); 251 | } 252 | hostinfo = s + 1; 253 | goto deal_with_host; 254 | } 255 | 256 | Uri::~Uri() { 257 | } 258 | 259 | string Uri::scheme() const { return mScheme; } 260 | 261 | void Uri::scheme(string scheme) { 262 | mScheme = scheme; 263 | } 264 | string Uri::user() const { return mUser; } 265 | void Uri::user(string user) { 266 | mUser = user; 267 | } 268 | string Uri::password() const { return mPassword; } 269 | void Uri::password(string password) { 270 | mPassword = password; 271 | } 272 | string Uri::hostname() const { return mHostname; } 273 | void Uri::hostname(string hostname) { 274 | mHostname = hostname; 275 | } 276 | string Uri::path() const { return mPath; } 277 | void Uri::path(string path) { 278 | mPath = path; 279 | } 280 | bool Uri::existsFragment() const { return mExistsFragment; } 281 | void Uri::existsFragment(bool existsFragment) { 282 | mExistsFragment = existsFragment; 283 | } 284 | bool Uri::existsQuery() const { return mExistsQuery; } 285 | void Uri::existsQuery(bool existsQuery) { 286 | mExistsQuery = existsQuery; 287 | } 288 | string Uri::query() const { return mQuery; } 289 | void Uri::query(string query) { 290 | mQuery = query; 291 | } 292 | string Uri::fragment() const { return mFragment; } 293 | void Uri::fragment(string fragment) { 294 | mFragment = fragment; 295 | } 296 | unsigned int Uri::port() const { return mPort; } 297 | void Uri::port(unsigned int port) { mPort = port; } 298 | 299 | static const char *default_filenames[] = { "index", "default", NULL }; 300 | static const char *default_extensions[] = { ".html", ".htm", ".php", ".shtml", ".asp", ".cgi", NULL }; 301 | 302 | static unsigned short default_port_for_scheme(const char *scheme_str) 303 | { 304 | schemes_t *scheme; 305 | 306 | if (scheme_str == NULL) 307 | return 0; 308 | 309 | for (scheme = schemes; scheme->name != NULL; ++scheme) 310 | if (strcasecmp(scheme_str, scheme->name) == 0) 311 | return scheme->default_port; 312 | 313 | return 0; 314 | } 315 | 316 | Uri Uri::absolute(const Uri &base) const 317 | { 318 | if (mScheme.empty()) 319 | { 320 | Uri root(base); 321 | 322 | if (root.mPath.empty()) root.mPath = "/"; 323 | 324 | if (mPath.empty()) 325 | { 326 | if (mExistsQuery) 327 | { 328 | root.mQuery = mQuery; 329 | root.mExistsQuery = mExistsQuery; 330 | root.mFragment = mFragment; 331 | root.mExistsFragment = mExistsFragment; 332 | } 333 | else if (mExistsFragment) 334 | { 335 | root.mFragment = mFragment; 336 | root.mExistsFragment = mExistsFragment; 337 | } 338 | } 339 | else if (mPath[0] == '/') 340 | { 341 | root.mPath = mPath; 342 | root.mQuery = mQuery; 343 | root.mExistsQuery = mExistsQuery; 344 | root.mFragment = mFragment; 345 | root.mExistsFragment = mExistsFragment; 346 | } 347 | else 348 | { 349 | string path(root.mPath); 350 | string::size_type find; 351 | find = path.rfind("/"); 352 | if (find != string::npos) path.erase(find+1); 353 | path += mPath; 354 | root.mPath = path; 355 | root.mQuery = mQuery; 356 | root.mExistsQuery = mExistsQuery; 357 | root.mFragment = mFragment; 358 | root.mExistsFragment = mExistsFragment; 359 | } 360 | 361 | return root; 362 | } 363 | 364 | if (mPath.empty()) 365 | { 366 | Uri root(*this); 367 | root.mPath = "/"; 368 | 369 | return root; 370 | } 371 | 372 | return *this; 373 | } 374 | 375 | string Uri::unparse(int flags ) const 376 | { 377 | string ret; 378 | ret.reserve(mScheme.length() + mUser.length() + mPassword.length() + mHostname.length() + mPath.length() + mQuery.length() + mFragment.length() + mPortStr.length()); 379 | 380 | DEBUGP("Unparsing scheme\n"); 381 | if(!(Uri::REMOVE_SCHEME & flags)) { 382 | if(!mScheme.empty()) { 383 | ret += mScheme; 384 | ret += "://"; 385 | } 386 | } 387 | DEBUGP("Unparsing hostname\n"); 388 | if(!mHostname.empty()) { 389 | size_t offset = 0; 390 | if(flags & Uri::REMOVE_WWW_PREFIX && mHostname.length() > 3) { 391 | offset = wwwPrefixOffset(mHostname); 392 | } 393 | ret += (mHostname.c_str() + offset); 394 | } 395 | DEBUGP("Unparsing port\n"); 396 | if (!mPortStr.empty() && !(!mScheme.empty() && mPort == default_port_for_scheme(mScheme.c_str()))) 397 | { 398 | ret += ':'; 399 | ret += mPortStr; 400 | } 401 | DEBUGP("Unparsing path\n"); 402 | if(!mPath.empty()) 403 | { 404 | char *buf = new char[mPath.length() + 1]; 405 | memcpy(buf, mPath.c_str(), mPath.length() + 1); 406 | if(flags & Uri::REMOVE_DEFAULT_FILENAMES) { 407 | const char **ptr = default_extensions; 408 | char *end = buf + mPath.length(); 409 | size_t offset = 0; 410 | while(*ptr != NULL) { 411 | size_t len = strlen(*ptr); 412 | if((strcmp(end - len, *ptr)) == 0) { 413 | offset = len; 414 | break; 415 | } 416 | ++ptr; 417 | } 418 | if(offset == 0) goto remove_bar; 419 | ptr = default_filenames; 420 | bool found = false; 421 | while(*ptr != NULL) { 422 | size_t len = strlen(*ptr); 423 | if(strncmp(end - offset - len, *ptr, len) == 0) { 424 | offset += len; 425 | found = true; 426 | break; 427 | } 428 | ++ptr; 429 | } 430 | if(found) { 431 | *(end - offset) = 0; //cut filename 432 | } 433 | 434 | } 435 | remove_bar: 436 | if(flags & Uri::REMOVE_TRAILING_BAR) { 437 | if(strlen(buf) > 1 && buf[strlen(buf) - 1] == '/') { //do not remove if path is only the bar 438 | buf[strlen(buf) - 1] = 0; 439 | } 440 | } 441 | ret += buf; 442 | delete [] buf; 443 | } 444 | DEBUGP("Unparsing query\n"); 445 | if(!(flags & Uri::REMOVE_QUERY) && mExistsQuery) { 446 | ret += '?'; 447 | if(flags & Uri::REMOVE_QUERY_VALUES) { 448 | const char *ptr = mQuery.c_str(); 449 | bool inside = false; 450 | while(*ptr) { 451 | if(*ptr == '=') { 452 | inside = true; 453 | } 454 | if(*ptr == '&') { 455 | inside = false; 456 | } 457 | if(inside) { 458 | ++ptr; 459 | } else { 460 | ret += *ptr; 461 | ++ptr; 462 | } 463 | } 464 | } else { 465 | ret += mQuery; 466 | } 467 | } 468 | DEBUGP("Unparsing fragment\n"); 469 | if(!(flags & Uri::REMOVE_FRAGMENT) && mExistsFragment) 470 | { 471 | ret += '#'; 472 | ret += mFragment; 473 | } 474 | 475 | return ret; 476 | } 477 | 478 | static size_t wwwPrefixOffset(const std::string& hostname) 479 | { 480 | string::size_type len = hostname.length(); 481 | if(strncasecmp("www", hostname.c_str(), 3) == 0) 482 | { 483 | if(len > 3 && hostname[3] == '.') 484 | { 485 | return 4; 486 | } 487 | if(len > 4 && isdigit(hostname[3]) && hostname[4] == '.') 488 | { 489 | return 5; 490 | } 491 | } 492 | return 0; 493 | } 494 | 495 | 496 | 497 | 498 | std::string Uri::canonicalHostname(unsigned int maxDepth) const 499 | { 500 | 501 | size_t prefixOffset = wwwPrefixOffset(mHostname); 502 | size_t suffixOffset = tldOffset(mHostname.c_str()); 503 | unsigned int depth = 0; 504 | string::const_iterator canonicalStart = mHostname.begin() + prefixOffset; 505 | string::const_iterator ptr = mHostname.begin(); 506 | ptr += mHostname.length() - suffixOffset; 507 | while (depth < maxDepth && ptr > canonicalStart) 508 | { 509 | --ptr; 510 | if (*ptr == '.') ++depth; 511 | } 512 | if (*ptr == '.') ++ptr; 513 | return string(ptr, mHostname.end()); 514 | } 515 | 516 | std::string Uri::decode(const std::string &uri) 517 | { 518 | //Note from RFC1630: "Sequences which start with a percent sign 519 | //but are not followed by two hexadecimal characters (0-9,A-F) are reserved 520 | //for future extension" 521 | const unsigned char *ptr = (const unsigned char *)uri.c_str(); 522 | string ret; 523 | ret.reserve(uri.length()); 524 | for (; *ptr; ++ptr) 525 | { 526 | if (*ptr == '%') 527 | { 528 | if (*(ptr + 1)) 529 | { 530 | char a = *(ptr + 1); 531 | char b = *(ptr + 2); 532 | if (!((a >= 0x30 && a < 0x40) || (a >= 0x41 && a < 0x47))) continue; 533 | if (!((b >= 0x30 && b < 0x40) || (b >= 0x41 && b < 0x47))) continue; 534 | char buf[3]; 535 | buf[0] = a; 536 | buf[1] = b; 537 | buf[2] = 0; 538 | ret += (char)strtoul(buf, NULL, 16); 539 | ptr += 2; 540 | continue; 541 | } 542 | } 543 | ret += *ptr; 544 | } 545 | return ret; 546 | } 547 | 548 | //This vector is generated by safechars.py. Please do not edit by hand. 549 | static const char safe[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 550 | 551 | 552 | std::string Uri::encode(const std::string &uri) 553 | { 554 | string ret; 555 | const unsigned char *ptr = (const unsigned char *)uri.c_str(); 556 | ret.reserve(uri.length()); 557 | 558 | for (; *ptr ; ++ptr) 559 | { 560 | if (!safe[*ptr]) 561 | { 562 | char buf[5]; 563 | memset(buf, 0, 5); 564 | snprintf(buf, 5, "%%%X", (*ptr)); 565 | ret.append(buf); 566 | } 567 | else 568 | { 569 | ret += *ptr; 570 | } 571 | } 572 | return ret; 573 | } 574 | 575 | -------------------------------------------------------------------------------- /LGPL_V2: -------------------------------------------------------------------------------- 1 | GNU LIBRARY GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1991 Free Software Foundation, Inc. 5 | 51 Franklin Street, Fifth Floor 6 | Boston, MA 02110-1301, USA. 7 | Everyone is permitted to copy and distribute verbatim copies 8 | of this license document, but changing it is not allowed. 9 | 10 | [This is the first released version of the library GPL. It is 11 | numbered 2 because it goes with version 2 of the ordinary GPL.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Library General Public License, applies to some 21 | specially designated Free Software Foundation software, and to any 22 | other libraries whose authors decide to use it. You can use it for 23 | your libraries, too. 24 | 25 | When we speak of free software, we are referring to freedom, not 26 | price. Our General Public Licenses are designed to make sure that you 27 | have the freedom to distribute copies of free software (and charge for 28 | this service if you wish), that you receive source code or can get it 29 | if you want it, that you can change the software or use pieces of it 30 | in new free programs; and that you know you can do these things. 31 | 32 | To protect your rights, we need to make restrictions that forbid 33 | anyone to deny you these rights or to ask you to surrender the rights. 34 | These restrictions translate to certain responsibilities for you if 35 | you distribute copies of the library, or if you modify it. 36 | 37 | For example, if you distribute copies of the library, whether gratis 38 | or for a fee, you must give the recipients all the rights that we gave 39 | you. You must make sure that they, too, receive or can get the source 40 | code. If you link a program with the library, you must provide 41 | complete object files to the recipients so that they can relink them 42 | with the library, after making changes to the library and recompiling 43 | it. And you must show them these terms so they know their rights. 44 | 45 | Our method of protecting your rights has two steps: (1) copyright 46 | the library, and (2) offer you this license which gives you legal 47 | permission to copy, distribute and/or modify the library. 48 | 49 | Also, for each distributor's protection, we want to make certain 50 | that everyone understands that there is no warranty for this free 51 | library. If the library is modified by someone else and passed on, we 52 | want its recipients to know that what they have is not the original 53 | version, so that any problems introduced by others will not reflect on 54 | the original authors' reputations. 55 | 56 | Finally, any free program is threatened constantly by software 57 | patents. We wish to avoid the danger that companies distributing free 58 | software will individually obtain patent licenses, thus in effect 59 | transforming the program into proprietary software. To prevent this, 60 | we have made it clear that any patent must be licensed for everyone's 61 | free use or not licensed at all. 62 | 63 | Most GNU software, including some libraries, is covered by the ordinary 64 | GNU General Public License, which was designed for utility programs. This 65 | license, the GNU Library General Public License, applies to certain 66 | designated libraries. This license is quite different from the ordinary 67 | one; be sure to read it in full, and don't assume that anything in it is 68 | the same as in the ordinary license. 69 | 70 | The reason we have a separate public license for some libraries is that 71 | they blur the distinction we usually make between modifying or adding to a 72 | program and simply using it. Linking a program with a library, without 73 | changing the library, is in some sense simply using the library, and is 74 | analogous to running a utility program or application program. However, in 75 | a textual and legal sense, the linked executable is a combined work, a 76 | derivative of the original library, and the ordinary General Public License 77 | treats it as such. 78 | 79 | Because of this blurred distinction, using the ordinary General 80 | Public License for libraries did not effectively promote software 81 | sharing, because most developers did not use the libraries. We 82 | concluded that weaker conditions might promote sharing better. 83 | 84 | However, unrestricted linking of non-free programs would deprive the 85 | users of those programs of all benefit from the free status of the 86 | libraries themselves. This Library General Public License is intended to 87 | permit developers of non-free programs to use free libraries, while 88 | preserving your freedom as a user of such programs to change the free 89 | libraries that are incorporated in them. (We have not seen how to achieve 90 | this as regards changes in header files, but we have achieved it as regards 91 | changes in the actual functions of the Library.) The hope is that this 92 | will lead to faster development of free libraries. 93 | 94 | The precise terms and conditions for copying, distribution and 95 | modification follow. Pay close attention to the difference between a 96 | "work based on the library" and a "work that uses the library". The 97 | former contains code derived from the library, while the latter only 98 | works together with the library. 99 | 100 | Note that it is possible for a library to be covered by the ordinary 101 | General Public License rather than by this special one. 102 | 103 | GNU LIBRARY GENERAL PUBLIC LICENSE 104 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 105 | 106 | 0. This License Agreement applies to any software library which 107 | contains a notice placed by the copyright holder or other authorized 108 | party saying it may be distributed under the terms of this Library 109 | General Public License (also called "this License"). Each licensee is 110 | addressed as "you". 111 | 112 | A "library" means a collection of software functions and/or data 113 | prepared so as to be conveniently linked with application programs 114 | (which use some of those functions and data) to form executables. 115 | 116 | The "Library", below, refers to any such software library or work 117 | which has been distributed under these terms. A "work based on the 118 | Library" means either the Library or any derivative work under 119 | copyright law: that is to say, a work containing the Library or a 120 | portion of it, either verbatim or with modifications and/or translated 121 | straightforwardly into another language. (Hereinafter, translation is 122 | included without limitation in the term "modification".) 123 | 124 | "Source code" for a work means the preferred form of the work for 125 | making modifications to it. For a library, complete source code means 126 | all the source code for all modules it contains, plus any associated 127 | interface definition files, plus the scripts used to control compilation 128 | and installation of the library. 129 | 130 | Activities other than copying, distribution and modification are not 131 | covered by this License; they are outside its scope. The act of 132 | running a program using the Library is not restricted, and output from 133 | such a program is covered only if its contents constitute a work based 134 | on the Library (independent of the use of the Library in a tool for 135 | writing it). Whether that is true depends on what the Library does 136 | and what the program that uses the Library does. 137 | 138 | 1. You may copy and distribute verbatim copies of the Library's 139 | complete source code as you receive it, in any medium, provided that 140 | you conspicuously and appropriately publish on each copy an 141 | appropriate copyright notice and disclaimer of warranty; keep intact 142 | all the notices that refer to this License and to the absence of any 143 | warranty; and distribute a copy of this License along with the 144 | Library. 145 | 146 | You may charge a fee for the physical act of transferring a copy, 147 | and you may at your option offer warranty protection in exchange for a 148 | fee. 149 | 150 | 2. You may modify your copy or copies of the Library or any portion 151 | of it, thus forming a work based on the Library, and copy and 152 | distribute such modifications or work under the terms of Section 1 153 | above, provided that you also meet all of these conditions: 154 | 155 | a) The modified work must itself be a software library. 156 | 157 | b) You must cause the files modified to carry prominent notices 158 | stating that you changed the files and the date of any change. 159 | 160 | c) You must cause the whole of the work to be licensed at no 161 | charge to all third parties under the terms of this License. 162 | 163 | d) If a facility in the modified Library refers to a function or a 164 | table of data to be supplied by an application program that uses 165 | the facility, other than as an argument passed when the facility 166 | is invoked, then you must make a good faith effort to ensure that, 167 | in the event an application does not supply such function or 168 | table, the facility still operates, and performs whatever part of 169 | its purpose remains meaningful. 170 | 171 | (For example, a function in a library to compute square roots has 172 | a purpose that is entirely well-defined independent of the 173 | application. Therefore, Subsection 2d requires that any 174 | application-supplied function or table used by this function must 175 | be optional: if the application does not supply it, the square 176 | root function must still compute square roots.) 177 | 178 | These requirements apply to the modified work as a whole. If 179 | identifiable sections of that work are not derived from the Library, 180 | and can be reasonably considered independent and separate works in 181 | themselves, then this License, and its terms, do not apply to those 182 | sections when you distribute them as separate works. But when you 183 | distribute the same sections as part of a whole which is a work based 184 | on the Library, the distribution of the whole must be on the terms of 185 | this License, whose permissions for other licensees extend to the 186 | entire whole, and thus to each and every part regardless of who wrote 187 | it. 188 | 189 | Thus, it is not the intent of this section to claim rights or contest 190 | your rights to work written entirely by you; rather, the intent is to 191 | exercise the right to control the distribution of derivative or 192 | collective works based on the Library. 193 | 194 | In addition, mere aggregation of another work not based on the Library 195 | with the Library (or with a work based on the Library) on a volume of 196 | a storage or distribution medium does not bring the other work under 197 | the scope of this License. 198 | 199 | 3. You may opt to apply the terms of the ordinary GNU General Public 200 | License instead of this License to a given copy of the Library. To do 201 | this, you must alter all the notices that refer to this License, so 202 | that they refer to the ordinary GNU General Public License, version 2, 203 | instead of to this License. (If a newer version than version 2 of the 204 | ordinary GNU General Public License has appeared, then you can specify 205 | that version instead if you wish.) Do not make any other change in 206 | these notices. 207 | 208 | Once this change is made in a given copy, it is irreversible for 209 | that copy, so the ordinary GNU General Public License applies to all 210 | subsequent copies and derivative works made from that copy. 211 | 212 | This option is useful when you wish to copy part of the code of 213 | the Library into a program that is not a library. 214 | 215 | 4. You may copy and distribute the Library (or a portion or 216 | derivative of it, under Section 2) in object code or executable form 217 | under the terms of Sections 1 and 2 above provided that you accompany 218 | it with the complete corresponding machine-readable source code, which 219 | must be distributed under the terms of Sections 1 and 2 above on a 220 | medium customarily used for software interchange. 221 | 222 | If distribution of object code is made by offering access to copy 223 | from a designated place, then offering equivalent access to copy the 224 | source code from the same place satisfies the requirement to 225 | distribute the source code, even though third parties are not 226 | compelled to copy the source along with the object code. 227 | 228 | 5. A program that contains no derivative of any portion of the 229 | Library, but is designed to work with the Library by being compiled or 230 | linked with it, is called a "work that uses the Library". Such a 231 | work, in isolation, is not a derivative work of the Library, and 232 | therefore falls outside the scope of this License. 233 | 234 | However, linking a "work that uses the Library" with the Library 235 | creates an executable that is a derivative of the Library (because it 236 | contains portions of the Library), rather than a "work that uses the 237 | library". The executable is therefore covered by this License. 238 | Section 6 states terms for distribution of such executables. 239 | 240 | When a "work that uses the Library" uses material from a header file 241 | that is part of the Library, the object code for the work may be a 242 | derivative work of the Library even though the source code is not. 243 | Whether this is true is especially significant if the work can be 244 | linked without the Library, or if the work is itself a library. The 245 | threshold for this to be true is not precisely defined by law. 246 | 247 | If such an object file uses only numerical parameters, data 248 | structure layouts and accessors, and small macros and small inline 249 | functions (ten lines or less in length), then the use of the object 250 | file is unrestricted, regardless of whether it is legally a derivative 251 | work. (Executables containing this object code plus portions of the 252 | Library will still fall under Section 6.) 253 | 254 | Otherwise, if the work is a derivative of the Library, you may 255 | distribute the object code for the work under the terms of Section 6. 256 | Any executables containing that work also fall under Section 6, 257 | whether or not they are linked directly with the Library itself. 258 | 259 | 6. As an exception to the Sections above, you may also compile or 260 | link a "work that uses the Library" with the Library to produce a 261 | work containing portions of the Library, and distribute that work 262 | under terms of your choice, provided that the terms permit 263 | modification of the work for the customer's own use and reverse 264 | engineering for debugging such modifications. 265 | 266 | You must give prominent notice with each copy of the work that the 267 | Library is used in it and that the Library and its use are covered by 268 | this License. You must supply a copy of this License. If the work 269 | during execution displays copyright notices, you must include the 270 | copyright notice for the Library among them, as well as a reference 271 | directing the user to the copy of this License. Also, you must do one 272 | of these things: 273 | 274 | a) Accompany the work with the complete corresponding 275 | machine-readable source code for the Library including whatever 276 | changes were used in the work (which must be distributed under 277 | Sections 1 and 2 above); and, if the work is an executable linked 278 | with the Library, with the complete machine-readable "work that 279 | uses the Library", as object code and/or source code, so that the 280 | user can modify the Library and then relink to produce a modified 281 | executable containing the modified Library. (It is understood 282 | that the user who changes the contents of definitions files in the 283 | Library will not necessarily be able to recompile the application 284 | to use the modified definitions.) 285 | 286 | b) Accompany the work with a written offer, valid for at 287 | least three years, to give the same user the materials 288 | specified in Subsection 6a, above, for a charge no more 289 | than the cost of performing this distribution. 290 | 291 | c) If distribution of the work is made by offering access to copy 292 | from a designated place, offer equivalent access to copy the above 293 | specified materials from the same place. 294 | 295 | d) Verify that the user has already received a copy of these 296 | materials or that you have already sent this user a copy. 297 | 298 | For an executable, the required form of the "work that uses the 299 | Library" must include any data and utility programs needed for 300 | reproducing the executable from it. However, as a special exception, 301 | the source code distributed need not include anything that is normally 302 | distributed (in either source or binary form) with the major 303 | components (compiler, kernel, and so on) of the operating system on 304 | which the executable runs, unless that component itself accompanies 305 | the executable. 306 | 307 | It may happen that this requirement contradicts the license 308 | restrictions of other proprietary libraries that do not normally 309 | accompany the operating system. Such a contradiction means you cannot 310 | use both them and the Library together in an executable that you 311 | distribute. 312 | 313 | 7. You may place library facilities that are a work based on the 314 | Library side-by-side in a single library together with other library 315 | facilities not covered by this License, and distribute such a combined 316 | library, provided that the separate distribution of the work based on 317 | the Library and of the other library facilities is otherwise 318 | permitted, and provided that you do these two things: 319 | 320 | a) Accompany the combined library with a copy of the same work 321 | based on the Library, uncombined with any other library 322 | facilities. This must be distributed under the terms of the 323 | Sections above. 324 | 325 | b) Give prominent notice with the combined library of the fact 326 | that part of it is a work based on the Library, and explaining 327 | where to find the accompanying uncombined form of the same work. 328 | 329 | 8. You may not copy, modify, sublicense, link with, or distribute 330 | the Library except as expressly provided under this License. Any 331 | attempt otherwise to copy, modify, sublicense, link with, or 332 | distribute the Library is void, and will automatically terminate your 333 | rights under this License. However, parties who have received copies, 334 | or rights, from you under this License will not have their licenses 335 | terminated so long as such parties remain in full compliance. 336 | 337 | 9. You are not required to accept this License, since you have not 338 | signed it. However, nothing else grants you permission to modify or 339 | distribute the Library or its derivative works. These actions are 340 | prohibited by law if you do not accept this License. Therefore, by 341 | modifying or distributing the Library (or any work based on the 342 | Library), you indicate your acceptance of this License to do so, and 343 | all its terms and conditions for copying, distributing or modifying 344 | the Library or works based on it. 345 | 346 | 10. Each time you redistribute the Library (or any work based on the 347 | Library), the recipient automatically receives a license from the 348 | original licensor to copy, distribute, link with or modify the Library 349 | subject to these terms and conditions. You may not impose any further 350 | restrictions on the recipients' exercise of the rights granted herein. 351 | You are not responsible for enforcing compliance by third parties to 352 | this License. 353 | 354 | 11. If, as a consequence of a court judgment or allegation of patent 355 | infringement or for any other reason (not limited to patent issues), 356 | conditions are imposed on you (whether by court order, agreement or 357 | otherwise) that contradict the conditions of this License, they do not 358 | excuse you from the conditions of this License. If you cannot 359 | distribute so as to satisfy simultaneously your obligations under this 360 | License and any other pertinent obligations, then as a consequence you 361 | may not distribute the Library at all. For example, if a patent 362 | license would not permit royalty-free redistribution of the Library by 363 | all those who receive copies directly or indirectly through you, then 364 | the only way you could satisfy both it and this License would be to 365 | refrain entirely from distribution of the Library. 366 | 367 | If any portion of this section is held invalid or unenforceable under any 368 | particular circumstance, the balance of the section is intended to apply, 369 | and the section as a whole is intended to apply in other circumstances. 370 | 371 | It is not the purpose of this section to induce you to infringe any 372 | patents or other property right claims or to contest validity of any 373 | such claims; this section has the sole purpose of protecting the 374 | integrity of the free software distribution system which is 375 | implemented by public license practices. Many people have made 376 | generous contributions to the wide range of software distributed 377 | through that system in reliance on consistent application of that 378 | system; it is up to the author/donor to decide if he or she is willing 379 | to distribute software through any other system and a licensee cannot 380 | impose that choice. 381 | 382 | This section is intended to make thoroughly clear what is believed to 383 | be a consequence of the rest of this License. 384 | 385 | 12. If the distribution and/or use of the Library is restricted in 386 | certain countries either by patents or by copyrighted interfaces, the 387 | original copyright holder who places the Library under this License may add 388 | an explicit geographical distribution limitation excluding those countries, 389 | so that distribution is permitted only in or among countries not thus 390 | excluded. In such case, this License incorporates the limitation as if 391 | written in the body of this License. 392 | 393 | 13. The Free Software Foundation may publish revised and/or new 394 | versions of the Library General Public License from time to time. 395 | Such new versions will be similar in spirit to the present version, 396 | but may differ in detail to address new problems or concerns. 397 | 398 | Each version is given a distinguishing version number. If the Library 399 | specifies a version number of this License which applies to it and 400 | "any later version", you have the option of following the terms and 401 | conditions either of that version or of any later version published by 402 | the Free Software Foundation. If the Library does not specify a 403 | license version number, you may choose any version ever published by 404 | the Free Software Foundation. 405 | 406 | 14. If you wish to incorporate parts of the Library into other free 407 | programs whose distribution conditions are incompatible with these, 408 | write to the author to ask for permission. For software which is 409 | copyrighted by the Free Software Foundation, write to the Free 410 | Software Foundation; we sometimes make exceptions for this. Our 411 | decision will be guided by the two goals of preserving the free status 412 | of all derivatives of our free software and of promoting the sharing 413 | and reuse of software generally. 414 | 415 | NO WARRANTY 416 | 417 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 418 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 419 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 420 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 421 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 422 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 423 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 424 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 425 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 426 | 427 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 428 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 429 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 430 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 431 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 432 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 433 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 434 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 435 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 436 | DAMAGES. 437 | 438 | END OF TERMS AND CONDITIONS 439 | 440 | How to Apply These Terms to Your New Libraries 441 | 442 | If you develop a new library, and you want it to be of the greatest 443 | possible use to the public, we recommend making it free software that 444 | everyone can redistribute and change. You can do so by permitting 445 | redistribution under these terms (or, alternatively, under the terms of the 446 | ordinary General Public License). 447 | 448 | To apply these terms, attach the following notices to the library. It is 449 | safest to attach them to the start of each source file to most effectively 450 | convey the exclusion of warranty; and each file should have at least the 451 | "copyright" line and a pointer to where the full notice is found. 452 | 453 | 454 | Copyright (C) 455 | 456 | This library is free software; you can redistribute it and/or 457 | modify it under the terms of the GNU Lesser General Public 458 | License as published by the Free Software Foundation; either 459 | version 2 of the License, or (at your option) any later version. 460 | 461 | This library is distributed in the hope that it will be useful, 462 | but WITHOUT ANY WARRANTY; without even the implied warranty of 463 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 464 | Lesser General Public License for more details. 465 | 466 | You should have received a copy of the GNU Lesser General Public 467 | License along with this library; if not, write to the Free Software 468 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 469 | 470 | Also add information on how to contact you by electronic and paper mail. 471 | 472 | You should also get your employer (if you work as a programmer) or your 473 | school, if any, to sign a "copyright disclaimer" for the library, if 474 | necessary. Here is a sample; alter the names: 475 | 476 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 477 | library `Frob' (a library for tweaking knobs) written by James Random Hacker. 478 | 479 | , 1 April 1990 480 | Ty Coon, President of Vice 481 | 482 | That's all there is to it! 483 | --------------------------------------------------------------------------------