├── appveyor.yml ├── benchmarks ├── arabic_newspapers.html └── benchmark.cc ├── visualc ├── gumbo.vcxproj.user ├── include │ └── strings.h ├── gumbo.vcxproj.filters └── gumbo.vcxproj ├── README.md ├── .gitmodules ├── gumbo.pc.in ├── src ├── tag_sizes.h ├── attribute.h ├── token_type.h ├── string_piece.h ├── attribute.c ├── tag.in ├── string_piece.c ├── util.c ├── insertion_mode.h ├── tag_strings.h ├── util.h ├── parser.h ├── char_ref.h ├── vector.h ├── tag.c ├── string_buffer.h ├── tag_enum.h ├── tokenizer_states.h ├── string_buffer.c ├── vector.c ├── tokenizer.h ├── utf8.h ├── tag_gperf.h └── error.h ├── THANKS ├── .travis.yml ├── genperf.py ├── configure.ac ├── gentags.py ├── .gitignore ├── autogen.sh ├── python └── gumbo │ ├── __init__.py │ ├── soup_adapter_test.py │ ├── gumboc_tags.py │ ├── soup_adapter.py │ ├── gumboc_test.py │ ├── html5lib_adapter.py │ └── html5lib_adapter_test.py ├── tests ├── attribute.cc ├── string_piece.cc ├── test_utils.h ├── string_buffer.cc ├── vector.cc ├── char_ref.cc └── test_utils.cc ├── examples ├── find_links.cc ├── clean_text.cc ├── get_title.c ├── positions_of_class.cc └── serialize.cc ├── .clang-format ├── gumbo_parser.gyp ├── CHANGES.md ├── CONTRIBUTING.md ├── DEBUGGING.md ├── Makefile.am ├── gtest.gyp ├── setup.py └── original-README.md /appveyor.yml: -------------------------------------------------------------------------------- 1 | version: 1.0.{build} 2 | build: 3 | project: visualc/gumbo.vcxproj 4 | verbosity: minimal 5 | -------------------------------------------------------------------------------- /benchmarks/arabic_newspapers.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/gumbo-parser/HEAD/benchmarks/arabic_newspapers.html -------------------------------------------------------------------------------- /visualc/gumbo.vcxproj.user: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /visualc/include/strings.h: -------------------------------------------------------------------------------- 1 | /*Dummy file to satisfy source file dependencies on Windows platform*/ 2 | #define strcasecmp _stricmp 3 | #define strncasecmp _strnicmp 4 | #define inline __inline 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Gumbo - A pure-C HTML5 parser. 2 | ============ 3 | 4 | This project has been unmaintained since 2016 and should not be used. 5 | 6 | The [original README](original-README.md) is available for historical reference. -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/gtest"] 2 | path = third_party/gtest 3 | url = https://chromium.googlesource.com/external/googletest/ 4 | [submodule "testdata"] 5 | path = testdata 6 | url = https://github.com/html5lib/html5lib-tests.git 7 | -------------------------------------------------------------------------------- /gumbo.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | libdir=@libdir@ 4 | includedir=@includedir@ 5 | 6 | Name: Gumbo 7 | Description: A fully-compliant HTML5 parser. 8 | Version: @PACKAGE_VERSION@ 9 | Libs: -L${libdir} -lgumbo 10 | Libs.private: @LIBS@ 11 | Cflags: -I${includedir} 12 | -------------------------------------------------------------------------------- /src/tag_sizes.h: -------------------------------------------------------------------------------- 1 | // Generated via `gentags.py src/tag.in`. 2 | // Do not edit; edit src/tag.in instead. 3 | // clang-format off 4 | 4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3, -------------------------------------------------------------------------------- /THANKS: -------------------------------------------------------------------------------- 1 | Gumbo HTML parser THANKS file 2 | 3 | Gumbo was originally written by Jonathan Tang, but many people helped out through suggestions, question-answering, code reviews, bugfixes, and organizational support. Here is a list of these people. Help me keep it complete and exempt of errors. 4 | 5 | Adam Barth 6 | Adam Roben 7 | Ben Noordhuis 8 | Bowen Han 9 | Constantinos Michael 10 | Craig Barnes 11 | Geoffrey Sneddon 12 | Ian Hickson 13 | Jack Deng 14 | Joel Low 15 | Jonathan Shneier 16 | Kevin Hendricks 17 | Mason Tang 18 | Maxim Zakharov 19 | Michal Zalewski 20 | Neal Norwitz 21 | Othar Hansson 22 | Ryan Grove 23 | Stefan Haustein 24 | Steffen Meschkat 25 | Steven Kabbes 26 | Thiago Farina 27 | Vicent Marti 28 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c++ 2 | 3 | compiler: 4 | - gcc 5 | - clang 6 | 7 | os: 8 | - linux 9 | - osx 10 | 11 | install: 12 | - wget 'https://googletest.googlecode.com/files/gtest-1.7.0.zip' 13 | - unzip gtest-1.7.0.zip 14 | - ln -s gtest-1.7.0 gtest 15 | - sudo pip install BeautifulSoup 16 | - sudo pip install html5lib==0.95 17 | 18 | script: 19 | - ./autogen.sh && ./configure && make && make check 20 | - python python/gumbo/gumboc_test.py 21 | - python python/gumbo/html5lib_adapter_test.py 22 | - python python/gumbo/soup_adapter_test.py 23 | - sudo make install 24 | - g++ examples/clean_text.cc `pkg-config --cflags --libs gumbo` 25 | - sudo python setup.py sdist install 26 | - python -c 'import gumbo; gumbo.parse("Foo")' 27 | -------------------------------------------------------------------------------- /genperf.py: -------------------------------------------------------------------------------- 1 | import sys, re 2 | 3 | gperf_output = sys.stdin.read() 4 | 5 | hash_func = re.search( 6 | "static unsigned int\s+hash\s+\((.*?)\)\s{(.*?)\n}", 7 | gperf_output, re.DOTALL) 8 | 9 | if not hash_func: 10 | raise "Failed to detect hash function in GPerf output" 11 | 12 | wordlist = re.search( 13 | "wordlist\[\]\s+=\s+{(.*?)}", 14 | gperf_output, re.DOTALL) 15 | 16 | if not wordlist: 17 | raise "Failed to detect word list in GPerf output" 18 | 19 | def process_wordlist(text): 20 | wordlist = [w.strip().replace('"', '') for w in text.split(',')] 21 | taglist = [ 22 | "\tGUMBO_TAG_" + (w.upper().replace('-', '_') if w else 'LAST') 23 | for w in wordlist] 24 | return taglist 25 | 26 | print "static unsigned int tag_hash(%s)\n{%s\n}" % ( 27 | hash_func.group(1), hash_func.group(2)) 28 | print "" 29 | print "static const unsigned char kGumboTagMap[] = {\n%s\n};" % ( 30 | ',\n'.join(process_wordlist(wordlist.group(1)))) 31 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # -*- Autoconf -*- 2 | # Process this file with autoconf to produce a configure script. 3 | 4 | AC_PREREQ([2.65]) 5 | AC_INIT([gumbo], [0.10.1], [jonathan.d.tang@gmail.com]) 6 | AC_CONFIG_MACRO_DIR([m4]) 7 | AC_CONFIG_SRCDIR([src/parser.c]) 8 | #AC_CONFIG_HEADERS([config.h]) 9 | AC_CONFIG_FILES([Makefile gumbo.pc]) 10 | 11 | # Checks for programs. 12 | AC_PROG_CXX 13 | AC_PROG_CC_C99 14 | 15 | # Checks for libraries. 16 | 17 | # Checks for header files. 18 | AC_CHECK_HEADERS([stddef.h stdlib.h string.h strings.h]) 19 | 20 | # Checks for typedefs, structures, and compiler characteristics. 21 | AC_C_INLINE 22 | AC_TYPE_SIZE_T 23 | 24 | # Checks for library functions. 25 | AC_CHECK_LIB([gtest_main], 26 | [main], 27 | AM_CONDITIONAL(HAVE_SHARED_LIBGTEST, [true]), 28 | AM_CONDITIONAL(HAVE_SHARED_LIBGTEST, [false])) 29 | 30 | # Init Automake & libtool 31 | AM_INIT_AUTOMAKE([foreign subdir-objects]) 32 | m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) 33 | LT_INIT 34 | 35 | AC_OUTPUT 36 | -------------------------------------------------------------------------------- /gentags.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def open_and_write_header(filename, comment_prefix): 4 | f = open(filename, 'w') 5 | f.write(comment_prefix + ' Generated via `gentags.py src/tag.in`.\n') 6 | f.write(comment_prefix + ' Do not edit; edit src/tag.in instead.\n') 7 | f.write(comment_prefix + ' clang-format off\n') 8 | return f 9 | 10 | tag_strings = open_and_write_header('src/tag_strings.h', '//') 11 | tag_enum = open_and_write_header('src/tag_enum.h', '//') 12 | tag_sizes = open_and_write_header('src/tag_sizes.h', '//') 13 | 14 | tag_py = open_and_write_header('python/gumbo/gumboc_tags.py', '#') 15 | tag_py.write('TagNames = [\n') 16 | 17 | tagfile = open(sys.argv[1]) 18 | 19 | for tag in tagfile: 20 | tag = tag.strip() 21 | tag_upper = tag.upper().replace('-', '_') 22 | tag_strings.write('"%s",\n' % tag) 23 | tag_enum.write('GUMBO_TAG_%s,\n' % tag_upper) 24 | tag_sizes.write('%d, ' % len(tag)) 25 | tag_py.write(' "%s",\n' % tag_upper) 26 | 27 | tagfile.close() 28 | 29 | tag_strings.close() 30 | tag_enum.close() 31 | tag_sizes.close() 32 | 33 | tag_py.write(']\n') 34 | tag_py.close() 35 | -------------------------------------------------------------------------------- /src/attribute.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | 17 | #ifndef GUMBO_ATTRIBUTE_H_ 18 | #define GUMBO_ATTRIBUTE_H_ 19 | 20 | #include "gumbo.h" 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | struct GumboInternalParser; 27 | 28 | // Release the memory used for an GumboAttribute, including the attribute 29 | // itself. 30 | void gumbo_destroy_attribute( 31 | struct GumboInternalParser* parser, GumboAttribute* attribute); 32 | 33 | #ifdef __cplusplus 34 | } 35 | #endif 36 | 37 | #endif // GUMBO_ATTRIBUTE_H_ 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compilation artifacts 2 | *.o 3 | *.lo 4 | *.la 5 | 6 | # Editor swap files 7 | *.swp 8 | *.swo 9 | *.swn 10 | 11 | #emacs editor leftovers 12 | *.*~ 13 | 14 | #diff leftovers 15 | *.orig 16 | 17 | # gtest pieces 18 | gtest 19 | gtest-1.7.0 20 | 21 | # Other build artifacts 22 | /Debug 23 | /visualc/Debug 24 | /visualc/Release 25 | /visualc/gumbo.sdf 26 | /visualc/gumbo.opensdf 27 | /build 28 | .log 29 | .sdf 30 | .opensdf 31 | .deps 32 | .dirstamp 33 | .libs 34 | Makefile 35 | Makefile.in 36 | aclocal.m4 37 | autom4te.cache 38 | compile 39 | config.guess 40 | config.log 41 | config.status 42 | config.sub 43 | configure 44 | depcomp 45 | gumbo.pc 46 | gumbo_test 47 | gumbo_test.log 48 | gumbo_test.trs 49 | install-sh 50 | libtool 51 | ltmain.sh 52 | m4/ 53 | missing 54 | test-driver 55 | test-suite.log 56 | 57 | # gyp android artifacts 58 | gumbo_parser.target.mk 59 | 60 | # `make dist` artifacts 61 | /gumbo-[0-9].[0-9].tar.gz 62 | /gumbo-[0-9].[0-9]/ 63 | 64 | # Python dist artifacts 65 | *.pyc 66 | *.dylib 67 | dist 68 | build 69 | python/gumbo.egg-info 70 | python/gumbo/libgumbo.so 71 | 72 | # Example binaries 73 | benchmark 74 | clean_text 75 | find_links 76 | get_title 77 | positions_of_class 78 | prettyprint 79 | serialize 80 | -------------------------------------------------------------------------------- /src/token_type.h: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | 17 | #ifndef GUMBO_TOKEN_TYPE_H_ 18 | #define GUMBO_TOKEN_TYPE_H_ 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | // An enum representing the type of token. 25 | typedef enum { 26 | GUMBO_TOKEN_DOCTYPE, 27 | GUMBO_TOKEN_START_TAG, 28 | GUMBO_TOKEN_END_TAG, 29 | GUMBO_TOKEN_COMMENT, 30 | GUMBO_TOKEN_WHITESPACE, 31 | GUMBO_TOKEN_CHARACTER, 32 | GUMBO_TOKEN_CDATA, 33 | GUMBO_TOKEN_NULL, 34 | GUMBO_TOKEN_EOF 35 | } GumboTokenType; 36 | 37 | #ifdef __cplusplus 38 | } // extern C 39 | #endif 40 | 41 | #endif // GUMBO_TOKEN_TYPE_H_ 42 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (c) 2013, Ben Noordhuis 4 | # 5 | # Permission to use, copy, modify, and/or distribute this software for any 6 | # purpose with or without fee is hereby granted, provided that the above 7 | # copyright notice and this permission notice appear in all copies. 8 | # 9 | # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 | # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 | # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 | # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 | 17 | if test -z "$LIBTOOLIZE" -a "`uname`" = "Darwin"; then 18 | if command -v "glibtoolize" >/dev/null; then 19 | LIBTOOLIZE=glibtoolize 20 | elif command -v "libtoolize" >/dev/null; then 21 | LIBTOOLIZE=libtoolize 22 | else 23 | echo "autogen.sh: line $LINENO: command glibtoolize or libtoolize not found" 24 | exit 1 25 | fi 26 | fi 27 | 28 | set -ex 29 | ${LIBTOOLIZE:-libtoolize} 30 | ${ACLOCAL:-aclocal -I m4} 31 | ${AUTOCONF:-autoconf} 32 | ${AUTOMAKE:-automake} --add-missing 33 | -------------------------------------------------------------------------------- /src/string_piece.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | 17 | #ifndef GUMBO_STRING_PIECE_H_ 18 | #define GUMBO_STRING_PIECE_H_ 19 | 20 | #include "gumbo.h" 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | struct GumboInternalParser; 27 | 28 | // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the 29 | // destination and copying over the characters from source. Dest should be 30 | // empty, with no buffer allocated; otherwise, this leaks it. 31 | void gumbo_string_copy(struct GumboInternalParser* parser, 32 | GumboStringPiece* dest, const GumboStringPiece* source); 33 | 34 | #ifdef __cplusplus 35 | } 36 | #endif 37 | 38 | #endif // GUMBO_STRING_PIECE_H_ 39 | -------------------------------------------------------------------------------- /python/gumbo/__init__.py: -------------------------------------------------------------------------------- 1 | """Gumbo HTML parser. 2 | 3 | These are the Python bindings for Gumbo. All public API classes and functions 4 | are exported from this module. They include: 5 | 6 | - CTypes representations of all structs and enums defined in gumbo.h. The 7 | naming convention is to take the C name and strip off the "Gumbo" prefix. 8 | 9 | - A low-level wrapper around the gumbo_parse function, returning the classes 10 | exposed above. Usage: 11 | 12 | import gumbo 13 | with gumboc.parse(text, **options) as output: 14 | do_stuff_with_doctype(output.document) 15 | do_stuff_with_parse_tree(output.root) 16 | 17 | - Higher-level bindings that mimic the API provided by html5lib. Usage: 18 | 19 | from gumbo import html5lib 20 | 21 | This requires that html5lib be installed (it uses their treebuilders), and is 22 | intended as a drop-in replacement. 23 | 24 | - Similarly, higher-level bindings that mimic BeautifulSoup and return 25 | BeautifulSoup objects. For this, use: 26 | 27 | import gumbo 28 | soup = gumbo.soup_parse(text, **options) 29 | 30 | It will give you back a soup object like BeautifulSoup.BeautifulSoup(text). 31 | """ 32 | 33 | from gumbo.gumboc import * 34 | 35 | try: 36 | from gumbo import html5lib_adapter as html5lib 37 | except ImportError: 38 | # html5lib not installed 39 | pass 40 | 41 | try: 42 | from gumbo.soup_adapter import parse as soup_parse 43 | except ImportError: 44 | # BeautifulSoup not installed 45 | pass 46 | -------------------------------------------------------------------------------- /src/attribute.c: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | 17 | #include "attribute.h" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "util.h" 25 | 26 | struct GumboInternalParser; 27 | 28 | GumboAttribute* gumbo_get_attribute( 29 | const GumboVector* attributes, const char* name) { 30 | for (unsigned int i = 0; i < attributes->length; ++i) { 31 | GumboAttribute* attr = attributes->data[i]; 32 | if (!strcasecmp(attr->name, name)) { 33 | return attr; 34 | } 35 | } 36 | return NULL; 37 | } 38 | 39 | void gumbo_destroy_attribute( 40 | struct GumboInternalParser* parser, GumboAttribute* attribute) { 41 | gumbo_parser_deallocate(parser, (void*) attribute->name); 42 | gumbo_parser_deallocate(parser, (void*) attribute->value); 43 | gumbo_parser_deallocate(parser, (void*) attribute); 44 | } 45 | -------------------------------------------------------------------------------- /tests/attribute.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | 17 | #include "attribute.h" 18 | 19 | #include 20 | #include 21 | 22 | #include "gtest/gtest.h" 23 | #include "test_utils.h" 24 | #include "vector.h" 25 | 26 | namespace { 27 | 28 | class GumboAttributeTest : public GumboTest { 29 | protected: 30 | GumboAttributeTest() { gumbo_vector_init(&parser_, 2, &vector_); } 31 | 32 | ~GumboAttributeTest() { gumbo_vector_destroy(&parser_, &vector_); } 33 | 34 | GumboVector vector_; 35 | }; 36 | 37 | TEST_F(GumboAttributeTest, GetAttribute) { 38 | GumboAttribute attr1; 39 | GumboAttribute attr2; 40 | attr1.name = ""; 41 | attr2.name = "foo"; 42 | 43 | gumbo_vector_add(&parser_, &attr1, &vector_); 44 | gumbo_vector_add(&parser_, &attr2, &vector_); 45 | EXPECT_EQ(&attr2, gumbo_get_attribute(&vector_, "foo")); 46 | EXPECT_EQ(NULL, gumbo_get_attribute(&vector_, "bar")); 47 | } 48 | 49 | } // namespace 50 | -------------------------------------------------------------------------------- /src/tag.in: -------------------------------------------------------------------------------- 1 | html 2 | head 3 | title 4 | base 5 | link 6 | meta 7 | style 8 | script 9 | noscript 10 | template 11 | body 12 | article 13 | section 14 | nav 15 | aside 16 | h1 17 | h2 18 | h3 19 | h4 20 | h5 21 | h6 22 | hgroup 23 | header 24 | footer 25 | address 26 | p 27 | hr 28 | pre 29 | blockquote 30 | ol 31 | ul 32 | li 33 | dl 34 | dt 35 | dd 36 | figure 37 | figcaption 38 | main 39 | div 40 | a 41 | em 42 | strong 43 | small 44 | s 45 | cite 46 | q 47 | dfn 48 | abbr 49 | data 50 | time 51 | code 52 | var 53 | samp 54 | kbd 55 | sub 56 | sup 57 | i 58 | b 59 | u 60 | mark 61 | ruby 62 | rt 63 | rp 64 | bdi 65 | bdo 66 | span 67 | br 68 | wbr 69 | ins 70 | del 71 | image 72 | img 73 | iframe 74 | embed 75 | object 76 | param 77 | video 78 | audio 79 | source 80 | track 81 | canvas 82 | map 83 | area 84 | math 85 | mi 86 | mo 87 | mn 88 | ms 89 | mtext 90 | mglyph 91 | malignmark 92 | annotation-xml 93 | svg 94 | foreignobject 95 | desc 96 | table 97 | caption 98 | colgroup 99 | col 100 | tbody 101 | thead 102 | tfoot 103 | tr 104 | td 105 | th 106 | form 107 | fieldset 108 | legend 109 | label 110 | input 111 | button 112 | select 113 | datalist 114 | optgroup 115 | option 116 | textarea 117 | keygen 118 | output 119 | progress 120 | meter 121 | details 122 | summary 123 | menu 124 | menuitem 125 | applet 126 | acronym 127 | bgsound 128 | dir 129 | frame 130 | frameset 131 | noframes 132 | isindex 133 | listing 134 | xmp 135 | nextid 136 | noembed 137 | plaintext 138 | rb 139 | strike 140 | basefont 141 | big 142 | blink 143 | center 144 | font 145 | marquee 146 | multicol 147 | nobr 148 | spacer 149 | tt 150 | rtc 151 | -------------------------------------------------------------------------------- /src/string_piece.c: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | 17 | #include "string_piece.h" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "util.h" 25 | 26 | struct GumboInternalParser; 27 | 28 | const GumboStringPiece kGumboEmptyString = {NULL, 0}; 29 | 30 | bool gumbo_string_equals( 31 | const GumboStringPiece* str1, const GumboStringPiece* str2) { 32 | return str1->length == str2->length && 33 | !memcmp(str1->data, str2->data, str1->length); 34 | } 35 | 36 | bool gumbo_string_equals_ignore_case( 37 | const GumboStringPiece* str1, const GumboStringPiece* str2) { 38 | return str1->length == str2->length && 39 | !strncasecmp(str1->data, str2->data, str1->length); 40 | } 41 | 42 | void gumbo_string_copy(struct GumboInternalParser* parser, 43 | GumboStringPiece* dest, const GumboStringPiece* source) { 44 | dest->length = source->length; 45 | char* buffer = gumbo_parser_allocate(parser, source->length); 46 | memcpy(buffer, source->data, source->length); 47 | dest->data = buffer; 48 | } 49 | -------------------------------------------------------------------------------- /src/util.c: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | 17 | #include "util.h" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "gumbo.h" 27 | #include "parser.h" 28 | 29 | // TODO(jdtang): This should be elsewhere, but there's no .c file for 30 | // SourcePositions and yet the constant needs some linkage, so this is as good 31 | // as any. 32 | const GumboSourcePosition kGumboEmptySourcePosition = {0, 0, 0}; 33 | 34 | void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) { 35 | return parser->_options->allocator(parser->_options->userdata, num_bytes); 36 | } 37 | 38 | void gumbo_parser_deallocate(GumboParser* parser, void* ptr) { 39 | parser->_options->deallocator(parser->_options->userdata, ptr); 40 | } 41 | 42 | char* gumbo_copy_stringz(GumboParser* parser, const char* str) { 43 | char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1); 44 | strcpy(buffer, str); 45 | return buffer; 46 | } 47 | 48 | // Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG 49 | // to use. 50 | void gumbo_debug(const char* format, ...) { 51 | #ifdef GUMBO_DEBUG 52 | va_list args; 53 | va_start(args, format); 54 | vprintf(format, args); 55 | va_end(args); 56 | fflush(stdout); 57 | #endif 58 | } 59 | -------------------------------------------------------------------------------- /src/insertion_mode.h: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | 17 | #ifndef GUMBO_INSERTION_MODE_H_ 18 | #define GUMBO_INSERTION_MODE_H_ 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode 25 | // If new enum values are added, be sure to update the kTokenHandlers dispatch 26 | // table in parser.c. 27 | typedef enum { 28 | GUMBO_INSERTION_MODE_INITIAL, 29 | GUMBO_INSERTION_MODE_BEFORE_HTML, 30 | GUMBO_INSERTION_MODE_BEFORE_HEAD, 31 | GUMBO_INSERTION_MODE_IN_HEAD, 32 | GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT, 33 | GUMBO_INSERTION_MODE_AFTER_HEAD, 34 | GUMBO_INSERTION_MODE_IN_BODY, 35 | GUMBO_INSERTION_MODE_TEXT, 36 | GUMBO_INSERTION_MODE_IN_TABLE, 37 | GUMBO_INSERTION_MODE_IN_TABLE_TEXT, 38 | GUMBO_INSERTION_MODE_IN_CAPTION, 39 | GUMBO_INSERTION_MODE_IN_COLUMN_GROUP, 40 | GUMBO_INSERTION_MODE_IN_TABLE_BODY, 41 | GUMBO_INSERTION_MODE_IN_ROW, 42 | GUMBO_INSERTION_MODE_IN_CELL, 43 | GUMBO_INSERTION_MODE_IN_SELECT, 44 | GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE, 45 | GUMBO_INSERTION_MODE_IN_TEMPLATE, 46 | GUMBO_INSERTION_MODE_AFTER_BODY, 47 | GUMBO_INSERTION_MODE_IN_FRAMESET, 48 | GUMBO_INSERTION_MODE_AFTER_FRAMESET, 49 | GUMBO_INSERTION_MODE_AFTER_AFTER_BODY, 50 | GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET 51 | } GumboInsertionMode; 52 | 53 | #ifdef __cplusplus 54 | } // extern C 55 | #endif 56 | 57 | #endif // GUMBO_INSERTION_MODE_H_ 58 | -------------------------------------------------------------------------------- /examples/find_links.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | // 17 | // Finds the URLs of all links in the page. 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "gumbo.h" 26 | 27 | static void search_for_links(GumboNode* node) { 28 | if (node->type != GUMBO_NODE_ELEMENT) { 29 | return; 30 | } 31 | GumboAttribute* href; 32 | if (node->v.element.tag == GUMBO_TAG_A && 33 | (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) { 34 | std::cout << href->value << std::endl; 35 | } 36 | 37 | GumboVector* children = &node->v.element.children; 38 | for (unsigned int i = 0; i < children->length; ++i) { 39 | search_for_links(static_cast(children->data[i])); 40 | } 41 | } 42 | 43 | int main(int argc, char** argv) { 44 | if (argc != 2) { 45 | std::cout << "Usage: find_links .\n"; 46 | exit(EXIT_FAILURE); 47 | } 48 | const char* filename = argv[1]; 49 | 50 | std::ifstream in(filename, std::ios::in | std::ios::binary); 51 | if (!in) { 52 | std::cout << "File " << filename << " not found!\n"; 53 | exit(EXIT_FAILURE); 54 | } 55 | 56 | std::string contents; 57 | in.seekg(0, std::ios::end); 58 | contents.resize(in.tellg()); 59 | in.seekg(0, std::ios::beg); 60 | in.read(&contents[0], contents.size()); 61 | in.close(); 62 | 63 | GumboOutput* output = gumbo_parse(contents.c_str()); 64 | search_for_links(output->root); 65 | gumbo_destroy_output(&kGumboDefaultOptions, output); 66 | } 67 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -1 5 | AlignAfterOpenBracket: false 6 | AlignEscapedNewlinesLeft: true 7 | AlignOperands: true 8 | AlignTrailingComments: true 9 | AllowAllParametersOfDeclarationOnNextLine: true 10 | AllowShortBlocksOnASingleLine: false 11 | AllowShortCaseLabelsOnASingleLine: false 12 | AllowShortIfStatementsOnASingleLine: true 13 | AllowShortLoopsOnASingleLine: true 14 | AllowShortFunctionsOnASingleLine: All 15 | AlwaysBreakAfterDefinitionReturnType: false 16 | AlwaysBreakTemplateDeclarations: true 17 | AlwaysBreakBeforeMultilineStrings: true 18 | BreakBeforeBinaryOperators: None 19 | BreakBeforeTernaryOperators: true 20 | BreakConstructorInitializersBeforeComma: false 21 | BinPackParameters: true 22 | BinPackArguments: true 23 | ColumnLimit: 80 24 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 25 | ConstructorInitializerIndentWidth: 4 26 | DerivePointerAlignment: true 27 | ExperimentalAutoDetectBinPacking: false 28 | IndentCaseLabels: true 29 | IndentWrappedFunctionNames: false 30 | IndentFunctionDeclarationAfterType: false 31 | MaxEmptyLinesToKeep: 1 32 | KeepEmptyLinesAtTheStartOfBlocks: false 33 | NamespaceIndentation: None 34 | ObjCBlockIndentWidth: 2 35 | ObjCSpaceAfterProperty: false 36 | ObjCSpaceBeforeProtocolList: false 37 | PenaltyBreakBeforeFirstCallParameter: 1 38 | PenaltyBreakComment: 300 39 | PenaltyBreakString: 1000 40 | PenaltyBreakFirstLessLess: 120 41 | PenaltyExcessCharacter: 1000000 42 | PenaltyReturnTypeOnItsOwnLine: 200 43 | PointerAlignment: Left 44 | SpacesBeforeTrailingComments: 2 45 | Cpp11BracedListStyle: true 46 | Standard: Auto 47 | IndentWidth: 2 48 | TabWidth: 8 49 | UseTab: Never 50 | BreakBeforeBraces: Attach 51 | SpacesInParentheses: false 52 | SpacesInSquareBrackets: false 53 | SpacesInAngles: false 54 | SpaceInEmptyParentheses: false 55 | SpacesInCStyleCastParentheses: false 56 | SpaceAfterCStyleCast: true 57 | SpacesInContainerLiterals: true 58 | SpaceBeforeAssignmentOperators: true 59 | ContinuationIndentWidth: 4 60 | CommentPragmas: '^ IWYU pragma:' 61 | ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] 62 | SpaceBeforeParens: ControlStatements 63 | DisableFormat: false 64 | ... 65 | 66 | -------------------------------------------------------------------------------- /src/tag_strings.h: -------------------------------------------------------------------------------- 1 | // Generated via `gentags.py src/tag.in`. 2 | // Do not edit; edit src/tag.in instead. 3 | // clang-format off 4 | "html", 5 | "head", 6 | "title", 7 | "base", 8 | "link", 9 | "meta", 10 | "style", 11 | "script", 12 | "noscript", 13 | "template", 14 | "body", 15 | "article", 16 | "section", 17 | "nav", 18 | "aside", 19 | "h1", 20 | "h2", 21 | "h3", 22 | "h4", 23 | "h5", 24 | "h6", 25 | "hgroup", 26 | "header", 27 | "footer", 28 | "address", 29 | "p", 30 | "hr", 31 | "pre", 32 | "blockquote", 33 | "ol", 34 | "ul", 35 | "li", 36 | "dl", 37 | "dt", 38 | "dd", 39 | "figure", 40 | "figcaption", 41 | "main", 42 | "div", 43 | "a", 44 | "em", 45 | "strong", 46 | "small", 47 | "s", 48 | "cite", 49 | "q", 50 | "dfn", 51 | "abbr", 52 | "data", 53 | "time", 54 | "code", 55 | "var", 56 | "samp", 57 | "kbd", 58 | "sub", 59 | "sup", 60 | "i", 61 | "b", 62 | "u", 63 | "mark", 64 | "ruby", 65 | "rt", 66 | "rp", 67 | "bdi", 68 | "bdo", 69 | "span", 70 | "br", 71 | "wbr", 72 | "ins", 73 | "del", 74 | "image", 75 | "img", 76 | "iframe", 77 | "embed", 78 | "object", 79 | "param", 80 | "video", 81 | "audio", 82 | "source", 83 | "track", 84 | "canvas", 85 | "map", 86 | "area", 87 | "math", 88 | "mi", 89 | "mo", 90 | "mn", 91 | "ms", 92 | "mtext", 93 | "mglyph", 94 | "malignmark", 95 | "annotation-xml", 96 | "svg", 97 | "foreignobject", 98 | "desc", 99 | "table", 100 | "caption", 101 | "colgroup", 102 | "col", 103 | "tbody", 104 | "thead", 105 | "tfoot", 106 | "tr", 107 | "td", 108 | "th", 109 | "form", 110 | "fieldset", 111 | "legend", 112 | "label", 113 | "input", 114 | "button", 115 | "select", 116 | "datalist", 117 | "optgroup", 118 | "option", 119 | "textarea", 120 | "keygen", 121 | "output", 122 | "progress", 123 | "meter", 124 | "details", 125 | "summary", 126 | "menu", 127 | "menuitem", 128 | "applet", 129 | "acronym", 130 | "bgsound", 131 | "dir", 132 | "frame", 133 | "frameset", 134 | "noframes", 135 | "isindex", 136 | "listing", 137 | "xmp", 138 | "nextid", 139 | "noembed", 140 | "plaintext", 141 | "rb", 142 | "strike", 143 | "basefont", 144 | "big", 145 | "blink", 146 | "center", 147 | "font", 148 | "marquee", 149 | "multicol", 150 | "nobr", 151 | "spacer", 152 | "tt", 153 | "rtc", 154 | -------------------------------------------------------------------------------- /src/util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | // 17 | // This contains some utility functions that didn't fit into any of the other 18 | // headers. 19 | 20 | #ifndef GUMBO_UTIL_H_ 21 | #define GUMBO_UTIL_H_ 22 | #ifdef _MSC_VER 23 | #define _CRT_SECURE_NO_WARNINGS 24 | #endif 25 | #include 26 | #include 27 | 28 | #ifdef __cplusplus 29 | extern "C" { 30 | #endif 31 | 32 | // Forward declaration since it's passed into some of the functions in this 33 | // header. 34 | struct GumboInternalParser; 35 | 36 | // Utility function for allocating & copying a null-terminated string into a 37 | // freshly-allocated buffer. This is necessary for proper memory management; we 38 | // have the convention that all const char* in parse tree structures are 39 | // freshly-allocated, so if we didn't copy, we'd try to delete a literal string 40 | // when the parse tree is destroyed. 41 | char* gumbo_copy_stringz(struct GumboInternalParser* parser, const char* str); 42 | 43 | // Allocate a chunk of memory, using the allocator specified in the Parser's 44 | // config options. 45 | void* gumbo_parser_allocate( 46 | struct GumboInternalParser* parser, size_t num_bytes); 47 | 48 | // Deallocate a chunk of memory, using the deallocator specified in the Parser's 49 | // config options. 50 | void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr); 51 | 52 | // Debug wrapper for printf, to make it easier to turn off debugging info when 53 | // required. 54 | void gumbo_debug(const char* format, ...); 55 | 56 | #ifdef __cplusplus 57 | } 58 | #endif 59 | 60 | #endif // GUMBO_UTIL_H_ 61 | -------------------------------------------------------------------------------- /src/parser.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | // 17 | // Contains the definition of the top-level GumboParser structure that's 18 | // threaded through basically every internal function in the library. 19 | 20 | #ifndef GUMBO_PARSER_H_ 21 | #define GUMBO_PARSER_H_ 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | struct GumboInternalParserState; 28 | struct GumboInternalOutput; 29 | struct GumboInternalOptions; 30 | struct GumboInternalTokenizerState; 31 | 32 | // An overarching struct that's threaded through (nearly) all functions in the 33 | // library, OOP-style. This gives each function access to the options and 34 | // output, along with any internal state needed for the parse. 35 | typedef struct GumboInternalParser { 36 | // Settings for this parse run. 37 | const struct GumboInternalOptions* _options; 38 | 39 | // Output for the parse. 40 | struct GumboInternalOutput* _output; 41 | 42 | // The internal tokenizer state, defined as a pointer to avoid a cyclic 43 | // dependency on html5tokenizer.h. The main parse routine is responsible for 44 | // initializing this on parse start, and destroying it on parse end. 45 | // End-users will never see a non-garbage value in this pointer. 46 | struct GumboInternalTokenizerState* _tokenizer_state; 47 | 48 | // The internal parser state. Initialized on parse start and destroyed on 49 | // parse end; end-users will never see a non-garbage value in this pointer. 50 | struct GumboInternalParserState* _parser_state; 51 | } GumboParser; 52 | 53 | #ifdef __cplusplus 54 | } 55 | #endif 56 | 57 | #endif // GUMBO_PARSER_H_ 58 | -------------------------------------------------------------------------------- /gumbo_parser.gyp: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | { 16 | 'targets': [ 17 | { 18 | 'target_name': 'gumbo_parser', 19 | 'type': 'static_library', 20 | 'cflags': ['-std=c99', '-Wall'], 21 | 'sources': [ 22 | 'src/attribute.c', 23 | 'src/attribute.h', 24 | 'src/char_ref.c', 25 | 'src/char_ref.h', 26 | 'src/error.c', 27 | 'src/error.h', 28 | 'src/gumbo.h', 29 | 'src/insertion_mode.h', 30 | 'src/parser.c', 31 | 'src/parser.h', 32 | 'src/string_buffer.c', 33 | 'src/string_buffer.h', 34 | 'src/string_piece.c', 35 | 'src/string_piece.h', 36 | 'src/tag.c', 37 | 'src/token_type.h', 38 | 'src/tokenizer.c', 39 | 'src/tokenizer.h', 40 | 'src/tokenizer_states.h', 41 | 'src/utf8.c', 42 | 'src/utf8.h', 43 | 'src/util.c', 44 | 'src/util.h', 45 | 'src/vector.c', 46 | 'src/vector.h', 47 | ], 48 | }, 49 | { 50 | 'target_name': 'gumbo_parser_unittests', 51 | 'type': 'executable', 52 | 'dependencies': [ 53 | 'gtest.gyp:gtest', 54 | 'gtest.gyp:gtest_main', 55 | 'gumbo_parser', 56 | ], 57 | 'include_dirs': [ 58 | '.', 59 | '..', 60 | 'src', 61 | ], 62 | 'sources': [ 63 | 'tests/attribute.cc', 64 | 'tests/char_ref.cc', 65 | 'tests/parser.cc', 66 | 'tests/string_buffer.cc', 67 | 'tests/string_piece.cc', 68 | 'tests/test_utils.cc', 69 | 'tests/test_utils.h', 70 | 'tests/tokenizer.cc', 71 | 'tests/utf8.cc', 72 | 'tests/vector.cc', 73 | ], 74 | }, 75 | ], 76 | } 77 | -------------------------------------------------------------------------------- /python/gumbo/soup_adapter_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | """Tests for the Gumbo's BeautifulSoup Python adapter.""" 16 | 17 | __author__ = 'jdtang@google.com (Jonathan Tang)' 18 | 19 | import unittest 20 | 21 | import soup_adapter 22 | 23 | 24 | class SoupAdapterTest(unittest.TestCase): 25 | 26 | def testSimpleParse(self): 27 | soup = soup_adapter.parse( 28 | """ 29 | 35 | """) 36 | 37 | head = soup.head 38 | self.assertEquals(soup, head.parent.parent) 39 | self.assertEquals(u'head', head.name) 40 | self.assertEquals(0, len(head)) 41 | 42 | body = soup.body 43 | self.assertEquals(head, body.previousSibling) 44 | self.assertEquals(2, len(body)) #
    + trailing whitespace 45 | self.assertEquals(u'ul', body.contents[0].name) 46 | self.assertEquals(body, head.next) 47 | self.assertEquals(head, body.previous) 48 | 49 | list_items = body.findAll('li') 50 | self.assertEquals(4, len(list_items)) 51 | 52 | evens = body('li', 'even') 53 | self.assertEquals(2, len(evens)) 54 | 55 | a2 = body.find('a', href='two.html') 56 | self.assertEquals(u'a', a2.name) 57 | self.assertEquals(u'Two', a2.contents[0]) 58 | self.assertEquals(a2, evens[0].next) 59 | self.assertEquals(evens[0], a2.previous) 60 | 61 | li2 = a2.parent 62 | self.assertEquals(u'li', li2.name) 63 | self.assertEquals(u'even', li2['class']) 64 | self.assertEquals(list_items[1], li2) 65 | self.assertEquals(evens[0], li2) 66 | 67 | if __name__ == '__main__': 68 | unittest.main() 69 | -------------------------------------------------------------------------------- /examples/clean_text.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | // 17 | // Gets the cleantext of a page. 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "gumbo.h" 25 | 26 | static std::string cleantext(GumboNode* node) { 27 | if (node->type == GUMBO_NODE_TEXT) { 28 | return std::string(node->v.text.text); 29 | } else if (node->type == GUMBO_NODE_ELEMENT && 30 | node->v.element.tag != GUMBO_TAG_SCRIPT && 31 | node->v.element.tag != GUMBO_TAG_STYLE) { 32 | std::string contents = ""; 33 | GumboVector* children = &node->v.element.children; 34 | for (unsigned int i = 0; i < children->length; ++i) { 35 | const std::string text = cleantext((GumboNode*) children->data[i]); 36 | if (i != 0 && !text.empty()) { 37 | contents.append(" "); 38 | } 39 | contents.append(text); 40 | } 41 | return contents; 42 | } else { 43 | return ""; 44 | } 45 | } 46 | 47 | int main(int argc, char** argv) { 48 | if (argc != 2) { 49 | std::cout << "Usage: clean_text \n"; 50 | exit(EXIT_FAILURE); 51 | } 52 | const char* filename = argv[1]; 53 | 54 | std::ifstream in(filename, std::ios::in | std::ios::binary); 55 | if (!in) { 56 | std::cout << "File " << filename << " not found!\n"; 57 | exit(EXIT_FAILURE); 58 | } 59 | 60 | std::string contents; 61 | in.seekg(0, std::ios::end); 62 | contents.resize(in.tellg()); 63 | in.seekg(0, std::ios::beg); 64 | in.read(&contents[0], contents.size()); 65 | in.close(); 66 | 67 | GumboOutput* output = gumbo_parse(contents.c_str()); 68 | std::cout << cleantext(output->root) << std::endl; 69 | gumbo_destroy_output(&kGumboDefaultOptions, output); 70 | } 71 | -------------------------------------------------------------------------------- /src/char_ref.h: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | // 17 | // Internal header for character reference handling; this should not be exposed 18 | // transitively by any public API header. This is why the functions aren't 19 | // namespaced. 20 | 21 | #ifndef GUMBO_CHAR_REF_H_ 22 | #define GUMBO_CHAR_REF_H_ 23 | 24 | #include 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | struct GumboInternalParser; 31 | struct GumboInternalUtf8Iterator; 32 | 33 | // Value that indicates no character was produced. 34 | extern const int kGumboNoChar; 35 | 36 | // Certain named character references generate two codepoints, not one, and so 37 | // the consume_char_ref subroutine needs to return this instead of an int. The 38 | // first field will be kGumboNoChar if no character reference was found; the 39 | // second field will be kGumboNoChar if that is the case or if the character 40 | // reference returns only a single codepoint. 41 | typedef struct { 42 | int first; 43 | int second; 44 | } OneOrTwoCodepoints; 45 | 46 | // Implements the "consume a character reference" section of the spec. 47 | // This reads in characters from the input as necessary, and fills in a 48 | // OneOrTwoCodepoints struct containing the characters read. It may add parse 49 | // errors to the GumboParser's errors vector, if the spec calls for it. Pass a 50 | // space for the "additional allowed char" when the spec says "with no 51 | // additional allowed char". Returns false on parse error, true otherwise. 52 | bool consume_char_ref(struct GumboInternalParser* parser, 53 | struct GumboInternalUtf8Iterator* input, int additional_allowed_char, 54 | bool is_in_attribute, OneOrTwoCodepoints* output); 55 | 56 | #ifdef __cplusplus 57 | } 58 | #endif 59 | 60 | #endif // GUMBO_CHAR_REF_H_ 61 | -------------------------------------------------------------------------------- /benchmarks/benchmark.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | // 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "gumbo.h" 27 | 28 | static const int kNumReps = 10; 29 | 30 | int main(int argc, char** argv) { 31 | if (argc != 1) { 32 | std::cout << "Usage: benchmarks\n"; 33 | exit(EXIT_FAILURE); 34 | } 35 | 36 | DIR* dir; 37 | struct dirent* file; 38 | 39 | if ((dir = opendir("benchmarks")) == NULL) { 40 | std::cout << "Couldn't find 'benchmarks' directory. " 41 | << "Run from root of distribution.\n"; 42 | exit(EXIT_FAILURE); 43 | } 44 | 45 | while ((file = readdir(dir)) != NULL) { 46 | std::string filename(file->d_name); 47 | if (filename.length() > 5 && filename.compare(filename.length() - 5, 5, ".html") == 0) { 48 | std::string full_filename = "benchmarks/" + filename; 49 | std::ifstream in(full_filename.c_str(), std::ios::in | std::ios::binary); 50 | if (!in) { 51 | std::cout << "File " << full_filename << " couldn't be read!\n"; 52 | exit(EXIT_FAILURE); 53 | } 54 | 55 | std::string contents; 56 | in.seekg(0, std::ios::end); 57 | contents.resize(in.tellg()); 58 | in.seekg(0, std::ios::beg); 59 | in.read(&contents[0], contents.size()); 60 | in.close(); 61 | 62 | clock_t start_time = clock(); 63 | for (int i = 0; i < kNumReps; ++i) { 64 | GumboOutput* output = gumbo_parse(contents.c_str()); 65 | gumbo_destroy_output(&kGumboDefaultOptions, output); 66 | } 67 | clock_t end_time = clock(); 68 | std::cout << filename << ": " 69 | << (1000000 * (end_time - start_time) / (kNumReps * CLOCKS_PER_SEC)) 70 | << " microseconds.\n"; 71 | } 72 | } 73 | closedir(dir); 74 | } 75 | -------------------------------------------------------------------------------- /python/gumbo/gumboc_tags.py: -------------------------------------------------------------------------------- 1 | # Generated via `gentags.py src/tag.in`. 2 | # Do not edit; edit src/tag.in instead. 3 | # clang-format off 4 | TagNames = [ 5 | "HTML", 6 | "HEAD", 7 | "TITLE", 8 | "BASE", 9 | "LINK", 10 | "META", 11 | "STYLE", 12 | "SCRIPT", 13 | "NOSCRIPT", 14 | "TEMPLATE", 15 | "BODY", 16 | "ARTICLE", 17 | "SECTION", 18 | "NAV", 19 | "ASIDE", 20 | "H1", 21 | "H2", 22 | "H3", 23 | "H4", 24 | "H5", 25 | "H6", 26 | "HGROUP", 27 | "HEADER", 28 | "FOOTER", 29 | "ADDRESS", 30 | "P", 31 | "HR", 32 | "PRE", 33 | "BLOCKQUOTE", 34 | "OL", 35 | "UL", 36 | "LI", 37 | "DL", 38 | "DT", 39 | "DD", 40 | "FIGURE", 41 | "FIGCAPTION", 42 | "MAIN", 43 | "DIV", 44 | "A", 45 | "EM", 46 | "STRONG", 47 | "SMALL", 48 | "S", 49 | "CITE", 50 | "Q", 51 | "DFN", 52 | "ABBR", 53 | "DATA", 54 | "TIME", 55 | "CODE", 56 | "VAR", 57 | "SAMP", 58 | "KBD", 59 | "SUB", 60 | "SUP", 61 | "I", 62 | "B", 63 | "U", 64 | "MARK", 65 | "RUBY", 66 | "RT", 67 | "RP", 68 | "BDI", 69 | "BDO", 70 | "SPAN", 71 | "BR", 72 | "WBR", 73 | "INS", 74 | "DEL", 75 | "IMAGE", 76 | "IMG", 77 | "IFRAME", 78 | "EMBED", 79 | "OBJECT", 80 | "PARAM", 81 | "VIDEO", 82 | "AUDIO", 83 | "SOURCE", 84 | "TRACK", 85 | "CANVAS", 86 | "MAP", 87 | "AREA", 88 | "MATH", 89 | "MI", 90 | "MO", 91 | "MN", 92 | "MS", 93 | "MTEXT", 94 | "MGLYPH", 95 | "MALIGNMARK", 96 | "ANNOTATION_XML", 97 | "SVG", 98 | "FOREIGNOBJECT", 99 | "DESC", 100 | "TABLE", 101 | "CAPTION", 102 | "COLGROUP", 103 | "COL", 104 | "TBODY", 105 | "THEAD", 106 | "TFOOT", 107 | "TR", 108 | "TD", 109 | "TH", 110 | "FORM", 111 | "FIELDSET", 112 | "LEGEND", 113 | "LABEL", 114 | "INPUT", 115 | "BUTTON", 116 | "SELECT", 117 | "DATALIST", 118 | "OPTGROUP", 119 | "OPTION", 120 | "TEXTAREA", 121 | "KEYGEN", 122 | "OUTPUT", 123 | "PROGRESS", 124 | "METER", 125 | "DETAILS", 126 | "SUMMARY", 127 | "MENU", 128 | "MENUITEM", 129 | "APPLET", 130 | "ACRONYM", 131 | "BGSOUND", 132 | "DIR", 133 | "FRAME", 134 | "FRAMESET", 135 | "NOFRAMES", 136 | "ISINDEX", 137 | "LISTING", 138 | "XMP", 139 | "NEXTID", 140 | "NOEMBED", 141 | "PLAINTEXT", 142 | "RB", 143 | "STRIKE", 144 | "BASEFONT", 145 | "BIG", 146 | "BLINK", 147 | "CENTER", 148 | "FONT", 149 | "MARQUEE", 150 | "MULTICOL", 151 | "NOBR", 152 | "SPACER", 153 | "TT", 154 | "RTC", 155 | ] 156 | -------------------------------------------------------------------------------- /src/vector.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // 15 | // Author: jdtang@google.com (Jonathan Tang) 16 | 17 | #ifndef GUMBO_VECTOR_H_ 18 | #define GUMBO_VECTOR_H_ 19 | 20 | #include "gumbo.h" 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | // Forward declaration since it's passed into some of the functions in this 27 | // header. 28 | struct GumboInternalParser; 29 | 30 | // Initializes a new GumboVector with the specified initial capacity. 31 | void gumbo_vector_init(struct GumboInternalParser* parser, 32 | size_t initial_capacity, GumboVector* vector); 33 | 34 | // Frees the memory used by an GumboVector. Does not free the contained 35 | // pointers. 36 | void gumbo_vector_destroy( 37 | struct GumboInternalParser* parser, GumboVector* vector); 38 | 39 | // Adds a new element to an GumboVector. 40 | void gumbo_vector_add( 41 | struct GumboInternalParser* parser, void* element, GumboVector* vector); 42 | 43 | // Removes and returns the element most recently added to the GumboVector. 44 | // Ownership is transferred to caller. Capacity is unchanged. If the vector is 45 | // empty, NULL is returned. 46 | void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector); 47 | 48 | // Inserts an element at a specific index. This is potentially O(N) time, but 49 | // is necessary for some of the spec's behavior. 50 | void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element, 51 | unsigned int index, GumboVector* vector); 52 | 53 | // Removes an element from the vector, or does nothing if the element is not in 54 | // the vector. 55 | void gumbo_vector_remove( 56 | struct GumboInternalParser* parser, void* element, GumboVector* vector); 57 | 58 | // Removes and returns an element at a specific index. Note that this is 59 | // potentially O(N) time and should be used sparingly. 60 | void* gumbo_vector_remove_at(struct GumboInternalParser* parser, 61 | unsigned int index, GumboVector* vector); 62 | 63 | #ifdef __cplusplus 64 | } 65 | #endif 66 | 67 | #endif // GUMBO_VECTOR_H_ 68 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | ## Gumbo 0.10.1 (2015-04-30) 2 | 3 | Same as 0.10.0, but with the version number bumped because the last version-number commit to v0.9.4 makes GitHub think that v0.9.4 is the latest version and so it's not highlighted on the webpage. 4 | 5 | ## Gumbo 0.10.0 (2015-04-30) 6 | 7 | * Full support for `