├── appveyor.yml
├── benchmarks
├── arabic_newspapers.html
└── benchmark.cc
├── visualc
├── gumbo.vcxproj.user
├── include
│ └── strings.h
├── gumbo.vcxproj.filters
└── gumbo.vcxproj
├── README.md
├── .gitmodules
├── gumbo.pc.in
├── src
├── tag_sizes.h
├── attribute.h
├── token_type.h
├── string_piece.h
├── attribute.c
├── tag.in
├── string_piece.c
├── util.c
├── insertion_mode.h
├── tag_strings.h
├── util.h
├── parser.h
├── char_ref.h
├── vector.h
├── tag.c
├── string_buffer.h
├── tag_enum.h
├── tokenizer_states.h
├── string_buffer.c
├── vector.c
├── tokenizer.h
├── utf8.h
├── tag_gperf.h
└── error.h
├── THANKS
├── .travis.yml
├── genperf.py
├── configure.ac
├── gentags.py
├── .gitignore
├── autogen.sh
├── python
└── gumbo
│ ├── __init__.py
│ ├── soup_adapter_test.py
│ ├── gumboc_tags.py
│ ├── soup_adapter.py
│ ├── gumboc_test.py
│ ├── html5lib_adapter.py
│ └── html5lib_adapter_test.py
├── tests
├── attribute.cc
├── string_piece.cc
├── test_utils.h
├── string_buffer.cc
├── vector.cc
├── char_ref.cc
└── test_utils.cc
├── examples
├── find_links.cc
├── clean_text.cc
├── get_title.c
├── positions_of_class.cc
└── serialize.cc
├── .clang-format
├── gumbo_parser.gyp
├── CHANGES.md
├── CONTRIBUTING.md
├── DEBUGGING.md
├── Makefile.am
├── gtest.gyp
├── setup.py
└── original-README.md
/appveyor.yml:
--------------------------------------------------------------------------------
1 | version: 1.0.{build}
2 | build:
3 | project: visualc/gumbo.vcxproj
4 | verbosity: minimal
5 |
--------------------------------------------------------------------------------
/benchmarks/arabic_newspapers.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/gumbo-parser/HEAD/benchmarks/arabic_newspapers.html
--------------------------------------------------------------------------------
/visualc/gumbo.vcxproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/visualc/include/strings.h:
--------------------------------------------------------------------------------
1 | /*Dummy file to satisfy source file dependencies on Windows platform*/
2 | #define strcasecmp _stricmp
3 | #define strncasecmp _strnicmp
4 | #define inline __inline
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Gumbo - A pure-C HTML5 parser.
2 | ============
3 |
4 | This project has been unmaintained since 2016 and should not be used.
5 |
6 | The [original README](original-README.md) is available for historical reference.
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/gtest"]
2 | path = third_party/gtest
3 | url = https://chromium.googlesource.com/external/googletest/
4 | [submodule "testdata"]
5 | path = testdata
6 | url = https://github.com/html5lib/html5lib-tests.git
7 |
--------------------------------------------------------------------------------
/gumbo.pc.in:
--------------------------------------------------------------------------------
1 | prefix=@prefix@
2 | exec_prefix=@exec_prefix@
3 | libdir=@libdir@
4 | includedir=@includedir@
5 |
6 | Name: Gumbo
7 | Description: A fully-compliant HTML5 parser.
8 | Version: @PACKAGE_VERSION@
9 | Libs: -L${libdir} -lgumbo
10 | Libs.private: @LIBS@
11 | Cflags: -I${includedir}
12 |
--------------------------------------------------------------------------------
/src/tag_sizes.h:
--------------------------------------------------------------------------------
1 | // Generated via `gentags.py src/tag.in`.
2 | // Do not edit; edit src/tag.in instead.
3 | // clang-format off
4 | 4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3,
--------------------------------------------------------------------------------
/THANKS:
--------------------------------------------------------------------------------
1 | Gumbo HTML parser THANKS file
2 |
3 | Gumbo was originally written by Jonathan Tang, but many people helped out through suggestions, question-answering, code reviews, bugfixes, and organizational support. Here is a list of these people. Help me keep it complete and exempt of errors.
4 |
5 | Adam Barth
6 | Adam Roben
7 | Ben Noordhuis
8 | Bowen Han
9 | Constantinos Michael
10 | Craig Barnes
11 | Geoffrey Sneddon
12 | Ian Hickson
13 | Jack Deng
14 | Joel Low
15 | Jonathan Shneier
16 | Kevin Hendricks
17 | Mason Tang
18 | Maxim Zakharov
19 | Michal Zalewski
20 | Neal Norwitz
21 | Othar Hansson
22 | Ryan Grove
23 | Stefan Haustein
24 | Steffen Meschkat
25 | Steven Kabbes
26 | Thiago Farina
27 | Vicent Marti
28 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: c++
2 |
3 | compiler:
4 | - gcc
5 | - clang
6 |
7 | os:
8 | - linux
9 | - osx
10 |
11 | install:
12 | - wget 'https://googletest.googlecode.com/files/gtest-1.7.0.zip'
13 | - unzip gtest-1.7.0.zip
14 | - ln -s gtest-1.7.0 gtest
15 | - sudo pip install BeautifulSoup
16 | - sudo pip install html5lib==0.95
17 |
18 | script:
19 | - ./autogen.sh && ./configure && make && make check
20 | - python python/gumbo/gumboc_test.py
21 | - python python/gumbo/html5lib_adapter_test.py
22 | - python python/gumbo/soup_adapter_test.py
23 | - sudo make install
24 | - g++ examples/clean_text.cc `pkg-config --cflags --libs gumbo`
25 | - sudo python setup.py sdist install
26 | - python -c 'import gumbo; gumbo.parse("Foo")'
27 |
--------------------------------------------------------------------------------
/genperf.py:
--------------------------------------------------------------------------------
1 | import sys, re
2 |
3 | gperf_output = sys.stdin.read()
4 |
5 | hash_func = re.search(
6 | "static unsigned int\s+hash\s+\((.*?)\)\s{(.*?)\n}",
7 | gperf_output, re.DOTALL)
8 |
9 | if not hash_func:
10 | raise "Failed to detect hash function in GPerf output"
11 |
12 | wordlist = re.search(
13 | "wordlist\[\]\s+=\s+{(.*?)}",
14 | gperf_output, re.DOTALL)
15 |
16 | if not wordlist:
17 | raise "Failed to detect word list in GPerf output"
18 |
19 | def process_wordlist(text):
20 | wordlist = [w.strip().replace('"', '') for w in text.split(',')]
21 | taglist = [
22 | "\tGUMBO_TAG_" + (w.upper().replace('-', '_') if w else 'LAST')
23 | for w in wordlist]
24 | return taglist
25 |
26 | print "static unsigned int tag_hash(%s)\n{%s\n}" % (
27 | hash_func.group(1), hash_func.group(2))
28 | print ""
29 | print "static const unsigned char kGumboTagMap[] = {\n%s\n};" % (
30 | ',\n'.join(process_wordlist(wordlist.group(1))))
31 |
--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
1 | # -*- Autoconf -*-
2 | # Process this file with autoconf to produce a configure script.
3 |
4 | AC_PREREQ([2.65])
5 | AC_INIT([gumbo], [0.10.1], [jonathan.d.tang@gmail.com])
6 | AC_CONFIG_MACRO_DIR([m4])
7 | AC_CONFIG_SRCDIR([src/parser.c])
8 | #AC_CONFIG_HEADERS([config.h])
9 | AC_CONFIG_FILES([Makefile gumbo.pc])
10 |
11 | # Checks for programs.
12 | AC_PROG_CXX
13 | AC_PROG_CC_C99
14 |
15 | # Checks for libraries.
16 |
17 | # Checks for header files.
18 | AC_CHECK_HEADERS([stddef.h stdlib.h string.h strings.h])
19 |
20 | # Checks for typedefs, structures, and compiler characteristics.
21 | AC_C_INLINE
22 | AC_TYPE_SIZE_T
23 |
24 | # Checks for library functions.
25 | AC_CHECK_LIB([gtest_main],
26 | [main],
27 | AM_CONDITIONAL(HAVE_SHARED_LIBGTEST, [true]),
28 | AM_CONDITIONAL(HAVE_SHARED_LIBGTEST, [false]))
29 |
30 | # Init Automake & libtool
31 | AM_INIT_AUTOMAKE([foreign subdir-objects])
32 | m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
33 | LT_INIT
34 |
35 | AC_OUTPUT
36 |
--------------------------------------------------------------------------------
/gentags.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | def open_and_write_header(filename, comment_prefix):
4 | f = open(filename, 'w')
5 | f.write(comment_prefix + ' Generated via `gentags.py src/tag.in`.\n')
6 | f.write(comment_prefix + ' Do not edit; edit src/tag.in instead.\n')
7 | f.write(comment_prefix + ' clang-format off\n')
8 | return f
9 |
10 | tag_strings = open_and_write_header('src/tag_strings.h', '//')
11 | tag_enum = open_and_write_header('src/tag_enum.h', '//')
12 | tag_sizes = open_and_write_header('src/tag_sizes.h', '//')
13 |
14 | tag_py = open_and_write_header('python/gumbo/gumboc_tags.py', '#')
15 | tag_py.write('TagNames = [\n')
16 |
17 | tagfile = open(sys.argv[1])
18 |
19 | for tag in tagfile:
20 | tag = tag.strip()
21 | tag_upper = tag.upper().replace('-', '_')
22 | tag_strings.write('"%s",\n' % tag)
23 | tag_enum.write('GUMBO_TAG_%s,\n' % tag_upper)
24 | tag_sizes.write('%d, ' % len(tag))
25 | tag_py.write(' "%s",\n' % tag_upper)
26 |
27 | tagfile.close()
28 |
29 | tag_strings.close()
30 | tag_enum.close()
31 | tag_sizes.close()
32 |
33 | tag_py.write(']\n')
34 | tag_py.close()
35 |
--------------------------------------------------------------------------------
/src/attribute.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 |
17 | #ifndef GUMBO_ATTRIBUTE_H_
18 | #define GUMBO_ATTRIBUTE_H_
19 |
20 | #include "gumbo.h"
21 |
22 | #ifdef __cplusplus
23 | extern "C" {
24 | #endif
25 |
26 | struct GumboInternalParser;
27 |
28 | // Release the memory used for an GumboAttribute, including the attribute
29 | // itself.
30 | void gumbo_destroy_attribute(
31 | struct GumboInternalParser* parser, GumboAttribute* attribute);
32 |
33 | #ifdef __cplusplus
34 | }
35 | #endif
36 |
37 | #endif // GUMBO_ATTRIBUTE_H_
38 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compilation artifacts
2 | *.o
3 | *.lo
4 | *.la
5 |
6 | # Editor swap files
7 | *.swp
8 | *.swo
9 | *.swn
10 |
11 | #emacs editor leftovers
12 | *.*~
13 |
14 | #diff leftovers
15 | *.orig
16 |
17 | # gtest pieces
18 | gtest
19 | gtest-1.7.0
20 |
21 | # Other build artifacts
22 | /Debug
23 | /visualc/Debug
24 | /visualc/Release
25 | /visualc/gumbo.sdf
26 | /visualc/gumbo.opensdf
27 | /build
28 | .log
29 | .sdf
30 | .opensdf
31 | .deps
32 | .dirstamp
33 | .libs
34 | Makefile
35 | Makefile.in
36 | aclocal.m4
37 | autom4te.cache
38 | compile
39 | config.guess
40 | config.log
41 | config.status
42 | config.sub
43 | configure
44 | depcomp
45 | gumbo.pc
46 | gumbo_test
47 | gumbo_test.log
48 | gumbo_test.trs
49 | install-sh
50 | libtool
51 | ltmain.sh
52 | m4/
53 | missing
54 | test-driver
55 | test-suite.log
56 |
57 | # gyp android artifacts
58 | gumbo_parser.target.mk
59 |
60 | # `make dist` artifacts
61 | /gumbo-[0-9].[0-9].tar.gz
62 | /gumbo-[0-9].[0-9]/
63 |
64 | # Python dist artifacts
65 | *.pyc
66 | *.dylib
67 | dist
68 | build
69 | python/gumbo.egg-info
70 | python/gumbo/libgumbo.so
71 |
72 | # Example binaries
73 | benchmark
74 | clean_text
75 | find_links
76 | get_title
77 | positions_of_class
78 | prettyprint
79 | serialize
80 |
--------------------------------------------------------------------------------
/src/token_type.h:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 |
17 | #ifndef GUMBO_TOKEN_TYPE_H_
18 | #define GUMBO_TOKEN_TYPE_H_
19 |
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 |
24 | // An enum representing the type of token.
25 | typedef enum {
26 | GUMBO_TOKEN_DOCTYPE,
27 | GUMBO_TOKEN_START_TAG,
28 | GUMBO_TOKEN_END_TAG,
29 | GUMBO_TOKEN_COMMENT,
30 | GUMBO_TOKEN_WHITESPACE,
31 | GUMBO_TOKEN_CHARACTER,
32 | GUMBO_TOKEN_CDATA,
33 | GUMBO_TOKEN_NULL,
34 | GUMBO_TOKEN_EOF
35 | } GumboTokenType;
36 |
37 | #ifdef __cplusplus
38 | } // extern C
39 | #endif
40 |
41 | #endif // GUMBO_TOKEN_TYPE_H_
42 |
--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Copyright (c) 2013, Ben Noordhuis
4 | #
5 | # Permission to use, copy, modify, and/or distribute this software for any
6 | # purpose with or without fee is hereby granted, provided that the above
7 | # copyright notice and this permission notice appear in all copies.
8 | #
9 | # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 | # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 | # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 | # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 |
17 | if test -z "$LIBTOOLIZE" -a "`uname`" = "Darwin"; then
18 | if command -v "glibtoolize" >/dev/null; then
19 | LIBTOOLIZE=glibtoolize
20 | elif command -v "libtoolize" >/dev/null; then
21 | LIBTOOLIZE=libtoolize
22 | else
23 | echo "autogen.sh: line $LINENO: command glibtoolize or libtoolize not found"
24 | exit 1
25 | fi
26 | fi
27 |
28 | set -ex
29 | ${LIBTOOLIZE:-libtoolize}
30 | ${ACLOCAL:-aclocal -I m4}
31 | ${AUTOCONF:-autoconf}
32 | ${AUTOMAKE:-automake} --add-missing
33 |
--------------------------------------------------------------------------------
/src/string_piece.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 |
17 | #ifndef GUMBO_STRING_PIECE_H_
18 | #define GUMBO_STRING_PIECE_H_
19 |
20 | #include "gumbo.h"
21 |
22 | #ifdef __cplusplus
23 | extern "C" {
24 | #endif
25 |
26 | struct GumboInternalParser;
27 |
28 | // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29 | // destination and copying over the characters from source. Dest should be
30 | // empty, with no buffer allocated; otherwise, this leaks it.
31 | void gumbo_string_copy(struct GumboInternalParser* parser,
32 | GumboStringPiece* dest, const GumboStringPiece* source);
33 |
34 | #ifdef __cplusplus
35 | }
36 | #endif
37 |
38 | #endif // GUMBO_STRING_PIECE_H_
39 |
--------------------------------------------------------------------------------
/python/gumbo/__init__.py:
--------------------------------------------------------------------------------
1 | """Gumbo HTML parser.
2 |
3 | These are the Python bindings for Gumbo. All public API classes and functions
4 | are exported from this module. They include:
5 |
6 | - CTypes representations of all structs and enums defined in gumbo.h. The
7 | naming convention is to take the C name and strip off the "Gumbo" prefix.
8 |
9 | - A low-level wrapper around the gumbo_parse function, returning the classes
10 | exposed above. Usage:
11 |
12 | import gumbo
13 | with gumboc.parse(text, **options) as output:
14 | do_stuff_with_doctype(output.document)
15 | do_stuff_with_parse_tree(output.root)
16 |
17 | - Higher-level bindings that mimic the API provided by html5lib. Usage:
18 |
19 | from gumbo import html5lib
20 |
21 | This requires that html5lib be installed (it uses their treebuilders), and is
22 | intended as a drop-in replacement.
23 |
24 | - Similarly, higher-level bindings that mimic BeautifulSoup and return
25 | BeautifulSoup objects. For this, use:
26 |
27 | import gumbo
28 | soup = gumbo.soup_parse(text, **options)
29 |
30 | It will give you back a soup object like BeautifulSoup.BeautifulSoup(text).
31 | """
32 |
33 | from gumbo.gumboc import *
34 |
35 | try:
36 | from gumbo import html5lib_adapter as html5lib
37 | except ImportError:
38 | # html5lib not installed
39 | pass
40 |
41 | try:
42 | from gumbo.soup_adapter import parse as soup_parse
43 | except ImportError:
44 | # BeautifulSoup not installed
45 | pass
46 |
--------------------------------------------------------------------------------
/src/attribute.c:
--------------------------------------------------------------------------------
1 | // Copyright 2010 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 |
17 | #include "attribute.h"
18 |
19 | #include
20 | #include
21 | #include
22 | #include
23 |
24 | #include "util.h"
25 |
26 | struct GumboInternalParser;
27 |
28 | GumboAttribute* gumbo_get_attribute(
29 | const GumboVector* attributes, const char* name) {
30 | for (unsigned int i = 0; i < attributes->length; ++i) {
31 | GumboAttribute* attr = attributes->data[i];
32 | if (!strcasecmp(attr->name, name)) {
33 | return attr;
34 | }
35 | }
36 | return NULL;
37 | }
38 |
39 | void gumbo_destroy_attribute(
40 | struct GumboInternalParser* parser, GumboAttribute* attribute) {
41 | gumbo_parser_deallocate(parser, (void*) attribute->name);
42 | gumbo_parser_deallocate(parser, (void*) attribute->value);
43 | gumbo_parser_deallocate(parser, (void*) attribute);
44 | }
45 |
--------------------------------------------------------------------------------
/tests/attribute.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 |
17 | #include "attribute.h"
18 |
19 | #include
20 | #include
21 |
22 | #include "gtest/gtest.h"
23 | #include "test_utils.h"
24 | #include "vector.h"
25 |
26 | namespace {
27 |
28 | class GumboAttributeTest : public GumboTest {
29 | protected:
30 | GumboAttributeTest() { gumbo_vector_init(&parser_, 2, &vector_); }
31 |
32 | ~GumboAttributeTest() { gumbo_vector_destroy(&parser_, &vector_); }
33 |
34 | GumboVector vector_;
35 | };
36 |
37 | TEST_F(GumboAttributeTest, GetAttribute) {
38 | GumboAttribute attr1;
39 | GumboAttribute attr2;
40 | attr1.name = "";
41 | attr2.name = "foo";
42 |
43 | gumbo_vector_add(&parser_, &attr1, &vector_);
44 | gumbo_vector_add(&parser_, &attr2, &vector_);
45 | EXPECT_EQ(&attr2, gumbo_get_attribute(&vector_, "foo"));
46 | EXPECT_EQ(NULL, gumbo_get_attribute(&vector_, "bar"));
47 | }
48 |
49 | } // namespace
50 |
--------------------------------------------------------------------------------
/src/tag.in:
--------------------------------------------------------------------------------
1 | html
2 | head
3 | title
4 | base
5 | link
6 | meta
7 | style
8 | script
9 | noscript
10 | template
11 | body
12 | article
13 | section
14 | nav
15 | aside
16 | h1
17 | h2
18 | h3
19 | h4
20 | h5
21 | h6
22 | hgroup
23 | header
24 | footer
25 | address
26 | p
27 | hr
28 | pre
29 | blockquote
30 | ol
31 | ul
32 | li
33 | dl
34 | dt
35 | dd
36 | figure
37 | figcaption
38 | main
39 | div
40 | a
41 | em
42 | strong
43 | small
44 | s
45 | cite
46 | q
47 | dfn
48 | abbr
49 | data
50 | time
51 | code
52 | var
53 | samp
54 | kbd
55 | sub
56 | sup
57 | i
58 | b
59 | u
60 | mark
61 | ruby
62 | rt
63 | rp
64 | bdi
65 | bdo
66 | span
67 | br
68 | wbr
69 | ins
70 | del
71 | image
72 | img
73 | iframe
74 | embed
75 | object
76 | param
77 | video
78 | audio
79 | source
80 | track
81 | canvas
82 | map
83 | area
84 | math
85 | mi
86 | mo
87 | mn
88 | ms
89 | mtext
90 | mglyph
91 | malignmark
92 | annotation-xml
93 | svg
94 | foreignobject
95 | desc
96 | table
97 | caption
98 | colgroup
99 | col
100 | tbody
101 | thead
102 | tfoot
103 | tr
104 | td
105 | th
106 | form
107 | fieldset
108 | legend
109 | label
110 | input
111 | button
112 | select
113 | datalist
114 | optgroup
115 | option
116 | textarea
117 | keygen
118 | output
119 | progress
120 | meter
121 | details
122 | summary
123 | menu
124 | menuitem
125 | applet
126 | acronym
127 | bgsound
128 | dir
129 | frame
130 | frameset
131 | noframes
132 | isindex
133 | listing
134 | xmp
135 | nextid
136 | noembed
137 | plaintext
138 | rb
139 | strike
140 | basefont
141 | big
142 | blink
143 | center
144 | font
145 | marquee
146 | multicol
147 | nobr
148 | spacer
149 | tt
150 | rtc
151 |
--------------------------------------------------------------------------------
/src/string_piece.c:
--------------------------------------------------------------------------------
1 | // Copyright 2010 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 |
17 | #include "string_piece.h"
18 |
19 | #include
20 | #include
21 | #include
22 | #include
23 |
24 | #include "util.h"
25 |
26 | struct GumboInternalParser;
27 |
28 | const GumboStringPiece kGumboEmptyString = {NULL, 0};
29 |
30 | bool gumbo_string_equals(
31 | const GumboStringPiece* str1, const GumboStringPiece* str2) {
32 | return str1->length == str2->length &&
33 | !memcmp(str1->data, str2->data, str1->length);
34 | }
35 |
36 | bool gumbo_string_equals_ignore_case(
37 | const GumboStringPiece* str1, const GumboStringPiece* str2) {
38 | return str1->length == str2->length &&
39 | !strncasecmp(str1->data, str2->data, str1->length);
40 | }
41 |
42 | void gumbo_string_copy(struct GumboInternalParser* parser,
43 | GumboStringPiece* dest, const GumboStringPiece* source) {
44 | dest->length = source->length;
45 | char* buffer = gumbo_parser_allocate(parser, source->length);
46 | memcpy(buffer, source->data, source->length);
47 | dest->data = buffer;
48 | }
49 |
--------------------------------------------------------------------------------
/src/util.c:
--------------------------------------------------------------------------------
1 | // Copyright 2010 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 |
17 | #include "util.h"
18 |
19 | #include
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include
25 |
26 | #include "gumbo.h"
27 | #include "parser.h"
28 |
29 | // TODO(jdtang): This should be elsewhere, but there's no .c file for
30 | // SourcePositions and yet the constant needs some linkage, so this is as good
31 | // as any.
32 | const GumboSourcePosition kGumboEmptySourcePosition = {0, 0, 0};
33 |
34 | void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) {
35 | return parser->_options->allocator(parser->_options->userdata, num_bytes);
36 | }
37 |
38 | void gumbo_parser_deallocate(GumboParser* parser, void* ptr) {
39 | parser->_options->deallocator(parser->_options->userdata, ptr);
40 | }
41 |
42 | char* gumbo_copy_stringz(GumboParser* parser, const char* str) {
43 | char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1);
44 | strcpy(buffer, str);
45 | return buffer;
46 | }
47 |
48 | // Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG
49 | // to use.
50 | void gumbo_debug(const char* format, ...) {
51 | #ifdef GUMBO_DEBUG
52 | va_list args;
53 | va_start(args, format);
54 | vprintf(format, args);
55 | va_end(args);
56 | fflush(stdout);
57 | #endif
58 | }
59 |
--------------------------------------------------------------------------------
/src/insertion_mode.h:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 |
17 | #ifndef GUMBO_INSERTION_MODE_H_
18 | #define GUMBO_INSERTION_MODE_H_
19 |
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 |
24 | // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
25 | // If new enum values are added, be sure to update the kTokenHandlers dispatch
26 | // table in parser.c.
27 | typedef enum {
28 | GUMBO_INSERTION_MODE_INITIAL,
29 | GUMBO_INSERTION_MODE_BEFORE_HTML,
30 | GUMBO_INSERTION_MODE_BEFORE_HEAD,
31 | GUMBO_INSERTION_MODE_IN_HEAD,
32 | GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT,
33 | GUMBO_INSERTION_MODE_AFTER_HEAD,
34 | GUMBO_INSERTION_MODE_IN_BODY,
35 | GUMBO_INSERTION_MODE_TEXT,
36 | GUMBO_INSERTION_MODE_IN_TABLE,
37 | GUMBO_INSERTION_MODE_IN_TABLE_TEXT,
38 | GUMBO_INSERTION_MODE_IN_CAPTION,
39 | GUMBO_INSERTION_MODE_IN_COLUMN_GROUP,
40 | GUMBO_INSERTION_MODE_IN_TABLE_BODY,
41 | GUMBO_INSERTION_MODE_IN_ROW,
42 | GUMBO_INSERTION_MODE_IN_CELL,
43 | GUMBO_INSERTION_MODE_IN_SELECT,
44 | GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
45 | GUMBO_INSERTION_MODE_IN_TEMPLATE,
46 | GUMBO_INSERTION_MODE_AFTER_BODY,
47 | GUMBO_INSERTION_MODE_IN_FRAMESET,
48 | GUMBO_INSERTION_MODE_AFTER_FRAMESET,
49 | GUMBO_INSERTION_MODE_AFTER_AFTER_BODY,
50 | GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET
51 | } GumboInsertionMode;
52 |
53 | #ifdef __cplusplus
54 | } // extern C
55 | #endif
56 |
57 | #endif // GUMBO_INSERTION_MODE_H_
58 |
--------------------------------------------------------------------------------
/examples/find_links.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2013 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 | //
17 | // Finds the URLs of all links in the page.
18 |
19 | #include
20 |
21 | #include
22 | #include
23 | #include
24 |
25 | #include "gumbo.h"
26 |
27 | static void search_for_links(GumboNode* node) {
28 | if (node->type != GUMBO_NODE_ELEMENT) {
29 | return;
30 | }
31 | GumboAttribute* href;
32 | if (node->v.element.tag == GUMBO_TAG_A &&
33 | (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) {
34 | std::cout << href->value << std::endl;
35 | }
36 |
37 | GumboVector* children = &node->v.element.children;
38 | for (unsigned int i = 0; i < children->length; ++i) {
39 | search_for_links(static_cast(children->data[i]));
40 | }
41 | }
42 |
43 | int main(int argc, char** argv) {
44 | if (argc != 2) {
45 | std::cout << "Usage: find_links .\n";
46 | exit(EXIT_FAILURE);
47 | }
48 | const char* filename = argv[1];
49 |
50 | std::ifstream in(filename, std::ios::in | std::ios::binary);
51 | if (!in) {
52 | std::cout << "File " << filename << " not found!\n";
53 | exit(EXIT_FAILURE);
54 | }
55 |
56 | std::string contents;
57 | in.seekg(0, std::ios::end);
58 | contents.resize(in.tellg());
59 | in.seekg(0, std::ios::beg);
60 | in.read(&contents[0], contents.size());
61 | in.close();
62 |
63 | GumboOutput* output = gumbo_parse(contents.c_str());
64 | search_for_links(output->root);
65 | gumbo_destroy_output(&kGumboDefaultOptions, output);
66 | }
67 |
--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | ---
2 | Language: Cpp
3 | # BasedOnStyle: Google
4 | AccessModifierOffset: -1
5 | AlignAfterOpenBracket: false
6 | AlignEscapedNewlinesLeft: true
7 | AlignOperands: true
8 | AlignTrailingComments: true
9 | AllowAllParametersOfDeclarationOnNextLine: true
10 | AllowShortBlocksOnASingleLine: false
11 | AllowShortCaseLabelsOnASingleLine: false
12 | AllowShortIfStatementsOnASingleLine: true
13 | AllowShortLoopsOnASingleLine: true
14 | AllowShortFunctionsOnASingleLine: All
15 | AlwaysBreakAfterDefinitionReturnType: false
16 | AlwaysBreakTemplateDeclarations: true
17 | AlwaysBreakBeforeMultilineStrings: true
18 | BreakBeforeBinaryOperators: None
19 | BreakBeforeTernaryOperators: true
20 | BreakConstructorInitializersBeforeComma: false
21 | BinPackParameters: true
22 | BinPackArguments: true
23 | ColumnLimit: 80
24 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
25 | ConstructorInitializerIndentWidth: 4
26 | DerivePointerAlignment: true
27 | ExperimentalAutoDetectBinPacking: false
28 | IndentCaseLabels: true
29 | IndentWrappedFunctionNames: false
30 | IndentFunctionDeclarationAfterType: false
31 | MaxEmptyLinesToKeep: 1
32 | KeepEmptyLinesAtTheStartOfBlocks: false
33 | NamespaceIndentation: None
34 | ObjCBlockIndentWidth: 2
35 | ObjCSpaceAfterProperty: false
36 | ObjCSpaceBeforeProtocolList: false
37 | PenaltyBreakBeforeFirstCallParameter: 1
38 | PenaltyBreakComment: 300
39 | PenaltyBreakString: 1000
40 | PenaltyBreakFirstLessLess: 120
41 | PenaltyExcessCharacter: 1000000
42 | PenaltyReturnTypeOnItsOwnLine: 200
43 | PointerAlignment: Left
44 | SpacesBeforeTrailingComments: 2
45 | Cpp11BracedListStyle: true
46 | Standard: Auto
47 | IndentWidth: 2
48 | TabWidth: 8
49 | UseTab: Never
50 | BreakBeforeBraces: Attach
51 | SpacesInParentheses: false
52 | SpacesInSquareBrackets: false
53 | SpacesInAngles: false
54 | SpaceInEmptyParentheses: false
55 | SpacesInCStyleCastParentheses: false
56 | SpaceAfterCStyleCast: true
57 | SpacesInContainerLiterals: true
58 | SpaceBeforeAssignmentOperators: true
59 | ContinuationIndentWidth: 4
60 | CommentPragmas: '^ IWYU pragma:'
61 | ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
62 | SpaceBeforeParens: ControlStatements
63 | DisableFormat: false
64 | ...
65 |
66 |
--------------------------------------------------------------------------------
/src/tag_strings.h:
--------------------------------------------------------------------------------
1 | // Generated via `gentags.py src/tag.in`.
2 | // Do not edit; edit src/tag.in instead.
3 | // clang-format off
4 | "html",
5 | "head",
6 | "title",
7 | "base",
8 | "link",
9 | "meta",
10 | "style",
11 | "script",
12 | "noscript",
13 | "template",
14 | "body",
15 | "article",
16 | "section",
17 | "nav",
18 | "aside",
19 | "h1",
20 | "h2",
21 | "h3",
22 | "h4",
23 | "h5",
24 | "h6",
25 | "hgroup",
26 | "header",
27 | "footer",
28 | "address",
29 | "p",
30 | "hr",
31 | "pre",
32 | "blockquote",
33 | "ol",
34 | "ul",
35 | "li",
36 | "dl",
37 | "dt",
38 | "dd",
39 | "figure",
40 | "figcaption",
41 | "main",
42 | "div",
43 | "a",
44 | "em",
45 | "strong",
46 | "small",
47 | "s",
48 | "cite",
49 | "q",
50 | "dfn",
51 | "abbr",
52 | "data",
53 | "time",
54 | "code",
55 | "var",
56 | "samp",
57 | "kbd",
58 | "sub",
59 | "sup",
60 | "i",
61 | "b",
62 | "u",
63 | "mark",
64 | "ruby",
65 | "rt",
66 | "rp",
67 | "bdi",
68 | "bdo",
69 | "span",
70 | "br",
71 | "wbr",
72 | "ins",
73 | "del",
74 | "image",
75 | "img",
76 | "iframe",
77 | "embed",
78 | "object",
79 | "param",
80 | "video",
81 | "audio",
82 | "source",
83 | "track",
84 | "canvas",
85 | "map",
86 | "area",
87 | "math",
88 | "mi",
89 | "mo",
90 | "mn",
91 | "ms",
92 | "mtext",
93 | "mglyph",
94 | "malignmark",
95 | "annotation-xml",
96 | "svg",
97 | "foreignobject",
98 | "desc",
99 | "table",
100 | "caption",
101 | "colgroup",
102 | "col",
103 | "tbody",
104 | "thead",
105 | "tfoot",
106 | "tr",
107 | "td",
108 | "th",
109 | "form",
110 | "fieldset",
111 | "legend",
112 | "label",
113 | "input",
114 | "button",
115 | "select",
116 | "datalist",
117 | "optgroup",
118 | "option",
119 | "textarea",
120 | "keygen",
121 | "output",
122 | "progress",
123 | "meter",
124 | "details",
125 | "summary",
126 | "menu",
127 | "menuitem",
128 | "applet",
129 | "acronym",
130 | "bgsound",
131 | "dir",
132 | "frame",
133 | "frameset",
134 | "noframes",
135 | "isindex",
136 | "listing",
137 | "xmp",
138 | "nextid",
139 | "noembed",
140 | "plaintext",
141 | "rb",
142 | "strike",
143 | "basefont",
144 | "big",
145 | "blink",
146 | "center",
147 | "font",
148 | "marquee",
149 | "multicol",
150 | "nobr",
151 | "spacer",
152 | "tt",
153 | "rtc",
154 |
--------------------------------------------------------------------------------
/src/util.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 | //
17 | // This contains some utility functions that didn't fit into any of the other
18 | // headers.
19 |
20 | #ifndef GUMBO_UTIL_H_
21 | #define GUMBO_UTIL_H_
22 | #ifdef _MSC_VER
23 | #define _CRT_SECURE_NO_WARNINGS
24 | #endif
25 | #include
26 | #include
27 |
28 | #ifdef __cplusplus
29 | extern "C" {
30 | #endif
31 |
32 | // Forward declaration since it's passed into some of the functions in this
33 | // header.
34 | struct GumboInternalParser;
35 |
36 | // Utility function for allocating & copying a null-terminated string into a
37 | // freshly-allocated buffer. This is necessary for proper memory management; we
38 | // have the convention that all const char* in parse tree structures are
39 | // freshly-allocated, so if we didn't copy, we'd try to delete a literal string
40 | // when the parse tree is destroyed.
41 | char* gumbo_copy_stringz(struct GumboInternalParser* parser, const char* str);
42 |
43 | // Allocate a chunk of memory, using the allocator specified in the Parser's
44 | // config options.
45 | void* gumbo_parser_allocate(
46 | struct GumboInternalParser* parser, size_t num_bytes);
47 |
48 | // Deallocate a chunk of memory, using the deallocator specified in the Parser's
49 | // config options.
50 | void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr);
51 |
52 | // Debug wrapper for printf, to make it easier to turn off debugging info when
53 | // required.
54 | void gumbo_debug(const char* format, ...);
55 |
56 | #ifdef __cplusplus
57 | }
58 | #endif
59 |
60 | #endif // GUMBO_UTIL_H_
61 |
--------------------------------------------------------------------------------
/src/parser.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 | //
17 | // Contains the definition of the top-level GumboParser structure that's
18 | // threaded through basically every internal function in the library.
19 |
20 | #ifndef GUMBO_PARSER_H_
21 | #define GUMBO_PARSER_H_
22 |
23 | #ifdef __cplusplus
24 | extern "C" {
25 | #endif
26 |
27 | struct GumboInternalParserState;
28 | struct GumboInternalOutput;
29 | struct GumboInternalOptions;
30 | struct GumboInternalTokenizerState;
31 |
32 | // An overarching struct that's threaded through (nearly) all functions in the
33 | // library, OOP-style. This gives each function access to the options and
34 | // output, along with any internal state needed for the parse.
35 | typedef struct GumboInternalParser {
36 | // Settings for this parse run.
37 | const struct GumboInternalOptions* _options;
38 |
39 | // Output for the parse.
40 | struct GumboInternalOutput* _output;
41 |
42 | // The internal tokenizer state, defined as a pointer to avoid a cyclic
43 | // dependency on html5tokenizer.h. The main parse routine is responsible for
44 | // initializing this on parse start, and destroying it on parse end.
45 | // End-users will never see a non-garbage value in this pointer.
46 | struct GumboInternalTokenizerState* _tokenizer_state;
47 |
48 | // The internal parser state. Initialized on parse start and destroyed on
49 | // parse end; end-users will never see a non-garbage value in this pointer.
50 | struct GumboInternalParserState* _parser_state;
51 | } GumboParser;
52 |
53 | #ifdef __cplusplus
54 | }
55 | #endif
56 |
57 | #endif // GUMBO_PARSER_H_
58 |
--------------------------------------------------------------------------------
/gumbo_parser.gyp:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | {
16 | 'targets': [
17 | {
18 | 'target_name': 'gumbo_parser',
19 | 'type': 'static_library',
20 | 'cflags': ['-std=c99', '-Wall'],
21 | 'sources': [
22 | 'src/attribute.c',
23 | 'src/attribute.h',
24 | 'src/char_ref.c',
25 | 'src/char_ref.h',
26 | 'src/error.c',
27 | 'src/error.h',
28 | 'src/gumbo.h',
29 | 'src/insertion_mode.h',
30 | 'src/parser.c',
31 | 'src/parser.h',
32 | 'src/string_buffer.c',
33 | 'src/string_buffer.h',
34 | 'src/string_piece.c',
35 | 'src/string_piece.h',
36 | 'src/tag.c',
37 | 'src/token_type.h',
38 | 'src/tokenizer.c',
39 | 'src/tokenizer.h',
40 | 'src/tokenizer_states.h',
41 | 'src/utf8.c',
42 | 'src/utf8.h',
43 | 'src/util.c',
44 | 'src/util.h',
45 | 'src/vector.c',
46 | 'src/vector.h',
47 | ],
48 | },
49 | {
50 | 'target_name': 'gumbo_parser_unittests',
51 | 'type': 'executable',
52 | 'dependencies': [
53 | 'gtest.gyp:gtest',
54 | 'gtest.gyp:gtest_main',
55 | 'gumbo_parser',
56 | ],
57 | 'include_dirs': [
58 | '.',
59 | '..',
60 | 'src',
61 | ],
62 | 'sources': [
63 | 'tests/attribute.cc',
64 | 'tests/char_ref.cc',
65 | 'tests/parser.cc',
66 | 'tests/string_buffer.cc',
67 | 'tests/string_piece.cc',
68 | 'tests/test_utils.cc',
69 | 'tests/test_utils.h',
70 | 'tests/tokenizer.cc',
71 | 'tests/utf8.cc',
72 | 'tests/vector.cc',
73 | ],
74 | },
75 | ],
76 | }
77 |
--------------------------------------------------------------------------------
/python/gumbo/soup_adapter_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Tests for the Gumbo's BeautifulSoup Python adapter."""
16 |
17 | __author__ = 'jdtang@google.com (Jonathan Tang)'
18 |
19 | import unittest
20 |
21 | import soup_adapter
22 |
23 |
24 | class SoupAdapterTest(unittest.TestCase):
25 |
26 | def testSimpleParse(self):
27 | soup = soup_adapter.parse(
28 | """
29 |
35 | """)
36 |
37 | head = soup.head
38 | self.assertEquals(soup, head.parent.parent)
39 | self.assertEquals(u'head', head.name)
40 | self.assertEquals(0, len(head))
41 |
42 | body = soup.body
43 | self.assertEquals(head, body.previousSibling)
44 | self.assertEquals(2, len(body)) # + trailing whitespace
45 | self.assertEquals(u'ul', body.contents[0].name)
46 | self.assertEquals(body, head.next)
47 | self.assertEquals(head, body.previous)
48 |
49 | list_items = body.findAll('li')
50 | self.assertEquals(4, len(list_items))
51 |
52 | evens = body('li', 'even')
53 | self.assertEquals(2, len(evens))
54 |
55 | a2 = body.find('a', href='two.html')
56 | self.assertEquals(u'a', a2.name)
57 | self.assertEquals(u'Two', a2.contents[0])
58 | self.assertEquals(a2, evens[0].next)
59 | self.assertEquals(evens[0], a2.previous)
60 |
61 | li2 = a2.parent
62 | self.assertEquals(u'li', li2.name)
63 | self.assertEquals(u'even', li2['class'])
64 | self.assertEquals(list_items[1], li2)
65 | self.assertEquals(evens[0], li2)
66 |
67 | if __name__ == '__main__':
68 | unittest.main()
69 |
--------------------------------------------------------------------------------
/examples/clean_text.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2013 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 | //
17 | // Gets the cleantext of a page.
18 |
19 | #include
20 | #include
21 | #include
22 | #include
23 |
24 | #include "gumbo.h"
25 |
26 | static std::string cleantext(GumboNode* node) {
27 | if (node->type == GUMBO_NODE_TEXT) {
28 | return std::string(node->v.text.text);
29 | } else if (node->type == GUMBO_NODE_ELEMENT &&
30 | node->v.element.tag != GUMBO_TAG_SCRIPT &&
31 | node->v.element.tag != GUMBO_TAG_STYLE) {
32 | std::string contents = "";
33 | GumboVector* children = &node->v.element.children;
34 | for (unsigned int i = 0; i < children->length; ++i) {
35 | const std::string text = cleantext((GumboNode*) children->data[i]);
36 | if (i != 0 && !text.empty()) {
37 | contents.append(" ");
38 | }
39 | contents.append(text);
40 | }
41 | return contents;
42 | } else {
43 | return "";
44 | }
45 | }
46 |
47 | int main(int argc, char** argv) {
48 | if (argc != 2) {
49 | std::cout << "Usage: clean_text \n";
50 | exit(EXIT_FAILURE);
51 | }
52 | const char* filename = argv[1];
53 |
54 | std::ifstream in(filename, std::ios::in | std::ios::binary);
55 | if (!in) {
56 | std::cout << "File " << filename << " not found!\n";
57 | exit(EXIT_FAILURE);
58 | }
59 |
60 | std::string contents;
61 | in.seekg(0, std::ios::end);
62 | contents.resize(in.tellg());
63 | in.seekg(0, std::ios::beg);
64 | in.read(&contents[0], contents.size());
65 | in.close();
66 |
67 | GumboOutput* output = gumbo_parse(contents.c_str());
68 | std::cout << cleantext(output->root) << std::endl;
69 | gumbo_destroy_output(&kGumboDefaultOptions, output);
70 | }
71 |
--------------------------------------------------------------------------------
/src/char_ref.h:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 | //
17 | // Internal header for character reference handling; this should not be exposed
18 | // transitively by any public API header. This is why the functions aren't
19 | // namespaced.
20 |
21 | #ifndef GUMBO_CHAR_REF_H_
22 | #define GUMBO_CHAR_REF_H_
23 |
24 | #include
25 |
26 | #ifdef __cplusplus
27 | extern "C" {
28 | #endif
29 |
30 | struct GumboInternalParser;
31 | struct GumboInternalUtf8Iterator;
32 |
33 | // Value that indicates no character was produced.
34 | extern const int kGumboNoChar;
35 |
36 | // Certain named character references generate two codepoints, not one, and so
37 | // the consume_char_ref subroutine needs to return this instead of an int. The
38 | // first field will be kGumboNoChar if no character reference was found; the
39 | // second field will be kGumboNoChar if that is the case or if the character
40 | // reference returns only a single codepoint.
41 | typedef struct {
42 | int first;
43 | int second;
44 | } OneOrTwoCodepoints;
45 |
46 | // Implements the "consume a character reference" section of the spec.
47 | // This reads in characters from the input as necessary, and fills in a
48 | // OneOrTwoCodepoints struct containing the characters read. It may add parse
49 | // errors to the GumboParser's errors vector, if the spec calls for it. Pass a
50 | // space for the "additional allowed char" when the spec says "with no
51 | // additional allowed char". Returns false on parse error, true otherwise.
52 | bool consume_char_ref(struct GumboInternalParser* parser,
53 | struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
54 | bool is_in_attribute, OneOrTwoCodepoints* output);
55 |
56 | #ifdef __cplusplus
57 | }
58 | #endif
59 |
60 | #endif // GUMBO_CHAR_REF_H_
61 |
--------------------------------------------------------------------------------
/benchmarks/benchmark.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2013 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 | //
17 |
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include
25 |
26 | #include "gumbo.h"
27 |
28 | static const int kNumReps = 10;
29 |
30 | int main(int argc, char** argv) {
31 | if (argc != 1) {
32 | std::cout << "Usage: benchmarks\n";
33 | exit(EXIT_FAILURE);
34 | }
35 |
36 | DIR* dir;
37 | struct dirent* file;
38 |
39 | if ((dir = opendir("benchmarks")) == NULL) {
40 | std::cout << "Couldn't find 'benchmarks' directory. "
41 | << "Run from root of distribution.\n";
42 | exit(EXIT_FAILURE);
43 | }
44 |
45 | while ((file = readdir(dir)) != NULL) {
46 | std::string filename(file->d_name);
47 | if (filename.length() > 5 && filename.compare(filename.length() - 5, 5, ".html") == 0) {
48 | std::string full_filename = "benchmarks/" + filename;
49 | std::ifstream in(full_filename.c_str(), std::ios::in | std::ios::binary);
50 | if (!in) {
51 | std::cout << "File " << full_filename << " couldn't be read!\n";
52 | exit(EXIT_FAILURE);
53 | }
54 |
55 | std::string contents;
56 | in.seekg(0, std::ios::end);
57 | contents.resize(in.tellg());
58 | in.seekg(0, std::ios::beg);
59 | in.read(&contents[0], contents.size());
60 | in.close();
61 |
62 | clock_t start_time = clock();
63 | for (int i = 0; i < kNumReps; ++i) {
64 | GumboOutput* output = gumbo_parse(contents.c_str());
65 | gumbo_destroy_output(&kGumboDefaultOptions, output);
66 | }
67 | clock_t end_time = clock();
68 | std::cout << filename << ": "
69 | << (1000000 * (end_time - start_time) / (kNumReps * CLOCKS_PER_SEC))
70 | << " microseconds.\n";
71 | }
72 | }
73 | closedir(dir);
74 | }
75 |
--------------------------------------------------------------------------------
/python/gumbo/gumboc_tags.py:
--------------------------------------------------------------------------------
1 | # Generated via `gentags.py src/tag.in`.
2 | # Do not edit; edit src/tag.in instead.
3 | # clang-format off
4 | TagNames = [
5 | "HTML",
6 | "HEAD",
7 | "TITLE",
8 | "BASE",
9 | "LINK",
10 | "META",
11 | "STYLE",
12 | "SCRIPT",
13 | "NOSCRIPT",
14 | "TEMPLATE",
15 | "BODY",
16 | "ARTICLE",
17 | "SECTION",
18 | "NAV",
19 | "ASIDE",
20 | "H1",
21 | "H2",
22 | "H3",
23 | "H4",
24 | "H5",
25 | "H6",
26 | "HGROUP",
27 | "HEADER",
28 | "FOOTER",
29 | "ADDRESS",
30 | "P",
31 | "HR",
32 | "PRE",
33 | "BLOCKQUOTE",
34 | "OL",
35 | "UL",
36 | "LI",
37 | "DL",
38 | "DT",
39 | "DD",
40 | "FIGURE",
41 | "FIGCAPTION",
42 | "MAIN",
43 | "DIV",
44 | "A",
45 | "EM",
46 | "STRONG",
47 | "SMALL",
48 | "S",
49 | "CITE",
50 | "Q",
51 | "DFN",
52 | "ABBR",
53 | "DATA",
54 | "TIME",
55 | "CODE",
56 | "VAR",
57 | "SAMP",
58 | "KBD",
59 | "SUB",
60 | "SUP",
61 | "I",
62 | "B",
63 | "U",
64 | "MARK",
65 | "RUBY",
66 | "RT",
67 | "RP",
68 | "BDI",
69 | "BDO",
70 | "SPAN",
71 | "BR",
72 | "WBR",
73 | "INS",
74 | "DEL",
75 | "IMAGE",
76 | "IMG",
77 | "IFRAME",
78 | "EMBED",
79 | "OBJECT",
80 | "PARAM",
81 | "VIDEO",
82 | "AUDIO",
83 | "SOURCE",
84 | "TRACK",
85 | "CANVAS",
86 | "MAP",
87 | "AREA",
88 | "MATH",
89 | "MI",
90 | "MO",
91 | "MN",
92 | "MS",
93 | "MTEXT",
94 | "MGLYPH",
95 | "MALIGNMARK",
96 | "ANNOTATION_XML",
97 | "SVG",
98 | "FOREIGNOBJECT",
99 | "DESC",
100 | "TABLE",
101 | "CAPTION",
102 | "COLGROUP",
103 | "COL",
104 | "TBODY",
105 | "THEAD",
106 | "TFOOT",
107 | "TR",
108 | "TD",
109 | "TH",
110 | "FORM",
111 | "FIELDSET",
112 | "LEGEND",
113 | "LABEL",
114 | "INPUT",
115 | "BUTTON",
116 | "SELECT",
117 | "DATALIST",
118 | "OPTGROUP",
119 | "OPTION",
120 | "TEXTAREA",
121 | "KEYGEN",
122 | "OUTPUT",
123 | "PROGRESS",
124 | "METER",
125 | "DETAILS",
126 | "SUMMARY",
127 | "MENU",
128 | "MENUITEM",
129 | "APPLET",
130 | "ACRONYM",
131 | "BGSOUND",
132 | "DIR",
133 | "FRAME",
134 | "FRAMESET",
135 | "NOFRAMES",
136 | "ISINDEX",
137 | "LISTING",
138 | "XMP",
139 | "NEXTID",
140 | "NOEMBED",
141 | "PLAINTEXT",
142 | "RB",
143 | "STRIKE",
144 | "BASEFONT",
145 | "BIG",
146 | "BLINK",
147 | "CENTER",
148 | "FONT",
149 | "MARQUEE",
150 | "MULTICOL",
151 | "NOBR",
152 | "SPACER",
153 | "TT",
154 | "RTC",
155 | ]
156 |
--------------------------------------------------------------------------------
/src/vector.h:
--------------------------------------------------------------------------------
1 | // Copyright 2010 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Author: jdtang@google.com (Jonathan Tang)
16 |
17 | #ifndef GUMBO_VECTOR_H_
18 | #define GUMBO_VECTOR_H_
19 |
20 | #include "gumbo.h"
21 |
22 | #ifdef __cplusplus
23 | extern "C" {
24 | #endif
25 |
26 | // Forward declaration since it's passed into some of the functions in this
27 | // header.
28 | struct GumboInternalParser;
29 |
30 | // Initializes a new GumboVector with the specified initial capacity.
31 | void gumbo_vector_init(struct GumboInternalParser* parser,
32 | size_t initial_capacity, GumboVector* vector);
33 |
34 | // Frees the memory used by an GumboVector. Does not free the contained
35 | // pointers.
36 | void gumbo_vector_destroy(
37 | struct GumboInternalParser* parser, GumboVector* vector);
38 |
39 | // Adds a new element to an GumboVector.
40 | void gumbo_vector_add(
41 | struct GumboInternalParser* parser, void* element, GumboVector* vector);
42 |
43 | // Removes and returns the element most recently added to the GumboVector.
44 | // Ownership is transferred to caller. Capacity is unchanged. If the vector is
45 | // empty, NULL is returned.
46 | void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector);
47 |
48 | // Inserts an element at a specific index. This is potentially O(N) time, but
49 | // is necessary for some of the spec's behavior.
50 | void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
51 | unsigned int index, GumboVector* vector);
52 |
53 | // Removes an element from the vector, or does nothing if the element is not in
54 | // the vector.
55 | void gumbo_vector_remove(
56 | struct GumboInternalParser* parser, void* element, GumboVector* vector);
57 |
58 | // Removes and returns an element at a specific index. Note that this is
59 | // potentially O(N) time and should be used sparingly.
60 | void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
61 | unsigned int index, GumboVector* vector);
62 |
63 | #ifdef __cplusplus
64 | }
65 | #endif
66 |
67 | #endif // GUMBO_VECTOR_H_
68 |
--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
1 | ## Gumbo 0.10.1 (2015-04-30)
2 |
3 | Same as 0.10.0, but with the version number bumped because the last version-number commit to v0.9.4 makes GitHub think that v0.9.4 is the latest version and so it's not highlighted on the webpage.
4 |
5 | ## Gumbo 0.10.0 (2015-04-30)
6 |
7 | * Full support for `` tag (kevinhendricks, nostrademons).
8 | * Some fixes for `