├── .clang-format ├── .gitignore ├── .gitmodules ├── .travis.yml ├── CHANGES.md ├── CONTRIBUTING.md ├── COPYING ├── DEBUGGING.md ├── Doxyfile ├── Makefile.am ├── README.md ├── THANKS ├── appveyor.yml ├── autogen.sh ├── benchmarks ├── arabic_newspapers.html ├── baidu.html ├── bbc.html ├── benchmark.cc ├── google.html ├── hacker_news.html ├── html5_spec.html ├── wikipedia.html ├── xinhua.html └── yahoo.html ├── configure.ac ├── examples ├── clean_text.cc ├── find_links.cc ├── get_title.c ├── positions_of_class.cc ├── prettyprint.cc └── serialize.cc ├── genperf.py ├── gentags.py ├── gtest.gyp ├── gumbo.pc.in ├── gumbo_parser.gyp ├── original-README.md ├── python └── gumbo │ ├── __init__.py │ ├── gumboc.py │ ├── gumboc_tags.py │ ├── gumboc_test.py │ ├── html5lib_adapter.py │ ├── html5lib_adapter_test.py │ ├── soup_adapter.py │ └── soup_adapter_test.py ├── setup.py ├── src ├── attribute.c ├── attribute.h ├── char_ref.c ├── char_ref.h ├── char_ref.rl ├── error.c ├── error.h ├── gumbo.h ├── insertion_mode.h ├── parser.c ├── parser.h ├── string_buffer.c ├── string_buffer.h ├── string_piece.c ├── string_piece.h ├── tag.c ├── tag.in ├── tag_enum.h ├── tag_gperf.h ├── tag_sizes.h ├── tag_strings.h ├── token_type.h ├── tokenizer.c ├── tokenizer.h ├── tokenizer_states.h ├── utf8.c ├── utf8.h ├── util.c ├── util.h ├── vector.c └── vector.h ├── tests ├── attribute.cc ├── char_ref.cc ├── parser.cc ├── string_buffer.cc ├── string_piece.cc ├── test_utils.cc ├── test_utils.h ├── tokenizer.cc ├── utf8.cc └── vector.cc └── visualc ├── gumbo.vcxproj ├── gumbo.vcxproj.filters ├── gumbo.vcxproj.user └── include └── strings.h /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -1 5 | AlignAfterOpenBracket: false 6 | AlignEscapedNewlinesLeft: true 7 | AlignOperands: true 8 | AlignTrailingComments: true 9 | AllowAllParametersOfDeclarationOnNextLine: true 10 | AllowShortBlocksOnASingleLine: false 11 | AllowShortCaseLabelsOnASingleLine: false 12 | AllowShortIfStatementsOnASingleLine: true 13 | AllowShortLoopsOnASingleLine: true 14 | AllowShortFunctionsOnASingleLine: All 15 | AlwaysBreakAfterDefinitionReturnType: false 16 | AlwaysBreakTemplateDeclarations: true 17 | AlwaysBreakBeforeMultilineStrings: true 18 | BreakBeforeBinaryOperators: None 19 | BreakBeforeTernaryOperators: true 20 | BreakConstructorInitializersBeforeComma: false 21 | BinPackParameters: true 22 | BinPackArguments: true 23 | ColumnLimit: 80 24 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 25 | ConstructorInitializerIndentWidth: 4 26 | DerivePointerAlignment: true 27 | ExperimentalAutoDetectBinPacking: false 28 | IndentCaseLabels: true 29 | IndentWrappedFunctionNames: false 30 | IndentFunctionDeclarationAfterType: false 31 | MaxEmptyLinesToKeep: 1 32 | KeepEmptyLinesAtTheStartOfBlocks: false 33 | NamespaceIndentation: None 34 | ObjCBlockIndentWidth: 2 35 | ObjCSpaceAfterProperty: false 36 | ObjCSpaceBeforeProtocolList: false 37 | PenaltyBreakBeforeFirstCallParameter: 1 38 | PenaltyBreakComment: 300 39 | PenaltyBreakString: 1000 40 | PenaltyBreakFirstLessLess: 120 41 | PenaltyExcessCharacter: 1000000 42 | PenaltyReturnTypeOnItsOwnLine: 200 43 | PointerAlignment: Left 44 | SpacesBeforeTrailingComments: 2 45 | Cpp11BracedListStyle: true 46 | Standard: Auto 47 | IndentWidth: 2 48 | TabWidth: 8 49 | UseTab: Never 50 | BreakBeforeBraces: Attach 51 | SpacesInParentheses: false 52 | SpacesInSquareBrackets: false 53 | SpacesInAngles: false 54 | SpaceInEmptyParentheses: false 55 | SpacesInCStyleCastParentheses: false 56 | SpaceAfterCStyleCast: true 57 | SpacesInContainerLiterals: true 58 | SpaceBeforeAssignmentOperators: true 59 | ContinuationIndentWidth: 4 60 | CommentPragmas: '^ IWYU pragma:' 61 | ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] 62 | SpaceBeforeParens: ControlStatements 63 | DisableFormat: false 64 | ... 65 | 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compilation artifacts 2 | *.o 3 | *.lo 4 | *.la 5 | 6 | # Editor swap files 7 | *.swp 8 | *.swo 9 | *.swn 10 | 11 | #emacs editor leftovers 12 | *.*~ 13 | 14 | #diff leftovers 15 | *.orig 16 | 17 | # gtest pieces 18 | gtest 19 | gtest-1.7.0 20 | 21 | # Other build artifacts 22 | /Debug 23 | /visualc/Debug 24 | /visualc/Release 25 | /visualc/gumbo.sdf 26 | /visualc/gumbo.opensdf 27 | /build 28 | .log 29 | .sdf 30 | .opensdf 31 | .deps 32 | .dirstamp 33 | .libs 34 | Makefile 35 | Makefile.in 36 | aclocal.m4 37 | autom4te.cache 38 | compile 39 | config.guess 40 | config.log 41 | config.status 42 | config.sub 43 | configure 44 | depcomp 45 | gumbo.pc 46 | gumbo_test 47 | gumbo_test.log 48 | gumbo_test.trs 49 | install-sh 50 | libtool 51 | ltmain.sh 52 | m4/ 53 | missing 54 | test-driver 55 | test-suite.log 56 | 57 | # gyp android artifacts 58 | gumbo_parser.target.mk 59 | 60 | # `make dist` artifacts 61 | /gumbo-[0-9].[0-9].tar.gz 62 | /gumbo-[0-9].[0-9]/ 63 | 64 | # Python dist artifacts 65 | *.pyc 66 | *.dylib 67 | dist 68 | build 69 | python/gumbo.egg-info 70 | python/gumbo/libgumbo.so 71 | 72 | # Example binaries 73 | benchmark 74 | clean_text 75 | find_links 76 | get_title 77 | positions_of_class 78 | prettyprint 79 | serialize 80 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/gtest"] 2 | path = third_party/gtest 3 | url = https://chromium.googlesource.com/external/googletest/ 4 | [submodule "testdata"] 5 | path = testdata 6 | url = https://github.com/html5lib/html5lib-tests.git 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c++ 2 | 3 | compiler: 4 | - gcc 5 | - clang 6 | 7 | os: 8 | - linux 9 | - osx 10 | 11 | install: 12 | - wget 'https://googletest.googlecode.com/files/gtest-1.7.0.zip' 13 | - unzip gtest-1.7.0.zip 14 | - ln -s gtest-1.7.0 gtest 15 | - sudo pip install BeautifulSoup 16 | - sudo pip install html5lib==0.95 17 | 18 | script: 19 | - ./autogen.sh && ./configure && make && make check 20 | - python python/gumbo/gumboc_test.py 21 | - python python/gumbo/html5lib_adapter_test.py 22 | - python python/gumbo/soup_adapter_test.py 23 | - sudo make install 24 | - g++ examples/clean_text.cc `pkg-config --cflags --libs gumbo` 25 | - sudo python setup.py sdist install 26 | - python -c 'import gumbo; gumbo.parse("Foo")' 27 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | ## Gumbo 0.10.1 (2015-04-30) 2 | 3 | Same as 0.10.0, but with the version number bumped because the last version-number commit to v0.9.4 makes GitHub think that v0.9.4 is the latest version and so it's not highlighted on the webpage. 4 | 5 | ## Gumbo 0.10.0 (2015-04-30) 6 | 7 | * Full support for `