├── .dockerignore ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST ├── MANIFEST.in ├── Makefile ├── README.md ├── gumbocy.cpp ├── gumbocy.pxd ├── gumbocy.pyx ├── re2cy.pxd ├── requirements-benchmark.txt ├── requirements.txt ├── scripts └── git-set-file-times ├── setup.py └── tests ├── benchmark_parsers.py ├── conftest.py ├── test_analyze.py ├── test_hyperlinks.py ├── test_listnodes.py └── test_word_groups.py /.dockerignore: -------------------------------------------------------------------------------- 1 | venv 2 | .git 3 | .cache -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | # Gumbocy-specific 65 | gumbocy.c 66 | gumbocy.html 67 | venv/ 68 | *.rst 69 | gumbo-parser 70 | /tests/_benchmark_fixture.html 71 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | services: 4 | - docker 5 | 6 | env: 7 | - GUMBOCY_PYTHON_VERSION=py27 8 | - GUMBOCY_PYTHON_VERSION=pypy 9 | 10 | before_install: 11 | - docker ps 12 | - docker info 13 | - docker version 14 | - ./scripts/git-set-file-times 15 | - docker pull commonsearch/gumbocy 16 | - make docker_build 17 | 18 | script: 19 | - make docker_test 20 | 21 | notifications: 22 | irc: "chat.freenode.net#commonsearch" 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:jessie 2 | 3 | RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ 4 | curl \ 5 | automake \ 6 | gcc \ 7 | g++ \ 8 | make \ 9 | libtool \ 10 | ca-certificates \ 11 | # python3-pip \ 12 | # python3-dev \ 13 | python-pip \ 14 | python-dev \ 15 | bzip2 16 | 17 | # Upgrade pip 18 | # RUN pip3 install --upgrade --ignore-installed pip 19 | RUN pip install --upgrade --ignore-installed pip 20 | 21 | # Install Gumbo 22 | ENV GUMBO_VERSION 0.10.1 23 | RUN curl -sL https://github.com/google/gumbo-parser/archive/v$GUMBO_VERSION.tar.gz > gumbo.tgz && \ 24 | rm -rf gumbo-parser-$GUMBO_VERSION gumbo-parser && \ 25 | tar zxf gumbo.tgz && \ 26 | mv gumbo-parser-$GUMBO_VERSION gumbo-parser && \ 27 | cd gumbo-parser && ./autogen.sh && ./configure && make && \ 28 | make install && ldconfig && cd .. && \ 29 | rm -rf gumbo.tgz gumbo-parser 30 | 31 | 32 | # Optional dependencies for benchmarking 33 | RUN apt-get install -y --no-install-recommends \ 34 | libxml2-dev \ 35 | libxslt1-dev \ 36 | zlib1g-dev 37 | 38 | # RUN ln -s /usr/local/lib/libgumbo.so /usr/lib/python2.7/dist-packages/gumbo/libgumbo.so 39 | 40 | # Install PyPy 41 | RUN curl -L 'https://bitbucket.org/squeaky/portable-pypy/downloads/pypy-5.3.1-linux_x86_64-portable.tar.bz2' -o /pypy.tar.bz2 && \ 42 | mkdir -p /opt/pypy/ && tar jxvf /pypy.tar.bz2 -C /opt/pypy/ --strip-components=1 && \ 43 | rm /pypy.tar.bz2 44 | 45 | RUN /opt/pypy/bin/pypy -m ensurepip 46 | RUN /opt/pypy/bin/pip install --upgrade --ignore-installed pip 47 | 48 | # Install RE2 49 | RUN mkdir -p /tmp/re2 && \ 50 | curl -L 'https://github.com/google/re2/archive/636bc71728b7488c43f9441ecfc80bdb1905b3f0.tar.gz' -o /tmp/re2/re2.tar.gz && \ 51 | cd /tmp/re2 && tar zxvf re2.tar.gz --strip-components=1 && \ 52 | make && make install && \ 53 | rm -rf /tmp/re2 && \ 54 | ldconfig 55 | 56 | # Install Python dependencies 57 | 58 | ADD requirements-benchmark.txt /requirements-benchmark.txt 59 | ADD requirements.txt /requirements.txt 60 | # RUN pip3 install -r requirements.txt 61 | # RUN pip3 install -r requirements-benchmark.txt 62 | RUN pip install -r requirements.txt 63 | RUN pip install -r requirements-benchmark.txt 64 | RUN /opt/pypy/bin/pip install -r /requirements.txt 65 | RUN /opt/pypy/bin/pip install setuptools==18.5 # Because of html5lib 66 | RUN /opt/pypy/bin/pip install -r /requirements-benchmark.txt 67 | 68 | RUN mkdir -p /cosr/gumbocy 69 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016 Common Search contributors 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | LICENSE 3 | Makefile 4 | README.md 5 | gumbocy.c 6 | gumbocy.pyx 7 | setup.py 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include *.c 3 | include *.pyx 4 | include *.pyd 5 | include LICENSE 6 | include Makefile -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | clean: 2 | rm -rf *.so build *.c *.cpp *.html dist .cache tests/__pycache__ *.rst 3 | 4 | cythonize: 5 | cython --cplus -2 --warning-extra --annotate gumbocy.pyx 6 | 7 | build_ext: clean cythonize 8 | ifeq ($(GUMBOCY_PYTHON_VERSION), pypy) 9 | /opt/pypy/bin/pypy setup.py build_ext --inplace -Igumbo-parser/src -Lgumbo-parser/.libs -Rgumbo-parser/.libs 10 | else 11 | python setup.py build_ext --inplace -Igumbo-parser/src -Lgumbo-parser/.libs -Rgumbo-parser/.libs 12 | endif 13 | 14 | rst: 15 | pandoc --from=markdown --to=rst --output=README.rst README.md 16 | 17 | virtualenv: 18 | rm -rf venv 19 | virtualenv venv 20 | venv/bin/pip install -r requirements.txt 21 | 22 | test: build_ext 23 | ifeq ($(GUMBOCY_PYTHON_VERSION), pypy) 24 | /opt/pypy/bin/py.test tests/ -vs 25 | else 26 | py.test tests/ -vs 27 | endif 28 | 29 | docker_build: 30 | docker build -t commonsearch/gumbocy . 31 | 32 | docker_shell: 33 | docker run -e GUMBOCY_PYTHON_VERSION -v "$(PWD):/cosr/gumbocy:rw" -w /cosr/gumbocy -i -t commonsearch/gumbocy bash 34 | 35 | docker_test: 36 | docker run -e GUMBOCY_PYTHON_VERSION -v "$(PWD):/cosr/gumbocy:rw" -w /cosr/gumbocy -i -t commonsearch/gumbocy make test 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gumbocy 2 | 3 | [![Build Status](https://travis-ci.org/commonsearch/gumbocy.svg?branch=master)](https://travis-ci.org/commonsearch/gumbocy) [![Apache License 2.0](https://img.shields.io/github/license/commonsearch/gumbocy.svg)](LICENSE) 4 | 5 | **gumbocy** is an alternative Python binding for the excellent [Gumbo](https://github.com/google/gumbo-parser) HTML5 parser, originally written for [Common Search](http://about.commonsearch.org). 6 | 7 | It differs from the [official Python binding](https://github.com/google/gumbo-parser/tree/master/python/gumbo) in a few ways: 8 | 9 | - It is optimized for performance by using [Cython](http://cython.org/). 10 | - It has a smaller feature set and doesn't aim to be a general-purpose binding. 11 | - Its `listnodes()` API just returns nodes as a flat list of tuples. 12 | - Its `analyze()` API traverses the HTML tree and returns high-level data like groups of words and lists of hyperlinks. 13 | - It is generally restrictive. For instance, attributes have to be whitelisted. 14 | 15 | ## Installation 16 | 17 | The only dependency is [Gumbo](https://github.com/google/gumbo-parser). You need to install it (possibly with `make gumbo_build`) if you are not using the Docker method below. 18 | 19 | ### From PyPI 20 | 21 | ``` 22 | pip install gumbocy 23 | ``` 24 | 25 | ### From source with Docker 26 | 27 | Clone this repository, then: 28 | 29 | ``` 30 | make docker_build 31 | make docker_shell 32 | ``` 33 | 34 | You will end up in a container with Gumbo and Gumbocy already installed. 35 | 36 | You can then run the tests for Python 2.7 and PyPy: 37 | 38 | ``` 39 | make docker_test 40 | GUMBOCY_PYTHON_VERSION=pypy make docker_test 41 | ``` 42 | 43 | ### From source without Docker 44 | 45 | This is an unsupported method. 46 | 47 | ``` 48 | make virtualenv 49 | source venv/bin/activate 50 | make build_ext 51 | ``` 52 | 53 | ## Running the tests 54 | 55 | ``` 56 | make test 57 | ``` 58 | 59 | ## Quickstart 60 | 61 | ``` 62 | import gumbocy 63 | 64 | parser = gumbocy.HTMLParser(options={}) 65 | parser.parse("""Helloworld!""") 66 | print parser.listnodes() 67 | 68 | => [(0, "html"), (1, "head"), (2, "title"), (3, None, "Hello"), (1, "body"), (2, None, "world!")] 69 | 70 | print parser.analyze() 71 | 72 | => {'word_groups': [('world!', 'body')], 'external_hyperlinks': [], 'internal_hyperlinks': [], 'title': 'Hello'} 73 | 74 | ``` 75 | 76 | For more usage examples, see the [tests](https://github.com/commonsearch/gumbocy/blob/master/tests/). 77 | 78 | ## Options reference 79 | 80 | - **attributes_whitelist**: a set of attributes which, if present, will be returned in a dict as the 3rd element of a node tuple by `listnodes()`. Note that "class" is returned as a frozenset. Defaults to `set()`. 81 | - **nesting_limit**: an integer to specify the maximum nesting level that will be returned. Defaults to `999`. 82 | - **head_only**: a boolean that will make gumbocy return only the elements in the of the document. Useful for parsing only tags for instance. Defaults to `False`. 83 | - **tags_ignore**: a list of tag names that won't be returned (as well as their children). 84 | - **ids_ignore**: a list of IDs for which matching elements (and their children) won't be returned. 85 | - **classes_ignore**: a list of classes for which matching elements (and their children) won't be returned. 86 | 87 | 88 | ## Contributing 89 | 90 | If you are using Sublime Text, we recommend installing [Cython support](https://github.com/NotSqrt/sublime-cython). 91 | 92 | All contributions are welcome! Feel free to use the [Issues tab](https://github.com/commonsearch/gumbocy/issues) or send us your Pull Requests. 93 | 94 | ## Changelog 95 | 96 | ### 0.2 97 | - New `analyze()` API, moving most of the tree traversal that was happening in `cosr-back` to Cython, resulting in a ~3x speedup in indexing speed. 98 | - More tests 99 | 100 | ### 0.1 101 | - Initial public release 102 | -------------------------------------------------------------------------------- /gumbocy.pxd: -------------------------------------------------------------------------------- 1 | # https://github.com/google/gumbo-parser/blob/master/src/gumbo.h 2 | 3 | 4 | cdef extern from "gumbo.h": 5 | 6 | ctypedef enum GumboNamespaceEnum: 7 | GUMBO_NAMESPACE_HTML 8 | GUMBO_NAMESPACE_SVG 9 | GUMBO_NAMESPACE_MATHML 10 | 11 | ctypedef enum GumboAttributeNamespaceEnum: 12 | GUMBO_ATTR_NAMESPACE_NONE, 13 | GUMBO_ATTR_NAMESPACE_XLINK, 14 | GUMBO_ATTR_NAMESPACE_XML, 15 | GUMBO_ATTR_NAMESPACE_XMLNS, 16 | 17 | ctypedef enum GumboQuirksModeEnum: 18 | GUMBO_DOCTYPE_NO_QUIRKS, 19 | GUMBO_DOCTYPE_QUIRKS, 20 | GUMBO_DOCTYPE_LIMITED_QUIRKS 21 | 22 | ctypedef enum GumboTag: 23 | GUMBO_TAG_HTML, 24 | GUMBO_TAG_HEAD, 25 | GUMBO_TAG_TITLE, 26 | GUMBO_TAG_BASE, 27 | GUMBO_TAG_LINK, 28 | GUMBO_TAG_META, 29 | GUMBO_TAG_STYLE, 30 | GUMBO_TAG_SCRIPT, 31 | GUMBO_TAG_NOSCRIPT, 32 | GUMBO_TAG_TEMPLATE, 33 | GUMBO_TAG_BODY, 34 | GUMBO_TAG_ARTICLE, 35 | GUMBO_TAG_SECTION, 36 | GUMBO_TAG_NAV, 37 | GUMBO_TAG_ASIDE, 38 | GUMBO_TAG_H1, 39 | GUMBO_TAG_H2, 40 | GUMBO_TAG_H3, 41 | GUMBO_TAG_H4, 42 | GUMBO_TAG_H5, 43 | GUMBO_TAG_H6, 44 | GUMBO_TAG_HGROUP, 45 | GUMBO_TAG_HEADER, 46 | GUMBO_TAG_FOOTER, 47 | GUMBO_TAG_ADDRESS, 48 | GUMBO_TAG_P, 49 | GUMBO_TAG_HR, 50 | GUMBO_TAG_PRE, 51 | GUMBO_TAG_BLOCKQUOTE, 52 | GUMBO_TAG_OL, 53 | GUMBO_TAG_UL, 54 | GUMBO_TAG_LI, 55 | GUMBO_TAG_DL, 56 | GUMBO_TAG_DT, 57 | GUMBO_TAG_DD, 58 | GUMBO_TAG_FIGURE, 59 | GUMBO_TAG_FIGCAPTION, 60 | GUMBO_TAG_MAIN, 61 | GUMBO_TAG_DIV, 62 | GUMBO_TAG_A, 63 | GUMBO_TAG_EM, 64 | GUMBO_TAG_STRONG, 65 | GUMBO_TAG_SMALL, 66 | GUMBO_TAG_S, 67 | GUMBO_TAG_CITE, 68 | GUMBO_TAG_Q, 69 | GUMBO_TAG_DFN, 70 | GUMBO_TAG_ABBR, 71 | GUMBO_TAG_DATA, 72 | GUMBO_TAG_TIME, 73 | GUMBO_TAG_CODE, 74 | GUMBO_TAG_VAR, 75 | GUMBO_TAG_SAMP, 76 | GUMBO_TAG_KBD, 77 | GUMBO_TAG_SUB, 78 | GUMBO_TAG_SUP, 79 | GUMBO_TAG_I, 80 | GUMBO_TAG_B, 81 | GUMBO_TAG_U, 82 | GUMBO_TAG_MARK, 83 | GUMBO_TAG_RUBY, 84 | GUMBO_TAG_RT, 85 | GUMBO_TAG_RP, 86 | GUMBO_TAG_BDI, 87 | GUMBO_TAG_BDO, 88 | GUMBO_TAG_SPAN, 89 | GUMBO_TAG_BR, 90 | GUMBO_TAG_WBR, 91 | GUMBO_TAG_INS, 92 | GUMBO_TAG_DEL, 93 | GUMBO_TAG_IMAGE, 94 | GUMBO_TAG_IMG, 95 | GUMBO_TAG_IFRAME, 96 | GUMBO_TAG_EMBED, 97 | GUMBO_TAG_OBJECT, 98 | GUMBO_TAG_PARAM, 99 | GUMBO_TAG_VIDEO, 100 | GUMBO_TAG_AUDIO, 101 | GUMBO_TAG_SOURCE, 102 | GUMBO_TAG_TRACK, 103 | GUMBO_TAG_CANVAS, 104 | GUMBO_TAG_MAP, 105 | GUMBO_TAG_AREA, 106 | GUMBO_TAG_MATH, 107 | GUMBO_TAG_MI, 108 | GUMBO_TAG_MO, 109 | GUMBO_TAG_MN, 110 | GUMBO_TAG_MS, 111 | GUMBO_TAG_MTEXT, 112 | GUMBO_TAG_MGLYPH, 113 | GUMBO_TAG_MALIGNMARK, 114 | GUMBO_TAG_ANNOTATION_XML, 115 | GUMBO_TAG_SVG, 116 | GUMBO_TAG_FOREIGNOBJECT, 117 | GUMBO_TAG_DESC, 118 | GUMBO_TAG_TABLE, 119 | GUMBO_TAG_CAPTION, 120 | GUMBO_TAG_COLGROUP, 121 | GUMBO_TAG_COL, 122 | GUMBO_TAG_TBODY, 123 | GUMBO_TAG_THEAD, 124 | GUMBO_TAG_TFOOT, 125 | GUMBO_TAG_TR, 126 | GUMBO_TAG_TD, 127 | GUMBO_TAG_TH, 128 | GUMBO_TAG_FORM, 129 | GUMBO_TAG_FIELDSET, 130 | GUMBO_TAG_LEGEND, 131 | GUMBO_TAG_LABEL, 132 | GUMBO_TAG_INPUT, 133 | GUMBO_TAG_BUTTON, 134 | GUMBO_TAG_SELECT, 135 | GUMBO_TAG_DATALIST, 136 | GUMBO_TAG_OPTGROUP, 137 | GUMBO_TAG_OPTION, 138 | GUMBO_TAG_TEXTAREA, 139 | GUMBO_TAG_KEYGEN, 140 | GUMBO_TAG_OUTPUT, 141 | GUMBO_TAG_PROGRESS, 142 | GUMBO_TAG_METER, 143 | GUMBO_TAG_DETAILS, 144 | GUMBO_TAG_SUMMARY, 145 | GUMBO_TAG_MENU, 146 | GUMBO_TAG_MENUITEM, 147 | GUMBO_TAG_APPLET, 148 | GUMBO_TAG_ACRONYM, 149 | GUMBO_TAG_BGSOUND, 150 | GUMBO_TAG_DIR, 151 | GUMBO_TAG_FRAME, 152 | GUMBO_TAG_FRAMESET, 153 | GUMBO_TAG_NOFRAMES, 154 | GUMBO_TAG_ISINDEX, 155 | GUMBO_TAG_LISTING, 156 | GUMBO_TAG_XMP, 157 | GUMBO_TAG_NEXTID, 158 | GUMBO_TAG_NOEMBED, 159 | GUMBO_TAG_PLAINTEXT, 160 | GUMBO_TAG_RB, 161 | GUMBO_TAG_STRIKE, 162 | GUMBO_TAG_BASEFONT, 163 | GUMBO_TAG_BIG, 164 | GUMBO_TAG_BLINK, 165 | GUMBO_TAG_CENTER, 166 | GUMBO_TAG_FONT, 167 | GUMBO_TAG_MARQUEE, 168 | GUMBO_TAG_MULTICOL, 169 | GUMBO_TAG_NOBR, 170 | GUMBO_TAG_SPACER, 171 | GUMBO_TAG_TT, 172 | GUMBO_TAG_RTC, 173 | GUMBO_TAG_UNKNOWN, 174 | GUMBO_TAG_LAST 175 | 176 | ctypedef enum GumboParseFlags: 177 | 178 | GUMBO_INSERTION_NORMAL = 0, 179 | 180 | GUMBO_INSERTION_BY_PARSER = 1 << 0, 181 | 182 | GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1, 183 | 184 | GUMBO_INSERTION_IMPLIED = 1 << 3, 185 | 186 | GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4, 187 | 188 | GUMBO_INSERTION_FROM_ISINDEX = 1 << 5, 189 | 190 | GUMBO_INSERTION_FROM_IMAGE = 1 << 6, 191 | 192 | GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7, 193 | 194 | GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8, 195 | 196 | GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9, 197 | 198 | GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10, 199 | 200 | 201 | ctypedef struct GumboVector: 202 | void** data 203 | unsigned int length 204 | unsigned int capacity 205 | 206 | ctypedef struct GumboStringPiece: 207 | const char* data 208 | size_t length 209 | 210 | ctypedef struct GumboStringPiece: 211 | const char* data 212 | size_t length 213 | 214 | 215 | ctypedef struct GumboAttribute: 216 | GumboAttributeNamespaceEnum attr_namespace 217 | const char* name 218 | GumboStringPiece original_name 219 | const char* value 220 | GumboStringPiece original_value 221 | 222 | ctypedef struct GumboDocument: 223 | GumboVector children 224 | bint has_doctype 225 | const char* name 226 | const char* public_identifier 227 | const char* system_identifier 228 | GumboQuirksModeEnum doc_type_quirks_mode 229 | 230 | ctypedef struct GumboElement: 231 | GumboVector children 232 | GumboTag tag 233 | GumboNamespaceEnum tag_namespace 234 | GumboStringPiece original_tag 235 | GumboStringPiece original_end_tag 236 | GumboVector attributes 237 | 238 | ctypedef struct GumboText: 239 | const char* text 240 | GumboStringPiece original_text 241 | 242 | ctypedef enum GumboNodeType: 243 | GUMBO_NODE_DOCUMENT 244 | GUMBO_NODE_ELEMENT 245 | GUMBO_NODE_TEXT 246 | GUMBO_NODE_CDATA 247 | GUMBO_NODE_COMMENT 248 | GUMBO_NODE_WHITESPACE 249 | GUMBO_NODE_TEMPLATE 250 | 251 | # ctypedef struct GumboNode: 252 | # pass 253 | 254 | ctypedef union GumboNodeData: 255 | GumboDocument document 256 | GumboElement element 257 | GumboText text 258 | 259 | ctypedef struct GumboNode: 260 | GumboNodeType type 261 | GumboNode* parent 262 | size_t index_within_parent 263 | GumboParseFlags parse_flags 264 | GumboNodeData v 265 | 266 | ctypedef struct GumboOutput: 267 | GumboNode* document 268 | GumboNode* root 269 | GumboVector errors 270 | 271 | ctypedef struct GumboOptions: 272 | pass 273 | 274 | extern const GumboOptions kGumboDefaultOptions 275 | 276 | GumboOutput* gumbo_parse(const char* buffer) 277 | 278 | void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) 279 | 280 | bint gumbo_string_equals(const GumboStringPiece* str1, const GumboStringPiece* str2) 281 | bint gumbo_string_equals_ignore_case(const GumboStringPiece* str1, const GumboStringPiece* str2) 282 | 283 | int gumbo_vector_index_of(GumboVector* vector, const void* element) 284 | 285 | const char* gumbo_normalized_tagname(GumboTag tag) 286 | 287 | void gumbo_tag_from_original_text(GumboStringPiece* text) 288 | 289 | GumboTag gumbo_tag_enum(const char* tagname) 290 | GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) 291 | 292 | GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name) 293 | -------------------------------------------------------------------------------- /gumbocy.pyx: -------------------------------------------------------------------------------- 1 | import re 2 | import urlparse 3 | cimport gumbocy 4 | cimport re2cy 5 | from libcpp.unordered_set cimport unordered_set 6 | from cython.operator cimport dereference as deref 7 | from libcpp.vector cimport vector 8 | from libcpp.map cimport map 9 | 10 | 11 | cdef extern from "stdio.h": 12 | int printf(const char* format, ...); 13 | 14 | cdef vector[re2cy.ArgPtr] *argp = new vector[re2cy.ArgPtr]() 15 | cdef re2cy.ArgPtr *empty_args = &(deref(argp)[0]) 16 | 17 | cdef bint re2_search(const char* s, re2cy.RE2 &pattern): 18 | return re2cy.RE2.PartialMatchN(s, pattern, empty_args, 0) 19 | 20 | cdef re2cy.RE2 *_RE2_SEARCH_STYLE_HIDDEN = new re2cy.RE2(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)") 21 | cdef re2cy.RE2 *_RE2_ABSOLUTE_HREF = new re2cy.RE2(r"^(?:[A-Za-z0-9\+\.\-]+\:)?\/\/") 22 | cdef re2cy.RE2 *_RE2_IGNORED_HREF = new re2cy.RE2(r"^(?:javascript|mailto|ftp|about)\:") 23 | 24 | _RE_SPLIT_WHITESPACE = re.compile(r"\s+") 25 | 26 | ctypedef enum AttributeNames: 27 | ATTR_ID, 28 | ATTR_ROLE, 29 | ATTR_HREF, 30 | ATTR_STYLE, 31 | ATTR_REL, 32 | ATTR_SRC, 33 | ATTR_ALT, 34 | ATTR_NAME, 35 | ATTR_PROPERTY, 36 | ATTR_CONTENT 37 | 38 | # ATTR_ID = 0 39 | # ATTR_ROLE = 1 40 | # ATTR_HREF = 2 41 | # ATTR_STYLE = 3 42 | # ATTR_REL = 4 43 | # ATTR_SRC = 5 44 | # ATTR_ALT = 6 45 | # ATTR_NAME = 7 46 | # ATTR_PROPERTY = 8 47 | # ATTR_CONTENT = 9 48 | 49 | # cdef struct Attributes: 50 | # int size_classes 51 | # vector[char*] classes 52 | # bint has_hidden 53 | # map[AttributeNames, const char*] values 54 | 55 | cdef class Attributes: 56 | cdef int size_classes 57 | cdef dict values 58 | # cdef map[AttributeNames, const char*] values 59 | # cdef const char* values[10] 60 | # cdef vector[char*] classes 61 | cdef list classes 62 | cdef bint has_hidden 63 | 64 | # ctypedef sAttributes Attributes 65 | 66 | cdef class HTMLParser: 67 | 68 | # Global parser variables 69 | cdef int nesting_limit 70 | cdef bint head_only 71 | cdef bint has_ids_ignore 72 | cdef bint has_classes_ignore 73 | cdef bint has_ids_hidden 74 | cdef bint has_classes_hidden 75 | cdef bint has_attributes_whitelist 76 | cdef bint has_classes_boilerplate 77 | cdef bint has_ids_boilerplate 78 | cdef bint has_roles_boilerplate 79 | cdef bint has_metas_whitelist 80 | 81 | cdef unordered_set[int] tags_ignore 82 | cdef unordered_set[int] tags_ignore_head_only 83 | cdef unordered_set[int] tags_boilerplate 84 | cdef unordered_set[int] tags_boilerplate_bypass 85 | cdef unordered_set[int] tags_separators 86 | 87 | cdef re2cy.RE2* attributes_whitelist 88 | cdef re2cy.RE2* metas_whitelist 89 | cdef re2cy.RE2* classes_ignore 90 | cdef re2cy.RE2* ids_ignore 91 | cdef re2cy.RE2* classes_hidden 92 | cdef re2cy.RE2* ids_hidden 93 | cdef re2cy.RE2* classes_boilerplate 94 | cdef re2cy.RE2* ids_boilerplate 95 | cdef re2cy.RE2* roles_boilerplate 96 | 97 | cdef bint analyze_internal_hyperlinks 98 | cdef bint analyze_external_hyperlinks 99 | cdef bint analyze_word_groups 100 | 101 | # Variables reinitialized at each parse() 102 | cdef list current_stack 103 | 104 | cdef bint has_url 105 | cdef char* url 106 | cdef char* netloc 107 | cdef char* scheme 108 | cdef re2cy.RE2* internal_netloc_search 109 | 110 | cdef dict analysis 111 | 112 | cdef object current_word_group 113 | cdef object current_hyperlink 114 | 115 | cdef bint has_output 116 | cdef gumbocy.GumboOutput* output 117 | cdef list nodes 118 | 119 | def __cinit__(self, dict options=None): 120 | 121 | options = options or {} 122 | 123 | self.nesting_limit = options.get("nesting_limit", 999) 124 | self.head_only = options.get("head_only") 125 | 126 | self.analyze_external_hyperlinks = bool(options.get("analyze_external_hyperlinks", True)) 127 | self.analyze_internal_hyperlinks = bool(options.get("analyze_internal_hyperlinks", True)) 128 | self.analyze_word_groups = bool(options.get("analyze_word_groups", True)) 129 | 130 | attributes_whitelist = set(options.get("attributes_whitelist") or []) 131 | 132 | classes_ignore = frozenset(options.get("classes_ignore") or []) 133 | if len(classes_ignore) > 0: 134 | self.has_classes_ignore = True 135 | self.classes_ignore = new re2cy.RE2("^(?:" + "|".join(classes_ignore) + ")$") 136 | attributes_whitelist.add("class") 137 | 138 | ids_ignore = frozenset(options.get("ids_ignore") or []) 139 | if len(ids_ignore) > 0: 140 | self.has_ids_ignore = True 141 | self.ids_ignore = new re2cy.RE2("^(?:" + "|".join(ids_ignore) + ")$") 142 | attributes_whitelist.add("id") 143 | 144 | classes_hidden = frozenset(options.get("classes_hidden") or []) 145 | if len(classes_hidden) > 0: 146 | self.has_classes_hidden = True 147 | self.classes_hidden = new re2cy.RE2("^(?:" + "|".join(classes_hidden) + ")$") 148 | attributes_whitelist.add("class") 149 | 150 | ids_hidden = frozenset(options.get("ids_hidden") or []) 151 | if len(ids_hidden) > 0: 152 | self.has_ids_hidden = True 153 | self.ids_hidden = new re2cy.RE2("^(?:" + "|".join(ids_hidden) + ")$") 154 | attributes_whitelist.add("id") 155 | 156 | classes_boilerplate = frozenset(options.get("classes_boilerplate") or []) 157 | if len(classes_boilerplate) > 0: 158 | self.has_classes_boilerplate = True 159 | self.classes_boilerplate = new re2cy.RE2("^(?:" + "|".join(classes_boilerplate) + ")$") 160 | attributes_whitelist.add("class") 161 | 162 | ids_boilerplate = frozenset(options.get("ids_boilerplate") or []) 163 | if len(ids_boilerplate) > 0: 164 | self.has_ids_boilerplate = True 165 | self.ids_boilerplate = new re2cy.RE2("^(?:" + "|".join(ids_boilerplate) + ")$") 166 | attributes_whitelist.add("id") 167 | 168 | roles_boilerplate = frozenset(options.get("roles_boilerplate") or []) 169 | if len(roles_boilerplate) > 0: 170 | self.has_roles_boilerplate = True 171 | self.roles_boilerplate = new re2cy.RE2("^(?:" + "|".join(roles_boilerplate) + ")$") 172 | attributes_whitelist.add("role") 173 | 174 | metas_whitelist = frozenset(options.get("metas_whitelist") or []) 175 | if len(metas_whitelist) > 0: 176 | self.has_metas_whitelist = True 177 | self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$") 178 | attributes_whitelist.add("name") 179 | attributes_whitelist.add("property") 180 | attributes_whitelist.add("content") 181 | 182 | # Some options add attributes to the whitelist 183 | if self.analyze_external_hyperlinks or self.analyze_internal_hyperlinks: 184 | attributes_whitelist.add("href") 185 | attributes_whitelist.add("rel") 186 | 187 | # FInally, freeze the attributes whitelist 188 | self.has_attributes_whitelist = len(attributes_whitelist) > 0 189 | if self.has_attributes_whitelist: 190 | self.attributes_whitelist = new re2cy.RE2("^(?:" + "|".join(attributes_whitelist) + ")$") 191 | 192 | self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY) 193 | self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P) 194 | self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV) 195 | self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN) 196 | 197 | for tag_name in options.get("tags_ignore", []): 198 | tag = gumbocy.gumbo_tag_enum(tag_name) 199 | if tag != gumbocy.GUMBO_TAG_UNKNOWN: 200 | self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name)) 201 | 202 | for tag_name in options.get("tags_boilerplate", []): 203 | tag = gumbocy.gumbo_tag_enum(tag_name) 204 | if tag != gumbocy.GUMBO_TAG_UNKNOWN: 205 | self.tags_boilerplate.insert( gumbocy.gumbo_tag_enum(tag_name)) 206 | 207 | for tag_name in options.get("tags_boilerplate_bypass", []): 208 | tag = gumbocy.gumbo_tag_enum(tag_name) 209 | if tag != gumbocy.GUMBO_TAG_UNKNOWN: 210 | self.tags_boilerplate_bypass.insert( gumbocy.gumbo_tag_enum(tag_name)) 211 | 212 | for tag_name in options.get("tags_separators", []): 213 | tag = gumbocy.gumbo_tag_enum(tag_name) 214 | if tag != gumbocy.GUMBO_TAG_UNKNOWN: 215 | self.tags_separators.insert( gumbocy.gumbo_tag_enum(tag_name)) 216 | 217 | self.tags_separators.insert(gumbocy.GUMBO_TAG_BODY) 218 | 219 | cdef bint guess_node_hidden(self, gumbocy.GumboNode* node, Attributes attrs): 220 | """ Rough guess to check if the element is explicitly hidden. 221 | 222 | Not intended to combat spam! 223 | """ 224 | 225 | if not self.has_attributes_whitelist: 226 | return False 227 | 228 | # From the HTML5 spec 229 | if attrs.has_hidden: 230 | return True 231 | 232 | if self.has_ids_hidden and attrs.values.get(ATTR_ID): 233 | if re2_search(attrs.values[ATTR_ID], deref(self.ids_hidden)): 234 | return True 235 | 236 | if self.has_classes_hidden and attrs.size_classes > 0: 237 | for k in attrs.classes: 238 | if re2_search(k, deref(self.classes_hidden)): 239 | return True 240 | 241 | if attrs.values.get(ATTR_STYLE): 242 | if re2_search(attrs.values[ATTR_STYLE], deref(_RE2_SEARCH_STYLE_HIDDEN)): 243 | return True 244 | 245 | return False 246 | 247 | 248 | cdef bint guess_node_boilerplate(self, gumbocy.GumboNode* node, Attributes attrs): 249 | """ Rough guess to check if the element is boilerplate """ 250 | 251 | if self.tags_boilerplate.count( node.v.element.tag): 252 | return True 253 | 254 | # http://html5doctor.com/understanding-aside/ 255 | if node.v.element.tag == gumbocy.GUMBO_TAG_ASIDE: 256 | if "article" not in self.current_stack: 257 | return True 258 | 259 | if self.has_classes_boilerplate and attrs.size_classes > 0: 260 | for k in attrs.classes: 261 | if re2_search(k, deref(self.classes_boilerplate)): 262 | return True 263 | 264 | if self.has_ids_boilerplate and attrs.values.get(ATTR_ID): 265 | if re2_search(attrs.values[ATTR_ID], deref(self.ids_boilerplate)): 266 | return True 267 | 268 | if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE): 269 | if re2_search(attrs.values[ATTR_ROLE], deref(self.roles_boilerplate)): 270 | return True 271 | 272 | return False 273 | 274 | cdef Attributes get_attributes(self, gumbocy.GumboNode* node): 275 | """ Build a dict with all the whitelisted attributes """ 276 | 277 | attrs = Attributes() 278 | # cdef Attributes attrs 279 | attrs.size_classes = 0 280 | attrs.has_hidden = 0 281 | # attrs.values = [""] * 10 282 | # attrs.classes = [] 283 | attrs.values = {} # deref(new map[AttributeNames, const char*]()) 284 | # attrs.values[ATTR_ID] = "x" 285 | # print dict(attrs.values) 286 | 287 | for i in range(node.v.element.attributes.length): 288 | 289 | attr = node.v.element.attributes.data[i] 290 | 291 | if re2_search(attr.name, deref(self.attributes_whitelist)): 292 | 293 | if attr.name == b"class": 294 | multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) 295 | attrs.size_classes = len(multiple_value) 296 | if attrs.size_classes > 0: 297 | attrs.classes = list(multiple_value) 298 | # for k in multiple_value: 299 | # ck = k 300 | # attrs.classes.push_back(ck) # = list(multiple_value) 301 | 302 | elif attr.name == b"id": 303 | pystr = str(attr.value).lower() 304 | attrs.values[ATTR_ID] = pystr 305 | 306 | elif attr.name == b"style": 307 | attrs.values[ATTR_STYLE] = attr.value 308 | 309 | elif attr.name == b"href": 310 | attrs.values[ATTR_HREF] = attr.value 311 | 312 | elif attr.name == b"role": 313 | pystr = str(attr.value).lower() 314 | attrs.values[ATTR_ROLE] = pystr 315 | 316 | elif attr.name == b"rel": 317 | pystr = str(attr.value).lower() 318 | attrs.values[ATTR_REL] = pystr 319 | 320 | elif attr.name == b"aria-hidden" and attr.value == b"true": 321 | attrs.has_hidden = 1 322 | 323 | elif attr.name == b"hidden": 324 | attrs.has_hidden = 1 325 | 326 | elif attr.name == b"alt": 327 | attrs.values[ATTR_ALT] = attr.value 328 | 329 | elif attr.name == b"src": 330 | attrs.values[ATTR_SRC] = attr.value 331 | 332 | elif attr.name == b"name": 333 | pystr = str(attr.value).lower() 334 | attrs.values[ATTR_NAME] = pystr 335 | 336 | elif attr.name == b"property": 337 | pystr = str(attr.value).lower() 338 | attrs.values[ATTR_PROPERTY] = pystr 339 | 340 | elif attr.name == b"content": 341 | attrs.values[ATTR_CONTENT] = attr.value 342 | 343 | return attrs 344 | 345 | cdef void close_word_group(self): 346 | """ Close the current word group """ 347 | 348 | if self.current_word_group: 349 | self.analysis["word_groups"].append(tuple(self.current_word_group)) 350 | self.current_word_group = None 351 | 352 | 353 | cdef void add_text(self, text): 354 | """ Adds inner text to the current word group """ 355 | 356 | if not self.current_word_group: 357 | self.current_word_group = [text.strip(), self.current_stack[-1]] 358 | else: 359 | self.current_word_group[0] += " " + text.strip() 360 | 361 | cdef void add_hyperlink_text(self, text): 362 | """ Adds inner text to the currently open hyperlink """ 363 | 364 | if self.current_hyperlink: 365 | self.current_hyperlink[1] += text 366 | 367 | cdef void open_hyperlink(self, Attributes attrs): 368 | """ Opens a new hyperlink """ 369 | 370 | if not self.analyze_external_hyperlinks and not self.analyze_internal_hyperlinks: 371 | return 372 | 373 | if not attrs.values.get(ATTR_HREF): 374 | return 375 | 376 | if len(attrs.values[ATTR_HREF]) == 0: 377 | return 378 | 379 | if re2_search(attrs.values[ATTR_HREF], deref(_RE2_IGNORED_HREF)): 380 | return 381 | 382 | self.close_hyperlink() 383 | 384 | # href, text, rel 385 | self.current_hyperlink = [attrs.values[ATTR_HREF], "", attrs.values.get(ATTR_REL)] 386 | 387 | cdef void close_hyperlink(self): 388 | """ Closes the current hyperlink if any, and decides if it's an external or internal link """ 389 | 390 | cdef bint is_external = 0 391 | 392 | if not self.analyze_external_hyperlinks and not self.analyze_internal_hyperlinks: 393 | return 394 | 395 | if self.current_hyperlink: 396 | href = self.current_hyperlink[0] 397 | 398 | if re2_search(href, deref(_RE2_ABSOLUTE_HREF)): 399 | is_external = 1 400 | 401 | if self.has_url: 402 | 403 | if href.startswith("//"): 404 | href = self.scheme + ":" + href 405 | 406 | # This may be an absolute link but to the same domain 407 | if re2_search(href, deref(self.internal_netloc_search)): 408 | is_external = 0 409 | href = href.split(self.netloc, 1)[1] 410 | 411 | if is_external: 412 | if self.analyze_external_hyperlinks: 413 | self.analysis["external_hyperlinks"].append( 414 | (href, self.current_hyperlink[1], self.current_hyperlink[2]) 415 | ) 416 | 417 | elif self.analyze_internal_hyperlinks: 418 | self.analysis["internal_hyperlinks"].append( 419 | (href, self.current_hyperlink[1], self.current_hyperlink[2]) 420 | ) 421 | 422 | self.current_hyperlink = None 423 | 424 | cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node, bint is_head, bint is_hidden, bint is_boilerplate, bint is_boilerplate_bypassed, bint is_hyperlink): 425 | """ Traverses the node tree. Return 1 to stop at this level """ 426 | 427 | cdef GumboStringPiece gsp 428 | cdef const char* tag_name 429 | cdef int tag_n 430 | 431 | if level > self.nesting_limit: 432 | return 0 433 | 434 | if node.type == gumbocy.GUMBO_NODE_TEXT: 435 | 436 | if (self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks) and is_hyperlink: 437 | self.add_hyperlink_text(node.v.text.text) 438 | 439 | if self.analyze_word_groups and not is_head and not is_hidden and (not is_boilerplate or is_boilerplate_bypassed): 440 | self.add_text(node.v.text.text) 441 | 442 | elif node.type == gumbocy.GUMBO_NODE_ELEMENT: 443 | 444 | tag_n = node.v.element.tag 445 | 446 | if self.head_only and self.tags_ignore_head_only.count(tag_n): 447 | return 1 448 | 449 | if self.tags_ignore.count(tag_n): 450 | return 0 451 | 452 | tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag) 453 | 454 | # When we find an unknown tag, find its tag_name in the buffer 455 | if tag_name == b"": 456 | gsp = node.v.element.original_tag 457 | gumbo_tag_from_original_text(&gsp) 458 | py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! 459 | tag_name = py_tag_name 460 | 461 | # if self.has_attributes_whitelist: 462 | 463 | attrs = self.get_attributes(node) 464 | 465 | if self.has_classes_ignore and attrs.size_classes > 0: 466 | for v in attrs.classes: 467 | if re2_search(v, deref(self.classes_ignore)): 468 | return 0 469 | 470 | if self.has_ids_ignore and attrs.values.get(ATTR_ID): 471 | if re2_search(attrs.values[ATTR_ID], deref(self.ids_ignore)): 472 | return 0 473 | 474 | if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE: 475 | if not self.analysis.get("title"): 476 | if node.v.element.children.length > 0: 477 | first_child = node.v.element.children.data[0] 478 | if first_child.type == gumbocy.GUMBO_NODE_TEXT: 479 | self.analysis["title"] = first_child.v.text.text 480 | return 0 481 | 482 | self.current_stack.append(tag_name) 483 | 484 | if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: 485 | is_head = 1 486 | 487 | elif node.v.element.tag == gumbocy.GUMBO_TAG_A: 488 | self.open_hyperlink(attrs) 489 | is_hyperlink = 1 490 | 491 | elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG: 492 | self.close_word_group() 493 | if attrs.values.get(ATTR_ALT): 494 | self.add_text(attrs.values[ATTR_ALT]) 495 | self.close_word_group() 496 | 497 | # Text extraction from image filenames disabled for now 498 | # if attrs.get("src"): 499 | # if not attrs["src"].startswith("data:"): 500 | # self.add_text(self._split_filename_words(attrs["src"])) 501 | # self.close_word_group() 502 | 503 | 504 | if is_head: 505 | if node.v.element.tag == gumbocy.GUMBO_TAG_LINK: 506 | 507 | # TODO: more properties 508 | if attrs.values.get(ATTR_REL) and attrs.values.get(ATTR_HREF): 509 | self.analysis.setdefault("head_links", []) 510 | self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]}) 511 | 512 | elif self.has_metas_whitelist and node.v.element.tag == gumbocy.GUMBO_TAG_META: 513 | 514 | if attrs.values.get(ATTR_CONTENT): 515 | 516 | if attrs.values.get(ATTR_NAME): 517 | if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)): 518 | self.analysis.setdefault("head_metas", {}) 519 | self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip() 520 | 521 | elif attrs.values.get(ATTR_PROPERTY): 522 | if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)): 523 | self.analysis.setdefault("head_metas", {}) 524 | self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip() 525 | 526 | elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE: 527 | if attrs.values.get(ATTR_HREF) and "base_url" not in self.analysis: 528 | self.analysis["base_url"] = attrs.values[ATTR_HREF] 529 | 530 | # TODO is_article 531 | 532 | if not is_hidden: 533 | is_hidden = self.guess_node_hidden(node, attrs) 534 | 535 | if is_boilerplate and not is_boilerplate_bypassed: 536 | if self.tags_boilerplate_bypass.count(tag_n): 537 | is_boilerplate_bypassed = True 538 | 539 | if not is_boilerplate: 540 | is_boilerplate = self.guess_node_boilerplate(node, attrs) 541 | 542 | # print " " * level, "BOILER", tag_name, is_boilerplate, dict(attrs.values), attrs.classes 543 | 544 | # Close the word group 545 | if self.tags_separators.count(tag_n): 546 | self.close_word_group() 547 | 548 | # Call _traverse_node() recursively for each of the children 549 | for i in range(node.v.element.children.length): 550 | child = node.v.element.children.data[i] 551 | if self._traverse_node(level + 1, child, is_head, is_hidden, is_boilerplate, is_boilerplate_bypassed, is_hyperlink) == 1: 552 | break 553 | 554 | # Close the word group 555 | if self.tags_separators.count(tag_n): 556 | self.close_word_group() 557 | 558 | self.current_stack.pop() 559 | 560 | if node.v.element.tag == gumbocy.GUMBO_TAG_A: 561 | self.close_hyperlink() 562 | 563 | if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD: 564 | if self.head_only: 565 | return 1 566 | 567 | return 0 568 | 569 | def parse(self, char* html): 570 | """ Do the actual parsing of the HTML with gumbo """ 571 | 572 | self.free() 573 | self.output = gumbocy.gumbo_parse(html) 574 | self.has_output = 1 575 | 576 | def analyze(self, url=None): 577 | """ Traverse the parsed tree and return the results """ 578 | 579 | self.analysis = {} 580 | self.has_url = 0 581 | 582 | if self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks: 583 | 584 | if url: 585 | self.has_url = 1 586 | self.url = url 587 | parsed = urlparse.urlparse(url) 588 | netloc = parsed.netloc.lower() 589 | self.netloc = netloc 590 | self.scheme = parsed.scheme 591 | self.internal_netloc_search = new re2cy.RE2("^http(?:s)?://%s" % re.escape(self.netloc)) 592 | 593 | if self.analyze_internal_hyperlinks: 594 | self.analysis["internal_hyperlinks"] = [] 595 | 596 | if self.analyze_external_hyperlinks: 597 | self.analysis["external_hyperlinks"] = [] 598 | 599 | if self.analyze_word_groups: 600 | self.analysis["word_groups"] = [] 601 | 602 | self.current_stack = [] 603 | self.current_word_group = None 604 | self.current_hyperlink = None 605 | 606 | self._traverse_node(0, self.output.root, 0, 0, 0, 0, 0) 607 | 608 | return self.analysis 609 | 610 | # 611 | # Older listnodes() API support 612 | # 613 | 614 | def listnodes(self): 615 | """ Return the nodes as a flat list of tuples """ 616 | 617 | self.nodes = [] 618 | 619 | self._traverse_node_simple(0, self.output.root) 620 | 621 | return self.nodes 622 | 623 | cdef bint _traverse_node_simple(self, int level, gumbocy.GumboNode* node): 624 | """ Traverses the node tree. Return 1 to stop at this level """ 625 | 626 | cdef GumboStringPiece gsp 627 | 628 | if level > self.nesting_limit: 629 | return 0 630 | 631 | if node.type == gumbocy.GUMBO_NODE_TEXT: 632 | self.nodes.append((level, None, node.v.text.text)) 633 | 634 | elif node.type == gumbocy.GUMBO_NODE_ELEMENT: 635 | 636 | tag_n = node.v.element.tag 637 | 638 | if self.head_only and self.tags_ignore_head_only.count(tag_n): 639 | return 1 640 | 641 | if self.tags_ignore.count(tag_n): 642 | return 0 643 | 644 | tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag) 645 | 646 | # When we find an unknown tag, find its tag_name in the buffer 647 | if tag_name == b"": 648 | gsp = node.v.element.original_tag 649 | gumbo_tag_from_original_text(&gsp) 650 | py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C! 651 | tag_name = py_tag_name 652 | 653 | if self.has_attributes_whitelist: 654 | 655 | # Build a dict with all the whitelisted attributes 656 | has_attrs = False 657 | attrs = False 658 | for i in range(node.v.element.attributes.length): 659 | attr = node.v.element.attributes.data[i] 660 | attr_name = str(attr.name) 661 | if re2_search(attr_name, deref(self.attributes_whitelist)): 662 | if attr_name == b"class": 663 | multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower())) 664 | if len(multiple_value): 665 | if self.has_classes_ignore: 666 | for v in multiple_value: 667 | if re2_search(v, deref(self.classes_ignore)): 668 | return 0 669 | 670 | if not has_attrs: 671 | attrs = {} 672 | has_attrs = True 673 | attrs[attr_name] = multiple_value 674 | 675 | else: 676 | 677 | if not has_attrs: 678 | attrs = {} 679 | has_attrs = True 680 | attrs[attr_name] = attr.value 681 | 682 | if not has_attrs: 683 | self.nodes.append((level, tag_name)) 684 | 685 | else: 686 | 687 | if self.has_ids_ignore: 688 | if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)): 689 | return 0 690 | 691 | self.nodes.append((level, tag_name, attrs)) 692 | 693 | else: 694 | self.nodes.append((level, tag_name)) 695 | 696 | # Call _iternode() recursively for each of the children 697 | for i in range(node.v.element.children.length): 698 | child = node.v.element.children.data[i] 699 | if self._traverse_node_simple(level + 1, child) == 1: 700 | break 701 | 702 | if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only: 703 | return 1 704 | 705 | return 0 706 | 707 | def __dealloc__(self): 708 | """ Cleanup gumbo memory when the parser is deallocated by Python """ 709 | self.free() 710 | 711 | cdef free(self): 712 | if self.has_output: 713 | gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output) 714 | self.has_output = 0 715 | -------------------------------------------------------------------------------- /re2cy.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.string cimport string 2 | 3 | ctypedef Arg* ArgPtr 4 | 5 | 6 | cdef extern from "re2/stringpiece.h" namespace "re2": 7 | cdef cppclass StringPiece: 8 | # Eliding some constructors on purpose. 9 | StringPiece(const char*) except + 10 | StringPiece(const string&) except + 11 | 12 | const char* data() 13 | int length() 14 | 15 | 16 | cdef extern from "re2/re2.h" namespace "re2": 17 | 18 | cdef cppclass Arg "RE2::Arg": 19 | Arg() 20 | 21 | cdef cppclass RE2: 22 | RE2(const char*) except + 23 | 24 | @staticmethod 25 | bint PartialMatchN( 26 | const char *, 27 | const RE2&, 28 | const Arg* const args[], 29 | int, 30 | ) 31 | -------------------------------------------------------------------------------- /requirements-benchmark.txt: -------------------------------------------------------------------------------- 1 | lxml 2 | requests 3 | html5lib 4 | bs4 5 | BeautifulSoup; python_version < '3.0' 6 | gumbo -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython==0.24 2 | pytest==2.9.2 3 | pytest-repeat==0.3.0 -------------------------------------------------------------------------------- /scripts/git-set-file-times: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | # sets mtime and atime of files to the latest commit time in git 5 | # 6 | # This is useful for serving static content (managed by git) 7 | # from a cluster of identically configured HTTP servers. HTTP 8 | # clients and content delivery networks can get consistent 9 | # Last-Modified headers no matter which HTTP server in the 10 | # cluster they hit. This should improve caching behavior. 11 | # 12 | # This does not take into account merges, but if you're updating 13 | # every machine in the cluster from the same commit (A) to the 14 | # same commit (B), the mtimes will be _consistent_ across all 15 | # machines if not necessarily accurate. 16 | # 17 | # THIS IS NOT INTENDED TO OPTIMIZE BUILD SYSTEMS SUCH AS 'make' 18 | # YOU HAVE BEEN WARNED! 19 | 20 | my %ls = (); 21 | my $commit_time; 22 | 23 | if ($ENV{GIT_DIR}) { 24 | chdir($ENV{GIT_DIR}) or die $!; 25 | } 26 | 27 | $/ = "\0"; 28 | open FH, 'git ls-files -z|' or die $!; 29 | while () { 30 | chomp; 31 | $ls{$_} = $_; 32 | } 33 | close FH; 34 | 35 | 36 | $/ = "\n"; 37 | open FH, "git log -m -r --name-only --no-color --pretty=raw -z @ARGV |" or die $!; 38 | while () { 39 | chomp; 40 | if (/^committer .*? (\d+) (?:[\-\+]\d+)$/) { 41 | $commit_time = $1; 42 | } elsif (s/\0\0commit [a-f0-9]{40}( \(from [a-f0-9]{40}\))?$// or s/\0$//) { 43 | my @files = delete @ls{split(/\0/, $_)}; 44 | @files = grep { defined $_ } @files; 45 | next unless @files; 46 | utime $commit_time, $commit_time, @files; 47 | } 48 | last unless %ls; 49 | 50 | } 51 | close FH; -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.extension import Extension 3 | import os 4 | 5 | # gumbocy.c will be present when installing from the source distribution on PyPI 6 | if os.path.isfile("gumbocy.cpp"): 7 | 8 | # Use "make cythonize" to build the c file from the .pyx source 9 | ext_modules = [ 10 | Extension("gumbocy", 11 | ["gumbocy.cpp"], 12 | libraries=["gumbo", "re2"], 13 | language="c++", 14 | extra_compile_args=["-std=c++11", '-O3', '-static-libstdc++'], 15 | extra_link_args=["-std=c++11"]) # , "-static" 16 | 17 | ] 18 | 19 | else: 20 | raise Exception("Must run 'make cythonize' first!") 21 | 22 | # # If the .c file is missing, we must be in local or installing from GitHub. 23 | # # In this case, we need Cython to be already installed. 24 | # else: 25 | # from Cython.Build import cythonize 26 | 27 | # ext_modules = cythonize([ 28 | # Extension("gumbocy", 29 | # ["gumbocy.pyx"], 30 | # libraries=["gumbo"], 31 | # language="c++", 32 | # extra_compile_args=["-std=c++11"], 33 | # extra_link_args=["-std=c++11"]) 34 | # ]) 35 | 36 | 37 | setup( 38 | name="gumbocy", 39 | version="0.2.0", 40 | description="Python binding for gumbo-parser (an HTML5-compliant parser) using Cython", 41 | author="Common Search contributors", 42 | license="Apache License, Version 2.0", 43 | url="https://github.com/commonsearch/gumbocy", 44 | ext_modules=ext_modules, 45 | keywords=["gumbo", "gumbo-parser", "gumbo-cython", "gumbocy", "cython", "htmlparser", "html5", "html5lib"], 46 | classifiers=[ 47 | "Programming Language :: Python", 48 | "Programming Language :: Python :: 2.7", 49 | # 'Development Status :: 1 - Planning', 50 | # 'Development Status :: 2 - Pre-Alpha', 51 | # 'Development Status :: 3 - Alpha', 52 | 'Development Status :: 4 - Beta', 53 | # 'Development Status :: 5 - Production/Stable', 54 | # 'Development Status :: 6 - Mature', 55 | # 'Development Status :: 7 - Inactive', 56 | "Programming Language :: Python :: Implementation :: CPython", 57 | "Programming Language :: Python :: Implementation :: PyPy", 58 | "Environment :: Other Environment", 59 | "Intended Audience :: Developers", 60 | "License :: OSI Approved :: Apache Software License", 61 | "Operating System :: OS Independent", 62 | "Topic :: Software Development :: Libraries" 63 | ] 64 | ) 65 | -------------------------------------------------------------------------------- /tests/benchmark_parsers.py: -------------------------------------------------------------------------------- 1 | # Usage: python -m cProfile -s cumtime tests/benchmark_parsers.py 2 | 3 | import os 4 | import sys 5 | sys.path.insert(-1, os.getcwd()) 6 | 7 | import requests 8 | import timeit 9 | import html5lib 10 | import lxml.html 11 | import gumbocy 12 | import gumbo 13 | import bs4 14 | 15 | if not os.path.isfile("tests/_benchmark_fixture.html"): 16 | url = 'https://raw.githubusercontent.com/whatwg/html/d8717d8831c276ca65d2d44bbf2ce4ce673997b9/source' 17 | html = requests.get(url).content 18 | with open("tests/_benchmark_fixture.html", "w") as f: 19 | f.write(html) 20 | 21 | with open("tests/_benchmark_fixture.html", "r") as f: 22 | html = f.read() 23 | html_unicode = html.decode("utf-8") 24 | 25 | 26 | def bench(name, func): 27 | print('{}: {:.3f} seconds'.format(name, min(timeit.repeat(func, number=1, repeat=3)))) 28 | 29 | 30 | def benchmark_gumbocy(): 31 | parser = gumbocy.HTMLParser(options={ 32 | "attributes_whitelist": ["id", "class", "style"] 33 | }) 34 | parser.parse(html) 35 | nodes = parser.listnodes() 36 | 37 | divs_count = 0 38 | for node in nodes: 39 | if node[1] == "div": 40 | divs_count += 1 41 | print "Gumbocy: ", divs_count 42 | 43 | 44 | def benchmark_gumbo_bs3(): 45 | parser = gumbo.soup_parse(html_unicode) 46 | divs = parser.findAll("div") 47 | print "gumbo bs3", len(divs) 48 | 49 | 50 | def benchmark_lxml_raw(): 51 | parsed = lxml.html.fromstring(html) 52 | divs = parsed.findall(".//div") 53 | print "lxml raw", len(divs) 54 | 55 | 56 | def benchmark_html5lib_bs4(): 57 | parser = bs4.BeautifulSoup(html, "html5lib") 58 | divs = parser.find_all("div") 59 | print "html5lib bs4", len(divs) 60 | 61 | 62 | def benchmark_htmlparser_bs4(): 63 | parser = bs4.BeautifulSoup(html, "html.parser") 64 | divs = parser.find_all("div") 65 | print "html.parser bs4", len(divs) 66 | 67 | 68 | bench("benchmark_gumbocy", benchmark_gumbocy) 69 | bench("benchmark_gumbo_bs3", benchmark_gumbo_bs3) 70 | bench("benchmark_lxml_raw", benchmark_lxml_raw) 71 | bench("benchmark_html5lib_bs4", benchmark_html5lib_bs4) 72 | bench("benchmark_htmlparser_bs4", benchmark_htmlparser_bs4) 73 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.getcwd()) 5 | 6 | import gumbocy 7 | -------------------------------------------------------------------------------- /tests/test_analyze.py: -------------------------------------------------------------------------------- 1 | import gumbocy 2 | from test_word_groups import TAGS_SEPARATORS 3 | 4 | 5 | def analyze(html, options=None): 6 | parser = gumbocy.HTMLParser(options=options) 7 | parser.parse(html) 8 | return parser.analyze() 9 | 10 | 11 | def test_separators(): 12 | html = """ 13 |

text

14 |

text 2

15 |

pre

inner

16 | """ 17 | 18 | analyzed = analyze(html, options={ 19 | "tags_separators": ["p"] 20 | }) 21 | 22 | assert analyzed["word_groups"] == [ 23 | ("text", "p"), 24 | ("text 2", "p"), 25 | ("pre", "p"), 26 | ("inner", "p") 27 | ] 28 | 29 | # More word group tests in test_word_groups.py 30 | 31 | 32 | def test_hidden_text(): 33 | 34 | html = """ 35 | 36 | text 37 |
textp
38 |
hidden by display
39 |
ignored by class_noindex
40 |
ignored by class_noindex 2
41 | 42 | 43 |
not_aria
44 |
hidden by visibility
45 | """ 46 | 47 | analyzed = analyze(html, options={ 48 | "attributes_whitelist": ["style", "hidden", "aria-hidden"], 49 | "classes_hidden": ["_class_hidden"], 50 | "ids_hidden": ["_id_hidden"], 51 | "tags_separators": ["div"], 52 | "classes_ignore": ["_class_noindex"] 53 | }) 54 | 55 | assert analyzed["word_groups"] == [ 56 | ("text", "body"), 57 | ("textp", "div"), 58 | ("not_aria", "div") 59 | ] 60 | 61 | 62 | def test_hidden_siblings(): 63 | 64 | html = """ 65 | 68 | 69 | """ 70 | 71 | analyzed = analyze(html, options={ 72 | "classes_boilerplate": ["login"] 73 | }) 74 | 75 | assert analyzed["word_groups"] == [] 76 | 77 | 78 | def test_boilerplate_text(): 79 | 80 | html = """ 81 | 82 |
83 | Boilerplate 84 |

Title

85 |
86 | 87 |
x
88 |
y
89 |
z
90 | 91 |

Title 2

92 | """ 93 | 94 | analyzed = analyze(html, options={ 95 | "attributes_whitelist": ["id", "class", "role"], 96 | "tags_boilerplate": ["header"], 97 | "tags_boilerplate_bypass": ["h2"], 98 | "classes_boilerplate": ["classboil"], 99 | "ids_boilerplate": ["idboil"], 100 | "roles_boilerplate": ["roleboil"], 101 | "tags_separators": TAGS_SEPARATORS 102 | }) 103 | 104 | assert analyzed["word_groups"] == [ 105 | ("Title", "h2"), 106 | ("Title 2", "h2") 107 | ] 108 | 109 | 110 | def test_title(): 111 | 112 | html = """ test 1 test 2 """ 113 | 114 | analyzed = analyze(html, options={ 115 | }) 116 | 117 | assert analyzed["title"] == "test 1" 118 | assert len(analyzed["word_groups"]) == 0 119 | 120 | 121 | def test_head_metas(): 122 | 123 | html = """ 124 | 125 | 126 | 127 | 128 | This is <body> text 129 | """ 130 | 131 | analyzed = analyze(html, options={ 132 | "metas_whitelist": ["description"] 133 | }) 134 | 135 | assert analyzed["head_metas"] == {"description": "This is a !"} 136 | -------------------------------------------------------------------------------- /tests/test_hyperlinks.py: -------------------------------------------------------------------------------- 1 | import gumbocy 2 | from test_word_groups import TAGS_SEPARATORS 3 | 4 | 5 | def _links(html, url=None): 6 | parser = gumbocy.HTMLParser(options={ 7 | "tags_separators": TAGS_SEPARATORS 8 | }) 9 | parser.parse(html) 10 | ret = parser.analyze(url=url) 11 | return { 12 | "all": ret["internal_hyperlinks"] + ret["external_hyperlinks"], 13 | "internal": ret["internal_hyperlinks"], 14 | "external": ret["external_hyperlinks"] 15 | } 16 | 17 | 18 | def test_get_hyperlinks(): 19 | links = _links("""Test titlex""") 20 | assert len(links["all"]) == 0 21 | 22 | links = _links("""Test title 23 | Y 24 | """) 25 | assert len(links["all"]) == 0 26 | 27 | links = _links("""Test title 28 | Y 29 | """) 30 | assert len(links["all"]) == 0 31 | 32 | links = _links("""Test title 33 | Y 34 | """) 35 | assert len(links["all"]) == 0 36 | 37 | links = _links("""Test title 38 | Y 39 | """) 40 | assert len(links["all"]) == 0 41 | 42 | links = _links("""Test title 43 | Y 44 | """) 45 | assert len(links["all"]) == 0 46 | 47 | links = _links("""Test title 48 | Y 49 | """) 50 | assert len(links["all"]) == 1 51 | assert links["external"][0][0] == "http://sub.test.com/page1?q=2&a=b#xxx" 52 | assert links["external"][0][1] == "Y" 53 | assert links["external"][0][2] == "nofollow" 54 | 55 | links = _links("""Test title 56 | Y X 57 | """, url="http://sub.test.com/page2") 58 | assert len(links["all"]) == 1 59 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx" 60 | assert links["internal"][0][1] == "Y X" 61 | assert links["internal"][0][2] is None 62 | 63 | links = _links("""Test title 64 | Y Z 65 | """, url="http://sub.test.com/page2/x.html") 66 | assert len(links["all"]) == 1 67 | assert links["internal"][0][0] == "../page1?q=2&a=b#xxx" 68 | assert links["internal"][0][1] == "Y Z" 69 | 70 | # Absolute links to the same netloc are still internal 71 | links = _links("""Test title 72 | Y Z 73 | """, url="http://sub.test.com/page2/x.html") 74 | assert len(links["all"]) == 1 75 | assert len(links["external"]) == 0 76 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx" 77 | assert links["internal"][0][1] == "Y Z" 78 | 79 | # Cross-scheme links are still considered internal 80 | links = _links("""Test title 81 | Y Z 82 | """, url="http://sub.test.com/page2/x.html") 83 | assert len(links["all"]) == 1 84 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx" 85 | assert links["internal"][0][1] == "Y Z" 86 | 87 | links = _links("""Test title 88 | Y Z 89 | """, url="https://sub.test.com/page2/x.html") 90 | assert len(links["all"]) == 1 91 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx" 92 | assert links["internal"][0][1] == "Y Z" 93 | 94 | links = _links("""Test title 95 | Y Z 96 | """, url="http://sub.test.com/page2/x.html") 97 | assert len(links["all"]) == 1 98 | assert links["internal"][0][0] == "/sub.test.com/page1?q=2&a=b#xxx" 99 | assert links["internal"][0][1] == "Y Z" 100 | 101 | links = _links("""Test title 102 | Y Z 103 | """, url="http://sub.test.com/page2/x.html") 104 | assert len(links["all"]) == 1 105 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx" 106 | assert links["internal"][0][1] == "Y Z" 107 | 108 | links = _links("""Test title 109 | Y Z 110 | """, url="http://sub.test.com/page2/x.html") 111 | assert len(links["all"]) == 1 112 | assert links["external"][0][0] == "http://sub2.test.com/page1?q=2&a=b#xxx" 113 | assert links["external"][0][1] == "Y Z" 114 | 115 | links = _links("""Test title 116 | Y Z 117 | """, url="http://sub.test.com/page2/x.html") 118 | assert len(links["all"]) == 1 119 | assert links["external"][0][0] == "https://sub2.test.com/page1?q=2&a=b#xxx" 120 | assert links["external"][0][1] == "Y Z" 121 | 122 | # TODO resolution tests 123 | -------------------------------------------------------------------------------- /tests/test_listnodes.py: -------------------------------------------------------------------------------- 1 | import gumbocy 2 | 3 | 4 | def listnodes(html, options=None): 5 | parser = gumbocy.HTMLParser(options=options) 6 | parser.parse(html) 7 | return parser.listnodes() 8 | 9 | 10 | def test_basic(): 11 | html = """ 12 | 13 | HW 14 | Hello world
15 | 16 | """ 17 | 18 | iterations = 1 # 300000 19 | for _ in range(0, iterations): 20 | nodes = listnodes(html, {"attributes_whitelist": ["href"]}) 21 | assert nodes == [ 22 | (0, "html"), 23 | (1, "head"), 24 | (2, "title"), 25 | (3, None, "HW"), 26 | (1, "body"), 27 | (2, None, " Hello "), 28 | (2, "a", {"href": "http://example.com"}), 29 | (3, None, "world"), 30 | (2, "br") 31 | ] 32 | 33 | 34 | def test_classes(): 35 | html = """ 36 | 37 | 38 |

39 | 40 | """ 41 | 42 | nodes = listnodes(html, {"attributes_whitelist": ["class"]}) 43 | assert nodes == [ 44 | (0, "html"), 45 | (1, "head"), 46 | (1, "body"), 47 | (2, "p", {"class": frozenset(["para", "graph"])}) 48 | ] 49 | 50 | 51 | def test_ignore(): 52 | html = """ 53 | 54 | HW 55 | Hello world
56 | 57 | """ 58 | 59 | nodes = listnodes(html, { 60 | "attributes_whitelist": ["class", "id"], 61 | "ids_ignore": ["i"], 62 | "classes_ignore": set(["ign"]), 63 | "tags_ignore": ["title"] 64 | }) 65 | assert nodes == [ 66 | (0, "html"), 67 | (1, "head"), 68 | (1, "body"), 69 | (2, None, " Hello ") 70 | ] 71 | 72 | 73 | def test_head_only(): 74 | html = """ 75 | 76 | HW 77 | Hello world
78 | 79 | """ 80 | 81 | nodes = listnodes(html, { 82 | "head_only": True 83 | }) 84 | assert nodes == [ 85 | (0, "html"), 86 | (1, "head"), 87 | (2, "title"), 88 | (3, None, "HW") 89 | ] 90 | 91 | html = """ 92 | 93 |

test

HW 94 | Hello world
95 | 96 | """ 97 | 98 | nodes = listnodes(html, { 99 | "head_only": True 100 | }) 101 | assert nodes == [ 102 | (0, "html"), 103 | (1, "head") 104 | ] 105 | 106 | 107 | def test_unknown_tags(): 108 | html = """ 109 | 110 | 111 | inline text 112 | 113 | """ 114 | 115 | nodes = listnodes(html, { 116 | "attributes_whitelist": ["class"], 117 | "tags_ignore": "new_tag" # We can't ignore unknown tags at the Gumbocy level (for now?) 118 | }) 119 | 120 | assert nodes == [ 121 | (0, "html"), 122 | (1, "head"), 123 | (1, "body"), 124 | (2, "new_tag", {'class': frozenset(['xx'])}), 125 | (3, None, "inline text"), 126 | (2, "new_tag_2") 127 | ] 128 | -------------------------------------------------------------------------------- /tests/test_word_groups.py: -------------------------------------------------------------------------------- 1 | import gumbocy 2 | import pytest 3 | 4 | TAGS_SEPARATORS = frozenset([ 5 | "body", 6 | 7 | # http://www.w3.org/TR/html5/grouping-content.html#grouping-content 8 | "p", "pre", "blockquote", "ul", "ol", "li", "dl", "dt", "dd", "figure", "figcaption", 9 | 10 | "br", "img", 11 | 12 | "h1", "h2", "h3", "h4", "h5", "h6" 13 | ]) 14 | 15 | 16 | SAMPLES = [ 17 | { 18 | "html": """

hello

""", 19 | "groups": [ 20 | ("hello", "p") 21 | ] 22 | }, 23 | 24 | # A is automatically added 25 | { 26 | "html": """ nobody """, 27 | "groups": [ 28 | ("nobody", "body") 29 | ] 30 | }, 31 | 32 | # span 33 | { 34 | "html": """

pre link post

""", 35 | "groups": [ 36 | ("pre link post", "p") 37 | ] 38 | }, 39 | 40 | # a 41 | { 42 | "html": """

pre link post

""", 43 | "groups": [ 44 | ("pre link post", "p") 45 | ] 46 | }, 47 | 48 | # mid p 49 | { 50 | "html": """

pre

  • li1 x
mid

post

""", 51 | "groups": [ 52 | ("pre", "p"), 53 | ("li1 x", "li"), 54 | ("mid", "body"), 55 | ("post", "p") 56 | ] 57 | }, 58 | 59 | # Lists 60 | { 61 | "html": """ pre
  • li1
  • li2
post """, 62 | "groups": [ 63 | ("pre", "body"), 64 | ("li1", "li"), 65 | ("li2", "li"), 66 | ("post", "body") 67 | ] 68 | }, 69 | 70 | # HR with illegal

. "post" is actually part of . 71 | { 72 | "html": """

pre


post

""", 73 | "groups": [ 74 | ("pre", "p"), 75 | ("post", "body") 76 | ] 77 | }, 78 | 79 | # Non-closed p tag. 80 | { 81 | "html": """ pre

post""", 82 | "groups": [ 83 | ("pre", "body"), 84 | ("post", "p") 85 | ] 86 | }, 87 | 88 | # BR 89 | { 90 | "html": """

pre
post

""", 91 | "groups": [ 92 | ("pre", "p"), 93 | ("post", "p") 94 | ] 95 | }, 96 | 97 | # IMG filename + alt 98 | { 99 | "html": """

pre james brown post

""", 100 | "groups": [ 101 | ("pre", "p"), 102 | ("james brown", "img"), 103 | # ("maceo parker", "img"), 104 | ("post", "p") 105 | ] 106 | }, 107 | 108 | # IMG with dataURIs are ignored 109 | { 110 | "html": """

pre Red dot post

""", 111 | "groups": [ 112 | ("pre", "p"), 113 | ("Red dot", "img"), 114 | ("post", "p") 115 | ] 116 | }, 117 | ] 118 | 119 | 120 | # TODO: good coverage of http://www.w3.org/html/wg/drafts/html/master/syntax.html 121 | @pytest.mark.parametrize(("sample"), SAMPLES) 122 | def test_get_word_groups(sample): 123 | 124 | parser = gumbocy.HTMLParser(options={ 125 | "tags_separators": TAGS_SEPARATORS, 126 | "attributes_whitelist": ["src", "alt"] 127 | }) 128 | parser.parse(sample["html"]) 129 | parsed = parser.analyze() 130 | 131 | for i, group in enumerate(parsed["word_groups"]): 132 | assert group == sample["groups"][i] 133 | 134 | assert len(parsed["word_groups"]) == len(sample["groups"]) 135 | --------------------------------------------------------------------------------