├── .dockerignore
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── MANIFEST
├── MANIFEST.in
├── Makefile
├── README.md
├── gumbocy.cpp
├── gumbocy.pxd
├── gumbocy.pyx
├── re2cy.pxd
├── requirements-benchmark.txt
├── requirements.txt
├── scripts
    └── git-set-file-times
├── setup.py
└── tests
    ├── benchmark_parsers.py
    ├── conftest.py
    ├── test_analyze.py
    ├── test_hyperlinks.py
    ├── test_listnodes.py
    └── test_word_groups.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | venv
2 | .git
3 | .cache


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 
64 | # Gumbocy-specific
65 | gumbocy.c
66 | gumbocy.html
67 | venv/
68 | *.rst
69 | gumbo-parser
70 | /tests/_benchmark_fixture.html
71 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | 
 3 | services:
 4 |   - docker
 5 | 
 6 | env:
 7 |   - GUMBOCY_PYTHON_VERSION=py27
 8 |   - GUMBOCY_PYTHON_VERSION=pypy
 9 | 
10 | before_install:
11 |   - docker ps
12 |   - docker info
13 |   - docker version
14 |   - ./scripts/git-set-file-times
15 |   - docker pull commonsearch/gumbocy
16 |   - make docker_build
17 | 
18 | script:
19 |   - make docker_test
20 | 
21 | notifications:
22 |   irc: "chat.freenode.net#commonsearch"
23 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:jessie
 2 | 
 3 | RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
 4 | 	curl \
 5 | 	automake \
 6 | 	gcc \
 7 | 	g++ \
 8 | 	make \
 9 | 	libtool \
10 | 	ca-certificates \
11 | #	python3-pip \
12 | #	python3-dev \
13 | 	python-pip \
14 | 	python-dev \
15 | 	bzip2
16 | 
17 | # Upgrade pip
18 | # RUN pip3 install --upgrade --ignore-installed pip
19 | RUN pip install --upgrade --ignore-installed pip
20 | 
21 | # Install Gumbo
22 | ENV GUMBO_VERSION 0.10.1
23 | RUN curl -sL https://github.com/google/gumbo-parser/archive/v$GUMBO_VERSION.tar.gz > gumbo.tgz && \
24 | 	rm -rf gumbo-parser-$GUMBO_VERSION gumbo-parser && \
25 | 	tar zxf gumbo.tgz && \
26 | 	mv gumbo-parser-$GUMBO_VERSION gumbo-parser && \
27 | 	cd gumbo-parser && ./autogen.sh && ./configure && make && \
28 | 	make install && ldconfig && cd .. && \
29 | 	rm -rf gumbo.tgz gumbo-parser
30 | 
31 | 
32 | # Optional dependencies for benchmarking
33 | RUN apt-get install -y --no-install-recommends \
34 | 	libxml2-dev \
35 | 	libxslt1-dev \
36 | 	zlib1g-dev
37 | 
38 | # RUN ln -s /usr/local/lib/libgumbo.so /usr/lib/python2.7/dist-packages/gumbo/libgumbo.so
39 | 
40 | # Install PyPy
41 | RUN curl -L 'https://bitbucket.org/squeaky/portable-pypy/downloads/pypy-5.3.1-linux_x86_64-portable.tar.bz2' -o /pypy.tar.bz2 && \
42 |   mkdir -p /opt/pypy/ && tar jxvf /pypy.tar.bz2 -C /opt/pypy/  --strip-components=1 && \
43 |   rm /pypy.tar.bz2
44 | 
45 | RUN /opt/pypy/bin/pypy -m ensurepip
46 | RUN /opt/pypy/bin/pip install --upgrade --ignore-installed pip
47 | 
48 | # Install RE2
49 | RUN mkdir -p /tmp/re2 && \
50 | 	curl -L 'https://github.com/google/re2/archive/636bc71728b7488c43f9441ecfc80bdb1905b3f0.tar.gz' -o /tmp/re2/re2.tar.gz && \
51 | 	cd /tmp/re2 && tar zxvf re2.tar.gz --strip-components=1 && \
52 | 	make && make install && \
53 | 	rm -rf /tmp/re2 && \
54 | 	ldconfig
55 | 
56 | # Install Python dependencies
57 | 
58 | ADD requirements-benchmark.txt /requirements-benchmark.txt
59 | ADD requirements.txt /requirements.txt
60 | # RUN pip3 install -r requirements.txt
61 | # RUN pip3 install -r requirements-benchmark.txt
62 | RUN pip install -r requirements.txt
63 | RUN pip install -r requirements-benchmark.txt
64 | RUN /opt/pypy/bin/pip install -r /requirements.txt
65 | RUN /opt/pypy/bin/pip install setuptools==18.5  # Because of html5lib
66 | RUN /opt/pypy/bin/pip install -r /requirements-benchmark.txt
67 | 
68 | RUN mkdir -p /cosr/gumbocy
69 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2016 Common Search contributors
190 |    
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | LICENSE
3 | Makefile
4 | README.md
5 | gumbocy.c
6 | gumbocy.pyx
7 | setup.py
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include *.c
3 | include *.pyx
4 | include *.pyd
5 | include LICENSE
6 | include Makefile


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | clean:
 2 | 	rm -rf *.so build *.c *.cpp *.html dist .cache tests/__pycache__ *.rst
 3 | 
 4 | cythonize:
 5 | 	cython --cplus -2 --warning-extra --annotate gumbocy.pyx
 6 | 
 7 | build_ext: clean cythonize
 8 | ifeq ($(GUMBOCY_PYTHON_VERSION), pypy)
 9 | 	/opt/pypy/bin/pypy setup.py build_ext --inplace -Igumbo-parser/src -Lgumbo-parser/.libs -Rgumbo-parser/.libs
10 | else
11 | 	python setup.py build_ext --inplace -Igumbo-parser/src -Lgumbo-parser/.libs -Rgumbo-parser/.libs
12 | endif
13 | 
14 | rst:
15 | 	pandoc --from=markdown --to=rst --output=README.rst README.md
16 | 
17 | virtualenv:
18 | 	rm -rf venv
19 | 	virtualenv venv
20 | 	venv/bin/pip install -r requirements.txt
21 | 
22 | test: build_ext
23 | ifeq ($(GUMBOCY_PYTHON_VERSION), pypy)
24 | 	/opt/pypy/bin/py.test tests/ -vs
25 | else
26 | 	py.test tests/ -vs
27 | endif
28 | 
29 | docker_build:
30 | 	docker build -t commonsearch/gumbocy .
31 | 
32 | docker_shell:
33 | 	docker run -e GUMBOCY_PYTHON_VERSION -v "$(PWD):/cosr/gumbocy:rw" -w /cosr/gumbocy -i -t commonsearch/gumbocy bash
34 | 
35 | docker_test:
36 | 	docker run -e GUMBOCY_PYTHON_VERSION -v "$(PWD):/cosr/gumbocy:rw" -w /cosr/gumbocy -i -t commonsearch/gumbocy make test
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # gumbocy
  2 | 
  3 | [![Build Status](https://travis-ci.org/commonsearch/gumbocy.svg?branch=master)](https://travis-ci.org/commonsearch/gumbocy) [![Apache License 2.0](https://img.shields.io/github/license/commonsearch/gumbocy.svg)](LICENSE)
  4 | 
  5 | **gumbocy** is an alternative Python binding for the excellent [Gumbo](https://github.com/google/gumbo-parser) HTML5 parser, originally written for [Common Search](http://about.commonsearch.org).
  6 | 
  7 | It differs from the [official Python binding](https://github.com/google/gumbo-parser/tree/master/python/gumbo) in a few ways:
  8 | 
  9 |  - It is optimized for performance by using [Cython](http://cython.org/).
 10 |  - It has a smaller feature set and doesn't aim to be a general-purpose binding.
 11 |  - Its `listnodes()` API just returns nodes as a flat list of tuples.
 12 |  - Its `analyze()` API traverses the HTML tree and returns high-level data like groups of words and lists of hyperlinks.
 13 |  - It is generally restrictive. For instance, attributes have to be whitelisted.
 14 | 
 15 | ## Installation
 16 | 
 17 | The only dependency is [Gumbo](https://github.com/google/gumbo-parser). You need to install it (possibly with `make gumbo_build`) if you are not using the Docker method below.
 18 | 
 19 | ### From PyPI
 20 | 
 21 | ```
 22 | pip install gumbocy
 23 | ```
 24 | 
 25 | ### From source with Docker
 26 | 
 27 | Clone this repository, then:
 28 | 
 29 | ```
 30 | make docker_build
 31 | make docker_shell
 32 | ```
 33 | 
 34 | You will end up in a container with Gumbo and Gumbocy already installed.
 35 | 
 36 | You can then run the tests for Python 2.7 and PyPy:
 37 | 
 38 | ```
 39 | make docker_test
 40 | GUMBOCY_PYTHON_VERSION=pypy make docker_test
 41 | ```
 42 | 
 43 | ### From source without Docker
 44 | 
 45 | This is an unsupported method.
 46 | 
 47 | ```
 48 | make virtualenv
 49 | source venv/bin/activate
 50 | make build_ext
 51 | ```
 52 | 
 53 | ## Running the tests
 54 | 
 55 | ```
 56 | make test
 57 | ```
 58 | 
 59 | ## Quickstart
 60 | 
 61 | ```
 62 | import gumbocy
 63 | 
 64 | parser = gumbocy.HTMLParser(options={})
 65 | parser.parse("""<html><head><title>Hello</title></head><body>world!</body></html>""")
 66 | print parser.listnodes()
 67 | 
 68 | => [(0, "html"), (1, "head"), (2, "title"), (3, None, "Hello"), (1, "body"), (2, None, "world!")]
 69 | 
 70 | print parser.analyze()
 71 | 
 72 | => {'word_groups': [('world!', 'body')], 'external_hyperlinks': [], 'internal_hyperlinks': [], 'title': 'Hello'}
 73 | 
 74 | ```
 75 | 
 76 | For more usage examples, see the [tests](https://github.com/commonsearch/gumbocy/blob/master/tests/).
 77 | 
 78 | ## Options reference
 79 | 
 80 |  - **attributes_whitelist**: a set of attributes which, if present, will be returned in a dict as the 3rd element of a node tuple by `listnodes()`. Note that "class" is returned as a frozenset. Defaults to `set()`.
 81 |  - **nesting_limit**: an integer to specify the maximum nesting level that will be returned. Defaults to `999`.
 82 |  - **head_only**: a boolean that will make gumbocy return only the elements in the <head> of the document. Useful for parsing only <meta> tags for instance. Defaults to `False`.
 83 |  - **tags_ignore**: a list of tag names that won't be returned (as well as their children).
 84 |  - **ids_ignore**: a list of IDs for which matching elements (and their children) won't be returned.
 85 |  - **classes_ignore**: a list of classes for which matching elements (and their children) won't be returned.
 86 | 
 87 | 
 88 | ## Contributing
 89 | 
 90 | If you are using Sublime Text, we recommend installing [Cython support](https://github.com/NotSqrt/sublime-cython).
 91 | 
 92 | All contributions are welcome! Feel free to use the [Issues tab](https://github.com/commonsearch/gumbocy/issues) or send us your Pull Requests.
 93 | 
 94 | ## Changelog
 95 | 
 96 | ### 0.2
 97 |  - New `analyze()` API, moving most of the tree traversal that was happening in `cosr-back` to Cython, resulting in a ~3x speedup in indexing speed.
 98 |  - More tests
 99 | 
100 | ### 0.1
101 |  - Initial public release
102 | 


--------------------------------------------------------------------------------
/gumbocy.pxd:
--------------------------------------------------------------------------------
  1 | # https://github.com/google/gumbo-parser/blob/master/src/gumbo.h
  2 | 
  3 | 
  4 | cdef extern from "gumbo.h":
  5 | 
  6 |     ctypedef enum GumboNamespaceEnum:
  7 |         GUMBO_NAMESPACE_HTML
  8 |         GUMBO_NAMESPACE_SVG
  9 |         GUMBO_NAMESPACE_MATHML
 10 | 
 11 |     ctypedef enum GumboAttributeNamespaceEnum:
 12 |         GUMBO_ATTR_NAMESPACE_NONE,
 13 |         GUMBO_ATTR_NAMESPACE_XLINK,
 14 |         GUMBO_ATTR_NAMESPACE_XML,
 15 |         GUMBO_ATTR_NAMESPACE_XMLNS,
 16 | 
 17 |     ctypedef enum GumboQuirksModeEnum:
 18 |         GUMBO_DOCTYPE_NO_QUIRKS,
 19 |         GUMBO_DOCTYPE_QUIRKS,
 20 |         GUMBO_DOCTYPE_LIMITED_QUIRKS
 21 | 
 22 |     ctypedef enum GumboTag:
 23 |         GUMBO_TAG_HTML,
 24 |         GUMBO_TAG_HEAD,
 25 |         GUMBO_TAG_TITLE,
 26 |         GUMBO_TAG_BASE,
 27 |         GUMBO_TAG_LINK,
 28 |         GUMBO_TAG_META,
 29 |         GUMBO_TAG_STYLE,
 30 |         GUMBO_TAG_SCRIPT,
 31 |         GUMBO_TAG_NOSCRIPT,
 32 |         GUMBO_TAG_TEMPLATE,
 33 |         GUMBO_TAG_BODY,
 34 |         GUMBO_TAG_ARTICLE,
 35 |         GUMBO_TAG_SECTION,
 36 |         GUMBO_TAG_NAV,
 37 |         GUMBO_TAG_ASIDE,
 38 |         GUMBO_TAG_H1,
 39 |         GUMBO_TAG_H2,
 40 |         GUMBO_TAG_H3,
 41 |         GUMBO_TAG_H4,
 42 |         GUMBO_TAG_H5,
 43 |         GUMBO_TAG_H6,
 44 |         GUMBO_TAG_HGROUP,
 45 |         GUMBO_TAG_HEADER,
 46 |         GUMBO_TAG_FOOTER,
 47 |         GUMBO_TAG_ADDRESS,
 48 |         GUMBO_TAG_P,
 49 |         GUMBO_TAG_HR,
 50 |         GUMBO_TAG_PRE,
 51 |         GUMBO_TAG_BLOCKQUOTE,
 52 |         GUMBO_TAG_OL,
 53 |         GUMBO_TAG_UL,
 54 |         GUMBO_TAG_LI,
 55 |         GUMBO_TAG_DL,
 56 |         GUMBO_TAG_DT,
 57 |         GUMBO_TAG_DD,
 58 |         GUMBO_TAG_FIGURE,
 59 |         GUMBO_TAG_FIGCAPTION,
 60 |         GUMBO_TAG_MAIN,
 61 |         GUMBO_TAG_DIV,
 62 |         GUMBO_TAG_A,
 63 |         GUMBO_TAG_EM,
 64 |         GUMBO_TAG_STRONG,
 65 |         GUMBO_TAG_SMALL,
 66 |         GUMBO_TAG_S,
 67 |         GUMBO_TAG_CITE,
 68 |         GUMBO_TAG_Q,
 69 |         GUMBO_TAG_DFN,
 70 |         GUMBO_TAG_ABBR,
 71 |         GUMBO_TAG_DATA,
 72 |         GUMBO_TAG_TIME,
 73 |         GUMBO_TAG_CODE,
 74 |         GUMBO_TAG_VAR,
 75 |         GUMBO_TAG_SAMP,
 76 |         GUMBO_TAG_KBD,
 77 |         GUMBO_TAG_SUB,
 78 |         GUMBO_TAG_SUP,
 79 |         GUMBO_TAG_I,
 80 |         GUMBO_TAG_B,
 81 |         GUMBO_TAG_U,
 82 |         GUMBO_TAG_MARK,
 83 |         GUMBO_TAG_RUBY,
 84 |         GUMBO_TAG_RT,
 85 |         GUMBO_TAG_RP,
 86 |         GUMBO_TAG_BDI,
 87 |         GUMBO_TAG_BDO,
 88 |         GUMBO_TAG_SPAN,
 89 |         GUMBO_TAG_BR,
 90 |         GUMBO_TAG_WBR,
 91 |         GUMBO_TAG_INS,
 92 |         GUMBO_TAG_DEL,
 93 |         GUMBO_TAG_IMAGE,
 94 |         GUMBO_TAG_IMG,
 95 |         GUMBO_TAG_IFRAME,
 96 |         GUMBO_TAG_EMBED,
 97 |         GUMBO_TAG_OBJECT,
 98 |         GUMBO_TAG_PARAM,
 99 |         GUMBO_TAG_VIDEO,
100 |         GUMBO_TAG_AUDIO,
101 |         GUMBO_TAG_SOURCE,
102 |         GUMBO_TAG_TRACK,
103 |         GUMBO_TAG_CANVAS,
104 |         GUMBO_TAG_MAP,
105 |         GUMBO_TAG_AREA,
106 |         GUMBO_TAG_MATH,
107 |         GUMBO_TAG_MI,
108 |         GUMBO_TAG_MO,
109 |         GUMBO_TAG_MN,
110 |         GUMBO_TAG_MS,
111 |         GUMBO_TAG_MTEXT,
112 |         GUMBO_TAG_MGLYPH,
113 |         GUMBO_TAG_MALIGNMARK,
114 |         GUMBO_TAG_ANNOTATION_XML,
115 |         GUMBO_TAG_SVG,
116 |         GUMBO_TAG_FOREIGNOBJECT,
117 |         GUMBO_TAG_DESC,
118 |         GUMBO_TAG_TABLE,
119 |         GUMBO_TAG_CAPTION,
120 |         GUMBO_TAG_COLGROUP,
121 |         GUMBO_TAG_COL,
122 |         GUMBO_TAG_TBODY,
123 |         GUMBO_TAG_THEAD,
124 |         GUMBO_TAG_TFOOT,
125 |         GUMBO_TAG_TR,
126 |         GUMBO_TAG_TD,
127 |         GUMBO_TAG_TH,
128 |         GUMBO_TAG_FORM,
129 |         GUMBO_TAG_FIELDSET,
130 |         GUMBO_TAG_LEGEND,
131 |         GUMBO_TAG_LABEL,
132 |         GUMBO_TAG_INPUT,
133 |         GUMBO_TAG_BUTTON,
134 |         GUMBO_TAG_SELECT,
135 |         GUMBO_TAG_DATALIST,
136 |         GUMBO_TAG_OPTGROUP,
137 |         GUMBO_TAG_OPTION,
138 |         GUMBO_TAG_TEXTAREA,
139 |         GUMBO_TAG_KEYGEN,
140 |         GUMBO_TAG_OUTPUT,
141 |         GUMBO_TAG_PROGRESS,
142 |         GUMBO_TAG_METER,
143 |         GUMBO_TAG_DETAILS,
144 |         GUMBO_TAG_SUMMARY,
145 |         GUMBO_TAG_MENU,
146 |         GUMBO_TAG_MENUITEM,
147 |         GUMBO_TAG_APPLET,
148 |         GUMBO_TAG_ACRONYM,
149 |         GUMBO_TAG_BGSOUND,
150 |         GUMBO_TAG_DIR,
151 |         GUMBO_TAG_FRAME,
152 |         GUMBO_TAG_FRAMESET,
153 |         GUMBO_TAG_NOFRAMES,
154 |         GUMBO_TAG_ISINDEX,
155 |         GUMBO_TAG_LISTING,
156 |         GUMBO_TAG_XMP,
157 |         GUMBO_TAG_NEXTID,
158 |         GUMBO_TAG_NOEMBED,
159 |         GUMBO_TAG_PLAINTEXT,
160 |         GUMBO_TAG_RB,
161 |         GUMBO_TAG_STRIKE,
162 |         GUMBO_TAG_BASEFONT,
163 |         GUMBO_TAG_BIG,
164 |         GUMBO_TAG_BLINK,
165 |         GUMBO_TAG_CENTER,
166 |         GUMBO_TAG_FONT,
167 |         GUMBO_TAG_MARQUEE,
168 |         GUMBO_TAG_MULTICOL,
169 |         GUMBO_TAG_NOBR,
170 |         GUMBO_TAG_SPACER,
171 |         GUMBO_TAG_TT,
172 |         GUMBO_TAG_RTC,
173 |         GUMBO_TAG_UNKNOWN,
174 |         GUMBO_TAG_LAST
175 | 
176 |     ctypedef enum GumboParseFlags:
177 | 
178 |         GUMBO_INSERTION_NORMAL = 0,
179 | 
180 |         GUMBO_INSERTION_BY_PARSER = 1 << 0,
181 | 
182 |         GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
183 | 
184 |         GUMBO_INSERTION_IMPLIED = 1 << 3,
185 | 
186 |         GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
187 | 
188 |         GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
189 | 
190 |         GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
191 | 
192 |         GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
193 | 
194 |         GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
195 | 
196 |         GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
197 | 
198 |         GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
199 | 
200 | 
201 |     ctypedef struct GumboVector:
202 |         void** data
203 |         unsigned int length
204 |         unsigned int capacity
205 | 
206 |     ctypedef struct GumboStringPiece:
207 |         const char* data
208 |         size_t length
209 | 
210 |     ctypedef struct GumboStringPiece:
211 |         const char* data
212 |         size_t length
213 | 
214 | 
215 |     ctypedef struct GumboAttribute:
216 |         GumboAttributeNamespaceEnum attr_namespace
217 |         const char* name
218 |         GumboStringPiece original_name
219 |         const char* value
220 |         GumboStringPiece original_value
221 | 
222 |     ctypedef struct GumboDocument:
223 |         GumboVector children
224 |         bint has_doctype
225 |         const char* name
226 |         const char* public_identifier
227 |         const char* system_identifier
228 |         GumboQuirksModeEnum doc_type_quirks_mode
229 | 
230 |     ctypedef struct GumboElement:
231 |         GumboVector children
232 |         GumboTag tag
233 |         GumboNamespaceEnum tag_namespace
234 |         GumboStringPiece original_tag
235 |         GumboStringPiece original_end_tag
236 |         GumboVector attributes
237 | 
238 |     ctypedef struct GumboText:
239 |         const char* text
240 |         GumboStringPiece original_text
241 | 
242 |     ctypedef enum GumboNodeType:
243 |         GUMBO_NODE_DOCUMENT
244 |         GUMBO_NODE_ELEMENT
245 |         GUMBO_NODE_TEXT
246 |         GUMBO_NODE_CDATA
247 |         GUMBO_NODE_COMMENT
248 |         GUMBO_NODE_WHITESPACE
249 |         GUMBO_NODE_TEMPLATE
250 | 
251 |     # ctypedef struct GumboNode:
252 |     #   pass
253 | 
254 |     ctypedef union GumboNodeData:
255 |         GumboDocument document
256 |         GumboElement element
257 |         GumboText text
258 | 
259 |     ctypedef struct GumboNode:
260 |         GumboNodeType type
261 |         GumboNode* parent
262 |         size_t index_within_parent
263 |         GumboParseFlags parse_flags
264 |         GumboNodeData v
265 | 
266 |     ctypedef struct GumboOutput:
267 |         GumboNode* document
268 |         GumboNode* root
269 |         GumboVector errors
270 | 
271 |     ctypedef struct GumboOptions:
272 |         pass
273 | 
274 |     extern const GumboOptions kGumboDefaultOptions
275 | 
276 |     GumboOutput* gumbo_parse(const char* buffer)
277 | 
278 |     void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output)
279 | 
280 |     bint gumbo_string_equals(const GumboStringPiece* str1, const GumboStringPiece* str2)
281 |     bint gumbo_string_equals_ignore_case(const GumboStringPiece* str1, const GumboStringPiece* str2)
282 | 
283 |     int gumbo_vector_index_of(GumboVector* vector, const void* element)
284 | 
285 |     const char* gumbo_normalized_tagname(GumboTag tag)
286 | 
287 |     void gumbo_tag_from_original_text(GumboStringPiece* text)
288 | 
289 |     GumboTag gumbo_tag_enum(const char* tagname)
290 |     GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length)
291 | 
292 |     GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name)
293 | 


--------------------------------------------------------------------------------
/gumbocy.pyx:
--------------------------------------------------------------------------------
  1 | import re
  2 | import urlparse
  3 | cimport gumbocy
  4 | cimport re2cy
  5 | from libcpp.unordered_set cimport unordered_set
  6 | from cython.operator cimport dereference as deref
  7 | from libcpp.vector cimport vector
  8 | from libcpp.map cimport map
  9 | 
 10 | 
 11 | cdef extern from "stdio.h":
 12 |     int printf(const char* format, ...);
 13 | 
 14 | cdef vector[re2cy.ArgPtr] *argp = new vector[re2cy.ArgPtr]()
 15 | cdef re2cy.ArgPtr *empty_args = &(deref(argp)[0])
 16 | 
 17 | cdef bint re2_search(const char* s, re2cy.RE2 &pattern):
 18 |     return re2cy.RE2.PartialMatchN(s, pattern, empty_args, 0)
 19 | 
 20 | cdef re2cy.RE2 *_RE2_SEARCH_STYLE_HIDDEN = new re2cy.RE2(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)")
 21 | cdef re2cy.RE2 *_RE2_ABSOLUTE_HREF = new re2cy.RE2(r"^(?:[A-Za-z0-9\+\.\-]+\:)?\/\/")
 22 | cdef re2cy.RE2 *_RE2_IGNORED_HREF = new re2cy.RE2(r"^(?:javascript|mailto|ftp|about)\:")
 23 | 
 24 | _RE_SPLIT_WHITESPACE = re.compile(r"\s+")
 25 | 
 26 | ctypedef enum AttributeNames:
 27 |     ATTR_ID,
 28 |     ATTR_ROLE,
 29 |     ATTR_HREF,
 30 |     ATTR_STYLE,
 31 |     ATTR_REL,
 32 |     ATTR_SRC,
 33 |     ATTR_ALT,
 34 |     ATTR_NAME,
 35 |     ATTR_PROPERTY,
 36 |     ATTR_CONTENT
 37 | 
 38 | # ATTR_ID = 0
 39 | # ATTR_ROLE = 1
 40 | # ATTR_HREF = 2
 41 | # ATTR_STYLE = 3
 42 | # ATTR_REL = 4
 43 | # ATTR_SRC = 5
 44 | # ATTR_ALT = 6
 45 | # ATTR_NAME = 7
 46 | # ATTR_PROPERTY = 8
 47 | # ATTR_CONTENT = 9
 48 | 
 49 | # cdef struct Attributes:
 50 | #     int size_classes
 51 | #     vector[char*] classes
 52 | #     bint has_hidden
 53 | #     map[AttributeNames, const char*] values
 54 | 
 55 | cdef class Attributes:
 56 |     cdef int size_classes
 57 |     cdef dict values
 58 |     # cdef map[AttributeNames, const char*] values
 59 |     # cdef const char* values[10]
 60 |     # cdef vector[char*] classes
 61 |     cdef list classes
 62 |     cdef bint has_hidden
 63 | 
 64 | # ctypedef sAttributes Attributes
 65 | 
 66 | cdef class HTMLParser:
 67 | 
 68 |     # Global parser variables
 69 |     cdef int nesting_limit
 70 |     cdef bint head_only
 71 |     cdef bint has_ids_ignore
 72 |     cdef bint has_classes_ignore
 73 |     cdef bint has_ids_hidden
 74 |     cdef bint has_classes_hidden
 75 |     cdef bint has_attributes_whitelist
 76 |     cdef bint has_classes_boilerplate
 77 |     cdef bint has_ids_boilerplate
 78 |     cdef bint has_roles_boilerplate
 79 |     cdef bint has_metas_whitelist
 80 | 
 81 |     cdef unordered_set[int] tags_ignore
 82 |     cdef unordered_set[int] tags_ignore_head_only
 83 |     cdef unordered_set[int] tags_boilerplate
 84 |     cdef unordered_set[int] tags_boilerplate_bypass
 85 |     cdef unordered_set[int] tags_separators
 86 | 
 87 |     cdef re2cy.RE2* attributes_whitelist
 88 |     cdef re2cy.RE2* metas_whitelist
 89 |     cdef re2cy.RE2* classes_ignore
 90 |     cdef re2cy.RE2* ids_ignore
 91 |     cdef re2cy.RE2* classes_hidden
 92 |     cdef re2cy.RE2* ids_hidden
 93 |     cdef re2cy.RE2* classes_boilerplate
 94 |     cdef re2cy.RE2* ids_boilerplate
 95 |     cdef re2cy.RE2* roles_boilerplate
 96 | 
 97 |     cdef bint analyze_internal_hyperlinks
 98 |     cdef bint analyze_external_hyperlinks
 99 |     cdef bint analyze_word_groups
100 | 
101 |     # Variables reinitialized at each parse()
102 |     cdef list current_stack
103 | 
104 |     cdef bint has_url
105 |     cdef char* url
106 |     cdef char* netloc
107 |     cdef char* scheme
108 |     cdef re2cy.RE2* internal_netloc_search
109 | 
110 |     cdef dict analysis
111 | 
112 |     cdef object current_word_group
113 |     cdef object current_hyperlink
114 | 
115 |     cdef bint has_output
116 |     cdef gumbocy.GumboOutput* output
117 |     cdef list nodes
118 | 
119 |     def __cinit__(self, dict options=None):
120 | 
121 |         options = options or {}
122 | 
123 |         self.nesting_limit = options.get("nesting_limit", 999)
124 |         self.head_only = options.get("head_only")
125 | 
126 |         self.analyze_external_hyperlinks = bool(options.get("analyze_external_hyperlinks", True))
127 |         self.analyze_internal_hyperlinks = bool(options.get("analyze_internal_hyperlinks", True))
128 |         self.analyze_word_groups = bool(options.get("analyze_word_groups", True))
129 | 
130 |         attributes_whitelist = set(options.get("attributes_whitelist") or [])
131 | 
132 |         classes_ignore = frozenset(options.get("classes_ignore") or [])
133 |         if len(classes_ignore) > 0:
134 |             self.has_classes_ignore = True
135 |             self.classes_ignore = new re2cy.RE2("^(?:" + "|".join(classes_ignore) + ")$")
136 |             attributes_whitelist.add("class")
137 | 
138 |         ids_ignore = frozenset(options.get("ids_ignore") or [])
139 |         if len(ids_ignore) > 0:
140 |             self.has_ids_ignore = True
141 |             self.ids_ignore = new re2cy.RE2("^(?:" + "|".join(ids_ignore) + ")$")
142 |             attributes_whitelist.add("id")
143 | 
144 |         classes_hidden = frozenset(options.get("classes_hidden") or [])
145 |         if len(classes_hidden) > 0:
146 |             self.has_classes_hidden = True
147 |             self.classes_hidden = new re2cy.RE2("^(?:" + "|".join(classes_hidden) + ")$")
148 |             attributes_whitelist.add("class")
149 | 
150 |         ids_hidden = frozenset(options.get("ids_hidden") or [])
151 |         if len(ids_hidden) > 0:
152 |             self.has_ids_hidden = True
153 |             self.ids_hidden = new re2cy.RE2("^(?:" + "|".join(ids_hidden) + ")$")
154 |             attributes_whitelist.add("id")
155 | 
156 |         classes_boilerplate = frozenset(options.get("classes_boilerplate") or [])
157 |         if len(classes_boilerplate) > 0:
158 |             self.has_classes_boilerplate = True
159 |             self.classes_boilerplate = new re2cy.RE2("^(?:" + "|".join(classes_boilerplate) + ")$")
160 |             attributes_whitelist.add("class")
161 | 
162 |         ids_boilerplate = frozenset(options.get("ids_boilerplate") or [])
163 |         if len(ids_boilerplate) > 0:
164 |             self.has_ids_boilerplate = True
165 |             self.ids_boilerplate = new re2cy.RE2("^(?:" + "|".join(ids_boilerplate) + ")$")
166 |             attributes_whitelist.add("id")
167 | 
168 |         roles_boilerplate = frozenset(options.get("roles_boilerplate") or [])
169 |         if len(roles_boilerplate) > 0:
170 |             self.has_roles_boilerplate = True
171 |             self.roles_boilerplate = new re2cy.RE2("^(?:" + "|".join(roles_boilerplate) + ")$")
172 |             attributes_whitelist.add("role")
173 | 
174 |         metas_whitelist = frozenset(options.get("metas_whitelist") or [])
175 |         if len(metas_whitelist) > 0:
176 |             self.has_metas_whitelist = True
177 |             self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$")
178 |             attributes_whitelist.add("name")
179 |             attributes_whitelist.add("property")
180 |             attributes_whitelist.add("content")
181 | 
182 |         # Some options add attributes to the whitelist
183 |         if self.analyze_external_hyperlinks or self.analyze_internal_hyperlinks:
184 |             attributes_whitelist.add("href")
185 |             attributes_whitelist.add("rel")
186 | 
187 |         # FInally, freeze the attributes whitelist
188 |         self.has_attributes_whitelist = len(attributes_whitelist) > 0
189 |         if self.has_attributes_whitelist:
190 |             self.attributes_whitelist = new re2cy.RE2("^(?:" + "|".join(attributes_whitelist) + ")$")
191 | 
192 |         self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY)
193 |         self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P)
194 |         self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV)
195 |         self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN)
196 | 
197 |         for tag_name in options.get("tags_ignore", []):
198 |             tag = gumbocy.gumbo_tag_enum(tag_name)
199 |             if tag != gumbocy.GUMBO_TAG_UNKNOWN:
200 |                 self.tags_ignore.insert(<int> gumbocy.gumbo_tag_enum(tag_name))
201 | 
202 |         for tag_name in options.get("tags_boilerplate", []):
203 |             tag = gumbocy.gumbo_tag_enum(tag_name)
204 |             if tag != gumbocy.GUMBO_TAG_UNKNOWN:
205 |                 self.tags_boilerplate.insert(<int> gumbocy.gumbo_tag_enum(tag_name))
206 | 
207 |         for tag_name in options.get("tags_boilerplate_bypass", []):
208 |             tag = gumbocy.gumbo_tag_enum(tag_name)
209 |             if tag != gumbocy.GUMBO_TAG_UNKNOWN:
210 |                 self.tags_boilerplate_bypass.insert(<int> gumbocy.gumbo_tag_enum(tag_name))
211 | 
212 |         for tag_name in options.get("tags_separators", []):
213 |             tag = gumbocy.gumbo_tag_enum(tag_name)
214 |             if tag != gumbocy.GUMBO_TAG_UNKNOWN:
215 |                 self.tags_separators.insert(<int> gumbocy.gumbo_tag_enum(tag_name))
216 | 
217 |         self.tags_separators.insert(gumbocy.GUMBO_TAG_BODY)
218 | 
219 |     cdef bint guess_node_hidden(self, gumbocy.GumboNode* node, Attributes attrs):
220 |         """ Rough guess to check if the element is explicitly hidden.
221 | 
222 |             Not intended to combat spam!
223 |         """
224 | 
225 |         if not self.has_attributes_whitelist:
226 |             return False
227 | 
228 |         # From the HTML5 spec
229 |         if attrs.has_hidden:
230 |             return True
231 | 
232 |         if self.has_ids_hidden and attrs.values.get(ATTR_ID):
233 |             if re2_search(attrs.values[ATTR_ID], deref(self.ids_hidden)):
234 |                 return True
235 | 
236 |         if self.has_classes_hidden and attrs.size_classes > 0:
237 |             for k in attrs.classes:
238 |                 if re2_search(k, deref(self.classes_hidden)):
239 |                     return True
240 | 
241 |         if attrs.values.get(ATTR_STYLE):
242 |             if re2_search(attrs.values[ATTR_STYLE], deref(_RE2_SEARCH_STYLE_HIDDEN)):
243 |                 return True
244 | 
245 |         return False
246 | 
247 | 
248 |     cdef bint guess_node_boilerplate(self, gumbocy.GumboNode* node, Attributes attrs):
249 |         """ Rough guess to check if the element is boilerplate """
250 | 
251 |         if self.tags_boilerplate.count(<int> node.v.element.tag):
252 |             return True
253 | 
254 |         # http://html5doctor.com/understanding-aside/
255 |         if node.v.element.tag == gumbocy.GUMBO_TAG_ASIDE:
256 |             if "article" not in self.current_stack:
257 |                 return True
258 | 
259 |         if self.has_classes_boilerplate and attrs.size_classes > 0:
260 |             for k in attrs.classes:
261 |                 if re2_search(k, deref(self.classes_boilerplate)):
262 |                     return True
263 | 
264 |         if self.has_ids_boilerplate and attrs.values.get(ATTR_ID):
265 |             if re2_search(attrs.values[ATTR_ID], deref(self.ids_boilerplate)):
266 |                 return True
267 | 
268 |         if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE):
269 |             if re2_search(attrs.values[ATTR_ROLE], deref(self.roles_boilerplate)):
270 |                 return True
271 | 
272 |         return False
273 | 
274 |     cdef Attributes get_attributes(self, gumbocy.GumboNode* node):
275 |         """ Build a dict with all the whitelisted attributes """
276 | 
277 |         attrs = Attributes()
278 |         # cdef Attributes attrs
279 |         attrs.size_classes = 0
280 |         attrs.has_hidden = 0
281 |         # attrs.values = [""] * 10
282 |         # attrs.classes = []
283 |         attrs.values = {}  # deref(new map[AttributeNames, const char*]())
284 |         # attrs.values[ATTR_ID] = "x"
285 |         # print dict(attrs.values)
286 | 
287 |         for i in range(node.v.element.attributes.length):
288 | 
289 |             attr = <gumbocy.GumboAttribute *> node.v.element.attributes.data[i]
290 | 
291 |             if re2_search(attr.name, deref(self.attributes_whitelist)):
292 | 
293 |                 if attr.name == b"class":
294 |                     multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower()))
295 |                     attrs.size_classes = len(multiple_value)
296 |                     if attrs.size_classes > 0:
297 |                         attrs.classes = list(multiple_value)
298 |                         # for k in multiple_value:
299 |                         #     ck = <char *> k
300 |                         #     attrs.classes.push_back(ck)  #  = list(multiple_value)
301 | 
302 |                 elif attr.name == b"id":
303 |                     pystr = str(attr.value).lower()
304 |                     attrs.values[ATTR_ID] = pystr
305 | 
306 |                 elif attr.name == b"style":
307 |                     attrs.values[ATTR_STYLE] = attr.value
308 | 
309 |                 elif attr.name == b"href":
310 |                     attrs.values[ATTR_HREF] = attr.value
311 | 
312 |                 elif attr.name == b"role":
313 |                     pystr = str(attr.value).lower()
314 |                     attrs.values[ATTR_ROLE] = pystr
315 | 
316 |                 elif attr.name == b"rel":
317 |                     pystr = str(attr.value).lower()
318 |                     attrs.values[ATTR_REL] =  pystr
319 | 
320 |                 elif attr.name == b"aria-hidden" and attr.value == b"true":
321 |                     attrs.has_hidden = 1
322 | 
323 |                 elif attr.name == b"hidden":
324 |                     attrs.has_hidden = 1
325 | 
326 |                 elif attr.name == b"alt":
327 |                     attrs.values[ATTR_ALT] = attr.value
328 | 
329 |                 elif attr.name == b"src":
330 |                     attrs.values[ATTR_SRC] = attr.value
331 | 
332 |                 elif attr.name == b"name":
333 |                     pystr = str(attr.value).lower()
334 |                     attrs.values[ATTR_NAME] = pystr
335 | 
336 |                 elif attr.name == b"property":
337 |                     pystr = str(attr.value).lower()
338 |                     attrs.values[ATTR_PROPERTY] = pystr
339 | 
340 |                 elif attr.name == b"content":
341 |                     attrs.values[ATTR_CONTENT] = attr.value
342 | 
343 |         return attrs
344 | 
345 |     cdef void close_word_group(self):
346 |         """ Close the current word group """
347 | 
348 |         if self.current_word_group:
349 |             self.analysis["word_groups"].append(tuple(self.current_word_group))
350 |             self.current_word_group = None
351 | 
352 | 
353 |     cdef void add_text(self, text):
354 |         """ Adds inner text to the current word group """
355 | 
356 |         if not self.current_word_group:
357 |             self.current_word_group = [text.strip(), self.current_stack[-1]]
358 |         else:
359 |             self.current_word_group[0] += " " + text.strip()
360 | 
361 |     cdef void add_hyperlink_text(self, text):
362 |         """ Adds inner text to the currently open hyperlink """
363 | 
364 |         if self.current_hyperlink:
365 |             self.current_hyperlink[1] += text
366 | 
367 |     cdef void open_hyperlink(self, Attributes attrs):
368 |         """ Opens a new hyperlink """
369 | 
370 |         if not self.analyze_external_hyperlinks and not self.analyze_internal_hyperlinks:
371 |             return
372 | 
373 |         if not attrs.values.get(ATTR_HREF):
374 |             return
375 | 
376 |         if len(attrs.values[ATTR_HREF]) == 0:
377 |             return
378 | 
379 |         if re2_search(attrs.values[ATTR_HREF], deref(_RE2_IGNORED_HREF)):
380 |             return
381 | 
382 |         self.close_hyperlink()
383 | 
384 |         # href, text, rel
385 |         self.current_hyperlink = [attrs.values[ATTR_HREF], "", attrs.values.get(ATTR_REL)]
386 | 
387 |     cdef void close_hyperlink(self):
388 |         """ Closes the current hyperlink if any, and decides if it's an external or internal link """
389 | 
390 |         cdef bint is_external = 0
391 | 
392 |         if not self.analyze_external_hyperlinks and not self.analyze_internal_hyperlinks:
393 |             return
394 | 
395 |         if self.current_hyperlink:
396 |             href = self.current_hyperlink[0]
397 | 
398 |             if re2_search(href, deref(_RE2_ABSOLUTE_HREF)):
399 |                 is_external = 1
400 | 
401 |                 if self.has_url:
402 | 
403 |                     if href.startswith("//"):
404 |                         href = self.scheme + ":" + href
405 | 
406 |                     # This may be an absolute link but to the same domain
407 |                     if re2_search(href, deref(self.internal_netloc_search)):
408 |                         is_external = 0
409 |                         href = href.split(self.netloc, 1)[1]
410 | 
411 |             if is_external:
412 |                 if self.analyze_external_hyperlinks:
413 |                     self.analysis["external_hyperlinks"].append(
414 |                         (href, self.current_hyperlink[1], self.current_hyperlink[2])
415 |                     )
416 | 
417 |             elif self.analyze_internal_hyperlinks:
418 |                 self.analysis["internal_hyperlinks"].append(
419 |                     (href, self.current_hyperlink[1], self.current_hyperlink[2])
420 |                 )
421 | 
422 |             self.current_hyperlink = None
423 | 
424 |     cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node, bint is_head, bint is_hidden, bint is_boilerplate, bint is_boilerplate_bypassed, bint is_hyperlink):
425 |         """ Traverses the node tree. Return 1 to stop at this level """
426 | 
427 |         cdef GumboStringPiece gsp
428 |         cdef const char* tag_name
429 |         cdef int tag_n
430 | 
431 |         if level > self.nesting_limit:
432 |             return 0
433 | 
434 |         if node.type == gumbocy.GUMBO_NODE_TEXT:
435 | 
436 |             if (self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks) and is_hyperlink:
437 |                 self.add_hyperlink_text(node.v.text.text)
438 | 
439 |             if self.analyze_word_groups and not is_head and not is_hidden and (not is_boilerplate or is_boilerplate_bypassed):
440 |                 self.add_text(node.v.text.text)
441 | 
442 |         elif node.type == gumbocy.GUMBO_NODE_ELEMENT:
443 | 
444 |             tag_n = <int> node.v.element.tag
445 | 
446 |             if self.head_only and self.tags_ignore_head_only.count(tag_n):
447 |                 return 1
448 | 
449 |             if self.tags_ignore.count(tag_n):
450 |                 return 0
451 | 
452 |             tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag)
453 | 
454 |             # When we find an unknown tag, find its tag_name in the buffer
455 |             if tag_name == b"":
456 |                 gsp = node.v.element.original_tag
457 |                 gumbo_tag_from_original_text(&gsp)
458 |                 py_tag_name = str(gsp.data)[0:gsp.length].lower()  # TODO try to do that only in C!
459 |                 tag_name = <const char *> py_tag_name
460 | 
461 |             # if self.has_attributes_whitelist:
462 | 
463 |             attrs = self.get_attributes(node)
464 | 
465 |             if self.has_classes_ignore and attrs.size_classes > 0:
466 |                 for v in attrs.classes:
467 |                     if re2_search(v, deref(self.classes_ignore)):
468 |                         return 0
469 | 
470 |             if self.has_ids_ignore and attrs.values.get(ATTR_ID):
471 |                 if re2_search(attrs.values[ATTR_ID], deref(self.ids_ignore)):
472 |                     return 0
473 | 
474 |             if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE:
475 |                 if not self.analysis.get("title"):
476 |                     if node.v.element.children.length > 0:
477 |                         first_child = <gumbocy.GumboNode *> node.v.element.children.data[0]
478 |                         if first_child.type == gumbocy.GUMBO_NODE_TEXT:
479 |                             self.analysis["title"] = first_child.v.text.text
480 |                 return 0
481 | 
482 |             self.current_stack.append(tag_name)
483 | 
484 |             if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD:
485 |                 is_head = 1
486 | 
487 |             elif node.v.element.tag == gumbocy.GUMBO_TAG_A:
488 |                 self.open_hyperlink(attrs)
489 |                 is_hyperlink = 1
490 | 
491 |             elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG:
492 |                 self.close_word_group()
493 |                 if attrs.values.get(ATTR_ALT):
494 |                     self.add_text(attrs.values[ATTR_ALT])
495 |                     self.close_word_group()
496 | 
497 |                 # Text extraction from image filenames disabled for now
498 |                 # if attrs.get("src"):
499 |                 #     if not attrs["src"].startswith("data:"):
500 |                 #         self.add_text(self._split_filename_words(attrs["src"]))
501 |                 #         self.close_word_group()
502 | 
503 | 
504 |             if is_head:
505 |                 if node.v.element.tag == gumbocy.GUMBO_TAG_LINK:
506 | 
507 |                     # TODO: more properties
508 |                     if attrs.values.get(ATTR_REL) and attrs.values.get(ATTR_HREF):
509 |                         self.analysis.setdefault("head_links", [])
510 |                         self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]})
511 | 
512 |                 elif self.has_metas_whitelist and node.v.element.tag == gumbocy.GUMBO_TAG_META:
513 | 
514 |                     if attrs.values.get(ATTR_CONTENT):
515 | 
516 |                         if attrs.values.get(ATTR_NAME):
517 |                             if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)):
518 |                                 self.analysis.setdefault("head_metas", {})
519 |                                 self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip()
520 | 
521 |                         elif attrs.values.get(ATTR_PROPERTY):
522 |                             if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)):
523 |                                 self.analysis.setdefault("head_metas", {})
524 |                                 self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip()
525 | 
526 |                 elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE:
527 |                     if attrs.values.get(ATTR_HREF) and "base_url" not in self.analysis:
528 |                         self.analysis["base_url"] = attrs.values[ATTR_HREF]
529 | 
530 |             # TODO is_article
531 | 
532 |             if not is_hidden:
533 |                 is_hidden = self.guess_node_hidden(node, attrs)
534 | 
535 |             if is_boilerplate and not is_boilerplate_bypassed:
536 |                 if self.tags_boilerplate_bypass.count(tag_n):
537 |                     is_boilerplate_bypassed = True
538 | 
539 |             if not is_boilerplate:
540 |                 is_boilerplate = self.guess_node_boilerplate(node, attrs)
541 | 
542 |             # print " " * level, "BOILER", tag_name, is_boilerplate, dict(attrs.values), attrs.classes
543 | 
544 |             # Close the word group
545 |             if self.tags_separators.count(tag_n):
546 |                 self.close_word_group()
547 | 
548 |             # Call _traverse_node() recursively for each of the children
549 |             for i in range(node.v.element.children.length):
550 |                 child = <gumbocy.GumboNode *>node.v.element.children.data[i]
551 |                 if self._traverse_node(level + 1, child, is_head, is_hidden, is_boilerplate, is_boilerplate_bypassed, is_hyperlink) == 1:
552 |                     break
553 | 
554 |             # Close the word group
555 |             if self.tags_separators.count(tag_n):
556 |                 self.close_word_group()
557 | 
558 |             self.current_stack.pop()
559 | 
560 |             if node.v.element.tag == gumbocy.GUMBO_TAG_A:
561 |                 self.close_hyperlink()
562 | 
563 |             if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD:
564 |                 if self.head_only:
565 |                     return 1
566 | 
567 |         return 0
568 | 
569 |     def parse(self, char* html):
570 |         """ Do the actual parsing of the HTML with gumbo """
571 | 
572 |         self.free()
573 |         self.output = gumbocy.gumbo_parse(html)
574 |         self.has_output = 1
575 | 
576 |     def analyze(self, url=None):
577 |         """ Traverse the parsed tree and return the results """
578 | 
579 |         self.analysis = {}
580 |         self.has_url = 0
581 | 
582 |         if self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks:
583 | 
584 |             if url:
585 |                 self.has_url = 1
586 |                 self.url = url
587 |                 parsed = urlparse.urlparse(url)
588 |                 netloc = parsed.netloc.lower()
589 |                 self.netloc = netloc
590 |                 self.scheme = parsed.scheme
591 |                 self.internal_netloc_search = new re2cy.RE2("^http(?:s)?://%s" % re.escape(self.netloc))
592 | 
593 |             if self.analyze_internal_hyperlinks:
594 |                 self.analysis["internal_hyperlinks"] = []
595 | 
596 |             if self.analyze_external_hyperlinks:
597 |                 self.analysis["external_hyperlinks"] = []
598 | 
599 |         if self.analyze_word_groups:
600 |             self.analysis["word_groups"] = []
601 | 
602 |         self.current_stack = []
603 |         self.current_word_group = None
604 |         self.current_hyperlink = None
605 | 
606 |         self._traverse_node(0, self.output.root, 0, 0, 0, 0, 0)
607 | 
608 |         return self.analysis
609 | 
610 |     #
611 |     # Older listnodes() API support
612 |     #
613 | 
614 |     def listnodes(self):
615 |         """ Return the nodes as a flat list of tuples """
616 | 
617 |         self.nodes = []
618 | 
619 |         self._traverse_node_simple(0, self.output.root)
620 | 
621 |         return self.nodes
622 | 
623 |     cdef bint _traverse_node_simple(self, int level, gumbocy.GumboNode* node):
624 |         """ Traverses the node tree. Return 1 to stop at this level """
625 | 
626 |         cdef GumboStringPiece gsp
627 | 
628 |         if level > self.nesting_limit:
629 |             return 0
630 | 
631 |         if node.type == gumbocy.GUMBO_NODE_TEXT:
632 |             self.nodes.append((level, None, node.v.text.text))
633 | 
634 |         elif node.type == gumbocy.GUMBO_NODE_ELEMENT:
635 | 
636 |             tag_n = <int> node.v.element.tag
637 | 
638 |             if self.head_only and self.tags_ignore_head_only.count(tag_n):
639 |                 return 1
640 | 
641 |             if self.tags_ignore.count(tag_n):
642 |                 return 0
643 | 
644 |             tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag)
645 | 
646 |             # When we find an unknown tag, find its tag_name in the buffer
647 |             if tag_name == b"":
648 |                 gsp = node.v.element.original_tag
649 |                 gumbo_tag_from_original_text(&gsp)
650 |                 py_tag_name = str(gsp.data)[0:gsp.length].lower()  # TODO try to do that only in C!
651 |                 tag_name = <const char *> py_tag_name
652 | 
653 |             if self.has_attributes_whitelist:
654 | 
655 |                 # Build a dict with all the whitelisted attributes
656 |                 has_attrs = False
657 |                 attrs = False
658 |                 for i in range(node.v.element.attributes.length):
659 |                     attr = <gumbocy.GumboAttribute *> node.v.element.attributes.data[i]
660 |                     attr_name = str(attr.name)
661 |                     if re2_search(attr_name, deref(self.attributes_whitelist)):
662 |                         if attr_name == b"class":
663 |                             multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower()))
664 |                             if len(multiple_value):
665 |                                 if self.has_classes_ignore:
666 |                                     for v in multiple_value:
667 |                                         if re2_search(v, deref(self.classes_ignore)):
668 |                                             return 0
669 | 
670 |                                 if not has_attrs:
671 |                                     attrs = {}
672 |                                     has_attrs = True
673 |                                 attrs[attr_name] = multiple_value
674 | 
675 |                         else:
676 | 
677 |                             if not has_attrs:
678 |                                 attrs = {}
679 |                                 has_attrs = True
680 |                             attrs[attr_name] = attr.value
681 | 
682 |                 if not has_attrs:
683 |                     self.nodes.append((level, tag_name))
684 | 
685 |                 else:
686 | 
687 |                     if self.has_ids_ignore:
688 |                         if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)):
689 |                             return 0
690 | 
691 |                     self.nodes.append((level, tag_name, attrs))
692 | 
693 |             else:
694 |                 self.nodes.append((level, tag_name))
695 | 
696 |             # Call _iternode() recursively for each of the children
697 |             for i in range(node.v.element.children.length):
698 |                 child = <gumbocy.GumboNode *>node.v.element.children.data[i]
699 |                 if self._traverse_node_simple(level + 1, child) == 1:
700 |                     break
701 | 
702 |             if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only:
703 |                 return 1
704 | 
705 |         return 0
706 | 
707 |     def __dealloc__(self):
708 |         """ Cleanup gumbo memory when the parser is deallocated by Python """
709 |         self.free()
710 | 
711 |     cdef free(self):
712 |         if self.has_output:
713 |             gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output)
714 |             self.has_output = 0
715 | 


--------------------------------------------------------------------------------
/re2cy.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.string cimport string
 2 | 
 3 | ctypedef Arg* ArgPtr
 4 | 
 5 | 
 6 | cdef extern from "re2/stringpiece.h" namespace "re2":
 7 |     cdef cppclass StringPiece:
 8 |         # Eliding some constructors on purpose.
 9 |         StringPiece(const char*) except +
10 |         StringPiece(const string&) except +
11 | 
12 |         const char* data()
13 |         int length()
14 | 
15 | 
16 | cdef extern from "re2/re2.h" namespace "re2":
17 | 
18 |     cdef cppclass Arg "RE2::Arg":
19 |         Arg()
20 | 
21 |     cdef cppclass RE2:
22 |         RE2(const char*) except +
23 | 
24 |         @staticmethod
25 |         bint PartialMatchN(
26 |             const char *,
27 |             const RE2&,
28 |             const Arg* const args[],
29 |             int,
30 |         )
31 | 


--------------------------------------------------------------------------------
/requirements-benchmark.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | requests
3 | html5lib
4 | bs4
5 | BeautifulSoup; python_version < '3.0'
6 | gumbo


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython==0.24
2 | pytest==2.9.2
3 | pytest-repeat==0.3.0


--------------------------------------------------------------------------------
/scripts/git-set-file-times:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | 
 4 | # sets mtime and atime of files to the latest commit time in git
 5 | #
 6 | # This is useful for serving static content (managed by git)
 7 | # from a cluster of identically configured HTTP servers.  HTTP
 8 | # clients and content delivery networks can get consistent
 9 | # Last-Modified headers no matter which HTTP server in the
10 | # cluster they hit.  This should improve caching behavior.
11 | #
12 | # This does not take into account merges, but if you're updating
13 | # every machine in the cluster from the same commit (A) to the
14 | # same commit (B), the mtimes will be _consistent_ across all
15 | # machines if not necessarily accurate.
16 | #
17 | # THIS IS NOT INTENDED TO OPTIMIZE BUILD SYSTEMS SUCH AS 'make'
18 | # YOU HAVE BEEN WARNED!
19 | 
20 | my %ls = ();
21 | my $commit_time;
22 | 
23 | if ($ENV{GIT_DIR}) {
24 |   chdir($ENV{GIT_DIR}) or die $!;
25 | }
26 | 
27 | $/ = "\0";
28 | open FH, 'git ls-files -z|' or die $!;
29 | while (<FH>) {
30 |   chomp;
31 |   $ls{$_} = $_;
32 | }
33 | close FH;
34 | 
35 | 
36 | $/ = "\n";
37 | open FH, "git log -m -r --name-only --no-color --pretty=raw -z @ARGV |" or die $!;
38 | while (<FH>) {
39 |   chomp;
40 |   if (/^committer .*? (\d+) (?:[\-\+]\d+)$/) {
41 |     $commit_time = $1;
42 |   } elsif (s/\0\0commit [a-f0-9]{40}( \(from [a-f0-9]{40}\))?$// or s/\0$//) {
43 |     my @files = delete @ls{split(/\0/, $_)};
44 |     @files = grep { defined $_ } @files;
45 |     next unless @files;
46 |     utime $commit_time, $commit_time, @files;
47 |   }
48 |   last unless %ls;
49 | 
50 | }
51 | close FH;


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from distutils.extension import Extension
 3 | import os
 4 | 
 5 | # gumbocy.c will be present when installing from the source distribution on PyPI
 6 | if os.path.isfile("gumbocy.cpp"):
 7 | 
 8 |   # Use "make cythonize" to build the c file from the .pyx source
 9 |   ext_modules = [
10 |       Extension("gumbocy",
11 |                 ["gumbocy.cpp"],
12 |                 libraries=["gumbo", "re2"],
13 |                 language="c++",
14 |                 extra_compile_args=["-std=c++11", '-O3', '-static-libstdc++'],
15 |                 extra_link_args=["-std=c++11"])  # , "-static"
16 | 
17 |   ]
18 | 
19 | else:
20 |   raise Exception("Must run 'make cythonize' first!")
21 | 
22 | # # If the .c file is missing, we must be in local or installing from GitHub.
23 | # # In this case, we need Cython to be already installed.
24 | # else:
25 | #   from Cython.Build import cythonize
26 | 
27 | #   ext_modules = cythonize([
28 | #       Extension("gumbocy",
29 | #                 ["gumbocy.pyx"],
30 | #                 libraries=["gumbo"],
31 | #                 language="c++",
32 | #                 extra_compile_args=["-std=c++11"],
33 | #                 extra_link_args=["-std=c++11"])
34 | #   ])
35 | 
36 | 
37 | setup(
38 |   name="gumbocy",
39 |   version="0.2.0",
40 |   description="Python binding for gumbo-parser (an HTML5-compliant parser) using Cython",
41 |   author="Common Search contributors",
42 |   license="Apache License, Version 2.0",
43 |   url="https://github.com/commonsearch/gumbocy",
44 |   ext_modules=ext_modules,
45 |   keywords=["gumbo", "gumbo-parser", "gumbo-cython", "gumbocy", "cython", "htmlparser", "html5", "html5lib"],
46 |   classifiers=[
47 |     "Programming Language :: Python",
48 |     "Programming Language :: Python :: 2.7",
49 |     # 'Development Status :: 1 - Planning',
50 |     # 'Development Status :: 2 - Pre-Alpha',
51 |     # 'Development Status :: 3 - Alpha',
52 |     'Development Status :: 4 - Beta',
53 |     # 'Development Status :: 5 - Production/Stable',
54 |     # 'Development Status :: 6 - Mature',
55 |     # 'Development Status :: 7 - Inactive',
56 |     "Programming Language :: Python :: Implementation :: CPython",
57 |     "Programming Language :: Python :: Implementation :: PyPy",
58 |     "Environment :: Other Environment",
59 |     "Intended Audience :: Developers",
60 |     "License :: OSI Approved :: Apache Software License",
61 |     "Operating System :: OS Independent",
62 |     "Topic :: Software Development :: Libraries"
63 |   ]
64 | )
65 | 


--------------------------------------------------------------------------------
/tests/benchmark_parsers.py:
--------------------------------------------------------------------------------
 1 | # Usage: python -m cProfile -s cumtime tests/benchmark_parsers.py
 2 | 
 3 | import os
 4 | import sys
 5 | sys.path.insert(-1, os.getcwd())
 6 | 
 7 | import requests
 8 | import timeit
 9 | import html5lib
10 | import lxml.html
11 | import gumbocy
12 | import gumbo
13 | import bs4
14 | 
15 | if not os.path.isfile("tests/_benchmark_fixture.html"):
16 |     url = 'https://raw.githubusercontent.com/whatwg/html/d8717d8831c276ca65d2d44bbf2ce4ce673997b9/source'
17 |     html = requests.get(url).content
18 |     with open("tests/_benchmark_fixture.html", "w") as f:
19 |         f.write(html)
20 | 
21 | with open("tests/_benchmark_fixture.html", "r") as f:
22 |     html = f.read()
23 |     html_unicode = html.decode("utf-8")
24 | 
25 | 
26 | def bench(name, func):
27 |     print('{}: {:.3f} seconds'.format(name, min(timeit.repeat(func, number=1, repeat=3))))
28 | 
29 | 
30 | def benchmark_gumbocy():
31 |     parser = gumbocy.HTMLParser(options={
32 |         "attributes_whitelist": ["id", "class", "style"]
33 |     })
34 |     parser.parse(html)
35 |     nodes = parser.listnodes()
36 | 
37 |     divs_count = 0
38 |     for node in nodes:
39 |         if node[1] == "div":
40 |             divs_count += 1
41 |     print "Gumbocy: ", divs_count
42 | 
43 | 
44 | def benchmark_gumbo_bs3():
45 |     parser = gumbo.soup_parse(html_unicode)
46 |     divs = parser.findAll("div")
47 |     print "gumbo bs3", len(divs)
48 | 
49 | 
50 | def benchmark_lxml_raw():
51 |     parsed = lxml.html.fromstring(html)
52 |     divs = parsed.findall(".//div")
53 |     print "lxml raw", len(divs)
54 | 
55 | 
56 | def benchmark_html5lib_bs4():
57 |     parser = bs4.BeautifulSoup(html, "html5lib")
58 |     divs = parser.find_all("div")
59 |     print "html5lib bs4", len(divs)
60 | 
61 | 
62 | def benchmark_htmlparser_bs4():
63 |     parser = bs4.BeautifulSoup(html, "html.parser")
64 |     divs = parser.find_all("div")
65 |     print "html.parser bs4", len(divs)
66 | 
67 | 
68 | bench("benchmark_gumbocy", benchmark_gumbocy)
69 | bench("benchmark_gumbo_bs3", benchmark_gumbo_bs3)
70 | bench("benchmark_lxml_raw", benchmark_lxml_raw)
71 | bench("benchmark_html5lib_bs4", benchmark_html5lib_bs4)
72 | bench("benchmark_htmlparser_bs4", benchmark_htmlparser_bs4)
73 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | sys.path.append(os.getcwd())
5 | 
6 | import gumbocy
7 | 


--------------------------------------------------------------------------------
/tests/test_analyze.py:
--------------------------------------------------------------------------------
  1 | import gumbocy
  2 | from test_word_groups import TAGS_SEPARATORS
  3 | 
  4 | 
  5 | def analyze(html, options=None):
  6 |     parser = gumbocy.HTMLParser(options=options)
  7 |     parser.parse(html)
  8 |     return parser.analyze()
  9 | 
 10 | 
 11 | def test_separators():
 12 |     html = """
 13 |         <p>text</p>
 14 |         <p>text 2</p>
 15 |         <p>pre<p>inner</p></p>
 16 |     """
 17 | 
 18 |     analyzed = analyze(html, options={
 19 |         "tags_separators": ["p"]
 20 |     })
 21 | 
 22 |     assert analyzed["word_groups"] == [
 23 |         ("text", "p"),
 24 |         ("text 2", "p"),
 25 |         ("pre", "p"),
 26 |         ("inner", "p")
 27 |     ]
 28 | 
 29 |     # More word group tests in test_word_groups.py
 30 | 
 31 | 
 32 | def test_hidden_text():
 33 | 
 34 |     html = """<html><head></head><body>
 35 |         <!-- comment -->
 36 |         text
 37 |         <div>textp</div>
 38 |         <div style='display: none;'>hidden by display</div>
 39 |         <div class='_class_noindex'>ignored by class_noindex</div>
 40 |         <div class='_class_noindex class2'>ignored by class_noindex 2</div>
 41 |         <div hidden>hidden by html5 attribute</div>
 42 |         <div aria-hidden="true">hidden by aria</div>
 43 |         <div aria-hidden="false">not_aria</div>
 44 |         <div style='visibility: hidden;'>hidden by visibility</div>
 45 |     </body></html>"""
 46 | 
 47 |     analyzed = analyze(html, options={
 48 |         "attributes_whitelist": ["style", "hidden", "aria-hidden"],
 49 |         "classes_hidden": ["_class_hidden"],
 50 |         "ids_hidden": ["_id_hidden"],
 51 |         "tags_separators": ["div"],
 52 |         "classes_ignore": ["_class_noindex"]
 53 |     })
 54 | 
 55 |     assert analyzed["word_groups"] == [
 56 |         ("text", "body"),
 57 |         ("textp", "div"),
 58 |         ("not_aria", "div")
 59 |     ]
 60 | 
 61 | 
 62 | def test_hidden_siblings():
 63 | 
 64 |     html = """
 65 | <span class='login facebook'>
 66 | Sign in with Facebook
 67 | </span>
 68 | <span class='login'>Or use your Businessweek account</span>
 69 | """
 70 | 
 71 |     analyzed = analyze(html, options={
 72 |         "classes_boilerplate": ["login"]
 73 |     })
 74 | 
 75 |     assert analyzed["word_groups"] == []
 76 | 
 77 | 
 78 | def test_boilerplate_text():
 79 | 
 80 |     html = """<html><head></head><body>
 81 | 
 82 |         <header>
 83 |             Boilerplate
 84 |             <h2>Title</h2>
 85 |         </header>
 86 | 
 87 |         <div class="classboil">x</div>
 88 |         <div id="idboil">y</div>
 89 |         <div role="roleboil">z</div>
 90 | 
 91 |         <h2>Title 2</h2>
 92 |     </body></html>"""
 93 | 
 94 |     analyzed = analyze(html, options={
 95 |         "attributes_whitelist": ["id", "class", "role"],
 96 |         "tags_boilerplate": ["header"],
 97 |         "tags_boilerplate_bypass": ["h2"],
 98 |         "classes_boilerplate": ["classboil"],
 99 |         "ids_boilerplate": ["idboil"],
100 |         "roles_boilerplate": ["roleboil"],
101 |         "tags_separators": TAGS_SEPARATORS
102 |     })
103 | 
104 |     assert analyzed["word_groups"] == [
105 |         ("Title", "h2"),
106 |         ("Title 2", "h2")
107 |     ]
108 | 
109 | 
110 | def test_title():
111 | 
112 |     html = """ <title>test 1</title> <title>test 2</title> """
113 | 
114 |     analyzed = analyze(html, options={
115 |     })
116 | 
117 |     assert analyzed["title"] == "test 1"
118 |     assert len(analyzed["word_groups"]) == 0
119 | 
120 | 
121 | def test_head_metas():
122 | 
123 |     html = """<html>
124 |         <head>
125 |             <meta name="Description" content=" This   is a &lt;summary&gt;!" />
126 |             <meta name="Description2" content=" This2   is a &lt;summary&gt;!" />
127 |         </head>
128 |         <body>This is &lt;body&gt; text</body>
129 |     </html>"""
130 | 
131 |     analyzed = analyze(html, options={
132 |         "metas_whitelist": ["description"]
133 |     })
134 | 
135 |     assert analyzed["head_metas"] == {"description": "This   is a <summary>!"}
136 | 


--------------------------------------------------------------------------------
/tests/test_hyperlinks.py:
--------------------------------------------------------------------------------
  1 | import gumbocy
  2 | from test_word_groups import TAGS_SEPARATORS
  3 | 
  4 | 
  5 | def _links(html, url=None):
  6 |     parser = gumbocy.HTMLParser(options={
  7 |         "tags_separators": TAGS_SEPARATORS
  8 |     })
  9 |     parser.parse(html)
 10 |     ret = parser.analyze(url=url)
 11 |     return {
 12 |         "all": ret["internal_hyperlinks"] + ret["external_hyperlinks"],
 13 |         "internal": ret["internal_hyperlinks"],
 14 |         "external": ret["external_hyperlinks"]
 15 |     }
 16 | 
 17 | 
 18 | def test_get_hyperlinks():
 19 |     links = _links("""<html><head><title>Test title</title></head><body>x</body></html>""")
 20 |     assert len(links["all"]) == 0
 21 | 
 22 |     links = _links("""<html><head><title>Test title</title></head><body>
 23 |         <a name="x">Y</a>
 24 |     </body></html>""")
 25 |     assert len(links["all"]) == 0
 26 | 
 27 |     links = _links("""<html><head><title>Test title</title></head><body>
 28 |         <a href="">Y</a>
 29 |     </body></html>""")
 30 |     assert len(links["all"]) == 0
 31 | 
 32 |     links = _links("""<html><head><title>Test title</title></head><body>
 33 |         <a href="ftp://test.com">Y</a>
 34 |     </body></html>""")
 35 |     assert len(links["all"]) == 0
 36 | 
 37 |     links = _links("""<html><head><title>Test title</title></head><body>
 38 |         <a href="javascript:hello()">Y</a>
 39 |     </body></html>""")
 40 |     assert len(links["all"]) == 0
 41 | 
 42 |     links = _links("""<html><head><title>Test title</title></head><body>
 43 |         <a href="mailto:contact@example.com">Y</a>
 44 |     </body></html>""")
 45 |     assert len(links["all"]) == 0
 46 | 
 47 |     links = _links("""<html><head><title>Test title</title></head><body>
 48 |         <a href="http://sub.test.com/page1?q=2&a=b#xxx" rel="nofollow">Y</a>
 49 |     </body></html>""")
 50 |     assert len(links["all"]) == 1
 51 |     assert links["external"][0][0] == "http://sub.test.com/page1?q=2&a=b#xxx"
 52 |     assert links["external"][0][1] == "Y"
 53 |     assert links["external"][0][2] == "nofollow"
 54 | 
 55 |     links = _links("""<html><head><title>Test title</title></head><body>
 56 |         <a href="/page1?q=2&a=b#xxx">Y X</a>
 57 |     </body></html>""", url="http://sub.test.com/page2")
 58 |     assert len(links["all"]) == 1
 59 |     assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
 60 |     assert links["internal"][0][1] == "Y X"
 61 |     assert links["internal"][0][2] is None
 62 | 
 63 |     links = _links("""<html><head><title>Test title</title></head><body>
 64 |         <a href="../page1?q=2&a=b#xxx">Y Z</a>
 65 |     </body></html>""", url="http://sub.test.com/page2/x.html")
 66 |     assert len(links["all"]) == 1
 67 |     assert links["internal"][0][0] == "../page1?q=2&a=b#xxx"
 68 |     assert links["internal"][0][1] == "Y Z"
 69 | 
 70 |     # Absolute links to the same netloc are still internal
 71 |     links = _links("""<html><head><title>Test title</title></head><body>
 72 |         <a href="http://sub.test.com/page1?q=2&a=b#xxx">Y Z</a>
 73 |     </body></html>""", url="http://sub.test.com/page2/x.html")
 74 |     assert len(links["all"]) == 1
 75 |     assert len(links["external"]) == 0
 76 |     assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
 77 |     assert links["internal"][0][1] == "Y Z"
 78 | 
 79 |     # Cross-scheme links are still considered internal
 80 |     links = _links("""<html><head><title>Test title</title></head><body>
 81 |         <a href="https://sub.test.com/page1?q=2&a=b#xxx">Y Z</a>
 82 |     </body></html>""", url="http://sub.test.com/page2/x.html")
 83 |     assert len(links["all"]) == 1
 84 |     assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
 85 |     assert links["internal"][0][1] == "Y Z"
 86 | 
 87 |     links = _links("""<html><head><title>Test title</title></head><body>
 88 |         <a href="http://sub.test.com/page1?q=2&a=b#xxx">Y Z</a>
 89 |     </body></html>""", url="https://sub.test.com/page2/x.html")
 90 |     assert len(links["all"]) == 1
 91 |     assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
 92 |     assert links["internal"][0][1] == "Y Z"
 93 | 
 94 |     links = _links("""<html><head><title>Test title</title></head><body>
 95 |         <a href="http://sub.test.com/sub.test.com/page1?q=2&a=b#xxx">Y Z</a>
 96 |     </body></html>""", url="http://sub.test.com/page2/x.html")
 97 |     assert len(links["all"]) == 1
 98 |     assert links["internal"][0][0] == "/sub.test.com/page1?q=2&a=b#xxx"
 99 |     assert links["internal"][0][1] == "Y Z"
100 | 
101 |     links = _links("""<html><head><title>Test title</title></head><body>
102 |         <a href="//sub.test.com/page1?q=2&a=b#xxx">Y Z</a>
103 |     </body></html>""", url="http://sub.test.com/page2/x.html")
104 |     assert len(links["all"]) == 1
105 |     assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
106 |     assert links["internal"][0][1] == "Y Z"
107 | 
108 |     links = _links("""<html><head><title>Test title</title></head><body>
109 |         <a href="//sub2.test.com/page1?q=2&a=b#xxx">Y Z</a>
110 |     </body></html>""", url="http://sub.test.com/page2/x.html")
111 |     assert len(links["all"]) == 1
112 |     assert links["external"][0][0] == "http://sub2.test.com/page1?q=2&a=b#xxx"
113 |     assert links["external"][0][1] == "Y Z"
114 | 
115 |     links = _links("""<html><head><title>Test title</title></head><body>
116 |         <a href="https://sub2.test.com/page1?q=2&a=b#xxx">Y Z</a>
117 |     </body></html>""", url="http://sub.test.com/page2/x.html")
118 |     assert len(links["all"]) == 1
119 |     assert links["external"][0][0] == "https://sub2.test.com/page1?q=2&a=b#xxx"
120 |     assert links["external"][0][1] == "Y Z"
121 | 
122 |     # TODO resolution tests
123 | 


--------------------------------------------------------------------------------
/tests/test_listnodes.py:
--------------------------------------------------------------------------------
  1 | import gumbocy
  2 | 
  3 | 
  4 | def listnodes(html, options=None):
  5 |     parser = gumbocy.HTMLParser(options=options)
  6 |     parser.parse(html)
  7 |     return parser.listnodes()
  8 | 
  9 | 
 10 | def test_basic():
 11 |     html = """
 12 |         <html>
 13 |             <HEAD><title>HW</title></head>
 14 |             <body> Hello <a href="http://example.com" id="i" class="c">world</a><br/></body>
 15 |         </html >
 16 |     """
 17 | 
 18 |     iterations = 1  # 300000
 19 |     for _ in range(0, iterations):
 20 |         nodes = listnodes(html, {"attributes_whitelist": ["href"]})
 21 |     assert nodes == [
 22 |         (0, "html"),
 23 |             (1, "head"),
 24 |                 (2, "title"),
 25 |                     (3, None, "HW"),
 26 |             (1, "body"),
 27 |                 (2, None, " Hello "),
 28 |                 (2, "a", {"href": "http://example.com"}),
 29 |                     (3, None, "world"),
 30 |                 (2, "br")
 31 |     ]
 32 | 
 33 | 
 34 | def test_classes():
 35 |     html = """
 36 |         <html>
 37 |             <head></head>
 38 |             <body><p class="para graph  "></p></body>
 39 |         </html >
 40 |     """
 41 | 
 42 |     nodes = listnodes(html, {"attributes_whitelist": ["class"]})
 43 |     assert nodes == [
 44 |         (0, "html"),
 45 |             (1, "head"),
 46 |             (1, "body"),
 47 |                 (2, "p", {"class": frozenset(["para", "graph"])})
 48 |     ]
 49 | 
 50 | 
 51 | def test_ignore():
 52 |     html = """
 53 |         <html>
 54 |             <HEAD><title>HW</title></head>
 55 |             <body> Hello <a href="http://example.com" id="i">world</a><br class="c ign"/></body>
 56 |         </html >
 57 |     """
 58 | 
 59 |     nodes = listnodes(html, {
 60 |         "attributes_whitelist": ["class", "id"],
 61 |         "ids_ignore": ["i"],
 62 |         "classes_ignore": set(["ign"]),
 63 |         "tags_ignore": ["title"]
 64 |     })
 65 |     assert nodes == [
 66 |         (0, "html"),
 67 |             (1, "head"),
 68 |             (1, "body"),
 69 |                 (2, None, " Hello ")
 70 |     ]
 71 | 
 72 | 
 73 | def test_head_only():
 74 |     html = """
 75 |         <html>
 76 |             <HEAD><title>HW</title></head>
 77 |             <body> Hello <a href="http://example.com" id="i">world</a><br class="c ign"/></body>
 78 |         </html >
 79 |     """
 80 | 
 81 |     nodes = listnodes(html, {
 82 |         "head_only": True
 83 |     })
 84 |     assert nodes == [
 85 |         (0, "html"),
 86 |             (1, "head"),
 87 |                 (2, "title"),
 88 |                     (3, None, "HW")
 89 |     ]
 90 | 
 91 |     html = """
 92 |         <html>
 93 |             <p>test</p><title>HW</title>
 94 |             <body> Hello <a href="http://example.com" id="i">world</a><br class="c ign"/></body>
 95 |         </html >
 96 |     """
 97 | 
 98 |     nodes = listnodes(html, {
 99 |         "head_only": True
100 |     })
101 |     assert nodes == [
102 |         (0, "html"),
103 |             (1, "head")
104 |     ]
105 | 
106 | 
107 | def test_unknown_tags():
108 |     html = """
109 |         <html>
110 |             <head></head>
111 |             <body><NEW_TAG class='xx'>inline text</NEW_TAG><new_tag_2 /></body>
112 |         </html >
113 |     """
114 | 
115 |     nodes = listnodes(html, {
116 |         "attributes_whitelist": ["class"],
117 |         "tags_ignore": "new_tag"  # We can't ignore unknown tags at the Gumbocy level (for now?)
118 |     })
119 | 
120 |     assert nodes == [
121 |         (0, "html"),
122 |             (1, "head"),
123 |             (1, "body"),
124 |                 (2, "new_tag", {'class': frozenset(['xx'])}),
125 |                     (3, None, "inline text"),
126 |                 (2, "new_tag_2")
127 |     ]
128 | 


--------------------------------------------------------------------------------
/tests/test_word_groups.py:
--------------------------------------------------------------------------------
  1 | import gumbocy
  2 | import pytest
  3 | 
  4 | TAGS_SEPARATORS = frozenset([
  5 |     "body",
  6 | 
  7 |     # http://www.w3.org/TR/html5/grouping-content.html#grouping-content
  8 |     "p", "pre", "blockquote", "ul", "ol", "li", "dl", "dt", "dd", "figure", "figcaption",
  9 | 
 10 |     "br", "img",
 11 | 
 12 |     "h1", "h2", "h3", "h4", "h5", "h6"
 13 | ])
 14 | 
 15 | 
 16 | SAMPLES = [
 17 |     {
 18 |         "html": """ <p>hello</p> """,
 19 |         "groups": [
 20 |             ("hello", "p")
 21 |         ]
 22 |     },
 23 | 
 24 |     # A <body> is automatically added
 25 |     {
 26 |         "html": """ nobody """,
 27 |         "groups": [
 28 |             ("nobody", "body")
 29 |         ]
 30 |     },
 31 | 
 32 |     # span
 33 |     {
 34 |         "html": """ <p>pre <span>link</span> post</p> """,
 35 |         "groups": [
 36 |             ("pre link post", "p")
 37 |         ]
 38 |     },
 39 | 
 40 |     # a
 41 |     {
 42 |         "html": """ <p>pre <a href="#">link</a> post</p> """,
 43 |         "groups": [
 44 |             ("pre link post", "p")
 45 |         ]
 46 |     },
 47 | 
 48 |     # mid p
 49 |     {
 50 |         "html": """ <p>pre </p><ul><li>li1 x</li></ul> mid <p> post </p> """,
 51 |         "groups": [
 52 |             ("pre", "p"),
 53 |             ("li1 x", "li"),
 54 |             ("mid", "body"),
 55 |             ("post", "p")
 56 |         ]
 57 |     },
 58 | 
 59 |     # Lists
 60 |     {
 61 |         "html": """ pre <ul><li>li1</li><li>li2</li></ul> post """,
 62 |         "groups": [
 63 |             ("pre", "body"),
 64 |             ("li1", "li"),
 65 |             ("li2", "li"),
 66 |             ("post", "body")
 67 |         ]
 68 |     },
 69 | 
 70 |     # HR with illegal <p>. "post" is actually part of <body>.
 71 |     {
 72 |         "html": """ <p>pre <hr/> post</p>""",
 73 |         "groups": [
 74 |             ("pre", "p"),
 75 |             ("post", "body")
 76 |         ]
 77 |     },
 78 | 
 79 |     # Non-closed p tag.
 80 |     {
 81 |         "html": """ pre <p> post""",
 82 |         "groups": [
 83 |             ("pre", "body"),
 84 |             ("post", "p")
 85 |         ]
 86 |     },
 87 | 
 88 |     # BR
 89 |     {
 90 |         "html": """ <p>pre <br/> post </p>""",
 91 |         "groups": [
 92 |             ("pre", "p"),
 93 |             ("post", "p")
 94 |         ]
 95 |     },
 96 | 
 97 |     # IMG filename + alt
 98 |     {
 99 |         "html": """ <p> pre <img src="/test/dir/maceo_parker.jpg" alt="james brown"> post </p>""",
100 |         "groups": [
101 |             ("pre", "p"),
102 |             ("james brown", "img"),
103 |             # ("maceo parker", "img"),
104 |             ("post", "p")
105 |         ]
106 |     },
107 | 
108 |     # IMG with dataURIs are ignored
109 |     {
110 |         "html": """<p> pre <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" alt="Red dot" /> post </p>""",
111 |         "groups": [
112 |             ("pre", "p"),
113 |             ("Red dot", "img"),
114 |             ("post", "p")
115 |         ]
116 |     },
117 | ]
118 | 
119 | 
120 | # TODO: good coverage of http://www.w3.org/html/wg/drafts/html/master/syntax.html
121 | @pytest.mark.parametrize(("sample"), SAMPLES)
122 | def test_get_word_groups(sample):
123 | 
124 |     parser = gumbocy.HTMLParser(options={
125 |         "tags_separators": TAGS_SEPARATORS,
126 |         "attributes_whitelist": ["src", "alt"]
127 |     })
128 |     parser.parse(sample["html"])
129 |     parsed = parser.analyze()
130 | 
131 |     for i, group in enumerate(parsed["word_groups"]):
132 |         assert group == sample["groups"][i]
133 | 
134 |     assert len(parsed["word_groups"]) == len(sample["groups"])
135 | 


--------------------------------------------------------------------------------