├── .dockerignore
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── MANIFEST
├── MANIFEST.in
├── Makefile
├── README.md
├── gumbocy.cpp
├── gumbocy.pxd
├── gumbocy.pyx
├── re2cy.pxd
├── requirements-benchmark.txt
├── requirements.txt
├── scripts
└── git-set-file-times
├── setup.py
└── tests
├── benchmark_parsers.py
├── conftest.py
├── test_analyze.py
├── test_hyperlinks.py
├── test_listnodes.py
└── test_word_groups.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | venv
2 | .git
3 | .cache
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 |
55 | # Sphinx documentation
56 | docs/_build/
57 |
58 | # PyBuilder
59 | target/
60 |
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 |
64 | # Gumbocy-specific
65 | gumbocy.c
66 | gumbocy.html
67 | venv/
68 | *.rst
69 | gumbo-parser
70 | /tests/_benchmark_fixture.html
71 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 |
3 | services:
4 | - docker
5 |
6 | env:
7 | - GUMBOCY_PYTHON_VERSION=py27
8 | - GUMBOCY_PYTHON_VERSION=pypy
9 |
10 | before_install:
11 | - docker ps
12 | - docker info
13 | - docker version
14 | - ./scripts/git-set-file-times
15 | - docker pull commonsearch/gumbocy
16 | - make docker_build
17 |
18 | script:
19 | - make docker_test
20 |
21 | notifications:
22 | irc: "chat.freenode.net#commonsearch"
23 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian:jessie
2 |
3 | RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
4 | curl \
5 | automake \
6 | gcc \
7 | g++ \
8 | make \
9 | libtool \
10 | ca-certificates \
11 | # python3-pip \
12 | # python3-dev \
13 | python-pip \
14 | python-dev \
15 | bzip2
16 |
17 | # Upgrade pip
18 | # RUN pip3 install --upgrade --ignore-installed pip
19 | RUN pip install --upgrade --ignore-installed pip
20 |
21 | # Install Gumbo
22 | ENV GUMBO_VERSION 0.10.1
23 | RUN curl -sL https://github.com/google/gumbo-parser/archive/v$GUMBO_VERSION.tar.gz > gumbo.tgz && \
24 | rm -rf gumbo-parser-$GUMBO_VERSION gumbo-parser && \
25 | tar zxf gumbo.tgz && \
26 | mv gumbo-parser-$GUMBO_VERSION gumbo-parser && \
27 | cd gumbo-parser && ./autogen.sh && ./configure && make && \
28 | make install && ldconfig && cd .. && \
29 | rm -rf gumbo.tgz gumbo-parser
30 |
31 |
32 | # Optional dependencies for benchmarking
33 | RUN apt-get install -y --no-install-recommends \
34 | libxml2-dev \
35 | libxslt1-dev \
36 | zlib1g-dev
37 |
38 | # RUN ln -s /usr/local/lib/libgumbo.so /usr/lib/python2.7/dist-packages/gumbo/libgumbo.so
39 |
40 | # Install PyPy
41 | RUN curl -L 'https://bitbucket.org/squeaky/portable-pypy/downloads/pypy-5.3.1-linux_x86_64-portable.tar.bz2' -o /pypy.tar.bz2 && \
42 | mkdir -p /opt/pypy/ && tar jxvf /pypy.tar.bz2 -C /opt/pypy/ --strip-components=1 && \
43 | rm /pypy.tar.bz2
44 |
45 | RUN /opt/pypy/bin/pypy -m ensurepip
46 | RUN /opt/pypy/bin/pip install --upgrade --ignore-installed pip
47 |
48 | # Install RE2
49 | RUN mkdir -p /tmp/re2 && \
50 | curl -L 'https://github.com/google/re2/archive/636bc71728b7488c43f9441ecfc80bdb1905b3f0.tar.gz' -o /tmp/re2/re2.tar.gz && \
51 | cd /tmp/re2 && tar zxvf re2.tar.gz --strip-components=1 && \
52 | make && make install && \
53 | rm -rf /tmp/re2 && \
54 | ldconfig
55 |
56 | # Install Python dependencies
57 |
58 | ADD requirements-benchmark.txt /requirements-benchmark.txt
59 | ADD requirements.txt /requirements.txt
60 | # RUN pip3 install -r requirements.txt
61 | # RUN pip3 install -r requirements-benchmark.txt
62 | RUN pip install -r requirements.txt
63 | RUN pip install -r requirements-benchmark.txt
64 | RUN /opt/pypy/bin/pip install -r /requirements.txt
65 | RUN /opt/pypy/bin/pip install setuptools==18.5 # Because of html5lib
66 | RUN /opt/pypy/bin/pip install -r /requirements-benchmark.txt
67 |
68 | RUN mkdir -p /cosr/gumbocy
69 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2016 Common Search contributors
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | LICENSE
3 | Makefile
4 | README.md
5 | gumbocy.c
6 | gumbocy.pyx
7 | setup.py
8 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include *.c
3 | include *.pyx
4 | include *.pyd
5 | include LICENSE
6 | include Makefile
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | clean:
2 | rm -rf *.so build *.c *.cpp *.html dist .cache tests/__pycache__ *.rst
3 |
4 | cythonize:
5 | cython --cplus -2 --warning-extra --annotate gumbocy.pyx
6 |
7 | build_ext: clean cythonize
8 | ifeq ($(GUMBOCY_PYTHON_VERSION), pypy)
9 | /opt/pypy/bin/pypy setup.py build_ext --inplace -Igumbo-parser/src -Lgumbo-parser/.libs -Rgumbo-parser/.libs
10 | else
11 | python setup.py build_ext --inplace -Igumbo-parser/src -Lgumbo-parser/.libs -Rgumbo-parser/.libs
12 | endif
13 |
14 | rst:
15 | pandoc --from=markdown --to=rst --output=README.rst README.md
16 |
17 | virtualenv:
18 | rm -rf venv
19 | virtualenv venv
20 | venv/bin/pip install -r requirements.txt
21 |
22 | test: build_ext
23 | ifeq ($(GUMBOCY_PYTHON_VERSION), pypy)
24 | /opt/pypy/bin/py.test tests/ -vs
25 | else
26 | py.test tests/ -vs
27 | endif
28 |
29 | docker_build:
30 | docker build -t commonsearch/gumbocy .
31 |
32 | docker_shell:
33 | docker run -e GUMBOCY_PYTHON_VERSION -v "$(PWD):/cosr/gumbocy:rw" -w /cosr/gumbocy -i -t commonsearch/gumbocy bash
34 |
35 | docker_test:
36 | docker run -e GUMBOCY_PYTHON_VERSION -v "$(PWD):/cosr/gumbocy:rw" -w /cosr/gumbocy -i -t commonsearch/gumbocy make test
37 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # gumbocy
2 |
3 | [](https://travis-ci.org/commonsearch/gumbocy) [](LICENSE)
4 |
5 | **gumbocy** is an alternative Python binding for the excellent [Gumbo](https://github.com/google/gumbo-parser) HTML5 parser, originally written for [Common Search](http://about.commonsearch.org).
6 |
7 | It differs from the [official Python binding](https://github.com/google/gumbo-parser/tree/master/python/gumbo) in a few ways:
8 |
9 | - It is optimized for performance by using [Cython](http://cython.org/).
10 | - It has a smaller feature set and doesn't aim to be a general-purpose binding.
11 | - Its `listnodes()` API just returns nodes as a flat list of tuples.
12 | - Its `analyze()` API traverses the HTML tree and returns high-level data like groups of words and lists of hyperlinks.
13 | - It is generally restrictive. For instance, attributes have to be whitelisted.
14 |
15 | ## Installation
16 |
17 | The only dependency is [Gumbo](https://github.com/google/gumbo-parser). You need to install it (possibly with `make gumbo_build`) if you are not using the Docker method below.
18 |
19 | ### From PyPI
20 |
21 | ```
22 | pip install gumbocy
23 | ```
24 |
25 | ### From source with Docker
26 |
27 | Clone this repository, then:
28 |
29 | ```
30 | make docker_build
31 | make docker_shell
32 | ```
33 |
34 | You will end up in a container with Gumbo and Gumbocy already installed.
35 |
36 | You can then run the tests for Python 2.7 and PyPy:
37 |
38 | ```
39 | make docker_test
40 | GUMBOCY_PYTHON_VERSION=pypy make docker_test
41 | ```
42 |
43 | ### From source without Docker
44 |
45 | This is an unsupported method.
46 |
47 | ```
48 | make virtualenv
49 | source venv/bin/activate
50 | make build_ext
51 | ```
52 |
53 | ## Running the tests
54 |
55 | ```
56 | make test
57 | ```
58 |
59 | ## Quickstart
60 |
61 | ```
62 | import gumbocy
63 |
64 | parser = gumbocy.HTMLParser(options={})
65 | parser.parse("""
Helloworld!""")
66 | print parser.listnodes()
67 |
68 | => [(0, "html"), (1, "head"), (2, "title"), (3, None, "Hello"), (1, "body"), (2, None, "world!")]
69 |
70 | print parser.analyze()
71 |
72 | => {'word_groups': [('world!', 'body')], 'external_hyperlinks': [], 'internal_hyperlinks': [], 'title': 'Hello'}
73 |
74 | ```
75 |
76 | For more usage examples, see the [tests](https://github.com/commonsearch/gumbocy/blob/master/tests/).
77 |
78 | ## Options reference
79 |
80 | - **attributes_whitelist**: a set of attributes which, if present, will be returned in a dict as the 3rd element of a node tuple by `listnodes()`. Note that "class" is returned as a frozenset. Defaults to `set()`.
81 | - **nesting_limit**: an integer to specify the maximum nesting level that will be returned. Defaults to `999`.
82 | - **head_only**: a boolean that will make gumbocy return only the elements in the of the document. Useful for parsing only tags for instance. Defaults to `False`.
83 | - **tags_ignore**: a list of tag names that won't be returned (as well as their children).
84 | - **ids_ignore**: a list of IDs for which matching elements (and their children) won't be returned.
85 | - **classes_ignore**: a list of classes for which matching elements (and their children) won't be returned.
86 |
87 |
88 | ## Contributing
89 |
90 | If you are using Sublime Text, we recommend installing [Cython support](https://github.com/NotSqrt/sublime-cython).
91 |
92 | All contributions are welcome! Feel free to use the [Issues tab](https://github.com/commonsearch/gumbocy/issues) or send us your Pull Requests.
93 |
94 | ## Changelog
95 |
96 | ### 0.2
97 | - New `analyze()` API, moving most of the tree traversal that was happening in `cosr-back` to Cython, resulting in a ~3x speedup in indexing speed.
98 | - More tests
99 |
100 | ### 0.1
101 | - Initial public release
102 |
--------------------------------------------------------------------------------
/gumbocy.pxd:
--------------------------------------------------------------------------------
1 | # https://github.com/google/gumbo-parser/blob/master/src/gumbo.h
2 |
3 |
4 | cdef extern from "gumbo.h":
5 |
6 | ctypedef enum GumboNamespaceEnum:
7 | GUMBO_NAMESPACE_HTML
8 | GUMBO_NAMESPACE_SVG
9 | GUMBO_NAMESPACE_MATHML
10 |
11 | ctypedef enum GumboAttributeNamespaceEnum:
12 | GUMBO_ATTR_NAMESPACE_NONE,
13 | GUMBO_ATTR_NAMESPACE_XLINK,
14 | GUMBO_ATTR_NAMESPACE_XML,
15 | GUMBO_ATTR_NAMESPACE_XMLNS,
16 |
17 | ctypedef enum GumboQuirksModeEnum:
18 | GUMBO_DOCTYPE_NO_QUIRKS,
19 | GUMBO_DOCTYPE_QUIRKS,
20 | GUMBO_DOCTYPE_LIMITED_QUIRKS
21 |
22 | ctypedef enum GumboTag:
23 | GUMBO_TAG_HTML,
24 | GUMBO_TAG_HEAD,
25 | GUMBO_TAG_TITLE,
26 | GUMBO_TAG_BASE,
27 | GUMBO_TAG_LINK,
28 | GUMBO_TAG_META,
29 | GUMBO_TAG_STYLE,
30 | GUMBO_TAG_SCRIPT,
31 | GUMBO_TAG_NOSCRIPT,
32 | GUMBO_TAG_TEMPLATE,
33 | GUMBO_TAG_BODY,
34 | GUMBO_TAG_ARTICLE,
35 | GUMBO_TAG_SECTION,
36 | GUMBO_TAG_NAV,
37 | GUMBO_TAG_ASIDE,
38 | GUMBO_TAG_H1,
39 | GUMBO_TAG_H2,
40 | GUMBO_TAG_H3,
41 | GUMBO_TAG_H4,
42 | GUMBO_TAG_H5,
43 | GUMBO_TAG_H6,
44 | GUMBO_TAG_HGROUP,
45 | GUMBO_TAG_HEADER,
46 | GUMBO_TAG_FOOTER,
47 | GUMBO_TAG_ADDRESS,
48 | GUMBO_TAG_P,
49 | GUMBO_TAG_HR,
50 | GUMBO_TAG_PRE,
51 | GUMBO_TAG_BLOCKQUOTE,
52 | GUMBO_TAG_OL,
53 | GUMBO_TAG_UL,
54 | GUMBO_TAG_LI,
55 | GUMBO_TAG_DL,
56 | GUMBO_TAG_DT,
57 | GUMBO_TAG_DD,
58 | GUMBO_TAG_FIGURE,
59 | GUMBO_TAG_FIGCAPTION,
60 | GUMBO_TAG_MAIN,
61 | GUMBO_TAG_DIV,
62 | GUMBO_TAG_A,
63 | GUMBO_TAG_EM,
64 | GUMBO_TAG_STRONG,
65 | GUMBO_TAG_SMALL,
66 | GUMBO_TAG_S,
67 | GUMBO_TAG_CITE,
68 | GUMBO_TAG_Q,
69 | GUMBO_TAG_DFN,
70 | GUMBO_TAG_ABBR,
71 | GUMBO_TAG_DATA,
72 | GUMBO_TAG_TIME,
73 | GUMBO_TAG_CODE,
74 | GUMBO_TAG_VAR,
75 | GUMBO_TAG_SAMP,
76 | GUMBO_TAG_KBD,
77 | GUMBO_TAG_SUB,
78 | GUMBO_TAG_SUP,
79 | GUMBO_TAG_I,
80 | GUMBO_TAG_B,
81 | GUMBO_TAG_U,
82 | GUMBO_TAG_MARK,
83 | GUMBO_TAG_RUBY,
84 | GUMBO_TAG_RT,
85 | GUMBO_TAG_RP,
86 | GUMBO_TAG_BDI,
87 | GUMBO_TAG_BDO,
88 | GUMBO_TAG_SPAN,
89 | GUMBO_TAG_BR,
90 | GUMBO_TAG_WBR,
91 | GUMBO_TAG_INS,
92 | GUMBO_TAG_DEL,
93 | GUMBO_TAG_IMAGE,
94 | GUMBO_TAG_IMG,
95 | GUMBO_TAG_IFRAME,
96 | GUMBO_TAG_EMBED,
97 | GUMBO_TAG_OBJECT,
98 | GUMBO_TAG_PARAM,
99 | GUMBO_TAG_VIDEO,
100 | GUMBO_TAG_AUDIO,
101 | GUMBO_TAG_SOURCE,
102 | GUMBO_TAG_TRACK,
103 | GUMBO_TAG_CANVAS,
104 | GUMBO_TAG_MAP,
105 | GUMBO_TAG_AREA,
106 | GUMBO_TAG_MATH,
107 | GUMBO_TAG_MI,
108 | GUMBO_TAG_MO,
109 | GUMBO_TAG_MN,
110 | GUMBO_TAG_MS,
111 | GUMBO_TAG_MTEXT,
112 | GUMBO_TAG_MGLYPH,
113 | GUMBO_TAG_MALIGNMARK,
114 | GUMBO_TAG_ANNOTATION_XML,
115 | GUMBO_TAG_SVG,
116 | GUMBO_TAG_FOREIGNOBJECT,
117 | GUMBO_TAG_DESC,
118 | GUMBO_TAG_TABLE,
119 | GUMBO_TAG_CAPTION,
120 | GUMBO_TAG_COLGROUP,
121 | GUMBO_TAG_COL,
122 | GUMBO_TAG_TBODY,
123 | GUMBO_TAG_THEAD,
124 | GUMBO_TAG_TFOOT,
125 | GUMBO_TAG_TR,
126 | GUMBO_TAG_TD,
127 | GUMBO_TAG_TH,
128 | GUMBO_TAG_FORM,
129 | GUMBO_TAG_FIELDSET,
130 | GUMBO_TAG_LEGEND,
131 | GUMBO_TAG_LABEL,
132 | GUMBO_TAG_INPUT,
133 | GUMBO_TAG_BUTTON,
134 | GUMBO_TAG_SELECT,
135 | GUMBO_TAG_DATALIST,
136 | GUMBO_TAG_OPTGROUP,
137 | GUMBO_TAG_OPTION,
138 | GUMBO_TAG_TEXTAREA,
139 | GUMBO_TAG_KEYGEN,
140 | GUMBO_TAG_OUTPUT,
141 | GUMBO_TAG_PROGRESS,
142 | GUMBO_TAG_METER,
143 | GUMBO_TAG_DETAILS,
144 | GUMBO_TAG_SUMMARY,
145 | GUMBO_TAG_MENU,
146 | GUMBO_TAG_MENUITEM,
147 | GUMBO_TAG_APPLET,
148 | GUMBO_TAG_ACRONYM,
149 | GUMBO_TAG_BGSOUND,
150 | GUMBO_TAG_DIR,
151 | GUMBO_TAG_FRAME,
152 | GUMBO_TAG_FRAMESET,
153 | GUMBO_TAG_NOFRAMES,
154 | GUMBO_TAG_ISINDEX,
155 | GUMBO_TAG_LISTING,
156 | GUMBO_TAG_XMP,
157 | GUMBO_TAG_NEXTID,
158 | GUMBO_TAG_NOEMBED,
159 | GUMBO_TAG_PLAINTEXT,
160 | GUMBO_TAG_RB,
161 | GUMBO_TAG_STRIKE,
162 | GUMBO_TAG_BASEFONT,
163 | GUMBO_TAG_BIG,
164 | GUMBO_TAG_BLINK,
165 | GUMBO_TAG_CENTER,
166 | GUMBO_TAG_FONT,
167 | GUMBO_TAG_MARQUEE,
168 | GUMBO_TAG_MULTICOL,
169 | GUMBO_TAG_NOBR,
170 | GUMBO_TAG_SPACER,
171 | GUMBO_TAG_TT,
172 | GUMBO_TAG_RTC,
173 | GUMBO_TAG_UNKNOWN,
174 | GUMBO_TAG_LAST
175 |
176 | ctypedef enum GumboParseFlags:
177 |
178 | GUMBO_INSERTION_NORMAL = 0,
179 |
180 | GUMBO_INSERTION_BY_PARSER = 1 << 0,
181 |
182 | GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
183 |
184 | GUMBO_INSERTION_IMPLIED = 1 << 3,
185 |
186 | GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
187 |
188 | GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
189 |
190 | GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
191 |
192 | GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
193 |
194 | GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
195 |
196 | GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
197 |
198 | GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
199 |
200 |
201 | ctypedef struct GumboVector:
202 | void** data
203 | unsigned int length
204 | unsigned int capacity
205 |
206 | ctypedef struct GumboStringPiece:
207 | const char* data
208 | size_t length
209 |
210 | ctypedef struct GumboStringPiece:
211 | const char* data
212 | size_t length
213 |
214 |
215 | ctypedef struct GumboAttribute:
216 | GumboAttributeNamespaceEnum attr_namespace
217 | const char* name
218 | GumboStringPiece original_name
219 | const char* value
220 | GumboStringPiece original_value
221 |
222 | ctypedef struct GumboDocument:
223 | GumboVector children
224 | bint has_doctype
225 | const char* name
226 | const char* public_identifier
227 | const char* system_identifier
228 | GumboQuirksModeEnum doc_type_quirks_mode
229 |
230 | ctypedef struct GumboElement:
231 | GumboVector children
232 | GumboTag tag
233 | GumboNamespaceEnum tag_namespace
234 | GumboStringPiece original_tag
235 | GumboStringPiece original_end_tag
236 | GumboVector attributes
237 |
238 | ctypedef struct GumboText:
239 | const char* text
240 | GumboStringPiece original_text
241 |
242 | ctypedef enum GumboNodeType:
243 | GUMBO_NODE_DOCUMENT
244 | GUMBO_NODE_ELEMENT
245 | GUMBO_NODE_TEXT
246 | GUMBO_NODE_CDATA
247 | GUMBO_NODE_COMMENT
248 | GUMBO_NODE_WHITESPACE
249 | GUMBO_NODE_TEMPLATE
250 |
251 | # ctypedef struct GumboNode:
252 | # pass
253 |
254 | ctypedef union GumboNodeData:
255 | GumboDocument document
256 | GumboElement element
257 | GumboText text
258 |
259 | ctypedef struct GumboNode:
260 | GumboNodeType type
261 | GumboNode* parent
262 | size_t index_within_parent
263 | GumboParseFlags parse_flags
264 | GumboNodeData v
265 |
266 | ctypedef struct GumboOutput:
267 | GumboNode* document
268 | GumboNode* root
269 | GumboVector errors
270 |
271 | ctypedef struct GumboOptions:
272 | pass
273 |
274 | extern const GumboOptions kGumboDefaultOptions
275 |
276 | GumboOutput* gumbo_parse(const char* buffer)
277 |
278 | void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output)
279 |
280 | bint gumbo_string_equals(const GumboStringPiece* str1, const GumboStringPiece* str2)
281 | bint gumbo_string_equals_ignore_case(const GumboStringPiece* str1, const GumboStringPiece* str2)
282 |
283 | int gumbo_vector_index_of(GumboVector* vector, const void* element)
284 |
285 | const char* gumbo_normalized_tagname(GumboTag tag)
286 |
287 | void gumbo_tag_from_original_text(GumboStringPiece* text)
288 |
289 | GumboTag gumbo_tag_enum(const char* tagname)
290 | GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length)
291 |
292 | GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name)
293 |
--------------------------------------------------------------------------------
/gumbocy.pyx:
--------------------------------------------------------------------------------
1 | import re
2 | import urlparse
3 | cimport gumbocy
4 | cimport re2cy
5 | from libcpp.unordered_set cimport unordered_set
6 | from cython.operator cimport dereference as deref
7 | from libcpp.vector cimport vector
8 | from libcpp.map cimport map
9 |
10 |
11 | cdef extern from "stdio.h":
12 | int printf(const char* format, ...);
13 |
14 | cdef vector[re2cy.ArgPtr] *argp = new vector[re2cy.ArgPtr]()
15 | cdef re2cy.ArgPtr *empty_args = &(deref(argp)[0])
16 |
17 | cdef bint re2_search(const char* s, re2cy.RE2 &pattern):
18 | return re2cy.RE2.PartialMatchN(s, pattern, empty_args, 0)
19 |
20 | cdef re2cy.RE2 *_RE2_SEARCH_STYLE_HIDDEN = new re2cy.RE2(r"(display\s*\:\s*none)|(visibility\s*\:\s*hidden)")
21 | cdef re2cy.RE2 *_RE2_ABSOLUTE_HREF = new re2cy.RE2(r"^(?:[A-Za-z0-9\+\.\-]+\:)?\/\/")
22 | cdef re2cy.RE2 *_RE2_IGNORED_HREF = new re2cy.RE2(r"^(?:javascript|mailto|ftp|about)\:")
23 |
24 | _RE_SPLIT_WHITESPACE = re.compile(r"\s+")
25 |
26 | ctypedef enum AttributeNames:
27 | ATTR_ID,
28 | ATTR_ROLE,
29 | ATTR_HREF,
30 | ATTR_STYLE,
31 | ATTR_REL,
32 | ATTR_SRC,
33 | ATTR_ALT,
34 | ATTR_NAME,
35 | ATTR_PROPERTY,
36 | ATTR_CONTENT
37 |
38 | # ATTR_ID = 0
39 | # ATTR_ROLE = 1
40 | # ATTR_HREF = 2
41 | # ATTR_STYLE = 3
42 | # ATTR_REL = 4
43 | # ATTR_SRC = 5
44 | # ATTR_ALT = 6
45 | # ATTR_NAME = 7
46 | # ATTR_PROPERTY = 8
47 | # ATTR_CONTENT = 9
48 |
49 | # cdef struct Attributes:
50 | # int size_classes
51 | # vector[char*] classes
52 | # bint has_hidden
53 | # map[AttributeNames, const char*] values
54 |
55 | cdef class Attributes:
56 | cdef int size_classes
57 | cdef dict values
58 | # cdef map[AttributeNames, const char*] values
59 | # cdef const char* values[10]
60 | # cdef vector[char*] classes
61 | cdef list classes
62 | cdef bint has_hidden
63 |
64 | # ctypedef sAttributes Attributes
65 |
66 | cdef class HTMLParser:
67 |
68 | # Global parser variables
69 | cdef int nesting_limit
70 | cdef bint head_only
71 | cdef bint has_ids_ignore
72 | cdef bint has_classes_ignore
73 | cdef bint has_ids_hidden
74 | cdef bint has_classes_hidden
75 | cdef bint has_attributes_whitelist
76 | cdef bint has_classes_boilerplate
77 | cdef bint has_ids_boilerplate
78 | cdef bint has_roles_boilerplate
79 | cdef bint has_metas_whitelist
80 |
81 | cdef unordered_set[int] tags_ignore
82 | cdef unordered_set[int] tags_ignore_head_only
83 | cdef unordered_set[int] tags_boilerplate
84 | cdef unordered_set[int] tags_boilerplate_bypass
85 | cdef unordered_set[int] tags_separators
86 |
87 | cdef re2cy.RE2* attributes_whitelist
88 | cdef re2cy.RE2* metas_whitelist
89 | cdef re2cy.RE2* classes_ignore
90 | cdef re2cy.RE2* ids_ignore
91 | cdef re2cy.RE2* classes_hidden
92 | cdef re2cy.RE2* ids_hidden
93 | cdef re2cy.RE2* classes_boilerplate
94 | cdef re2cy.RE2* ids_boilerplate
95 | cdef re2cy.RE2* roles_boilerplate
96 |
97 | cdef bint analyze_internal_hyperlinks
98 | cdef bint analyze_external_hyperlinks
99 | cdef bint analyze_word_groups
100 |
101 | # Variables reinitialized at each parse()
102 | cdef list current_stack
103 |
104 | cdef bint has_url
105 | cdef char* url
106 | cdef char* netloc
107 | cdef char* scheme
108 | cdef re2cy.RE2* internal_netloc_search
109 |
110 | cdef dict analysis
111 |
112 | cdef object current_word_group
113 | cdef object current_hyperlink
114 |
115 | cdef bint has_output
116 | cdef gumbocy.GumboOutput* output
117 | cdef list nodes
118 |
119 | def __cinit__(self, dict options=None):
120 |
121 | options = options or {}
122 |
123 | self.nesting_limit = options.get("nesting_limit", 999)
124 | self.head_only = options.get("head_only")
125 |
126 | self.analyze_external_hyperlinks = bool(options.get("analyze_external_hyperlinks", True))
127 | self.analyze_internal_hyperlinks = bool(options.get("analyze_internal_hyperlinks", True))
128 | self.analyze_word_groups = bool(options.get("analyze_word_groups", True))
129 |
130 | attributes_whitelist = set(options.get("attributes_whitelist") or [])
131 |
132 | classes_ignore = frozenset(options.get("classes_ignore") or [])
133 | if len(classes_ignore) > 0:
134 | self.has_classes_ignore = True
135 | self.classes_ignore = new re2cy.RE2("^(?:" + "|".join(classes_ignore) + ")$")
136 | attributes_whitelist.add("class")
137 |
138 | ids_ignore = frozenset(options.get("ids_ignore") or [])
139 | if len(ids_ignore) > 0:
140 | self.has_ids_ignore = True
141 | self.ids_ignore = new re2cy.RE2("^(?:" + "|".join(ids_ignore) + ")$")
142 | attributes_whitelist.add("id")
143 |
144 | classes_hidden = frozenset(options.get("classes_hidden") or [])
145 | if len(classes_hidden) > 0:
146 | self.has_classes_hidden = True
147 | self.classes_hidden = new re2cy.RE2("^(?:" + "|".join(classes_hidden) + ")$")
148 | attributes_whitelist.add("class")
149 |
150 | ids_hidden = frozenset(options.get("ids_hidden") or [])
151 | if len(ids_hidden) > 0:
152 | self.has_ids_hidden = True
153 | self.ids_hidden = new re2cy.RE2("^(?:" + "|".join(ids_hidden) + ")$")
154 | attributes_whitelist.add("id")
155 |
156 | classes_boilerplate = frozenset(options.get("classes_boilerplate") or [])
157 | if len(classes_boilerplate) > 0:
158 | self.has_classes_boilerplate = True
159 | self.classes_boilerplate = new re2cy.RE2("^(?:" + "|".join(classes_boilerplate) + ")$")
160 | attributes_whitelist.add("class")
161 |
162 | ids_boilerplate = frozenset(options.get("ids_boilerplate") or [])
163 | if len(ids_boilerplate) > 0:
164 | self.has_ids_boilerplate = True
165 | self.ids_boilerplate = new re2cy.RE2("^(?:" + "|".join(ids_boilerplate) + ")$")
166 | attributes_whitelist.add("id")
167 |
168 | roles_boilerplate = frozenset(options.get("roles_boilerplate") or [])
169 | if len(roles_boilerplate) > 0:
170 | self.has_roles_boilerplate = True
171 | self.roles_boilerplate = new re2cy.RE2("^(?:" + "|".join(roles_boilerplate) + ")$")
172 | attributes_whitelist.add("role")
173 |
174 | metas_whitelist = frozenset(options.get("metas_whitelist") or [])
175 | if len(metas_whitelist) > 0:
176 | self.has_metas_whitelist = True
177 | self.metas_whitelist = new re2cy.RE2("^(?:" + "|".join(metas_whitelist) + ")$")
178 | attributes_whitelist.add("name")
179 | attributes_whitelist.add("property")
180 | attributes_whitelist.add("content")
181 |
182 | # Some options add attributes to the whitelist
183 | if self.analyze_external_hyperlinks or self.analyze_internal_hyperlinks:
184 | attributes_whitelist.add("href")
185 | attributes_whitelist.add("rel")
186 |
187 | # FInally, freeze the attributes whitelist
188 | self.has_attributes_whitelist = len(attributes_whitelist) > 0
189 | if self.has_attributes_whitelist:
190 | self.attributes_whitelist = new re2cy.RE2("^(?:" + "|".join(attributes_whitelist) + ")$")
191 |
192 | self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_BODY)
193 | self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_P)
194 | self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_DIV)
195 | self.tags_ignore_head_only.insert(gumbocy.GUMBO_TAG_SPAN)
196 |
197 | for tag_name in options.get("tags_ignore", []):
198 | tag = gumbocy.gumbo_tag_enum(tag_name)
199 | if tag != gumbocy.GUMBO_TAG_UNKNOWN:
200 | self.tags_ignore.insert( gumbocy.gumbo_tag_enum(tag_name))
201 |
202 | for tag_name in options.get("tags_boilerplate", []):
203 | tag = gumbocy.gumbo_tag_enum(tag_name)
204 | if tag != gumbocy.GUMBO_TAG_UNKNOWN:
205 | self.tags_boilerplate.insert( gumbocy.gumbo_tag_enum(tag_name))
206 |
207 | for tag_name in options.get("tags_boilerplate_bypass", []):
208 | tag = gumbocy.gumbo_tag_enum(tag_name)
209 | if tag != gumbocy.GUMBO_TAG_UNKNOWN:
210 | self.tags_boilerplate_bypass.insert( gumbocy.gumbo_tag_enum(tag_name))
211 |
212 | for tag_name in options.get("tags_separators", []):
213 | tag = gumbocy.gumbo_tag_enum(tag_name)
214 | if tag != gumbocy.GUMBO_TAG_UNKNOWN:
215 | self.tags_separators.insert( gumbocy.gumbo_tag_enum(tag_name))
216 |
217 | self.tags_separators.insert(gumbocy.GUMBO_TAG_BODY)
218 |
219 | cdef bint guess_node_hidden(self, gumbocy.GumboNode* node, Attributes attrs):
220 | """ Rough guess to check if the element is explicitly hidden.
221 |
222 | Not intended to combat spam!
223 | """
224 |
225 | if not self.has_attributes_whitelist:
226 | return False
227 |
228 | # From the HTML5 spec
229 | if attrs.has_hidden:
230 | return True
231 |
232 | if self.has_ids_hidden and attrs.values.get(ATTR_ID):
233 | if re2_search(attrs.values[ATTR_ID], deref(self.ids_hidden)):
234 | return True
235 |
236 | if self.has_classes_hidden and attrs.size_classes > 0:
237 | for k in attrs.classes:
238 | if re2_search(k, deref(self.classes_hidden)):
239 | return True
240 |
241 | if attrs.values.get(ATTR_STYLE):
242 | if re2_search(attrs.values[ATTR_STYLE], deref(_RE2_SEARCH_STYLE_HIDDEN)):
243 | return True
244 |
245 | return False
246 |
247 |
248 | cdef bint guess_node_boilerplate(self, gumbocy.GumboNode* node, Attributes attrs):
249 | """ Rough guess to check if the element is boilerplate """
250 |
251 | if self.tags_boilerplate.count( node.v.element.tag):
252 | return True
253 |
254 | # http://html5doctor.com/understanding-aside/
255 | if node.v.element.tag == gumbocy.GUMBO_TAG_ASIDE:
256 | if "article" not in self.current_stack:
257 | return True
258 |
259 | if self.has_classes_boilerplate and attrs.size_classes > 0:
260 | for k in attrs.classes:
261 | if re2_search(k, deref(self.classes_boilerplate)):
262 | return True
263 |
264 | if self.has_ids_boilerplate and attrs.values.get(ATTR_ID):
265 | if re2_search(attrs.values[ATTR_ID], deref(self.ids_boilerplate)):
266 | return True
267 |
268 | if self.has_roles_boilerplate and attrs.values.get(ATTR_ROLE):
269 | if re2_search(attrs.values[ATTR_ROLE], deref(self.roles_boilerplate)):
270 | return True
271 |
272 | return False
273 |
274 | cdef Attributes get_attributes(self, gumbocy.GumboNode* node):
275 | """ Build a dict with all the whitelisted attributes """
276 |
277 | attrs = Attributes()
278 | # cdef Attributes attrs
279 | attrs.size_classes = 0
280 | attrs.has_hidden = 0
281 | # attrs.values = [""] * 10
282 | # attrs.classes = []
283 | attrs.values = {} # deref(new map[AttributeNames, const char*]())
284 | # attrs.values[ATTR_ID] = "x"
285 | # print dict(attrs.values)
286 |
287 | for i in range(node.v.element.attributes.length):
288 |
289 | attr = node.v.element.attributes.data[i]
290 |
291 | if re2_search(attr.name, deref(self.attributes_whitelist)):
292 |
293 | if attr.name == b"class":
294 | multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower()))
295 | attrs.size_classes = len(multiple_value)
296 | if attrs.size_classes > 0:
297 | attrs.classes = list(multiple_value)
298 | # for k in multiple_value:
299 | # ck = k
300 | # attrs.classes.push_back(ck) # = list(multiple_value)
301 |
302 | elif attr.name == b"id":
303 | pystr = str(attr.value).lower()
304 | attrs.values[ATTR_ID] = pystr
305 |
306 | elif attr.name == b"style":
307 | attrs.values[ATTR_STYLE] = attr.value
308 |
309 | elif attr.name == b"href":
310 | attrs.values[ATTR_HREF] = attr.value
311 |
312 | elif attr.name == b"role":
313 | pystr = str(attr.value).lower()
314 | attrs.values[ATTR_ROLE] = pystr
315 |
316 | elif attr.name == b"rel":
317 | pystr = str(attr.value).lower()
318 | attrs.values[ATTR_REL] = pystr
319 |
320 | elif attr.name == b"aria-hidden" and attr.value == b"true":
321 | attrs.has_hidden = 1
322 |
323 | elif attr.name == b"hidden":
324 | attrs.has_hidden = 1
325 |
326 | elif attr.name == b"alt":
327 | attrs.values[ATTR_ALT] = attr.value
328 |
329 | elif attr.name == b"src":
330 | attrs.values[ATTR_SRC] = attr.value
331 |
332 | elif attr.name == b"name":
333 | pystr = str(attr.value).lower()
334 | attrs.values[ATTR_NAME] = pystr
335 |
336 | elif attr.name == b"property":
337 | pystr = str(attr.value).lower()
338 | attrs.values[ATTR_PROPERTY] = pystr
339 |
340 | elif attr.name == b"content":
341 | attrs.values[ATTR_CONTENT] = attr.value
342 |
343 | return attrs
344 |
345 | cdef void close_word_group(self):
346 | """ Close the current word group """
347 |
348 | if self.current_word_group:
349 | self.analysis["word_groups"].append(tuple(self.current_word_group))
350 | self.current_word_group = None
351 |
352 |
353 | cdef void add_text(self, text):
354 | """ Adds inner text to the current word group """
355 |
356 | if not self.current_word_group:
357 | self.current_word_group = [text.strip(), self.current_stack[-1]]
358 | else:
359 | self.current_word_group[0] += " " + text.strip()
360 |
361 | cdef void add_hyperlink_text(self, text):
362 | """ Adds inner text to the currently open hyperlink """
363 |
364 | if self.current_hyperlink:
365 | self.current_hyperlink[1] += text
366 |
367 | cdef void open_hyperlink(self, Attributes attrs):
368 | """ Opens a new hyperlink """
369 |
370 | if not self.analyze_external_hyperlinks and not self.analyze_internal_hyperlinks:
371 | return
372 |
373 | if not attrs.values.get(ATTR_HREF):
374 | return
375 |
376 | if len(attrs.values[ATTR_HREF]) == 0:
377 | return
378 |
379 | if re2_search(attrs.values[ATTR_HREF], deref(_RE2_IGNORED_HREF)):
380 | return
381 |
382 | self.close_hyperlink()
383 |
384 | # href, text, rel
385 | self.current_hyperlink = [attrs.values[ATTR_HREF], "", attrs.values.get(ATTR_REL)]
386 |
387 | cdef void close_hyperlink(self):
388 | """ Closes the current hyperlink if any, and decides if it's an external or internal link """
389 |
390 | cdef bint is_external = 0
391 |
392 | if not self.analyze_external_hyperlinks and not self.analyze_internal_hyperlinks:
393 | return
394 |
395 | if self.current_hyperlink:
396 | href = self.current_hyperlink[0]
397 |
398 | if re2_search(href, deref(_RE2_ABSOLUTE_HREF)):
399 | is_external = 1
400 |
401 | if self.has_url:
402 |
403 | if href.startswith("//"):
404 | href = self.scheme + ":" + href
405 |
406 | # This may be an absolute link but to the same domain
407 | if re2_search(href, deref(self.internal_netloc_search)):
408 | is_external = 0
409 | href = href.split(self.netloc, 1)[1]
410 |
411 | if is_external:
412 | if self.analyze_external_hyperlinks:
413 | self.analysis["external_hyperlinks"].append(
414 | (href, self.current_hyperlink[1], self.current_hyperlink[2])
415 | )
416 |
417 | elif self.analyze_internal_hyperlinks:
418 | self.analysis["internal_hyperlinks"].append(
419 | (href, self.current_hyperlink[1], self.current_hyperlink[2])
420 | )
421 |
422 | self.current_hyperlink = None
423 |
424 | cdef bint _traverse_node(self, int level, gumbocy.GumboNode* node, bint is_head, bint is_hidden, bint is_boilerplate, bint is_boilerplate_bypassed, bint is_hyperlink):
425 | """ Traverses the node tree. Return 1 to stop at this level """
426 |
427 | cdef GumboStringPiece gsp
428 | cdef const char* tag_name
429 | cdef int tag_n
430 |
431 | if level > self.nesting_limit:
432 | return 0
433 |
434 | if node.type == gumbocy.GUMBO_NODE_TEXT:
435 |
436 | if (self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks) and is_hyperlink:
437 | self.add_hyperlink_text(node.v.text.text)
438 |
439 | if self.analyze_word_groups and not is_head and not is_hidden and (not is_boilerplate or is_boilerplate_bypassed):
440 | self.add_text(node.v.text.text)
441 |
442 | elif node.type == gumbocy.GUMBO_NODE_ELEMENT:
443 |
444 | tag_n = node.v.element.tag
445 |
446 | if self.head_only and self.tags_ignore_head_only.count(tag_n):
447 | return 1
448 |
449 | if self.tags_ignore.count(tag_n):
450 | return 0
451 |
452 | tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag)
453 |
454 | # When we find an unknown tag, find its tag_name in the buffer
455 | if tag_name == b"":
456 | gsp = node.v.element.original_tag
457 | gumbo_tag_from_original_text(&gsp)
458 | py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C!
459 | tag_name = py_tag_name
460 |
461 | # if self.has_attributes_whitelist:
462 |
463 | attrs = self.get_attributes(node)
464 |
465 | if self.has_classes_ignore and attrs.size_classes > 0:
466 | for v in attrs.classes:
467 | if re2_search(v, deref(self.classes_ignore)):
468 | return 0
469 |
470 | if self.has_ids_ignore and attrs.values.get(ATTR_ID):
471 | if re2_search(attrs.values[ATTR_ID], deref(self.ids_ignore)):
472 | return 0
473 |
474 | if node.v.element.tag == gumbocy.GUMBO_TAG_TITLE:
475 | if not self.analysis.get("title"):
476 | if node.v.element.children.length > 0:
477 | first_child = node.v.element.children.data[0]
478 | if first_child.type == gumbocy.GUMBO_NODE_TEXT:
479 | self.analysis["title"] = first_child.v.text.text
480 | return 0
481 |
482 | self.current_stack.append(tag_name)
483 |
484 | if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD:
485 | is_head = 1
486 |
487 | elif node.v.element.tag == gumbocy.GUMBO_TAG_A:
488 | self.open_hyperlink(attrs)
489 | is_hyperlink = 1
490 |
491 | elif node.v.element.tag == gumbocy.GUMBO_TAG_IMG:
492 | self.close_word_group()
493 | if attrs.values.get(ATTR_ALT):
494 | self.add_text(attrs.values[ATTR_ALT])
495 | self.close_word_group()
496 |
497 | # Text extraction from image filenames disabled for now
498 | # if attrs.get("src"):
499 | # if not attrs["src"].startswith("data:"):
500 | # self.add_text(self._split_filename_words(attrs["src"]))
501 | # self.close_word_group()
502 |
503 |
504 | if is_head:
505 | if node.v.element.tag == gumbocy.GUMBO_TAG_LINK:
506 |
507 | # TODO: more properties
508 | if attrs.values.get(ATTR_REL) and attrs.values.get(ATTR_HREF):
509 | self.analysis.setdefault("head_links", [])
510 | self.analysis["head_links"].append({"rel": attrs.values[ATTR_REL], "href": attrs.values[ATTR_HREF]})
511 |
512 | elif self.has_metas_whitelist and node.v.element.tag == gumbocy.GUMBO_TAG_META:
513 |
514 | if attrs.values.get(ATTR_CONTENT):
515 |
516 | if attrs.values.get(ATTR_NAME):
517 | if re2_search(attrs.values[ATTR_NAME], deref(self.metas_whitelist)):
518 | self.analysis.setdefault("head_metas", {})
519 | self.analysis["head_metas"][attrs.values[ATTR_NAME]] = str(attrs.values[ATTR_CONTENT]).strip()
520 |
521 | elif attrs.values.get(ATTR_PROPERTY):
522 | if re2_search(attrs.values[ATTR_PROPERTY], deref(self.metas_whitelist)):
523 | self.analysis.setdefault("head_metas", {})
524 | self.analysis["head_metas"][attrs.values[ATTR_PROPERTY]] = str(attrs.values[ATTR_CONTENT]).strip()
525 |
526 | elif node.v.element.tag == gumbocy.GUMBO_TAG_BASE:
527 | if attrs.values.get(ATTR_HREF) and "base_url" not in self.analysis:
528 | self.analysis["base_url"] = attrs.values[ATTR_HREF]
529 |
530 | # TODO is_article
531 |
532 | if not is_hidden:
533 | is_hidden = self.guess_node_hidden(node, attrs)
534 |
535 | if is_boilerplate and not is_boilerplate_bypassed:
536 | if self.tags_boilerplate_bypass.count(tag_n):
537 | is_boilerplate_bypassed = True
538 |
539 | if not is_boilerplate:
540 | is_boilerplate = self.guess_node_boilerplate(node, attrs)
541 |
542 | # print " " * level, "BOILER", tag_name, is_boilerplate, dict(attrs.values), attrs.classes
543 |
544 | # Close the word group
545 | if self.tags_separators.count(tag_n):
546 | self.close_word_group()
547 |
548 | # Call _traverse_node() recursively for each of the children
549 | for i in range(node.v.element.children.length):
550 | child = node.v.element.children.data[i]
551 | if self._traverse_node(level + 1, child, is_head, is_hidden, is_boilerplate, is_boilerplate_bypassed, is_hyperlink) == 1:
552 | break
553 |
554 | # Close the word group
555 | if self.tags_separators.count(tag_n):
556 | self.close_word_group()
557 |
558 | self.current_stack.pop()
559 |
560 | if node.v.element.tag == gumbocy.GUMBO_TAG_A:
561 | self.close_hyperlink()
562 |
563 | if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD:
564 | if self.head_only:
565 | return 1
566 |
567 | return 0
568 |
569 | def parse(self, char* html):
570 | """ Do the actual parsing of the HTML with gumbo """
571 |
572 | self.free()
573 | self.output = gumbocy.gumbo_parse(html)
574 | self.has_output = 1
575 |
576 | def analyze(self, url=None):
577 | """ Traverse the parsed tree and return the results """
578 |
579 | self.analysis = {}
580 | self.has_url = 0
581 |
582 | if self.analyze_internal_hyperlinks or self.analyze_external_hyperlinks:
583 |
584 | if url:
585 | self.has_url = 1
586 | self.url = url
587 | parsed = urlparse.urlparse(url)
588 | netloc = parsed.netloc.lower()
589 | self.netloc = netloc
590 | self.scheme = parsed.scheme
591 | self.internal_netloc_search = new re2cy.RE2("^http(?:s)?://%s" % re.escape(self.netloc))
592 |
593 | if self.analyze_internal_hyperlinks:
594 | self.analysis["internal_hyperlinks"] = []
595 |
596 | if self.analyze_external_hyperlinks:
597 | self.analysis["external_hyperlinks"] = []
598 |
599 | if self.analyze_word_groups:
600 | self.analysis["word_groups"] = []
601 |
602 | self.current_stack = []
603 | self.current_word_group = None
604 | self.current_hyperlink = None
605 |
606 | self._traverse_node(0, self.output.root, 0, 0, 0, 0, 0)
607 |
608 | return self.analysis
609 |
610 | #
611 | # Older listnodes() API support
612 | #
613 |
614 | def listnodes(self):
615 | """ Return the nodes as a flat list of tuples """
616 |
617 | self.nodes = []
618 |
619 | self._traverse_node_simple(0, self.output.root)
620 |
621 | return self.nodes
622 |
623 | cdef bint _traverse_node_simple(self, int level, gumbocy.GumboNode* node):
624 | """ Traverses the node tree. Return 1 to stop at this level """
625 |
626 | cdef GumboStringPiece gsp
627 |
628 | if level > self.nesting_limit:
629 | return 0
630 |
631 | if node.type == gumbocy.GUMBO_NODE_TEXT:
632 | self.nodes.append((level, None, node.v.text.text))
633 |
634 | elif node.type == gumbocy.GUMBO_NODE_ELEMENT:
635 |
636 | tag_n = node.v.element.tag
637 |
638 | if self.head_only and self.tags_ignore_head_only.count(tag_n):
639 | return 1
640 |
641 | if self.tags_ignore.count(tag_n):
642 | return 0
643 |
644 | tag_name = gumbocy.gumbo_normalized_tagname(node.v.element.tag)
645 |
646 | # When we find an unknown tag, find its tag_name in the buffer
647 | if tag_name == b"":
648 | gsp = node.v.element.original_tag
649 | gumbo_tag_from_original_text(&gsp)
650 | py_tag_name = str(gsp.data)[0:gsp.length].lower() # TODO try to do that only in C!
651 | tag_name = py_tag_name
652 |
653 | if self.has_attributes_whitelist:
654 |
655 | # Build a dict with all the whitelisted attributes
656 | has_attrs = False
657 | attrs = False
658 | for i in range(node.v.element.attributes.length):
659 | attr = node.v.element.attributes.data[i]
660 | attr_name = str(attr.name)
661 | if re2_search(attr_name, deref(self.attributes_whitelist)):
662 | if attr_name == b"class":
663 | multiple_value = frozenset(_RE_SPLIT_WHITESPACE.split(attr.value.strip().lower()))
664 | if len(multiple_value):
665 | if self.has_classes_ignore:
666 | for v in multiple_value:
667 | if re2_search(v, deref(self.classes_ignore)):
668 | return 0
669 |
670 | if not has_attrs:
671 | attrs = {}
672 | has_attrs = True
673 | attrs[attr_name] = multiple_value
674 |
675 | else:
676 |
677 | if not has_attrs:
678 | attrs = {}
679 | has_attrs = True
680 | attrs[attr_name] = attr.value
681 |
682 | if not has_attrs:
683 | self.nodes.append((level, tag_name))
684 |
685 | else:
686 |
687 | if self.has_ids_ignore:
688 | if attrs.get("id") and re2_search(attrs["id"].lower(), deref(self.ids_ignore)):
689 | return 0
690 |
691 | self.nodes.append((level, tag_name, attrs))
692 |
693 | else:
694 | self.nodes.append((level, tag_name))
695 |
696 | # Call _iternode() recursively for each of the children
697 | for i in range(node.v.element.children.length):
698 | child = node.v.element.children.data[i]
699 | if self._traverse_node_simple(level + 1, child) == 1:
700 | break
701 |
702 | if node.v.element.tag == gumbocy.GUMBO_TAG_HEAD and self.head_only:
703 | return 1
704 |
705 | return 0
706 |
707 | def __dealloc__(self):
708 | """ Cleanup gumbo memory when the parser is deallocated by Python """
709 | self.free()
710 |
711 | cdef free(self):
712 | if self.has_output:
713 | gumbocy.gumbo_destroy_output(&gumbocy.kGumboDefaultOptions, self.output)
714 | self.has_output = 0
715 |
--------------------------------------------------------------------------------
/re2cy.pxd:
--------------------------------------------------------------------------------
1 | from libcpp.string cimport string
2 |
3 | ctypedef Arg* ArgPtr
4 |
5 |
6 | cdef extern from "re2/stringpiece.h" namespace "re2":
7 | cdef cppclass StringPiece:
8 | # Eliding some constructors on purpose.
9 | StringPiece(const char*) except +
10 | StringPiece(const string&) except +
11 |
12 | const char* data()
13 | int length()
14 |
15 |
16 | cdef extern from "re2/re2.h" namespace "re2":
17 |
18 | cdef cppclass Arg "RE2::Arg":
19 | Arg()
20 |
21 | cdef cppclass RE2:
22 | RE2(const char*) except +
23 |
24 | @staticmethod
25 | bint PartialMatchN(
26 | const char *,
27 | const RE2&,
28 | const Arg* const args[],
29 | int,
30 | )
31 |
--------------------------------------------------------------------------------
/requirements-benchmark.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | requests
3 | html5lib
4 | bs4
5 | BeautifulSoup; python_version < '3.0'
6 | gumbo
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython==0.24
2 | pytest==2.9.2
3 | pytest-repeat==0.3.0
--------------------------------------------------------------------------------
/scripts/git-set-file-times:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use strict;
3 |
4 | # sets mtime and atime of files to the latest commit time in git
5 | #
6 | # This is useful for serving static content (managed by git)
7 | # from a cluster of identically configured HTTP servers. HTTP
8 | # clients and content delivery networks can get consistent
9 | # Last-Modified headers no matter which HTTP server in the
10 | # cluster they hit. This should improve caching behavior.
11 | #
12 | # This does not take into account merges, but if you're updating
13 | # every machine in the cluster from the same commit (A) to the
14 | # same commit (B), the mtimes will be _consistent_ across all
15 | # machines if not necessarily accurate.
16 | #
17 | # THIS IS NOT INTENDED TO OPTIMIZE BUILD SYSTEMS SUCH AS 'make'
18 | # YOU HAVE BEEN WARNED!
19 |
20 | my %ls = ();
21 | my $commit_time;
22 |
23 | if ($ENV{GIT_DIR}) {
24 | chdir($ENV{GIT_DIR}) or die $!;
25 | }
26 |
27 | $/ = "\0";
28 | open FH, 'git ls-files -z|' or die $!;
29 | while () {
30 | chomp;
31 | $ls{$_} = $_;
32 | }
33 | close FH;
34 |
35 |
36 | $/ = "\n";
37 | open FH, "git log -m -r --name-only --no-color --pretty=raw -z @ARGV |" or die $!;
38 | while () {
39 | chomp;
40 | if (/^committer .*? (\d+) (?:[\-\+]\d+)$/) {
41 | $commit_time = $1;
42 | } elsif (s/\0\0commit [a-f0-9]{40}( \(from [a-f0-9]{40}\))?$// or s/\0$//) {
43 | my @files = delete @ls{split(/\0/, $_)};
44 | @files = grep { defined $_ } @files;
45 | next unless @files;
46 | utime $commit_time, $commit_time, @files;
47 | }
48 | last unless %ls;
49 |
50 | }
51 | close FH;
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | from distutils.extension import Extension
3 | import os
4 |
5 | # gumbocy.c will be present when installing from the source distribution on PyPI
6 | if os.path.isfile("gumbocy.cpp"):
7 |
8 | # Use "make cythonize" to build the c file from the .pyx source
9 | ext_modules = [
10 | Extension("gumbocy",
11 | ["gumbocy.cpp"],
12 | libraries=["gumbo", "re2"],
13 | language="c++",
14 | extra_compile_args=["-std=c++11", '-O3', '-static-libstdc++'],
15 | extra_link_args=["-std=c++11"]) # , "-static"
16 |
17 | ]
18 |
19 | else:
20 | raise Exception("Must run 'make cythonize' first!")
21 |
22 | # # If the .c file is missing, we must be in local or installing from GitHub.
23 | # # In this case, we need Cython to be already installed.
24 | # else:
25 | # from Cython.Build import cythonize
26 |
27 | # ext_modules = cythonize([
28 | # Extension("gumbocy",
29 | # ["gumbocy.pyx"],
30 | # libraries=["gumbo"],
31 | # language="c++",
32 | # extra_compile_args=["-std=c++11"],
33 | # extra_link_args=["-std=c++11"])
34 | # ])
35 |
36 |
37 | setup(
38 | name="gumbocy",
39 | version="0.2.0",
40 | description="Python binding for gumbo-parser (an HTML5-compliant parser) using Cython",
41 | author="Common Search contributors",
42 | license="Apache License, Version 2.0",
43 | url="https://github.com/commonsearch/gumbocy",
44 | ext_modules=ext_modules,
45 | keywords=["gumbo", "gumbo-parser", "gumbo-cython", "gumbocy", "cython", "htmlparser", "html5", "html5lib"],
46 | classifiers=[
47 | "Programming Language :: Python",
48 | "Programming Language :: Python :: 2.7",
49 | # 'Development Status :: 1 - Planning',
50 | # 'Development Status :: 2 - Pre-Alpha',
51 | # 'Development Status :: 3 - Alpha',
52 | 'Development Status :: 4 - Beta',
53 | # 'Development Status :: 5 - Production/Stable',
54 | # 'Development Status :: 6 - Mature',
55 | # 'Development Status :: 7 - Inactive',
56 | "Programming Language :: Python :: Implementation :: CPython",
57 | "Programming Language :: Python :: Implementation :: PyPy",
58 | "Environment :: Other Environment",
59 | "Intended Audience :: Developers",
60 | "License :: OSI Approved :: Apache Software License",
61 | "Operating System :: OS Independent",
62 | "Topic :: Software Development :: Libraries"
63 | ]
64 | )
65 |
--------------------------------------------------------------------------------
/tests/benchmark_parsers.py:
--------------------------------------------------------------------------------
1 | # Usage: python -m cProfile -s cumtime tests/benchmark_parsers.py
2 |
3 | import os
4 | import sys
5 | sys.path.insert(-1, os.getcwd())
6 |
7 | import requests
8 | import timeit
9 | import html5lib
10 | import lxml.html
11 | import gumbocy
12 | import gumbo
13 | import bs4
14 |
15 | if not os.path.isfile("tests/_benchmark_fixture.html"):
16 | url = 'https://raw.githubusercontent.com/whatwg/html/d8717d8831c276ca65d2d44bbf2ce4ce673997b9/source'
17 | html = requests.get(url).content
18 | with open("tests/_benchmark_fixture.html", "w") as f:
19 | f.write(html)
20 |
21 | with open("tests/_benchmark_fixture.html", "r") as f:
22 | html = f.read()
23 | html_unicode = html.decode("utf-8")
24 |
25 |
26 | def bench(name, func):
27 | print('{}: {:.3f} seconds'.format(name, min(timeit.repeat(func, number=1, repeat=3))))
28 |
29 |
30 | def benchmark_gumbocy():
31 | parser = gumbocy.HTMLParser(options={
32 | "attributes_whitelist": ["id", "class", "style"]
33 | })
34 | parser.parse(html)
35 | nodes = parser.listnodes()
36 |
37 | divs_count = 0
38 | for node in nodes:
39 | if node[1] == "div":
40 | divs_count += 1
41 | print "Gumbocy: ", divs_count
42 |
43 |
44 | def benchmark_gumbo_bs3():
45 | parser = gumbo.soup_parse(html_unicode)
46 | divs = parser.findAll("div")
47 | print "gumbo bs3", len(divs)
48 |
49 |
50 | def benchmark_lxml_raw():
51 | parsed = lxml.html.fromstring(html)
52 | divs = parsed.findall(".//div")
53 | print "lxml raw", len(divs)
54 |
55 |
56 | def benchmark_html5lib_bs4():
57 | parser = bs4.BeautifulSoup(html, "html5lib")
58 | divs = parser.find_all("div")
59 | print "html5lib bs4", len(divs)
60 |
61 |
62 | def benchmark_htmlparser_bs4():
63 | parser = bs4.BeautifulSoup(html, "html.parser")
64 | divs = parser.find_all("div")
65 | print "html.parser bs4", len(divs)
66 |
67 |
68 | bench("benchmark_gumbocy", benchmark_gumbocy)
69 | bench("benchmark_gumbo_bs3", benchmark_gumbo_bs3)
70 | bench("benchmark_lxml_raw", benchmark_lxml_raw)
71 | bench("benchmark_html5lib_bs4", benchmark_html5lib_bs4)
72 | bench("benchmark_htmlparser_bs4", benchmark_htmlparser_bs4)
73 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.append(os.getcwd())
5 |
6 | import gumbocy
7 |
--------------------------------------------------------------------------------
/tests/test_analyze.py:
--------------------------------------------------------------------------------
1 | import gumbocy
2 | from test_word_groups import TAGS_SEPARATORS
3 |
4 |
5 | def analyze(html, options=None):
6 | parser = gumbocy.HTMLParser(options=options)
7 | parser.parse(html)
8 | return parser.analyze()
9 |
10 |
11 | def test_separators():
12 | html = """
13 | text
14 | text 2
15 | pre
inner
16 | """
17 |
18 | analyzed = analyze(html, options={
19 | "tags_separators": ["p"]
20 | })
21 |
22 | assert analyzed["word_groups"] == [
23 | ("text", "p"),
24 | ("text 2", "p"),
25 | ("pre", "p"),
26 | ("inner", "p")
27 | ]
28 |
29 | # More word group tests in test_word_groups.py
30 |
31 |
32 | def test_hidden_text():
33 |
34 | html = """
35 |
36 | text
37 | textp
38 | hidden by display
39 | ignored by class_noindex
40 | ignored by class_noindex 2
41 | hidden by html5 attribute
42 | hidden by aria
43 | not_aria
44 | hidden by visibility
45 | """
46 |
47 | analyzed = analyze(html, options={
48 | "attributes_whitelist": ["style", "hidden", "aria-hidden"],
49 | "classes_hidden": ["_class_hidden"],
50 | "ids_hidden": ["_id_hidden"],
51 | "tags_separators": ["div"],
52 | "classes_ignore": ["_class_noindex"]
53 | })
54 |
55 | assert analyzed["word_groups"] == [
56 | ("text", "body"),
57 | ("textp", "div"),
58 | ("not_aria", "div")
59 | ]
60 |
61 |
62 | def test_hidden_siblings():
63 |
64 | html = """
65 |
66 | Sign in with Facebook
67 |
68 | Or use your Businessweek account
69 | """
70 |
71 | analyzed = analyze(html, options={
72 | "classes_boilerplate": ["login"]
73 | })
74 |
75 | assert analyzed["word_groups"] == []
76 |
77 |
78 | def test_boilerplate_text():
79 |
80 | html = """
81 |
82 |
83 | Boilerplate
84 | Title
85 |
86 |
87 | x
88 | y
89 | z
90 |
91 | Title 2
92 | """
93 |
94 | analyzed = analyze(html, options={
95 | "attributes_whitelist": ["id", "class", "role"],
96 | "tags_boilerplate": ["header"],
97 | "tags_boilerplate_bypass": ["h2"],
98 | "classes_boilerplate": ["classboil"],
99 | "ids_boilerplate": ["idboil"],
100 | "roles_boilerplate": ["roleboil"],
101 | "tags_separators": TAGS_SEPARATORS
102 | })
103 |
104 | assert analyzed["word_groups"] == [
105 | ("Title", "h2"),
106 | ("Title 2", "h2")
107 | ]
108 |
109 |
110 | def test_title():
111 |
112 | html = """ test 1 test 2 """
113 |
114 | analyzed = analyze(html, options={
115 | })
116 |
117 | assert analyzed["title"] == "test 1"
118 | assert len(analyzed["word_groups"]) == 0
119 |
120 |
121 | def test_head_metas():
122 |
123 | html = """
124 |
125 |
126 |
127 |
128 | This is <body> text
129 | """
130 |
131 | analyzed = analyze(html, options={
132 | "metas_whitelist": ["description"]
133 | })
134 |
135 | assert analyzed["head_metas"] == {"description": "This is a !"}
136 |
--------------------------------------------------------------------------------
/tests/test_hyperlinks.py:
--------------------------------------------------------------------------------
1 | import gumbocy
2 | from test_word_groups import TAGS_SEPARATORS
3 |
4 |
5 | def _links(html, url=None):
6 | parser = gumbocy.HTMLParser(options={
7 | "tags_separators": TAGS_SEPARATORS
8 | })
9 | parser.parse(html)
10 | ret = parser.analyze(url=url)
11 | return {
12 | "all": ret["internal_hyperlinks"] + ret["external_hyperlinks"],
13 | "internal": ret["internal_hyperlinks"],
14 | "external": ret["external_hyperlinks"]
15 | }
16 |
17 |
18 | def test_get_hyperlinks():
19 | links = _links("""Test titlex""")
20 | assert len(links["all"]) == 0
21 |
22 | links = _links("""Test title
23 | Y
24 | """)
25 | assert len(links["all"]) == 0
26 |
27 | links = _links("""Test title
28 | Y
29 | """)
30 | assert len(links["all"]) == 0
31 |
32 | links = _links("""Test title
33 | Y
34 | """)
35 | assert len(links["all"]) == 0
36 |
37 | links = _links("""Test title
38 | Y
39 | """)
40 | assert len(links["all"]) == 0
41 |
42 | links = _links("""Test title
43 | Y
44 | """)
45 | assert len(links["all"]) == 0
46 |
47 | links = _links("""Test title
48 | Y
49 | """)
50 | assert len(links["all"]) == 1
51 | assert links["external"][0][0] == "http://sub.test.com/page1?q=2&a=b#xxx"
52 | assert links["external"][0][1] == "Y"
53 | assert links["external"][0][2] == "nofollow"
54 |
55 | links = _links("""Test title
56 | Y X
57 | """, url="http://sub.test.com/page2")
58 | assert len(links["all"]) == 1
59 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
60 | assert links["internal"][0][1] == "Y X"
61 | assert links["internal"][0][2] is None
62 |
63 | links = _links("""Test title
64 | Y Z
65 | """, url="http://sub.test.com/page2/x.html")
66 | assert len(links["all"]) == 1
67 | assert links["internal"][0][0] == "../page1?q=2&a=b#xxx"
68 | assert links["internal"][0][1] == "Y Z"
69 |
70 | # Absolute links to the same netloc are still internal
71 | links = _links("""Test title
72 | Y Z
73 | """, url="http://sub.test.com/page2/x.html")
74 | assert len(links["all"]) == 1
75 | assert len(links["external"]) == 0
76 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
77 | assert links["internal"][0][1] == "Y Z"
78 |
79 | # Cross-scheme links are still considered internal
80 | links = _links("""Test title
81 | Y Z
82 | """, url="http://sub.test.com/page2/x.html")
83 | assert len(links["all"]) == 1
84 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
85 | assert links["internal"][0][1] == "Y Z"
86 |
87 | links = _links("""Test title
88 | Y Z
89 | """, url="https://sub.test.com/page2/x.html")
90 | assert len(links["all"]) == 1
91 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
92 | assert links["internal"][0][1] == "Y Z"
93 |
94 | links = _links("""Test title
95 | Y Z
96 | """, url="http://sub.test.com/page2/x.html")
97 | assert len(links["all"]) == 1
98 | assert links["internal"][0][0] == "/sub.test.com/page1?q=2&a=b#xxx"
99 | assert links["internal"][0][1] == "Y Z"
100 |
101 | links = _links("""Test title
102 | Y Z
103 | """, url="http://sub.test.com/page2/x.html")
104 | assert len(links["all"]) == 1
105 | assert links["internal"][0][0] == "/page1?q=2&a=b#xxx"
106 | assert links["internal"][0][1] == "Y Z"
107 |
108 | links = _links("""Test title
109 | Y Z
110 | """, url="http://sub.test.com/page2/x.html")
111 | assert len(links["all"]) == 1
112 | assert links["external"][0][0] == "http://sub2.test.com/page1?q=2&a=b#xxx"
113 | assert links["external"][0][1] == "Y Z"
114 |
115 | links = _links("""Test title
116 | Y Z
117 | """, url="http://sub.test.com/page2/x.html")
118 | assert len(links["all"]) == 1
119 | assert links["external"][0][0] == "https://sub2.test.com/page1?q=2&a=b#xxx"
120 | assert links["external"][0][1] == "Y Z"
121 |
122 | # TODO resolution tests
123 |
--------------------------------------------------------------------------------
/tests/test_listnodes.py:
--------------------------------------------------------------------------------
1 | import gumbocy
2 |
3 |
4 | def listnodes(html, options=None):
5 | parser = gumbocy.HTMLParser(options=options)
6 | parser.parse(html)
7 | return parser.listnodes()
8 |
9 |
10 | def test_basic():
11 | html = """
12 |
13 | HW
14 | Hello world
15 |
16 | """
17 |
18 | iterations = 1 # 300000
19 | for _ in range(0, iterations):
20 | nodes = listnodes(html, {"attributes_whitelist": ["href"]})
21 | assert nodes == [
22 | (0, "html"),
23 | (1, "head"),
24 | (2, "title"),
25 | (3, None, "HW"),
26 | (1, "body"),
27 | (2, None, " Hello "),
28 | (2, "a", {"href": "http://example.com"}),
29 | (3, None, "world"),
30 | (2, "br")
31 | ]
32 |
33 |
34 | def test_classes():
35 | html = """
36 |
37 |
38 |
39 |
40 | """
41 |
42 | nodes = listnodes(html, {"attributes_whitelist": ["class"]})
43 | assert nodes == [
44 | (0, "html"),
45 | (1, "head"),
46 | (1, "body"),
47 | (2, "p", {"class": frozenset(["para", "graph"])})
48 | ]
49 |
50 |
51 | def test_ignore():
52 | html = """
53 |
54 | HW
55 | Hello world
56 |
57 | """
58 |
59 | nodes = listnodes(html, {
60 | "attributes_whitelist": ["class", "id"],
61 | "ids_ignore": ["i"],
62 | "classes_ignore": set(["ign"]),
63 | "tags_ignore": ["title"]
64 | })
65 | assert nodes == [
66 | (0, "html"),
67 | (1, "head"),
68 | (1, "body"),
69 | (2, None, " Hello ")
70 | ]
71 |
72 |
73 | def test_head_only():
74 | html = """
75 |
76 | HW
77 | Hello world
78 |
79 | """
80 |
81 | nodes = listnodes(html, {
82 | "head_only": True
83 | })
84 | assert nodes == [
85 | (0, "html"),
86 | (1, "head"),
87 | (2, "title"),
88 | (3, None, "HW")
89 | ]
90 |
91 | html = """
92 |
93 | test
HW
94 | Hello world
95 |
96 | """
97 |
98 | nodes = listnodes(html, {
99 | "head_only": True
100 | })
101 | assert nodes == [
102 | (0, "html"),
103 | (1, "head")
104 | ]
105 |
106 |
107 | def test_unknown_tags():
108 | html = """
109 |
110 |
111 | inline text
112 |
113 | """
114 |
115 | nodes = listnodes(html, {
116 | "attributes_whitelist": ["class"],
117 | "tags_ignore": "new_tag" # We can't ignore unknown tags at the Gumbocy level (for now?)
118 | })
119 |
120 | assert nodes == [
121 | (0, "html"),
122 | (1, "head"),
123 | (1, "body"),
124 | (2, "new_tag", {'class': frozenset(['xx'])}),
125 | (3, None, "inline text"),
126 | (2, "new_tag_2")
127 | ]
128 |
--------------------------------------------------------------------------------
/tests/test_word_groups.py:
--------------------------------------------------------------------------------
1 | import gumbocy
2 | import pytest
3 |
4 | TAGS_SEPARATORS = frozenset([
5 | "body",
6 |
7 | # http://www.w3.org/TR/html5/grouping-content.html#grouping-content
8 | "p", "pre", "blockquote", "ul", "ol", "li", "dl", "dt", "dd", "figure", "figcaption",
9 |
10 | "br", "img",
11 |
12 | "h1", "h2", "h3", "h4", "h5", "h6"
13 | ])
14 |
15 |
16 | SAMPLES = [
17 | {
18 | "html": """ hello
""",
19 | "groups": [
20 | ("hello", "p")
21 | ]
22 | },
23 |
24 | # A is automatically added
25 | {
26 | "html": """ nobody """,
27 | "groups": [
28 | ("nobody", "body")
29 | ]
30 | },
31 |
32 | # span
33 | {
34 | "html": """ pre link post
""",
35 | "groups": [
36 | ("pre link post", "p")
37 | ]
38 | },
39 |
40 | # a
41 | {
42 | "html": """ pre link post
""",
43 | "groups": [
44 | ("pre link post", "p")
45 | ]
46 | },
47 |
48 | # mid p
49 | {
50 | "html": """ pre
mid post
""",
51 | "groups": [
52 | ("pre", "p"),
53 | ("li1 x", "li"),
54 | ("mid", "body"),
55 | ("post", "p")
56 | ]
57 | },
58 |
59 | # Lists
60 | {
61 | "html": """ pre post """,
62 | "groups": [
63 | ("pre", "body"),
64 | ("li1", "li"),
65 | ("li2", "li"),
66 | ("post", "body")
67 | ]
68 | },
69 |
70 | # HR with illegal . "post" is actually part of
.
71 | {
72 | "html": """ pre
post""",
73 | "groups": [
74 | ("pre", "p"),
75 | ("post", "body")
76 | ]
77 | },
78 |
79 | # Non-closed p tag.
80 | {
81 | "html": """ pre post""",
82 | "groups": [
83 | ("pre", "body"),
84 | ("post", "p")
85 | ]
86 | },
87 |
88 | # BR
89 | {
90 | "html": """
pre
post
""",
91 | "groups": [
92 | ("pre", "p"),
93 | ("post", "p")
94 | ]
95 | },
96 |
97 | # IMG filename + alt
98 | {
99 | "html": """ pre
post
""",
100 | "groups": [
101 | ("pre", "p"),
102 | ("james brown", "img"),
103 | # ("maceo parker", "img"),
104 | ("post", "p")
105 | ]
106 | },
107 |
108 | # IMG with dataURIs are ignored
109 | {
110 | "html": """ pre
post
""",
111 | "groups": [
112 | ("pre", "p"),
113 | ("Red dot", "img"),
114 | ("post", "p")
115 | ]
116 | },
117 | ]
118 |
119 |
120 | # TODO: good coverage of http://www.w3.org/html/wg/drafts/html/master/syntax.html
121 | @pytest.mark.parametrize(("sample"), SAMPLES)
122 | def test_get_word_groups(sample):
123 |
124 | parser = gumbocy.HTMLParser(options={
125 | "tags_separators": TAGS_SEPARATORS,
126 | "attributes_whitelist": ["src", "alt"]
127 | })
128 | parser.parse(sample["html"])
129 | parsed = parser.analyze()
130 |
131 | for i, group in enumerate(parsed["word_groups"]):
132 | assert group == sample["groups"][i]
133 |
134 | assert len(parsed["word_groups"]) == len(sample["groups"])
135 |
--------------------------------------------------------------------------------