├── .github
    └── workflows
    │   └── tox.yml
├── .gitignore
├── .hgignore
├── CHANGES.rst
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── README_fixt.py
├── conftest.py
├── docs
    ├── Makefile
    ├── api.rst
    ├── attributes.rst
    ├── changes.rst
    ├── conf.py
    ├── conftest.py
    ├── css.rst
    ├── future.rst
    ├── index.rst
    ├── manipulating.rst
    ├── pseudo_classes.rst
    ├── scrap.rst
    ├── testing.rst
    ├── tips.rst
    └── traversing.rst
├── pyquery
    ├── __init__.py
    ├── cssselectpatch.py
    ├── openers.py
    ├── pyquery.py
    └── text.py
├── pytest.ini
├── setup.py
├── tests
    ├── __init__.py
    ├── apps.py
    ├── browser_base.py
    ├── doctests.rst
    ├── geckodriver.sh
    ├── invalid.xml
    ├── selenium.sh
    ├── test.html
    ├── test_browser.py
    ├── test_pyquery.py
    └── test_real_browser.py
└── tox.ini


/.github/workflows/tox.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | name: tox
 4 | 
 5 | on: [push, pull_request]
 6 | 
 7 | jobs:
 8 |   linux:
 9 |     runs-on: ubuntu-latest
10 |     env:
11 |       MOZ_HEADLESS: "1"
12 |     strategy:
13 |       matrix:
14 |         python: [3.8, 3.9, "3.10", 3.11, 3.12]
15 | 
16 |     steps:
17 |       - name: Setup firefox
18 |         uses: browser-actions/setup-firefox@latest
19 |         with:
20 |           firefox-version: latest
21 |       - uses: actions/checkout@v3
22 |       - name: Setup Python
23 |         uses: actions/setup-python@v4
24 |         with:
25 |           python-version: ${{ matrix.python }}
26 |       - name: Install geckodriver
27 |         run: tests/geckodriver.sh
28 |       - name: Install Tox and any other packages
29 |         run: pip install tox
30 |       - name: Run Tox
31 |         run: |
32 |           export PATH=$PATH:$PWD
33 |           tox -e py
34 |       - name: Run lint / docs
35 |         run: tox -e lint,docs
36 |         if: matrix.python == 3.11
37 | 
38 |   windows:
39 |     runs-on: windows-latest
40 |     strategy:
41 |       matrix:
42 |         python: [3.8, 3.9, "3.10", 3.11, 3.12]
43 | 
44 |     steps:
45 |       - uses: actions/checkout@v3
46 |       - name: Setup Python
47 |         uses: actions/setup-python@v4
48 |         with:
49 |           python-version: ${{ matrix.python }}
50 |       - name: Install Tox and any other packages
51 |         run: pip install tox
52 |       - name: Run Tox
53 |         run: tox -e py
54 | 
55 |   macos:
56 |     runs-on: macos-latest
57 |     strategy:
58 |       matrix:
59 |         python: [3.8, 3.9, "3.10", 3.11, 3.12]
60 | 
61 |     steps:
62 |       - uses: actions/checkout@v3
63 |       - name: Setup Python
64 |         uses: actions/setup-python@v4
65 |         with:
66 |           python-version: ${{ matrix.python }}
67 |       - name: Install Tox and any other packages
68 |         run: pip install tox
69 |       - name: Run Tox
70 |         run: tox -e py
71 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # Distribution / packaging
 6 | develop-eggs/
 7 | bin/
 8 | dist/
 9 | build/
10 | parts/
11 | .tox/
12 | .installed.cfg
13 | *.egg-info
14 | *.swp
15 | 
16 | # Temporary files
17 | *~
18 | geckodriver
19 | 
20 | # Log files
21 | geckodriver.log
22 | 
23 | # Sphinx documentation
24 | docs/_build/
25 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
 1 | # use glob syntax.
 2 | syntax: glob
 3 | 
 4 | develop-eggs/
 5 | bin/
 6 | dist/
 7 | build/
 8 | parts/
 9 | docs/_build/
10 | .tox/
11 | .installed.cfg
12 | *.egg-info
13 | *.pyc
14 | *.swp
15 | *~
16 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
  1 | 2.0.2 (unreleased)
  2 | ------------------
  3 | 
  4 | - Nothing changed yet.
  5 | 
  6 | 
  7 | 2.0.1 (2024-08-30)
  8 | ------------------
  9 | 
 10 | - Breaking change: its seems no longer possible to use the html parser with a xml file so its no longer tested
 11 | 
 12 | - Drop support for python 3.7
 13 | 
 14 | 2.0.0 (2022-12-28)
 15 | ------------------
 16 | 
 17 | - Breaking change: inputs starting with ``"http://"`` or ``"https://"`` like
 18 |   ``PyQuery("http://example.com")`` will no longer fetch the contents of the URL.
 19 |   Users desiring the old behavior should switch to ``PyQuery(url="http://example.com")``.
 20 | 
 21 | - Add nextUntil method
 22 | 
 23 | - ``.remove()`` no longer inserts a space in place of the removed element
 24 | 
 25 | - Fix escaping of top-level element text in ``.html()`` output
 26 | 
 27 | - Support (and require) cssselect 1.2+
 28 | 
 29 | - Drop support for python 3.5/3.6
 30 | 
 31 | 
 32 | 1.4.3 (2020-11-21)
 33 | ------------------
 34 | 
 35 | - No longer use a universal wheel
 36 | 
 37 | 
 38 | 1.4.2 (2020-11-21)
 39 | ------------------
 40 | 
 41 | - Fix exception raised when calling `PyQuery("<textarea></textarea>").text()`
 42 | 
 43 | - python2 is no longer supported
 44 | 
 45 | 1.4.1 (2019-10-26)
 46 | ------------------
 47 | 
 48 | - This is the latest release with py2 support
 49 | 
 50 | - Remove py33, py34 support
 51 | 
 52 | - web scraping improvements: default timeout and session support
 53 | 
 54 | - Add API methods to serialize form-related elements according to spec
 55 | 
 56 | - Include HTML markup when querying textarea text/value
 57 | 
 58 | 
 59 | 1.4.0 (2018-01-11)
 60 | ------------------
 61 | 
 62 | - Refactoring of `.text()` to match firefox behavior.
 63 | 
 64 | 
 65 | 1.3.0 (2017-10-21)
 66 | ------------------
 67 | 
 68 | - Remove some unmaintained modules: ``pyquery.ajax`` and ``pyquery.rules``
 69 | 
 70 | - Code cleanup. No longer use ugly hacks required by python2.6/python3.2.
 71 | 
 72 | - Run tests with python3.6 on CI
 73 | 
 74 | - Add a ``method`` argument to ``.outer_html()``
 75 | 
 76 | 
 77 | 1.2.17 (2016-10-14)
 78 | -------------------
 79 | 
 80 | - ``PyQuery('<input value="">').val()`` is ``''``
 81 | - ``PyQuery('<input>').val()`` is ``''``
 82 | 
 83 | 
 84 | 1.2.16 (2016-10-14)
 85 | -------------------
 86 | 
 87 | - ``.attr('value', '')`` no longer removes the ``value`` attribute
 88 | 
 89 | - ``<input type="checkbox">`` without ``value="..."`` have a ``.val()`` of
 90 |   ``'on'``
 91 | 
 92 | - ``<input type="radio">`` without ``value="..."`` have a ``.val()`` of
 93 |   ``'on'``
 94 | 
 95 | - ``<select>`` without ``<option selected>`` have the value of their first
 96 |   ``<option>`` (or ``None`` if there are no options)
 97 | 
 98 | 
 99 | 1.2.15 (2016-10-11)
100 | -------------------
101 | 
102 | - .val() should never raise
103 | 
104 | - drop py26 support
105 | 
106 | - improve .extend() by returning self
107 | 
108 | 
109 | 1.2.14 (2016-10-10)
110 | -------------------
111 | 
112 | - fix val() for <textarea> and <select>, to match jQuery behavior
113 | 
114 | 
115 | 1.2.13 (2016-04-12)
116 | -------------------
117 | 
118 | - Note explicit support for Python 3.5
119 | 
120 | 1.2.12 (2016-04-12)
121 | -------------------
122 | 
123 | - make_links_absolute now take care of whitespaces
124 | 
125 | - added pseudo selector :has()
126 | 
127 | - add cookies arguments as allowed arguments for requests
128 | 
129 | 
130 | 1.2.11 (2016-02-02)
131 | -------------------
132 | 
133 | - Preserve namespaces attribute on PyQuery copies.
134 | 
135 | - Do not raise an error when the http response code is 2XX
136 | 
137 | 1.2.10 (2016-01-05)
138 | -------------------
139 | 
140 | - Fixed #118: implemented usage ``lxml.etree.tostring`` within ``outer_html`` method
141 | 
142 | - Fixed #117: Raise HTTP Error if HTTP status code is not equal to 200
143 | 
144 | - Fixed #112: make_links_absolute does not apply to form actions
145 | 
146 | - Fixed #98: contains act like jQuery
147 | 
148 | 
149 | 1.2.9 (2014-08-22)
150 | ------------------
151 | 
152 | - Support for keyword arguments in PyQuery custom functions
153 | 
154 | - Fixed #78: items must take care or the parent
155 | 
156 | - Fixed #65 PyQuery.make_links_absolute() no longer creates 'href' attribute
157 |   when it isn't there
158 | 
159 | - Fixed #19. ``is_()`` was broken.
160 | 
161 | - Fixed #9. ``.replaceWith(PyQuery element)`` raises error
162 | 
163 | - Remove official python3.2 support (mostly because of 3rd party semi-deps)
164 | 
165 | 
166 | 1.2.8 (2013-12-21)
167 | ------------------
168 | 
169 | - Fixed #22: Open by filename fails when file contains invalid xml
170 | 
171 | - Bug fix in .remove_class()
172 | 
173 | 
174 | 1.2.7 (2013-12-21)
175 | ------------------
176 | 
177 | - Use pep8 name for methods but keep an alias for camel case method.
178 |   Eg: remove_attr and removeAttr works
179 |   Fix #57
180 | 
181 | - .text() now return an empty string instead of None if there is no text node.
182 |   Fix #45
183 | 
184 | - Fixed #23: removeClass adds class attribute to elements which previously
185 |   lacked one
186 | 
187 | 
188 | 1.2.6 (2013-10-11)
189 | ------------------
190 | 
191 | - README_fixt.py was not include in the release. Fix #54.
192 | 
193 | 
194 | 1.2.5 (2013-10-10)
195 | ------------------
196 | 
197 | - cssselect compat. See https://github.com/SimonSapin/cssselect/pull/22
198 | 
199 | - tests improvements. no longer require a eth connection.
200 | 
201 | - fix #55
202 | 
203 | 1.2.4
204 | -----
205 | 
206 | - Moved to github. So a few files are renamed from .txt to .rst
207 | 
208 | - Added .xhtml_to_html() and .remove_namespaces()
209 | 
210 | - Use requests to fetch urls (if available)
211 | 
212 | - Use restkit's proxy instead of Paste (which will die with py3)
213 | 
214 | - Allow to open https urls
215 | 
216 | - python2.5 is no longer supported (may work, but tests are broken)
217 | 
218 | 1.2.3
219 | -----
220 | 
221 | - Allow to pass this in .filter() callback
222 | 
223 | - Add .contents() .items()
224 | 
225 | - Add tox.ini
226 | 
227 | - Bug fixes: fix #35 #55 #64 #66
228 | 
229 | 1.2.2
230 | -----
231 | 
232 | - Fix cssselectpatch to match the newer implementation of cssselect. Fixes issue #62, #52 and #59 (Haoyu Bai)
233 | 
234 | - Fix issue #37 (Caleb Burns)
235 | 
236 | 1.2.1
237 | -----
238 | 
239 | - Allow to use a custom css translator.
240 | 
241 | - Fix issue 44: case problem with xml documents
242 | 
243 | 1.2
244 | ---
245 | 
246 | - PyQuery now uses `cssselect <http://pypi.python.org/pypi/cssselect>`_. See issue 43.
247 | 
248 | - Fix issue 40: forward .html() extra arguments to ``lxml.etree.tostring``
249 | 
250 | 1.1.1
251 | -----
252 | 
253 | - Minor release. Include test file so you can run tests from the tarball.
254 | 
255 | 
256 | 1.1
257 | ---
258 | 
259 | - fix issues 30, 31, 32 - py3 improvements / webob 1.2+ support
260 | 
261 | 
262 | 1.0
263 | ---
264 | 
265 | - fix issues 24
266 | 
267 | 0.7
268 | ---
269 | 
270 | - Python 3 compatible
271 | 
272 | - Add __unicode__ method
273 | 
274 | - Add root and encoding attribute
275 | 
276 | - fix issues 19, 20, 22, 23
277 | 
278 | 0.6.1
279 | ------
280 | 
281 | - Move README.txt at package root
282 | 
283 | - Add CHANGES.txt and add it to long_description
284 | 
285 | 0.6
286 | ----
287 | 
288 | - Added PyQuery.outerHtml
289 | 
290 | - Added PyQuery.fn
291 | 
292 | - Added PyQuery.map
293 | 
294 | - Change PyQuery.each behavior to reflect jQuery api
295 | 
296 | 
297 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |   1. Redistributions of source code must retain the above copyright
 8 |      notice, this list of conditions and the following disclaimer.
 9 | 
10 |   2. Redistributions in binary form must reproduce the above copyright
11 |      notice, this list of conditions and the following disclaimer in
12 |      the documentation and/or other materials provided with the
13 |      distribution.
14 | 
15 |   3. Neither the name of Infrae nor the names of its contributors may
16 |      be used to endorse or promote products derived from this software
17 |      without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INFRAE OR
23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | graft docs
 2 | prune docs/_build
 3 | graft pyquery
 4 | graft tests
 5 | include *.py
 6 | include *.txt
 7 | include *_fixt.py *.rst *.cfg *.ini
 8 | global-exclude *.pyc
 9 | global-exclude __pycache__
10 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | pyquery: a jquery-like library for python
 2 | =========================================
 3 | 
 4 | .. image:: https://github.com/gawel/pyquery/actions/workflows/tox.yml/badge.svg
 5 |    :alt: Build Status
 6 |    :target: https://github.com/gawel/pyquery/actions/workflows/tox.yml
 7 | 
 8 | pyquery allows you to make jquery queries on xml documents.
 9 | The API is as much as possible similar to jquery. pyquery uses lxml for fast
10 | xml and html manipulation.
11 | 
12 | This is not (or at least not yet) a library to produce or interact with
13 | javascript code. I just liked the jquery API and I missed it in python so I
14 | told myself "Hey let's make jquery in python". This is the result.
15 | 
16 | The `project`_ is being actively developed on a git repository on Github. I
17 | have the policy of giving push access to anyone who wants it and then reviewing
18 | what they do. So if you want to contribute just email me.
19 | 
20 | Please report bugs on the `github
21 | <https://github.com/gawel/pyquery/issues>`_ issue
22 | tracker.
23 | 
24 | .. _deliverance: http://www.gawel.org/weblog/en/2008/12/skinning-with-pyquery-and-deliverance
25 | .. _project: https://github.com/gawel/pyquery/
26 | 
27 | ..
28 |    >>> (urlopen, your_url, path_to_html_file) = getfixture('readme_fixt')
29 | 
30 | Quickstart
31 | ==========
32 | 
33 | You can use the PyQuery class to load an xml document from a string, a lxml
34 | document, from a file or from an url::
35 | 
36 |     >>> from pyquery import PyQuery as pq
37 |     >>> from lxml import etree
38 |     >>> import urllib
39 |     >>> d = pq("<html></html>")
40 |     >>> d = pq(etree.fromstring("<html></html>"))
41 |     >>> d = pq(url=your_url)
42 |     >>> d = pq(url=your_url,
43 |     ...        opener=lambda url, **kw: urlopen(url).read())
44 |     >>> d = pq(filename=path_to_html_file)
45 | 
46 | Now d is like the $ in jquery::
47 | 
48 |     >>> d("#hello")
49 |     [<p#hello.hello>]
50 |     >>> p = d("#hello")
51 |     >>> print(p.html())
52 |     Hello world !
53 |     >>> p.html("you know <a href='http://python.org/'>Python</a> rocks")
54 |     [<p#hello.hello>]
55 |     >>> print(p.html())
56 |     you know <a href="http://python.org/">Python</a> rocks
57 |     >>> print(p.text())
58 |     you know Python rocks
59 | 
60 | You can use some of the pseudo classes that are available in jQuery but that
61 | are not standard in css such as :first :last :even :odd :eq :lt :gt :checked
62 | :selected :file::
63 | 
64 |     >>> d('p:first')
65 |     [<p#hello.hello>]
66 | 
67 | 


--------------------------------------------------------------------------------
/README_fixt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from webtest import http
 3 | from webtest.debugapp import debug_app
 4 | 
 5 | try:
 6 |     from urllib import urlopen
 7 | except ImportError:
 8 |     from urllib.request import urlopen
 9 | 
10 | 
11 | def setup_test(test):
12 |     server = http.StopableWSGIServer.create(debug_app)
13 |     server.wait()
14 |     path_to_html_file = os.path.join('tests', 'test.html')
15 |     test.globs.update(
16 |         urlopen=urlopen,
17 |         server=server,
18 |         your_url=server.application_url,
19 |         path_to_html_file=path_to_html_file,
20 |     )
21 | setup_test.__test__ = False
22 | 
23 | 
24 | def teardown_test(test):
25 |     test.globs['server'].shutdown()
26 | teardown_test.__test__ = False
27 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | from webtest import http
 4 | from webtest.debugapp import debug_app
 5 | from urllib.request import urlopen
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def readme_fixt():
10 |     server = http.StopableWSGIServer.create(debug_app)
11 |     server.wait()
12 |     path_to_html_file = os.path.join('tests', 'test.html')
13 |     yield (
14 |         urlopen,
15 |         server.application_url,
16 |         path_to_html_file,
17 |     )
18 |     server.shutdown()
19 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = ../bin/sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/chut.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/chut.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/chut"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/chut"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | ================================================
 2 | :mod:`~pyquery.pyquery` -- PyQuery complete API
 3 | ================================================
 4 | 
 5 | .. automodule:: pyquery.pyquery
 6 | 
 7 | .. autoclass:: PyQuery
 8 |    :members:
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/attributes.rst:
--------------------------------------------------------------------------------
 1 | Attributes
 2 | ----------
 3 | 
 4 | ..
 5 |     >>> from pyquery import PyQuery as pq
 6 | 
 7 | Using attribute to select specific tag
 8 | In attribute selectors, the value should be a valid CSS identifier or quoted as string::
 9 | 
10 |     >>> d = pq("<option value='1'><option value='2'>")
11 |     >>> d('option[value="1"]')
12 |     [<option>]
13 | 
14 | 
15 | You can play with the attributes with the jquery API::
16 | 
17 |     >>> p = pq('<p id="hello" class="hello"></p>')('p')
18 |     >>> p.attr("id")
19 |     'hello'
20 |     >>> p.attr("id", "plop")
21 |     [<p#plop.hello>]
22 |     >>> p.attr("id", "hello")
23 |     [<p#hello.hello>]
24 | 
25 | 
26 | Or in a more pythonic way::
27 | 
28 |     >>> p.attr.id = "plop"
29 |     >>> p.attr.id
30 |     'plop'
31 |     >>> p.attr["id"] = "ola"
32 |     >>> p.attr["id"]
33 |     'ola'
34 |     >>> p.attr(id='hello', class_='hello2')
35 |     [<p#hello.hello2>]
36 |     >>> p.attr.class_
37 |     'hello2'
38 |     >>> p.attr.class_ = 'hello'
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/docs/changes.rst:
--------------------------------------------------------------------------------
1 | News
2 | =====
3 | 
4 | .. include:: ../CHANGES.rst
5 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # pyquery documentation build configuration file, created by
  3 | # sphinx-quickstart on Thu Nov  1 21:48:09 2012.
  4 | #
  5 | # This file is execfile()d with the current directory set to its containing dir.
  6 | #
  7 | # Note that not all possible configuration values are present in this
  8 | # autogenerated file.
  9 | #
 10 | # All configuration values have a default; values that are commented out
 11 | # serve to show the default.
 12 | 
 13 | # If extensions (or modules to document with autodoc) are in another directory,
 14 | # add these directories to sys.path here. If the directory is relative to the
 15 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 16 | # sys.path.insert(0, os.path.abspath('.'))
 17 | 
 18 | from os import path
 19 | 
 20 | # -- General configuration -----------------------------------------------------
 21 | 
 22 | # If your documentation needs a minimal Sphinx version, state it here.
 23 | # needs_sphinx = '1.0'
 24 | 
 25 | # Add any Sphinx extension module names here, as strings. They can be extensions
 26 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 27 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
 28 | 
 29 | # Add any paths that contain templates here, relative to this directory.
 30 | templates_path = ['_templates']
 31 | 
 32 | # The suffix of source filenames.
 33 | source_suffix = '.rst'
 34 | 
 35 | # The encoding of source files.
 36 | # source_encoding = 'utf-8-sig'
 37 | 
 38 | # The master toctree document.
 39 | master_doc = 'index'
 40 | 
 41 | # General information about the project.
 42 | project = 'pyquery'
 43 | copyright = '2012-2017, Olivier Lauzanne'
 44 | 
 45 | # The version info for the project you're documenting, acts as replacement for
 46 | # |version| and |release|, also used in various other places throughout the
 47 | # built documents.
 48 | #
 49 | # The short X.Y version.
 50 | version = '2.0.x'
 51 | # The full version, including alpha/beta/rc tags.
 52 | release = '2.0.x'
 53 | 
 54 | # The language for content autogenerated by Sphinx. Refer to documentation
 55 | # for a list of supported languages.
 56 | # language = None
 57 | 
 58 | # There are two options for replacing |today|: either, you set today to some
 59 | # non-false value, then it is used:
 60 | # today = ''
 61 | # Else, today_fmt is used as the format for a strftime call.
 62 | # today_fmt = '%B %d, %Y'
 63 | 
 64 | # List of patterns, relative to source directory, that match files and
 65 | # directories to ignore when looking for source files.
 66 | exclude_patterns = ['_build']
 67 | 
 68 | # The reST default role (used for this markup: `text`) to use for all documents.
 69 | # default_role = None
 70 | 
 71 | # If true, '()' will be appended to :func: etc. cross-reference text.
 72 | # add_function_parentheses = True
 73 | 
 74 | # If true, the current module name will be prepended to all description
 75 | # unit titles (such as .. function::).
 76 | # add_module_names = True
 77 | 
 78 | # If true, sectionauthor and moduleauthor directives will be shown in the
 79 | # output. They are ignored by default.
 80 | # show_authors = False
 81 | 
 82 | # The name of the Pygments (syntax highlighting) style to use.
 83 | pygments_style = 'sphinx'
 84 | 
 85 | # A list of ignored prefixes for module index sorting.
 86 | # modindex_common_prefix = []
 87 | 
 88 | 
 89 | # -- Options for HTML output ---------------------------------------------------
 90 | 
 91 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 92 | # a list of builtin themes.
 93 | html_theme = 'nature'
 94 | 
 95 | # Theme options are theme-specific and customize the look and feel of a theme
 96 | # further.  For a list of options available for each theme, see the
 97 | # documentation.
 98 | # html_theme_options = {}
 99 | 
100 | # Add any paths that contain custom themes here, relative to this directory.
101 | # html_theme_path = []
102 | 
103 | # The name for this set of Sphinx documents.  If None, it defaults to
104 | # "<project> v<release> documentation".
105 | # html_title = None
106 | 
107 | # A shorter title for the navigation bar.  Default is the same as html_title.
108 | # html_short_title = None
109 | 
110 | # The name of an image file (relative to this directory) to place at the top
111 | # of the sidebar.
112 | # html_logo = None
113 | 
114 | # The name of an image file (within the static path) to use as favicon of the
115 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
116 | # pixels large.
117 | # html_favicon = None
118 | 
119 | # Add any paths that contain custom static files (such as style sheets) here,
120 | # relative to this directory. They are copied after the builtin static files,
121 | # so a file named "default.css" will overwrite the builtin "default.css".
122 | html_static_path = ['_static']
123 | 
124 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
125 | # using the given strftime format.
126 | # html_last_updated_fmt = '%b %d, %Y'
127 | 
128 | # If true, SmartyPants will be used to convert quotes and dashes to
129 | # typographically correct entities.
130 | # html_use_smartypants = True
131 | 
132 | # Custom sidebar templates, maps document names to template names.
133 | # html_sidebars = {}
134 | 
135 | # Additional templates that should be rendered to pages, maps page names to
136 | # template names.
137 | # html_additional_pages = {}
138 | 
139 | # If false, no module index is generated.
140 | # html_domain_indices = True
141 | 
142 | # If false, no index is generated.
143 | # html_use_index = True
144 | 
145 | # If true, the index is split into individual pages for each letter.
146 | # html_split_index = False
147 | 
148 | # If true, links to the reST sources are added to the pages.
149 | # html_show_sourcelink = True
150 | 
151 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
152 | # html_show_sphinx = True
153 | 
154 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
155 | # html_show_copyright = True
156 | 
157 | # If true, an OpenSearch description file will be output, and all pages will
158 | # contain a <link> tag referring to it.  The value of this option must be the
159 | # base URL from which the finished HTML is served.
160 | # html_use_opensearch = ''
161 | 
162 | # This is the file name suffix for HTML files (e.g. ".xhtml").
163 | # html_file_suffix = None
164 | 
165 | # Output file base name for HTML help builder.
166 | htmlhelp_basename = 'pyquerydoc'
167 | 
168 | 
169 | # -- Options for LaTeX output --------------------------------------------------
170 | 
171 | latex_elements = {
172 |     # The paper size ('letterpaper' or 'a4paper').
173 |     # 'papersize': 'letterpaper',
174 |     # The font size ('10pt', '11pt' or '12pt').
175 |     # 'pointsize': '10pt',
176 |     # Additional stuff for the LaTeX preamble.
177 |     # 'preamble': '',
178 | }
179 | 
180 | # Grouping the document tree into LaTeX files. List of tuples
181 | # (source start file, target name, title, author, documentclass [howto/manual]).
182 | latex_documents = [
183 |     (
184 |         'index',
185 |         'pyquery.tex',
186 |         'pyquery Documentation',
187 |         'Olivier Lauzanne',
188 |         'manual',
189 |     ),
190 | ]
191 | 
192 | # The name of an image file (relative to this directory) to place at the top of
193 | # the title page.
194 | # latex_logo = None
195 | 
196 | # For "manual" documents, if this is true, then toplevel headings are parts,
197 | # not chapters.
198 | # latex_use_parts = False
199 | 
200 | # If true, show page references after internal links.
201 | # latex_show_pagerefs = False
202 | 
203 | # If true, show URL addresses after external links.
204 | # latex_show_urls = False
205 | 
206 | # Documents to append as an appendix to all manuals.
207 | # latex_appendices = []
208 | 
209 | # If false, no module index is generated.
210 | # latex_domain_indices = True
211 | 
212 | 
213 | # -- Options for manual page output --------------------------------------------
214 | 
215 | # One entry per manual page. List of tuples
216 | # (source start file, name, description, authors, manual section).
217 | man_pages = [
218 |     ('index', 'pyquery', 'pyquery Documentation', ['Olivier Lauzanne'], 1)
219 | ]
220 | 
221 | # If true, show URL addresses after external links.
222 | # man_show_urls = False
223 | 
224 | 
225 | # -- Options for Texinfo output ------------------------------------------------
226 | 
227 | # Grouping the document tree into Texinfo files. List of tuples
228 | # (source start file, target name, title, author,
229 | #  dir menu entry, description, category)
230 | texinfo_documents = [
231 |     (
232 |         'index',
233 |         'pyquery',
234 |         'pyquery Documentation',
235 |         'Olivier Lauzanne',
236 |         'pyquery',
237 |         'One line description of project.',
238 |         'Miscellaneous',
239 |     ),
240 | ]
241 | 
242 | # Documents to append as an appendix to all manuals.
243 | # texinfo_appendices = []
244 | 
245 | # If false, no module index is generated.
246 | # texinfo_domain_indices = True
247 | 
248 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
249 | # texinfo_show_urls = 'footnote'
250 | 
251 | 
252 | pkg_dir = path.abspath(__file__).split('/docs')[0]
253 | setup = path.join(pkg_dir, 'setup.py')
254 | if path.isfile(setup):
255 |     for line_ in open(setup):
256 |         if line_.startswith("version"):
257 |             version = line_.split('=')[-1]
258 |             version = version.strip()
259 |             version = version.strip("'\"")
260 |             release = version
261 |             break
262 | del pkg_dir, setup, path
263 | 
264 | try:
265 |     from pyquery.cssselectpatch import JQueryTranslator
266 | except ImportError:
267 |     pass
268 | else:
269 |     with open('pseudo_classes.rst', 'w') as fd:
270 |         fd.write('=========================\n')
271 |         fd.write('Using pseudo classes\n')
272 |         fd.write('=========================\n')
273 |         for k in sorted(dir(JQueryTranslator)):
274 |             if k.startswith('xpath_'):
275 |                 attr = getattr(JQueryTranslator, k)
276 |                 doc = getattr(attr, '__doc__', '') or ''
277 |                 doc = doc.strip()
278 |                 if doc.startswith('Common implementation'):
279 |                     continue
280 |                 k = k[6:]
281 |                 if '_' not in k or not doc:
282 |                     continue
283 |                 k, t = k.split('_', 1)
284 |                 if '_' in t:
285 |                     continue
286 |                 if t == 'function':
287 |                     k += '()'
288 |                 fd.write('\n\n:%s\n' % k)
289 |                 fd.write('==================\n\n')
290 |                 fd.write(doc.strip('..').replace('        ', '    '))
291 | 


--------------------------------------------------------------------------------
/docs/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pytest
 4 | from webtest import http
 5 | from webtest.debugapp import debug_app
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def scrap_url():
10 |     sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
11 |     from tests.apps import input_app
12 |     server = http.StopableWSGIServer.create(input_app)
13 |     server.wait()
14 |     yield server.application_url.rstrip('/') + '/html'
15 |     server.shutdown()
16 | 
17 | 
18 | @pytest.fixture
19 | def tips_url():
20 |     server = http.StopableWSGIServer.create(debug_app)
21 |     server.wait()
22 |     yield server.application_url.rstrip('/') + '/form.html'
23 |     server.shutdown()
24 | 


--------------------------------------------------------------------------------
/docs/css.rst:
--------------------------------------------------------------------------------
 1 | CSS
 2 | ---
 3 | 
 4 | .. Initialize tests
 5 | 
 6 |     >>> from pyquery import PyQuery
 7 |     >>> p = PyQuery('<p id="hello" class="hello"><a/></p>')('p')
 8 | 
 9 | You can play with css classes::
10 | 
11 |     >>> p.addClass("toto")
12 |     [<p#hello.hello.toto>]
13 |     >>> p.toggleClass("titi toto")
14 |     [<p#hello.hello.titi>]
15 |     >>> p.removeClass("titi")
16 |     [<p#hello.hello>]
17 | 
18 | Or the css style::
19 | 
20 |     >>> p.css("font-size", "15px")
21 |     [<p#hello.hello>]
22 |     >>> p.attr("style")
23 |     'font-size: 15px'
24 |     >>> p.css({"font-size": "17px"})
25 |     [<p#hello.hello>]
26 |     >>> p.attr("style")
27 |     'font-size: 17px'
28 | 
29 | Same thing the pythonic way ('_' characters are translated to '-')::
30 | 
31 |     >>> p.css.font_size = "16px"
32 |     >>> p.attr.style
33 |     'font-size: 16px'
34 |     >>> p.css['font-size'] = "15px"
35 |     >>> p.attr.style
36 |     'font-size: 15px'
37 |     >>> p.css(font_size="16px")
38 |     [<p#hello.hello>]
39 |     >>> p.attr.style
40 |     'font-size: 16px'
41 |     >>> p.css = {"font-size": "17px"}
42 |     >>> p.attr.style
43 |     'font-size: 17px'
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/docs/future.rst:
--------------------------------------------------------------------------------
 1 | Future
 2 | -------
 3 | 
 4 | - SELECTORS: done
 5 | 
 6 | - ATTRIBUTES: done
 7 | 
 8 | - CSS: done
 9 | 
10 | - HTML: done
11 | 
12 | - MANIPULATING: missing the wrapInner method
13 | 
14 | - TRAVERSING: about half done
15 | 
16 | - EVENTS: nothing to do with server side might be used later for automatic ajax
17 | 
18 | - CORE UI EFFECTS: did hide and show the rest doesn't really makes sense on
19 |   server side
20 | 
21 | - AJAX: some with wsgi app
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: ../README.rst
 2 | 
 3 | Full documentation
 4 | ==================
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 1
 8 | 
 9 |    attributes
10 |    css
11 |    pseudo_classes
12 |    manipulating
13 |    traversing
14 |    api
15 |    scrap
16 |    tips
17 |    testing
18 |    future
19 |    changes
20 | 
21 | More documentation
22 | ==================
23 | 
24 | First there is the Sphinx documentation `here`_.
25 | Then for more documentation about the API you can use the `jquery website`_.
26 | The reference I'm now using for the API is ... the `color cheat sheet`_.
27 | Then you can always look at the `code`_.
28 | 
29 | .. _jquery website: http://docs.jquery.com/
30 | .. _code: https://github.com/gawel/pyquery
31 | .. _color cheat sheet: http://colorcharge.com/wp-content/uploads/2007/12/jquery12_colorcharge.png
32 | .. _here: http://pyquery.rtfd.org/
33 | 
34 | Indices and tables
35 | ==================
36 | 
37 | * :ref:`genindex`
38 | * :ref:`modindex`
39 | * :ref:`search`
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/docs/manipulating.rst:
--------------------------------------------------------------------------------
 1 | Manipulating
 2 | ------------
 3 | 
 4 | ..
 5 |     >>> from pyquery import PyQuery as pq
 6 | 
 7 | You can also add content to the end of tags::
 8 | 
 9 |     >>> d = pq('<p class="hello" id="hello">you know Python rocks</p>')
10 |     >>> d('p').append(' check out <a href="http://reddit.com/r/python"><span>reddit</span></a>')
11 |     [<p#hello.hello>]
12 |     >>> print(d)
13 |     <p class="hello" id="hello">you know Python rocks check out <a href="http://reddit.com/r/python"><span>reddit</span></a></p>
14 | 
15 | Or to the beginning::
16 | 
17 |     >>> p = d('p')
18 |     >>> p.prepend('check out <a href="http://reddit.com/r/python">reddit</a>')
19 |     [<p#hello.hello>]
20 |     >>> print(p.html())
21 |     check out <a href="http://reddit.com/r/python">reddit</a>you know ...
22 | 
23 | Prepend or append an element into an other::
24 | 
25 |     >>> d = pq('<html><body><div id="test"><a href="http://python.org">python</a> !</div></body></html>')
26 |     >>> p.prependTo(d('#test'))
27 |     [<p#hello.hello>]
28 |     >>> print(d('#test').html())
29 |     <p class="hello" ...
30 | 
31 | Insert an element after another::
32 | 
33 |     >>> p.insertAfter(d('#test'))
34 |     [<p#hello.hello>]
35 |     >>> print(d('#test').html())
36 |     <a href="http://python.org">python</a> !
37 | 
38 | Or before::
39 | 
40 |     >>> p.insertBefore(d('#test'))
41 |     [<p#hello.hello>]
42 |     >>> print(d('body').html())
43 |     <p class="hello" id="hello">...
44 | 
45 | Doing something for each elements::
46 | 
47 |     >>> p.each(lambda i, e: pq(e).addClass('hello2'))
48 |     [<p#hello.hello.hello2>]
49 | 
50 | Remove an element::
51 | 
52 |     >>> d = pq('<html><body><p id="id">Yeah!</p><p>python rocks !</p></div></html>')
53 |     >>> d.remove('p#id')
54 |     [<html>]
55 |     >>> d('p#id')
56 |     []
57 | 
58 | Remove what's inside the selection::
59 | 
60 |     >>> d('p').empty()
61 |     [<p>]
62 | 
63 | And you can get back the modified html::
64 | 
65 |     >>> print(d)
66 |     <html><body><p/></body></html>
67 | 
68 | You can generate html stuff::
69 | 
70 |     >>> from pyquery import PyQuery as pq
71 |     >>> print(pq('<div>Yeah !</div>').addClass('myclass') + pq('<b>cool</b>'))
72 |     <div class="myclass">Yeah !</div><b>cool</b>
73 | 
74 | Remove all namespaces::
75 | 
76 |     >>> d = pq('<foo xmlns="http://example.com/foo"></foo>')
77 |     >>> d
78 |     [<{http://example.com/foo}foo>]
79 |     >>> d.remove_namespaces()
80 |     [<foo>]
81 | 
82 | 


--------------------------------------------------------------------------------
/docs/pseudo_classes.rst:
--------------------------------------------------------------------------------
  1 | =========================
  2 | Using pseudo classes
  3 | =========================
  4 | 
  5 | 
  6 | :button
  7 | ==================
  8 | 
  9 | Matches all button input elements and the button element::
 10 | 
 11 |         >>> from pyquery import PyQuery
 12 |         >>> d = PyQuery(('<div><input type="button"/>'
 13 |         ...          '<button></button></div>'))
 14 |         >>> d(':button')
 15 |         [<input>, <button>]
 16 | 
 17 |     
 18 | 
 19 | :checkbox
 20 | ==================
 21 | 
 22 | Matches all checkbox input elements::
 23 | 
 24 |         >>> from pyquery import PyQuery
 25 |         >>> d = PyQuery('<div><input type="checkbox"/></div>')
 26 |         >>> d('input:checkbox')
 27 |         [<input>]
 28 | 
 29 |     
 30 | 
 31 | :checked
 32 | ==================
 33 | 
 34 | Matches odd elements, zero-indexed::
 35 | 
 36 |         >>> from pyquery import PyQuery
 37 |         >>> d = PyQuery('<div><input checked="checked"/></div>')
 38 |         >>> d('input:checked')
 39 |         [<input>]
 40 | 
 41 |     
 42 | 
 43 | :child
 44 | ==================
 45 | 
 46 | right is an immediate child of left
 47 | 
 48 | :contains()
 49 | ==================
 50 | 
 51 | Matches all elements that contain the given text
 52 | 
 53 |         >>> from pyquery import PyQuery
 54 |         >>> d = PyQuery('<div><h1/><h1 class="title">title</h1></div>')
 55 |         >>> d('h1:contains("title")')
 56 |         [<h1.title>]
 57 | 
 58 |     
 59 | 
 60 | :descendant
 61 | ==================
 62 | 
 63 | right is a child, grand-child or further descendant of left
 64 | 
 65 | :disabled
 66 | ==================
 67 | 
 68 | Matches all elements that are disabled::
 69 | 
 70 |         >>> from pyquery import PyQuery
 71 |         >>> d = PyQuery('<div><input disabled="disabled"/></div>')
 72 |         >>> d('input:disabled')
 73 |         [<input>]
 74 | 
 75 |     
 76 | 
 77 | :empty
 78 | ==================
 79 | 
 80 | Match all elements that do not contain other elements::
 81 | 
 82 |         >>> from pyquery import PyQuery
 83 |         >>> d = PyQuery('<div><h1><span>title</span></h1><h2/></div>')
 84 |         >>> d(':empty')
 85 |         [<h2>]
 86 | 
 87 |     
 88 | 
 89 | :enabled
 90 | ==================
 91 | 
 92 | Matches all elements that are enabled::
 93 | 
 94 |         >>> from pyquery import PyQuery
 95 |         >>> d = PyQuery('<div><input value="foo" /></div>')
 96 |         >>> d('input:enabled')
 97 |         [<input>]
 98 | 
 99 |     
100 | 
101 | :eq()
102 | ==================
103 | 
104 | Matches a single element by its index::
105 | 
106 |         >>> from pyquery import PyQuery
107 |         >>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
108 |         >>> d('h1:eq(0)')
109 |         [<h1.first>]
110 |         >>> d('h1:eq(1)')
111 |         [<h1.last>]
112 | 
113 |     
114 | 
115 | :even
116 | ==================
117 | 
118 | Matches even elements, zero-indexed::
119 | 
120 |         >>> from pyquery import PyQuery
121 |         >>> d = PyQuery('<div><p></p><p class="last"></p></div>')
122 |         >>> d('p:even')
123 |         [<p>]
124 | 
125 |     
126 | 
127 | :file
128 | ==================
129 | 
130 | Matches all input elements of type file::
131 | 
132 |         >>> from pyquery import PyQuery
133 |         >>> d = PyQuery('<div><input type="file"/></div>')
134 |         >>> d('input:file')
135 |         [<input>]
136 | 
137 |     
138 | 
139 | :first
140 | ==================
141 | 
142 | Matches the first selected element::
143 | 
144 |         >>> from pyquery import PyQuery
145 |         >>> d = PyQuery('<div><p class="first"></p><p></p></div>')
146 |         >>> d('p:first')
147 |         [<p.first>]
148 | 
149 |     
150 | 
151 | :gt()
152 | ==================
153 | 
154 | Matches all elements with an index over the given one::
155 | 
156 |         >>> from pyquery import PyQuery
157 |         >>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
158 |         >>> d('h1:gt(0)')
159 |         [<h1.last>]
160 | 
161 |     
162 | 
163 | :has()
164 | ==================
165 | 
166 | Matches elements which contain at least one element that matches
167 |     the specified selector. https://api.jquery.com/has-selector/
168 | 
169 |         >>> from pyquery import PyQuery
170 |         >>> d = PyQuery('<div class="foo"><div class="bar"></div></div>')
171 |         >>> d('.foo:has(".baz")')
172 |         []
173 |         >>> d('.foo:has(".foo")')
174 |         []
175 |         >>> d('.foo:has(".bar")')
176 |         [<div.foo>]
177 |         >>> d('.foo:has(div)')
178 |         [<div.foo>]
179 | 
180 |     
181 | 
182 | :header
183 | ==================
184 | 
185 | Matches all header elements (h1, ..., h6)::
186 | 
187 |         >>> from pyquery import PyQuery
188 |         >>> d = PyQuery('<div><h1>title</h1></div>')
189 |         >>> d(':header')
190 |         [<h1>]
191 | 
192 |     
193 | 
194 | :hidden
195 | ==================
196 | 
197 | Matches all hidden input elements::
198 | 
199 |         >>> from pyquery import PyQuery
200 |         >>> d = PyQuery('<div><input type="hidden"/></div>')
201 |         >>> d('input:hidden')
202 |         [<input>]
203 | 
204 |     
205 | 
206 | :image
207 | ==================
208 | 
209 | Matches all image input elements::
210 | 
211 |         >>> from pyquery import PyQuery
212 |         >>> d = PyQuery('<div><input type="image"/></div>')
213 |         >>> d('input:image')
214 |         [<input>]
215 | 
216 |     
217 | 
218 | :input
219 | ==================
220 | 
221 | Matches all input elements::
222 | 
223 |         >>> from pyquery import PyQuery
224 |         >>> d = PyQuery(('<div><input type="file"/>'
225 |         ...          '<textarea></textarea></div>'))
226 |         >>> d(':input')
227 |         [<input>, <textarea>]
228 | 
229 |     
230 | 
231 | :last
232 | ==================
233 | 
234 | Matches the last selected element::
235 | 
236 |         >>> from pyquery import PyQuery
237 |         >>> d = PyQuery('<div><p></p><p class="last"></p></div>')
238 |         >>> d('p:last')
239 |         [<p.last>]
240 | 
241 |     
242 | 
243 | :lt()
244 | ==================
245 | 
246 | Matches all elements with an index below the given one::
247 | 
248 |         >>> from pyquery import PyQuery
249 |         >>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
250 |         >>> d('h1:lt(1)')
251 |         [<h1.first>]
252 | 
253 |     
254 | 
255 | :odd
256 | ==================
257 | 
258 | Matches odd elements, zero-indexed::
259 | 
260 |         >>> from pyquery import PyQuery
261 |         >>> d = PyQuery('<div><p></p><p class="last"></p></div>')
262 |         >>> d('p:odd')
263 |         [<p.last>]
264 | 
265 |     
266 | 
267 | :parent
268 | ==================
269 | 
270 | Match all elements that contain other elements::
271 | 
272 |         >>> from pyquery import PyQuery
273 |         >>> d = PyQuery('<div><h1><span>title</span></h1><h1/></div>')
274 |         >>> d('h1:parent')
275 |         [<h1>]
276 | 
277 |     
278 | 
279 | :password
280 | ==================
281 | 
282 | Matches all password input elements::
283 | 
284 |         >>> from pyquery import PyQuery
285 |         >>> d = PyQuery('<div><input type="password"/></div>')
286 |         >>> d('input:password')
287 |         [<input>]
288 | 
289 |     
290 | 
291 | :pseudo
292 | ==================
293 | 
294 | Translate a pseudo-element.
295 | 
296 |     Defaults to not supporting pseudo-elements at all,
297 |     but can be overridden by sub-classes
298 | 
299 | :radio
300 | ==================
301 | 
302 | Matches all radio input elements::
303 | 
304 |         >>> from pyquery import PyQuery
305 |         >>> d = PyQuery('<div><input type="radio"/></div>')
306 |         >>> d('input:radio')
307 |         [<input>]
308 | 
309 |     
310 | 
311 | :reset
312 | ==================
313 | 
314 | Matches all reset input elements::
315 | 
316 |         >>> from pyquery import PyQuery
317 |         >>> d = PyQuery('<div><input type="reset"/></div>')
318 |         >>> d('input:reset')
319 |         [<input>]
320 | 
321 |     
322 | 
323 | :selected
324 | ==================
325 | 
326 | Matches all elements that are selected::
327 | 
328 |         >>> from pyquery import PyQuery
329 |         >>> d = PyQuery('<select><option selected="selected"/></select>')
330 |         >>> d('option:selected')
331 |         [<option>]
332 | 
333 |     
334 | 
335 | :submit
336 | ==================
337 | 
338 | Matches all submit input elements::
339 | 
340 |         >>> from pyquery import PyQuery
341 |         >>> d = PyQuery('<div><input type="submit"/></div>')
342 |         >>> d('input:submit')
343 |         [<input>]
344 | 
345 |     
346 | 
347 | :text
348 | ==================
349 | 
350 | Matches all text input elements::
351 | 
352 |         >>> from pyquery import PyQuery
353 |         >>> d = PyQuery('<div><input type="text"/></div>')
354 |         >>> d('input:text')
355 |         [<input>]
356 | 
357 |     


--------------------------------------------------------------------------------
/docs/scrap.rst:
--------------------------------------------------------------------------------
 1 | Scraping
 2 | =========
 3 | 
 4 | ..
 5 |   >>> from pyquery import PyQuery as pq
 6 |   >>> your_url = getfixture('scrap_url')
 7 | 
 8 | PyQuery is able to load an html document from a url::
 9 | 
10 |   >>> pq(url=your_url)
11 |   [<html>]
12 | 
13 | By default it uses python's urllib.
14 | 
15 | If `requests`_ is installed then it will use it. This allow you to use most of `requests`_ parameters::
16 | 
17 |   >>> pq(url=your_url, headers={'user-agent': 'pyquery'})
18 |   [<html>]
19 | 
20 |   >>> pq(url=your_url, data={'q': 'foo'}, method='post', verify=True)
21 |   [<html>]
22 | 
23 | 
24 | Timeout
25 | -------
26 | 
27 | The default timeout is 60 seconds, you can change it by setting the timeout parameter which is forwarded to the underlying urllib or requests library.
28 | 
29 | Session
30 | -------
31 | 
32 | When using the requests library you can instantiate a Session object which keeps state between http calls (for example - to keep cookies). You can set the session parameter to use this session object.
33 | 
34 | .. _requests: http://docs.python-requests.org/en/latest/
35 | 


--------------------------------------------------------------------------------
/docs/testing.rst:
--------------------------------------------------------------------------------
 1 | Testing
 2 | -------
 3 | 
 4 | If you want to run the tests that you can see above you should do::
 5 | 
 6 |     $ git clone git://github.com/gawel/pyquery.git
 7 |     $ cd pyquery
 8 |     $ python bootstrap.py
 9 |     $ bin/buildout install tox
10 |     $ bin/tox
11 | 
12 | You can build the Sphinx documentation by doing::
13 | 
14 |     $ cd docs
15 |     $ make html
16 | 


--------------------------------------------------------------------------------
/docs/tips.rst:
--------------------------------------------------------------------------------
 1 | Tips
 2 | ====
 3 | 
 4 | ..
 5 |     >>> from pyquery import PyQuery as pq
 6 |     >>> your_url = getfixture('tips_url')
 7 | 
 8 | Making links absolute
 9 | ---------------------
10 | 
11 | You can make links absolute which can be useful for screen scrapping::
12 | 
13 |     >>> d = pq(url=your_url, parser='html')
14 |     >>> d('form').attr('action')
15 |     '/form-submit'
16 |     >>> d.make_links_absolute()
17 |     [<html>]
18 | 
19 | Using different parsers
20 | -----------------------
21 | 
22 | By default pyquery uses the lxml xml parser and then if it doesn't work goes on
23 | to try the html parser from lxml.html. The xml parser can sometimes be
24 | problematic when parsing xhtml pages because the parser will not raise an error
25 | but give an unusable tree (on w3c.org for example).
26 | 
27 | You can also choose which parser to use explicitly::
28 | 
29 |    >>> pq('<html><body><p>toto</p></body></html>', parser='xml')
30 |    [<html>]
31 |    >>> pq('<html><body><p>toto</p></body></html>', parser='html')
32 |    [<html>]
33 |    >>> pq('<html><body><p>toto</p></body></html>', parser='html_fragments')
34 |    [<p>]
35 | 
36 | The html and html_fragments parser are the ones from lxml.html.
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/docs/traversing.rst:
--------------------------------------------------------------------------------
 1 | Traversing
 2 | ----------
 3 | 
 4 | ..
 5 |     >>> from pyquery import PyQuery as pq
 6 | 
 7 | Some jQuery traversal methods are supported.  Here are a few examples.
 8 | 
 9 | You can filter the selection list using a string selector::
10 | 
11 |     >>> d = pq('<p id="hello" class="hello"><a/></p><p id="test"><a/></p>')
12 |     >>> d('p').filter('.hello')
13 |     [<p#hello.hello>]
14 | 
15 | It is possible to select a single element with eq::
16 | 
17 |     >>> d('p').eq(0)
18 |     [<p#hello.hello>]
19 | 
20 | You can find nested elements::
21 | 
22 |     >>> d('p').find('a')
23 |     [<a>, <a>]
24 |     >>> d('p').eq(1).find('a')
25 |     [<a>]
26 | 
27 | Breaking out of a level of traversal is also supported using end::
28 | 
29 |     >>> d('p').find('a').end()
30 |     [<p#hello.hello>, <p#test>]
31 |     >>> d('p').eq(0).end()
32 |     [<p#hello.hello>, <p#test>]
33 |     >>> d('p').filter(lambda i: i == 1).end()
34 |     [<p#hello.hello>, <p#test>]
35 | 
36 | 
37 | If you want to select a dotted id you need to escape the dot::
38 | 
39 |     >>> d = pq('<p id="hello.you"><a/></p><p id="test"><a/></p>')
40 |     >>> d(r'#hello\.you')
41 |     [<p#hello.you>]
42 | 
43 | 


--------------------------------------------------------------------------------
/pyquery/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
2 | #
3 | # Distributed under the BSD license, see LICENSE.txt
4 | 
5 | from .pyquery import PyQuery  # NOQA
6 | 


--------------------------------------------------------------------------------
/pyquery/cssselectpatch.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
  2 | #
  3 | # Distributed under the BSD license, see LICENSE.txt
  4 | from __future__ import unicode_literals
  5 | from cssselect import xpath as cssselect_xpath
  6 | from cssselect.xpath import ExpressionError
  7 | 
  8 | XPathExprOrig = cssselect_xpath.XPathExpr
  9 | 
 10 | 
 11 | class XPathExpr(XPathExprOrig):
 12 | 
 13 |     def __init__(self, path='', element='*', condition='', star_prefix=False):
 14 |         self.path = path
 15 |         self.element = element
 16 |         self.condition = condition
 17 |         self.post_condition = None
 18 | 
 19 |     def add_post_condition(self, post_condition):
 20 |         if self.post_condition:
 21 |             self.post_condition = '%s and (%s)' % (self.post_condition,
 22 |                                                    post_condition)
 23 |         else:
 24 |             self.post_condition = post_condition
 25 | 
 26 |     def __str__(self):
 27 |         path = XPathExprOrig.__str__(self)
 28 |         if self.post_condition:
 29 |             path = '%s[%s]' % (path, self.post_condition)
 30 |         return path
 31 | 
 32 |     def join(self, combiner, other,
 33 |              closing_combiner=None, has_inner_condition=False):
 34 |         res = XPathExprOrig.join(self, combiner, other,
 35 |                                  closing_combiner=closing_combiner,
 36 |                                  has_inner_condition=has_inner_condition)
 37 |         self.post_condition = other.post_condition
 38 |         return res
 39 | 
 40 | 
 41 | # keep cssselect < 0.8 compat for now
 42 | 
 43 | 
 44 | class JQueryTranslator(cssselect_xpath.HTMLTranslator):
 45 |     """This class is used to implement the css pseudo classes
 46 |     (:first, :last, ...) that are not defined in the css standard,
 47 |     but are defined in the jquery API.
 48 |     """
 49 | 
 50 |     xpathexpr_cls = XPathExpr
 51 | 
 52 |     def xpath_first_pseudo(self, xpath):
 53 |         """Matches the first selected element::
 54 | 
 55 |             >>> from pyquery import PyQuery
 56 |             >>> d = PyQuery('<div><p class="first"></p><p></p></div>')
 57 |             >>> d('p:first')
 58 |             [<p.first>]
 59 | 
 60 |         ..
 61 |         """
 62 |         xpath.add_post_condition('position() = 1')
 63 |         return xpath
 64 | 
 65 |     def xpath_last_pseudo(self, xpath):
 66 |         """Matches the last selected element::
 67 | 
 68 |             >>> from pyquery import PyQuery
 69 |             >>> d = PyQuery('<div><p></p><p class="last"></p></div>')
 70 |             >>> d('p:last')
 71 |             [<p.last>]
 72 | 
 73 |         ..
 74 |         """
 75 |         xpath.add_post_condition('position() = last()')
 76 |         return xpath
 77 | 
 78 |     def xpath_even_pseudo(self, xpath):
 79 |         """Matches even elements, zero-indexed::
 80 | 
 81 |             >>> from pyquery import PyQuery
 82 |             >>> d = PyQuery('<div><p></p><p class="last"></p></div>')
 83 |             >>> d('p:even')
 84 |             [<p>]
 85 | 
 86 |         ..
 87 |         """
 88 |         # the first element is 1 in xpath and 0 in python and js
 89 |         xpath.add_post_condition('position() mod 2 = 1')
 90 |         return xpath
 91 | 
 92 |     def xpath_odd_pseudo(self, xpath):
 93 |         """Matches odd elements, zero-indexed::
 94 | 
 95 |             >>> from pyquery import PyQuery
 96 |             >>> d = PyQuery('<div><p></p><p class="last"></p></div>')
 97 |             >>> d('p:odd')
 98 |             [<p.last>]
 99 | 
100 |         ..
101 |         """
102 |         xpath.add_post_condition('position() mod 2 = 0')
103 |         return xpath
104 | 
105 |     def xpath_checked_pseudo(self, xpath):
106 |         """Matches odd elements, zero-indexed::
107 | 
108 |             >>> from pyquery import PyQuery
109 |             >>> d = PyQuery('<div><input checked="checked"/></div>')
110 |             >>> d('input:checked')
111 |             [<input>]
112 | 
113 |         ..
114 |         """
115 |         xpath.add_condition("@checked and name(.) = 'input'")
116 |         return xpath
117 | 
118 |     def xpath_selected_pseudo(self, xpath):
119 |         """Matches all elements that are selected::
120 | 
121 |             >>> from pyquery import PyQuery
122 |             >>> d = PyQuery('<select><option selected="selected"/></select>')
123 |             >>> d('option:selected')
124 |             [<option>]
125 | 
126 |         ..
127 |         """
128 |         xpath.add_condition("@selected and name(.) = 'option'")
129 |         return xpath
130 | 
131 |     def _format_disabled_xpath(self, disabled=True):
132 |         """Format XPath condition for :disabled or :enabled pseudo-classes
133 |         according to the WHATWG spec. See: https://html.spec.whatwg.org
134 |         /multipage/semantics-other.html#concept-element-disabled
135 |         """
136 |         bool_op = '' if disabled else 'not'
137 |         return '''(
138 |             ((name(.) = 'button' or name(.) = 'input' or name(.) = 'select'
139 |                     or name(.) = 'textarea' or name(.) = 'fieldset')
140 |                 and %s(@disabled or (ancestor::fieldset[@disabled]
141 |                     and not(ancestor::legend[not(preceding-sibling::legend)])))
142 |             )
143 |             or
144 |             ((name(.) = 'option'
145 |                 and %s(@disabled or ancestor::optgroup[@disabled]))
146 |             )
147 |             or
148 |             ((name(.) = 'optgroup' and %s(@disabled)))
149 |             )''' % (bool_op, bool_op, bool_op)
150 | 
151 |     def xpath_disabled_pseudo(self, xpath):
152 |         """Matches all elements that are disabled::
153 | 
154 |             >>> from pyquery import PyQuery
155 |             >>> d = PyQuery('<div><input disabled="disabled"/></div>')
156 |             >>> d('input:disabled')
157 |             [<input>]
158 | 
159 |         ..
160 |         """
161 |         xpath.add_condition(self._format_disabled_xpath())
162 |         return xpath
163 | 
164 |     def xpath_enabled_pseudo(self, xpath):
165 |         """Matches all elements that are enabled::
166 | 
167 |             >>> from pyquery import PyQuery
168 |             >>> d = PyQuery('<div><input value="foo" /></div>')
169 |             >>> d('input:enabled')
170 |             [<input>]
171 | 
172 |         ..
173 |         """
174 |         xpath.add_condition(self._format_disabled_xpath(disabled=False))
175 |         return xpath
176 | 
177 |     def xpath_file_pseudo(self, xpath):
178 |         """Matches all input elements of type file::
179 | 
180 |             >>> from pyquery import PyQuery
181 |             >>> d = PyQuery('<div><input type="file"/></div>')
182 |             >>> d('input:file')
183 |             [<input>]
184 | 
185 |         ..
186 |         """
187 |         xpath.add_condition("@type = 'file' and name(.) = 'input'")
188 |         return xpath
189 | 
190 |     def xpath_input_pseudo(self, xpath):
191 |         """Matches all input elements::
192 | 
193 |             >>> from pyquery import PyQuery
194 |             >>> d = PyQuery(('<div><input type="file"/>'
195 |             ...              '<textarea></textarea></div>'))
196 |             >>> d(':input')
197 |             [<input>, <textarea>]
198 | 
199 |         ..
200 |         """
201 |         xpath.add_condition((
202 |             "(name(.) = 'input' or name(.) = 'select') "
203 |             "or (name(.) = 'textarea' or name(.) = 'button')"))
204 |         return xpath
205 | 
206 |     def xpath_button_pseudo(self, xpath):
207 |         """Matches all button input elements and the button element::
208 | 
209 |             >>> from pyquery import PyQuery
210 |             >>> d = PyQuery(('<div><input type="button"/>'
211 |             ...              '<button></button></div>'))
212 |             >>> d(':button')
213 |             [<input>, <button>]
214 | 
215 |         ..
216 |         """
217 |         xpath.add_condition((
218 |             "(@type = 'button' and name(.) = 'input') "
219 |             "or name(.) = 'button'"))
220 |         return xpath
221 | 
222 |     def xpath_radio_pseudo(self, xpath):
223 |         """Matches all radio input elements::
224 | 
225 |             >>> from pyquery import PyQuery
226 |             >>> d = PyQuery('<div><input type="radio"/></div>')
227 |             >>> d('input:radio')
228 |             [<input>]
229 | 
230 |         ..
231 |         """
232 |         xpath.add_condition("@type = 'radio' and name(.) = 'input'")
233 |         return xpath
234 | 
235 |     def xpath_text_pseudo(self, xpath):
236 |         """Matches all text input elements::
237 | 
238 |             >>> from pyquery import PyQuery
239 |             >>> d = PyQuery('<div><input type="text"/></div>')
240 |             >>> d('input:text')
241 |             [<input>]
242 | 
243 |         ..
244 |         """
245 |         xpath.add_condition("@type = 'text' and name(.) = 'input'")
246 |         return xpath
247 | 
248 |     def xpath_checkbox_pseudo(self, xpath):
249 |         """Matches all checkbox input elements::
250 | 
251 |             >>> from pyquery import PyQuery
252 |             >>> d = PyQuery('<div><input type="checkbox"/></div>')
253 |             >>> d('input:checkbox')
254 |             [<input>]
255 | 
256 |         ..
257 |         """
258 |         xpath.add_condition("@type = 'checkbox' and name(.) = 'input'")
259 |         return xpath
260 | 
261 |     def xpath_password_pseudo(self, xpath):
262 |         """Matches all password input elements::
263 | 
264 |             >>> from pyquery import PyQuery
265 |             >>> d = PyQuery('<div><input type="password"/></div>')
266 |             >>> d('input:password')
267 |             [<input>]
268 | 
269 |         ..
270 |         """
271 |         xpath.add_condition("@type = 'password' and name(.) = 'input'")
272 |         return xpath
273 | 
274 |     def xpath_submit_pseudo(self, xpath):
275 |         """Matches all submit input elements::
276 | 
277 |             >>> from pyquery import PyQuery
278 |             >>> d = PyQuery('<div><input type="submit"/></div>')
279 |             >>> d('input:submit')
280 |             [<input>]
281 | 
282 |         ..
283 |         """
284 |         xpath.add_condition("@type = 'submit' and name(.) = 'input'")
285 |         return xpath
286 | 
287 |     def xpath_hidden_pseudo(self, xpath):
288 |         """Matches all hidden input elements::
289 | 
290 |             >>> from pyquery import PyQuery
291 |             >>> d = PyQuery('<div><input type="hidden"/></div>')
292 |             >>> d('input:hidden')
293 |             [<input>]
294 | 
295 |         ..
296 |         """
297 |         xpath.add_condition("@type = 'hidden' and name(.) = 'input'")
298 |         return xpath
299 | 
300 |     def xpath_image_pseudo(self, xpath):
301 |         """Matches all image input elements::
302 | 
303 |             >>> from pyquery import PyQuery
304 |             >>> d = PyQuery('<div><input type="image"/></div>')
305 |             >>> d('input:image')
306 |             [<input>]
307 | 
308 |         ..
309 |         """
310 |         xpath.add_condition("@type = 'image' and name(.) = 'input'")
311 |         return xpath
312 | 
313 |     def xpath_reset_pseudo(self, xpath):
314 |         """Matches all reset input elements::
315 | 
316 |             >>> from pyquery import PyQuery
317 |             >>> d = PyQuery('<div><input type="reset"/></div>')
318 |             >>> d('input:reset')
319 |             [<input>]
320 | 
321 |         ..
322 |         """
323 |         xpath.add_condition("@type = 'reset' and name(.) = 'input'")
324 |         return xpath
325 | 
326 |     def xpath_header_pseudo(self, xpath):
327 |         """Matches all header elements (h1, ..., h6)::
328 | 
329 |             >>> from pyquery import PyQuery
330 |             >>> d = PyQuery('<div><h1>title</h1></div>')
331 |             >>> d(':header')
332 |             [<h1>]
333 | 
334 |         ..
335 |         """
336 |         # this seems kind of brute-force, is there a better way?
337 |         xpath.add_condition((
338 |             "(name(.) = 'h1' or name(.) = 'h2' or name (.) = 'h3') "
339 |             "or (name(.) = 'h4' or name (.) = 'h5' or name(.) = 'h6')"))
340 |         return xpath
341 | 
342 |     def xpath_parent_pseudo(self, xpath):
343 |         """Match all elements that contain other elements::
344 | 
345 |             >>> from pyquery import PyQuery
346 |             >>> d = PyQuery('<div><h1><span>title</span></h1><h1/></div>')
347 |             >>> d('h1:parent')
348 |             [<h1>]
349 | 
350 |         ..
351 |         """
352 |         xpath.add_condition("count(child::*) > 0")
353 |         return xpath
354 | 
355 |     def xpath_empty_pseudo(self, xpath):
356 |         """Match all elements that do not contain other elements::
357 | 
358 |             >>> from pyquery import PyQuery
359 |             >>> d = PyQuery('<div><h1><span>title</span></h1><h2/></div>')
360 |             >>> d(':empty')
361 |             [<h2>]
362 | 
363 |         ..
364 |         """
365 |         xpath.add_condition("not(node())")
366 |         return xpath
367 | 
368 |     def xpath_eq_function(self, xpath, function):
369 |         """Matches a single element by its index::
370 | 
371 |             >>> from pyquery import PyQuery
372 |             >>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
373 |             >>> d('h1:eq(0)')
374 |             [<h1.first>]
375 |             >>> d('h1:eq(1)')
376 |             [<h1.last>]
377 | 
378 |         ..
379 |         """
380 |         if function.argument_types() != ['NUMBER']:
381 |             raise ExpressionError(
382 |                 "Expected a single integer for :eq(), got %r" % (
383 |                     function.arguments,))
384 |         value = int(function.arguments[0].value)
385 |         xpath.add_post_condition('position() = %s' % (value + 1))
386 |         return xpath
387 | 
388 |     def xpath_gt_function(self, xpath, function):
389 |         """Matches all elements with an index over the given one::
390 | 
391 |             >>> from pyquery import PyQuery
392 |             >>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
393 |             >>> d('h1:gt(0)')
394 |             [<h1.last>]
395 | 
396 |         ..
397 |         """
398 |         if function.argument_types() != ['NUMBER']:
399 |             raise ExpressionError(
400 |                 "Expected a single integer for :gt(), got %r" % (
401 |                     function.arguments,))
402 |         value = int(function.arguments[0].value)
403 |         xpath.add_post_condition('position() > %s' % (value + 1))
404 |         return xpath
405 | 
406 |     def xpath_lt_function(self, xpath, function):
407 |         """Matches all elements with an index below the given one::
408 | 
409 |             >>> from pyquery import PyQuery
410 |             >>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
411 |             >>> d('h1:lt(1)')
412 |             [<h1.first>]
413 | 
414 |         ..
415 |         """
416 |         if function.argument_types() != ['NUMBER']:
417 |             raise ExpressionError(
418 |                 "Expected a single integer for :gt(), got %r" % (
419 |                     function.arguments,))
420 | 
421 |         value = int(function.arguments[0].value)
422 |         xpath.add_post_condition('position() < %s' % (value + 1))
423 |         return xpath
424 | 
425 |     def xpath_contains_function(self, xpath, function):
426 |         """Matches all elements that contain the given text
427 | 
428 |             >>> from pyquery import PyQuery
429 |             >>> d = PyQuery('<div><h1/><h1 class="title">title</h1></div>')
430 |             >>> d('h1:contains("title")')
431 |             [<h1.title>]
432 | 
433 |         ..
434 |         """
435 |         if function.argument_types() not in (['STRING'], ['IDENT']):
436 |             raise ExpressionError(
437 |                 "Expected a single string or ident for :contains(), got %r" % (
438 |                     function.arguments,))
439 | 
440 |         value = self.xpath_literal(function.arguments[0].value)
441 |         xpath.add_post_condition('contains(., %s)' % value)
442 |         return xpath
443 | 
444 |     def xpath_has_function(self, xpath, function):
445 |         """Matches elements which contain at least one element that matches
446 |         the specified selector. https://api.jquery.com/has-selector/
447 | 
448 |             >>> from pyquery import PyQuery
449 |             >>> d = PyQuery('<div class="foo"><div class="bar"></div></div>')
450 |             >>> d('.foo:has(".baz")')
451 |             []
452 |             >>> d('.foo:has(".foo")')
453 |             []
454 |             >>> d('.foo:has(".bar")')
455 |             [<div.foo>]
456 |             >>> d('.foo:has(div)')
457 |             [<div.foo>]
458 | 
459 |         ..
460 |         """
461 |         if function.argument_types() not in (['STRING'], ['IDENT']):
462 |             raise ExpressionError(
463 |                 "Expected a single string or ident for :has(), got %r" % (
464 |                     function.arguments,))
465 |         value = self.css_to_xpath(
466 |             function.arguments[0].value, prefix='descendant::',
467 |         )
468 |         xpath.add_post_condition(value)
469 |         return xpath
470 | 


--------------------------------------------------------------------------------
/pyquery/openers.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from urllib.parse import urlencode
 3 | from urllib.error import HTTPError
 4 | 
 5 | try:
 6 |     import requests
 7 |     HAS_REQUEST = True
 8 | except ImportError:
 9 |     HAS_REQUEST = False
10 | 
11 | DEFAULT_TIMEOUT = 60
12 | 
13 | basestring = (str, bytes)
14 | 
15 | allowed_args = (
16 |     'auth', 'data', 'headers', 'verify',
17 |     'cert', 'config', 'hooks', 'proxies', 'cookies'
18 | )
19 | 
20 | 
21 | def _query(url, method, kwargs):
22 |     data = None
23 |     if 'data' in kwargs:
24 |         data = kwargs.pop('data')
25 |     if type(data) in (dict, list, tuple):
26 |         data = urlencode(data)
27 | 
28 |     if isinstance(method, basestring) and \
29 |        method.lower() == 'get' and data:
30 |         if '?' not in url:
31 |             url += '?'
32 |         elif url[-1] not in ('?', '&'):
33 |             url += '&'
34 |         url += data
35 |         data = None
36 | 
37 |     if data:
38 |         data = data.encode('utf-8')
39 |     return url, data
40 | 
41 | 
42 | def _requests(url, kwargs):
43 | 
44 |     encoding = kwargs.get('encoding')
45 |     method = kwargs.get('method', 'get').lower()
46 |     session = kwargs.get('session')
47 |     if session:
48 |         meth = getattr(session, str(method))
49 |     else:
50 |         meth = getattr(requests, str(method))
51 |     if method == 'get':
52 |         url, data = _query(url, method, kwargs)
53 |     kw = {}
54 |     for k in allowed_args:
55 |         if k in kwargs:
56 |             kw[k] = kwargs[k]
57 |     resp = meth(url=url, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT), **kw)
58 |     if not (200 <= resp.status_code < 300):
59 |         raise HTTPError(resp.url, resp.status_code,
60 |                         resp.reason, resp.headers, None)
61 |     if encoding:
62 |         resp.encoding = encoding
63 |     html = resp.text
64 |     return html
65 | 
66 | 
67 | def _urllib(url, kwargs):
68 |     method = kwargs.get('method')
69 |     url, data = _query(url, method, kwargs)
70 |     return urlopen(url, data, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT))
71 | 
72 | 
73 | def url_opener(url, kwargs):
74 |     if HAS_REQUEST:
75 |         return _requests(url, kwargs)
76 |     return _urllib(url, kwargs)
77 | 


--------------------------------------------------------------------------------
/pyquery/pyquery.py:
--------------------------------------------------------------------------------
   1 | # Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
   2 | #
   3 | # Distributed under the BSD license, see LICENSE.txt
   4 | from .cssselectpatch import JQueryTranslator
   5 | from reprlib import recursive_repr
   6 | from urllib.parse import urlencode
   7 | from urllib.parse import urljoin
   8 | from .openers import url_opener
   9 | from .text import extract_text
  10 | from copy import deepcopy
  11 | from html import escape
  12 | from lxml import etree
  13 | import lxml.html
  14 | import inspect
  15 | import itertools
  16 | import types
  17 | import sys
  18 | 
  19 | if sys.version_info >= (3, 12, 0):
  20 |     from collections import OrderedDict
  21 | else:
  22 |     # backward compat. to be able to run doctest with 3.7+. see:
  23 |     # https://github.com/gawel/pyquery/issues/249
  24 |     # and:
  25 |     # https://github.com/python/cpython/blob/3.12/Lib/collections/__init__.py#L272
  26 |     from collections import OrderedDict as BaseOrderedDict
  27 | 
  28 |     class OrderedDict(BaseOrderedDict):
  29 |         @recursive_repr()
  30 |         def __repr__(self):
  31 |             'od.__repr__() <==> repr(od)'
  32 |             if not self:
  33 |                 return '%s()' % (self.__class__.__name__,)
  34 |             return '%s(%r)' % (self.__class__.__name__, dict(self.items()))
  35 | 
  36 | basestring = (str, bytes)
  37 | 
  38 | 
  39 | def getargspec(func):
  40 |     args = inspect.signature(func).parameters.values()
  41 |     return [p.name for p in args
  42 |             if p.kind == p.POSITIONAL_OR_KEYWORD]
  43 | 
  44 | 
  45 | def with_camel_case_alias(func):
  46 |     """decorator for methods who required a camelcase alias"""
  47 |     _camel_case_aliases.add(func.__name__)
  48 |     return func
  49 | 
  50 | 
  51 | _camel_case_aliases = set()
  52 | 
  53 | 
  54 | def build_camel_case_aliases(PyQuery):
  55 |     """add camelcase aliases to PyQuery"""
  56 |     for alias in _camel_case_aliases:
  57 |         parts = list(alias.split('_'))
  58 |         name = parts[0] + ''.join([p.title() for p in parts[1:]])
  59 |         func = getattr(PyQuery, alias)
  60 |         f = types.FunctionType(func.__code__, func.__globals__,
  61 |                                name, func.__defaults__)
  62 |         f.__doc__ = (
  63 |             'Alias for :func:`~pyquery.pyquery.PyQuery.%s`') % func.__name__
  64 |         setattr(PyQuery, name, f.__get__(None, PyQuery))
  65 | 
  66 | 
  67 | def fromstring(context, parser=None, custom_parser=None):
  68 |     """use html parser if we don't have clean xml
  69 |     """
  70 |     if hasattr(context, 'read') and hasattr(context.read, '__call__'):
  71 |         meth = 'parse'
  72 |     else:
  73 |         meth = 'fromstring'
  74 |     if custom_parser is None:
  75 |         if parser is None:
  76 |             try:
  77 |                 result = getattr(etree, meth)(context)
  78 |             except etree.XMLSyntaxError:
  79 |                 if hasattr(context, 'seek'):
  80 |                     context.seek(0)
  81 |                 result = getattr(lxml.html, meth)(context)
  82 |             if isinstance(result, etree._ElementTree):
  83 |                 return [result.getroot()]
  84 |             else:
  85 |                 return [result]
  86 |         elif parser == 'xml':
  87 |             custom_parser = getattr(etree, meth)
  88 |         elif parser == 'html':
  89 |             custom_parser = getattr(lxml.html, meth)
  90 |         elif parser == 'html5':
  91 |             from lxml.html import html5parser
  92 |             custom_parser = getattr(html5parser, meth)
  93 |         elif parser == 'soup':
  94 |             from lxml.html import soupparser
  95 |             custom_parser = getattr(soupparser, meth)
  96 |         elif parser == 'html_fragments':
  97 |             custom_parser = lxml.html.fragments_fromstring
  98 |         else:
  99 |             raise ValueError('No such parser: "%s"' % parser)
 100 | 
 101 |     result = custom_parser(context)
 102 |     if isinstance(result, list):
 103 |         return result
 104 |     elif isinstance(result, etree._ElementTree):
 105 |         return [result.getroot()]
 106 |     elif result is not None:
 107 |         return [result]
 108 |     else:
 109 |         return []
 110 | 
 111 | 
 112 | def callback(func, *args):
 113 |     return func(*args[:func.__code__.co_argcount])
 114 | 
 115 | 
 116 | class NoDefault(object):
 117 |     def __repr__(self):
 118 |         """clean representation in Sphinx"""
 119 |         return '<NoDefault>'
 120 | 
 121 | 
 122 | no_default = NoDefault()
 123 | del NoDefault
 124 | 
 125 | 
 126 | class FlexibleElement(object):
 127 |     """property to allow a flexible api"""
 128 |     def __init__(self, pget, pset=no_default, pdel=no_default):
 129 |         self.pget = pget
 130 |         self.pset = pset
 131 |         self.pdel = pdel
 132 | 
 133 |     def __get__(self, instance, klass):
 134 |         class _element(object):
 135 |             """real element to support set/get/del attr and item and js call
 136 |             style"""
 137 |             def __call__(prop, *args, **kwargs):
 138 |                 return self.pget(instance, *args, **kwargs)
 139 |             __getattr__ = __getitem__ = __setattr__ = __setitem__ = __call__
 140 | 
 141 |             def __delitem__(prop, name):
 142 |                 if self.pdel is not no_default:
 143 |                     return self.pdel(instance, name)
 144 |                 else:
 145 |                     raise NotImplementedError()
 146 |             __delattr__ = __delitem__
 147 | 
 148 |             def __repr__(prop):
 149 |                 return '<flexible_element %s>' % self.pget.__name__
 150 |         return _element()
 151 | 
 152 |     def __set__(self, instance, value):
 153 |         if self.pset is not no_default:
 154 |             self.pset(instance, value)
 155 |         else:
 156 |             raise NotImplementedError()
 157 | 
 158 | 
 159 | class PyQuery(list):
 160 |     """The main class
 161 |     """
 162 | 
 163 |     _translator_class = JQueryTranslator
 164 | 
 165 |     def __init__(self, *args, **kwargs):
 166 |         html = None
 167 |         elements = []
 168 |         self._base_url = None
 169 |         self.parser = kwargs.pop('parser', None)
 170 | 
 171 |         if 'parent' in kwargs:
 172 |             self._parent = kwargs.pop('parent')
 173 |         else:
 174 |             self._parent = no_default
 175 | 
 176 |         if 'css_translator' in kwargs:
 177 |             self._translator = kwargs.pop('css_translator')
 178 |         elif self.parser in ('xml',):
 179 |             self._translator = self._translator_class(xhtml=True)
 180 |         elif self._parent is not no_default:
 181 |             self._translator = self._parent._translator
 182 |         else:
 183 |             self._translator = self._translator_class(xhtml=False)
 184 | 
 185 |         self.namespaces = kwargs.pop('namespaces', None)
 186 | 
 187 |         if kwargs:
 188 |             # specific case to get the dom
 189 |             if 'filename' in kwargs:
 190 |                 html = open(kwargs['filename'],
 191 |                             encoding=kwargs.get('encoding'))
 192 |             elif 'url' in kwargs:
 193 |                 url = kwargs.pop('url')
 194 |                 if 'opener' in kwargs:
 195 |                     opener = kwargs.pop('opener')
 196 |                     html = opener(url, **kwargs)
 197 |                 else:
 198 |                     html = url_opener(url, kwargs)
 199 |                 if not self.parser:
 200 |                     self.parser = 'html'
 201 |                 self._base_url = url
 202 |             else:
 203 |                 raise ValueError('Invalid keyword arguments %s' % kwargs)
 204 | 
 205 |             elements = fromstring(html, self.parser)
 206 |             # close open descriptor if possible
 207 |             if hasattr(html, 'close'):
 208 |                 try:
 209 |                     html.close()
 210 |                 except Exception:
 211 |                     pass
 212 | 
 213 |         else:
 214 |             # get nodes
 215 | 
 216 |             # determine context and selector if any
 217 |             selector = context = no_default
 218 |             length = len(args)
 219 |             if length == 1:
 220 |                 context = args[0]
 221 |             elif length == 2:
 222 |                 selector, context = args
 223 |             else:
 224 |                 raise ValueError(
 225 |                     "You can't do that. Please, provide arguments")
 226 | 
 227 |             # get context
 228 |             if isinstance(context, basestring):
 229 |                 try:
 230 |                     elements = fromstring(context, self.parser)
 231 |                 except Exception:
 232 |                     raise
 233 |             elif isinstance(context, self.__class__):
 234 |                 # copy
 235 |                 elements = context[:]
 236 |             elif isinstance(context, list):
 237 |                 elements = context
 238 |             elif isinstance(context, etree._Element):
 239 |                 elements = [context]
 240 |             else:
 241 |                 raise TypeError(context)
 242 | 
 243 |             # select nodes
 244 |             if elements and selector is not no_default:
 245 |                 xpath = self._css_to_xpath(selector)
 246 |                 results = []
 247 |                 for tag in elements:
 248 |                     results.extend(
 249 |                         tag.xpath(xpath, namespaces=self.namespaces))
 250 |                 elements = results
 251 | 
 252 |         list.__init__(self, elements)
 253 | 
 254 |     def _css_to_xpath(self, selector, prefix='descendant-or-self::'):
 255 |         selector = selector.replace('[@', '[')
 256 |         return self._translator.css_to_xpath(selector, prefix)
 257 | 
 258 |     def _copy(self, *args, **kwargs):
 259 |         kwargs.setdefault('namespaces', self.namespaces)
 260 |         return self.__class__(*args, **kwargs)
 261 | 
 262 |     def __call__(self, *args, **kwargs):
 263 |         """return a new PyQuery instance
 264 |         """
 265 |         length = len(args)
 266 |         if length == 0:
 267 |             raise ValueError('You must provide at least a selector')
 268 |         if args[0] == '':
 269 |             return self._copy([])
 270 |         if (len(args) == 1 and
 271 |                 isinstance(args[0], str) and
 272 |                 not args[0].startswith('<')):
 273 |             args += (self,)
 274 |         result = self._copy(*args, parent=self, **kwargs)
 275 |         return result
 276 | 
 277 |     # keep original list api prefixed with _
 278 |     _append = list.append
 279 |     _extend = list.extend
 280 | 
 281 |     # improve pythonic api
 282 |     def __add__(self, other):
 283 |         assert isinstance(other, self.__class__)
 284 |         return self._copy(self[:] + other[:])
 285 | 
 286 |     def extend(self, other):
 287 |         """Extend with another PyQuery object"""
 288 |         assert isinstance(other, self.__class__)
 289 |         self._extend(other[:])
 290 |         return self
 291 | 
 292 |     def items(self, selector=None):
 293 |         """Iter over elements. Return PyQuery objects:
 294 | 
 295 |             >>> d = PyQuery('<div><span>foo</span><span>bar</span></div>')
 296 |             >>> [i.text() for i in d.items('span')]
 297 |             ['foo', 'bar']
 298 |             >>> [i.text() for i in d('span').items()]
 299 |             ['foo', 'bar']
 300 |             >>> list(d.items('a')) == list(d('a').items())
 301 |             True
 302 |         """
 303 |         if selector:
 304 |             elems = self(selector) or []
 305 |         else:
 306 |             elems = self
 307 |         for elem in elems:
 308 |             yield self._copy(elem, parent=self)
 309 | 
 310 |     def xhtml_to_html(self):
 311 |         """Remove xhtml namespace:
 312 | 
 313 |             >>> doc = PyQuery(
 314 |             ...         '<html xmlns="http://www.w3.org/1999/xhtml"></html>')
 315 |             >>> doc
 316 |             [<{http://www.w3.org/1999/xhtml}html>]
 317 |             >>> doc.xhtml_to_html()
 318 |             [<html>]
 319 |         """
 320 |         try:
 321 |             root = self[0].getroottree()
 322 |         except IndexError:
 323 |             pass
 324 |         else:
 325 |             lxml.html.xhtml_to_html(root)
 326 |         return self
 327 | 
 328 |     def remove_namespaces(self):
 329 |         """Remove all namespaces:
 330 | 
 331 |             >>> doc = PyQuery('<foo xmlns="http://example.com/foo"></foo>')
 332 |             >>> doc
 333 |             [<{http://example.com/foo}foo>]
 334 |             >>> doc.remove_namespaces()
 335 |             [<foo>]
 336 |         """
 337 |         try:
 338 |             root = self[0].getroottree()
 339 |         except IndexError:
 340 |             pass
 341 |         else:
 342 |             for el in root.iter('{*}*'):
 343 |                 if el.tag.startswith('{'):
 344 |                     el.tag = el.tag.split('}', 1)[1]
 345 |         return self
 346 | 
 347 |     def __str__(self):
 348 |         """xml representation of current nodes::
 349 | 
 350 |             >>> xml = PyQuery(
 351 |             ...   '<script><![[CDATA[ ]></script>', parser='html_fragments')
 352 |             >>> print(str(xml))
 353 |             <script>&lt;![[CDATA[ ]&gt;</script>
 354 | 
 355 |         """
 356 |         return ''.join([etree.tostring(e, encoding=str) for e in self])
 357 | 
 358 |     def __unicode__(self):
 359 |         """xml representation of current nodes"""
 360 |         return u''.join([etree.tostring(e, encoding=str)
 361 |                          for e in self])
 362 | 
 363 |     def __html__(self):
 364 |         """html representation of current nodes::
 365 | 
 366 |             >>> html = PyQuery(
 367 |             ...   '<script><![[CDATA[ ]></script>', parser='html_fragments')
 368 |             >>> print(html.__html__())
 369 |             <script><![[CDATA[ ]></script>
 370 | 
 371 |         """
 372 |         return u''.join([lxml.html.tostring(e, encoding=str)
 373 |                          for e in self])
 374 | 
 375 |     def __repr__(self):
 376 |         r = []
 377 |         try:
 378 |             for el in self:
 379 |                 c = el.get('class')
 380 |                 c = c and '.' + '.'.join(c.split(' ')) or ''
 381 |                 id = el.get('id')
 382 |                 id = id and '#' + id or ''
 383 |                 r.append('<%s%s%s>' % (el.tag, id, c))
 384 |             return '[' + (', '.join(r)) + ']'
 385 |         except AttributeError:
 386 |             return list.__repr__(self)
 387 | 
 388 |     @property
 389 |     def root(self):
 390 |         """return the xml root element
 391 |         """
 392 |         if self._parent is not no_default:
 393 |             return self._parent[0].getroottree()
 394 |         return self[0].getroottree()
 395 | 
 396 |     @property
 397 |     def encoding(self):
 398 |         """return the xml encoding of the root element
 399 |         """
 400 |         root = self.root
 401 |         if root is not None:
 402 |             return self.root.docinfo.encoding
 403 | 
 404 |     ##############
 405 |     # Traversing #
 406 |     ##############
 407 | 
 408 |     def _filter_only(self, selector, elements, reverse=False, unique=False):
 409 |         """Filters the selection set only, as opposed to also including
 410 |            descendants.
 411 |         """
 412 |         if selector is None:
 413 |             results = elements
 414 |         else:
 415 |             xpath = self._css_to_xpath(selector, 'self::')
 416 |             results = []
 417 |             for tag in elements:
 418 |                 results.extend(tag.xpath(xpath, namespaces=self.namespaces))
 419 |         if reverse:
 420 |             results.reverse()
 421 |         if unique:
 422 |             result_list = results
 423 |             results = []
 424 |             for item in result_list:
 425 |                 if item not in results:
 426 |                     results.append(item)
 427 |         return self._copy(results, parent=self)
 428 | 
 429 |     def parent(self, selector=None):
 430 |         return self._filter_only(
 431 |             selector,
 432 |             [e.getparent() for e in self if e.getparent() is not None],
 433 |             unique=True)
 434 | 
 435 |     def prev(self, selector=None):
 436 |         return self._filter_only(
 437 |             selector,
 438 |             [e.getprevious() for e in self if e.getprevious() is not None])
 439 | 
 440 |     def next(self, selector=None):
 441 |         return self._filter_only(
 442 |             selector,
 443 |             [e.getnext() for e in self if e.getnext() is not None])
 444 | 
 445 |     def _traverse(self, method):
 446 |         for e in self:
 447 |             current = getattr(e, method)()
 448 |             while current is not None:
 449 |                 yield current
 450 |                 current = getattr(current, method)()
 451 | 
 452 |     def _traverse_parent_topdown(self):
 453 |         for e in self:
 454 |             this_list = []
 455 |             current = e.getparent()
 456 |             while current is not None:
 457 |                 this_list.append(current)
 458 |                 current = current.getparent()
 459 |             this_list.reverse()
 460 |             for j in this_list:
 461 |                 yield j
 462 | 
 463 |     def _next_all(self):
 464 |         return [e for e in self._traverse('getnext')]
 465 | 
 466 |     @with_camel_case_alias
 467 |     def next_all(self, selector=None):
 468 |         """
 469 |         >>> h = '<span><p class="hello">Hi</p><p>Bye</p><img scr=""/></span>'
 470 |         >>> d = PyQuery(h)
 471 |         >>> d('p:last').next_all()
 472 |         [<img>]
 473 |         >>> d('p:last').nextAll()
 474 |         [<img>]
 475 |         """
 476 |         return self._filter_only(selector, self._next_all())
 477 | 
 478 |     @with_camel_case_alias
 479 |     def next_until(self, selector, filter_=None):
 480 |         """
 481 |         >>> h = '''
 482 |         ... <h2>Greeting 1</h2>
 483 |         ... <p>Hello!</p><p>World!</p>
 484 |         ... <h2>Greeting 2</h2><p>Bye!</p>
 485 |         ... '''
 486 |         >>> d = PyQuery(h)
 487 |         >>> d('h2:first').nextUntil('h2')
 488 |         [<p>, <p>]
 489 |         """
 490 |         return self._filter_only(
 491 |             filter_, [
 492 |                 e
 493 |                 for q in itertools.takewhile(
 494 |                     lambda q: not q.is_(selector), self.next_all().items())
 495 |                 for e in q
 496 |             ]
 497 |         )
 498 | 
 499 |     def _prev_all(self):
 500 |         return [e for e in self._traverse('getprevious')]
 501 | 
 502 |     @with_camel_case_alias
 503 |     def prev_all(self, selector=None):
 504 |         """
 505 |         >>> h = '<span><p class="hello">Hi</p><p>Bye</p><img scr=""/></span>'
 506 |         >>> d = PyQuery(h)
 507 |         >>> d('p:last').prev_all()
 508 |         [<p.hello>]
 509 |         >>> d('p:last').prevAll()
 510 |         [<p.hello>]
 511 |         """
 512 |         return self._filter_only(selector, self._prev_all(), reverse=True)
 513 | 
 514 |     def siblings(self, selector=None):
 515 |         """
 516 |          >>> h = '<span><p class="hello">Hi</p><p>Bye</p><img scr=""/></span>'
 517 |          >>> d = PyQuery(h)
 518 |          >>> d('.hello').siblings()
 519 |          [<p>, <img>]
 520 |          >>> d('.hello').siblings('img')
 521 |          [<img>]
 522 | 
 523 |         """
 524 |         return self._filter_only(selector, self._prev_all() + self._next_all())
 525 | 
 526 |     def parents(self, selector=None):
 527 |         """
 528 |         >>> d = PyQuery('<span><p class="hello">Hi</p><p>Bye</p></span>')
 529 |         >>> d('p').parents()
 530 |         [<span>]
 531 |         >>> d('.hello').parents('span')
 532 |         [<span>]
 533 |         >>> d('.hello').parents('p')
 534 |         []
 535 |         """
 536 |         return self._filter_only(
 537 |             selector,
 538 |             [e for e in self._traverse_parent_topdown()],
 539 |             unique=True
 540 |         )
 541 | 
 542 |     def children(self, selector=None):
 543 |         """Filter elements that are direct children of self using optional
 544 |         selector:
 545 | 
 546 |             >>> d = PyQuery('<span><p class="hello">Hi</p><p>Bye</p></span>')
 547 |             >>> d
 548 |             [<span>]
 549 |             >>> d.children()
 550 |             [<p.hello>, <p>]
 551 |             >>> d.children('.hello')
 552 |             [<p.hello>]
 553 |         """
 554 |         elements = [child for tag in self for child in tag.getchildren()]
 555 |         return self._filter_only(selector, elements)
 556 | 
 557 |     def closest(self, selector=None):
 558 |         """
 559 |         >>> d = PyQuery(
 560 |         ...  '<div class="hello"><p>This is a '
 561 |         ...  '<strong class="hello">test</strong></p></div>')
 562 |         >>> d('strong').closest('div')
 563 |         [<div.hello>]
 564 |         >>> d('strong').closest('.hello')
 565 |         [<strong.hello>]
 566 |         >>> d('strong').closest('form')
 567 |         []
 568 |         """
 569 |         result = []
 570 |         for current in self:
 571 |             while (current is not None and
 572 |                     not self._copy(current).is_(selector)):
 573 |                 current = current.getparent()
 574 |             if current is not None:
 575 |                 result.append(current)
 576 |         return self._copy(result, parent=self)
 577 | 
 578 |     def contents(self):
 579 |         """
 580 |         Return contents (with text nodes):
 581 | 
 582 |             >>> d = PyQuery('hello <b>bold</b>')
 583 |             >>> d.contents()  # doctest: +ELLIPSIS
 584 |             ['hello ', <Element b at ...>]
 585 |         """
 586 |         results = []
 587 |         for elem in self:
 588 |             results.extend(elem.xpath('child::text()|child::*',
 589 |                            namespaces=self.namespaces))
 590 |         return self._copy(results, parent=self)
 591 | 
 592 |     def filter(self, selector):
 593 |         """Filter elements in self using selector (string or function):
 594 | 
 595 |             >>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p>')
 596 |             >>> d('p')
 597 |             [<p.hello>, <p>]
 598 |             >>> d('p').filter('.hello')
 599 |             [<p.hello>]
 600 |             >>> d('p').filter(lambda i: i == 1)
 601 |             [<p>]
 602 |             >>> d('p').filter(lambda i: PyQuery(this).text() == 'Hi')
 603 |             [<p.hello>]
 604 |             >>> d('p').filter(lambda i, this: PyQuery(this).text() == 'Hi')
 605 |             [<p.hello>]
 606 |         """
 607 |         if not hasattr(selector, '__call__'):
 608 |             return self._filter_only(selector, self)
 609 |         else:
 610 |             elements = []
 611 |             args = getargspec(callback)
 612 |             try:
 613 |                 for i, this in enumerate(self):
 614 |                     if len(args) == 1:
 615 |                         selector.__globals__['this'] = this
 616 |                     if callback(selector, i, this):
 617 |                         elements.append(this)
 618 |             finally:
 619 |                 f_globals = selector.__globals__
 620 |                 if 'this' in f_globals:
 621 |                     del f_globals['this']
 622 |             return self._copy(elements, parent=self)
 623 | 
 624 |     def not_(self, selector):
 625 |         """Return elements that don't match the given selector:
 626 | 
 627 |             >>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p><div></div>')
 628 |             >>> d('p').not_('.hello')
 629 |             [<p>]
 630 |         """
 631 |         exclude = set(self._copy(selector, self))
 632 |         return self._copy([e for e in self if e not in exclude],
 633 |                           parent=self)
 634 | 
 635 |     def is_(self, selector):
 636 |         """Returns True if selector matches at least one current element, else
 637 |         False:
 638 | 
 639 |             >>> d = PyQuery('<p class="hello"><span>Hi</span></p><p>Bye</p>')
 640 |             >>> d('p').eq(0).is_('.hello')
 641 |             True
 642 | 
 643 |             >>> d('p').eq(0).is_('span')
 644 |             False
 645 | 
 646 |             >>> d('p').eq(1).is_('.hello')
 647 |             False
 648 | 
 649 |         ..
 650 |         """
 651 |         return bool(self._filter_only(selector, self))
 652 | 
 653 |     def find(self, selector):
 654 |         """Find elements using selector traversing down from self:
 655 | 
 656 |             >>> m = '<p><span><em>Whoah!</em></span></p><p><em> there</em></p>'
 657 |             >>> d = PyQuery(m)
 658 |             >>> d('p').find('em')
 659 |             [<em>, <em>]
 660 |             >>> d('p').eq(1).find('em')
 661 |             [<em>]
 662 |         """
 663 |         xpath = self._css_to_xpath(selector)
 664 |         results = [child.xpath(xpath, namespaces=self.namespaces)
 665 |                    for tag in self
 666 |                    for child in tag.getchildren()]
 667 |         # Flatten the results
 668 |         elements = []
 669 |         for r in results:
 670 |             elements.extend(r)
 671 |         return self._copy(elements, parent=self)
 672 | 
 673 |     def eq(self, index):
 674 |         """Return PyQuery of only the element with the provided index::
 675 | 
 676 |             >>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p><div></div>')
 677 |             >>> d('p').eq(0)
 678 |             [<p.hello>]
 679 |             >>> d('p').eq(1)
 680 |             [<p>]
 681 |             >>> d('p').eq(2)
 682 |             []
 683 | 
 684 |         ..
 685 |         """
 686 |         # Slicing will return empty list when index=-1
 687 |         # we should handle out of bound by ourselves
 688 |         try:
 689 |             items = self[index]
 690 |         except IndexError:
 691 |             items = []
 692 |         return self._copy(items, parent=self)
 693 | 
 694 |     def each(self, func):
 695 |         """apply func on each nodes
 696 |         """
 697 |         try:
 698 |             for i, element in enumerate(self):
 699 |                 func.__globals__['this'] = element
 700 |                 if callback(func, i, element) is False:
 701 |                     break
 702 |         finally:
 703 |             f_globals = func.__globals__
 704 |             if 'this' in f_globals:
 705 |                 del f_globals['this']
 706 |         return self
 707 | 
 708 |     def map(self, func):
 709 |         """Returns a new PyQuery after transforming current items with func.
 710 | 
 711 |         func should take two arguments - 'index' and 'element'.  Elements can
 712 |         also be referred to as 'this' inside of func::
 713 | 
 714 |             >>> d = PyQuery('<p class="hello">Hi there</p><p>Bye</p><br />')
 715 |             >>> d('p').map(lambda i, e: PyQuery(e).text())
 716 |             ['Hi there', 'Bye']
 717 | 
 718 |             >>> d('p').map(lambda i, e: len(PyQuery(this).text()))
 719 |             [8, 3]
 720 | 
 721 |             >>> d('p').map(lambda i, e: PyQuery(this).text().split())
 722 |             ['Hi', 'there', 'Bye']
 723 | 
 724 |         """
 725 |         items = []
 726 |         try:
 727 |             for i, element in enumerate(self):
 728 |                 func.__globals__['this'] = element
 729 |                 result = callback(func, i, element)
 730 |                 if result is not None:
 731 |                     if not isinstance(result, list):
 732 |                         items.append(result)
 733 |                     else:
 734 |                         items.extend(result)
 735 |         finally:
 736 |             f_globals = func.__globals__
 737 |             if 'this' in f_globals:
 738 |                 del f_globals['this']
 739 |         return self._copy(items, parent=self)
 740 | 
 741 |     @property
 742 |     def length(self):
 743 |         return len(self)
 744 | 
 745 |     def size(self):
 746 |         return len(self)
 747 | 
 748 |     def end(self):
 749 |         """Break out of a level of traversal and return to the parent level.
 750 | 
 751 |             >>> m = '<p><span><em>Whoah!</em></span></p><p><em> there</em></p>'
 752 |             >>> d = PyQuery(m)
 753 |             >>> d('p').eq(1).find('em').end().end()
 754 |             [<p>, <p>]
 755 |         """
 756 |         return self._parent
 757 | 
 758 |     ##############
 759 |     # Attributes #
 760 |     ##############
 761 |     def attr(self, *args, **kwargs):
 762 |         """Attributes manipulation
 763 |         """
 764 | 
 765 |         mapping = {'class_': 'class', 'for_': 'for'}
 766 | 
 767 |         attr = value = no_default
 768 |         length = len(args)
 769 |         if length == 1:
 770 |             attr = args[0]
 771 |             attr = mapping.get(attr, attr)
 772 |         elif length == 2:
 773 |             attr, value = args
 774 |             attr = mapping.get(attr, attr)
 775 |         elif kwargs:
 776 |             attr = {}
 777 |             for k, v in kwargs.items():
 778 |                 attr[mapping.get(k, k)] = v
 779 |         else:
 780 |             raise ValueError('Invalid arguments %s %s' % (args, kwargs))
 781 | 
 782 |         if not self:
 783 |             return None
 784 |         elif isinstance(attr, dict):
 785 |             for tag in self:
 786 |                 for key, value in attr.items():
 787 |                     tag.set(key, value)
 788 |         elif value is no_default:
 789 |             return self[0].get(attr)
 790 |         elif value is None:
 791 |             return self.remove_attr(attr)
 792 |         else:
 793 |             for tag in self:
 794 |                 tag.set(attr, value)
 795 |         return self
 796 | 
 797 |     @with_camel_case_alias
 798 |     def remove_attr(self, name):
 799 |         """Remove an attribute::
 800 | 
 801 |             >>> d = PyQuery('<div id="myid"></div>')
 802 |             >>> d.remove_attr('id')
 803 |             [<div>]
 804 |             >>> d.removeAttr('id')
 805 |             [<div>]
 806 | 
 807 |         ..
 808 |         """
 809 |         for tag in self:
 810 |             try:
 811 |                 del tag.attrib[name]
 812 |             except KeyError:
 813 |                 pass
 814 |         return self
 815 | 
 816 |     attr = FlexibleElement(pget=attr, pdel=remove_attr)
 817 | 
 818 |     #######
 819 |     # CSS #
 820 |     #######
 821 |     def height(self, value=no_default):
 822 |         """set/get height of element
 823 |         """
 824 |         return self.attr('height', value)
 825 | 
 826 |     def width(self, value=no_default):
 827 |         """set/get width of element
 828 |         """
 829 |         return self.attr('width', value)
 830 | 
 831 |     @with_camel_case_alias
 832 |     def has_class(self, name):
 833 |         """Return True if element has class::
 834 | 
 835 |             >>> d = PyQuery('<div class="myclass"></div>')
 836 |             >>> d.has_class('myclass')
 837 |             True
 838 |             >>> d.hasClass('myclass')
 839 |             True
 840 | 
 841 |         ..
 842 |         """
 843 |         return self.is_('.%s' % name)
 844 | 
 845 |     @with_camel_case_alias
 846 |     def add_class(self, value):
 847 |         """Add a css class to elements::
 848 | 
 849 |             >>> d = PyQuery('<div></div>')
 850 |             >>> d.add_class('myclass')
 851 |             [<div.myclass>]
 852 |             >>> d.addClass('myclass')
 853 |             [<div.myclass>]
 854 | 
 855 |         ..
 856 |         """
 857 |         for tag in self:
 858 |             values = value.split(' ')
 859 |             classes = (tag.get('class') or '').split()
 860 |             classes += [v for v in values if v not in classes]
 861 |             tag.set('class', ' '.join(classes))
 862 |         return self
 863 | 
 864 |     @with_camel_case_alias
 865 |     def remove_class(self, value):
 866 |         """Remove a css class to elements::
 867 | 
 868 |             >>> d = PyQuery('<div class="myclass"></div>')
 869 |             >>> d.remove_class('myclass')
 870 |             [<div>]
 871 |             >>> d.removeClass('myclass')
 872 |             [<div>]
 873 | 
 874 |         ..
 875 |         """
 876 |         for tag in self:
 877 |             values = value.split(' ')
 878 |             classes = set((tag.get('class') or '').split())
 879 |             classes.difference_update(values)
 880 |             classes.difference_update([''])
 881 |             classes = ' '.join(classes)
 882 |             if classes.strip():
 883 |                 tag.set('class', classes)
 884 |             elif tag.get('class'):
 885 |                 tag.set('class', classes)
 886 |         return self
 887 | 
 888 |     @with_camel_case_alias
 889 |     def toggle_class(self, value):
 890 |         """Toggle a css class to elements
 891 | 
 892 |             >>> d = PyQuery('<div></div>')
 893 |             >>> d.toggle_class('myclass')
 894 |             [<div.myclass>]
 895 |             >>> d.toggleClass('myclass')
 896 |             [<div>]
 897 | 
 898 |         """
 899 |         for tag in self:
 900 |             values = value.split(' ')
 901 |             classes = (tag.get('class') or '').split()
 902 |             values_to_add = [v for v in values if v not in classes]
 903 |             values_to_del = [v for v in values if v in classes]
 904 |             classes = [v for v in classes if v not in values_to_del]
 905 |             classes += values_to_add
 906 |             tag.set('class', ' '.join(classes))
 907 |         return self
 908 | 
 909 |     def css(self, *args, **kwargs):
 910 |         """css attributes manipulation
 911 |         """
 912 | 
 913 |         attr = value = no_default
 914 |         length = len(args)
 915 |         if length == 1:
 916 |             attr = args[0]
 917 |         elif length == 2:
 918 |             attr, value = args
 919 |         elif kwargs:
 920 |             attr = kwargs
 921 |         else:
 922 |             raise ValueError('Invalid arguments %s %s' % (args, kwargs))
 923 | 
 924 |         if isinstance(attr, dict):
 925 |             for tag in self:
 926 |                 stripped_keys = [key.strip().replace('_', '-')
 927 |                                  for key in attr.keys()]
 928 |                 current = [el.strip()
 929 |                            for el in (tag.get('style') or '').split(';')
 930 |                            if el.strip()
 931 |                            and el.split(':')[0].strip() not in stripped_keys]
 932 |                 for key, value in attr.items():
 933 |                     key = key.replace('_', '-')
 934 |                     current.append('%s: %s' % (key, value))
 935 |                 tag.set('style', '; '.join(current))
 936 |         elif isinstance(value, basestring):
 937 |             attr = attr.replace('_', '-')
 938 |             for tag in self:
 939 |                 current = [
 940 |                     el.strip()
 941 |                     for el in (tag.get('style') or '').split(';')
 942 |                     if (el.strip() and
 943 |                         not el.split(':')[0].strip() == attr.strip())]
 944 |                 current.append('%s: %s' % (attr, value))
 945 |                 tag.set('style', '; '.join(current))
 946 |         return self
 947 | 
 948 |     css = FlexibleElement(pget=css, pset=css)
 949 | 
 950 |     ###################
 951 |     # CORE UI EFFECTS #
 952 |     ###################
 953 |     def hide(self):
 954 |         """Add display:none to elements style:
 955 | 
 956 |             >>> print(PyQuery('<div style="display:none;"/>').hide())
 957 |             <div style="display: none"/>
 958 | 
 959 |         """
 960 |         return self.css('display', 'none')
 961 | 
 962 |     def show(self):
 963 |         """Add display:block to elements style:
 964 | 
 965 |             >>> print(PyQuery('<div />').show())
 966 |             <div style="display: block"/>
 967 | 
 968 |         """
 969 |         return self.css('display', 'block')
 970 | 
 971 |     ########
 972 |     # HTML #
 973 |     ########
 974 |     def val(self, value=no_default):
 975 |         """Set the attribute value::
 976 | 
 977 |             >>> d = PyQuery('<input />')
 978 |             >>> d.val('Youhou')
 979 |             [<input>]
 980 | 
 981 |         Get the attribute value::
 982 | 
 983 |             >>> d.val()
 984 |             'Youhou'
 985 | 
 986 |         Set the selected values for a `select` element with the `multiple`
 987 |         attribute::
 988 | 
 989 |             >>> d = PyQuery('''
 990 |             ...             <select multiple>
 991 |             ...                 <option value="you"><option value="hou">
 992 |             ...             </select>
 993 |             ...             ''')
 994 |             >>> d.val(['you', 'hou'])
 995 |             [<select>]
 996 | 
 997 |         Get the selected values for a `select` element with the `multiple`
 998 |         attribute::
 999 | 
1000 |             >>> d.val()
1001 |             ['you', 'hou']
1002 | 
1003 |         """
1004 |         def _get_value(tag):
1005 |             # <textarea>
1006 |             if tag.tag == 'textarea':
1007 |                 return self._copy(tag).html()
1008 |             # <select>
1009 |             elif tag.tag == 'select':
1010 |                 if 'multiple' in tag.attrib:
1011 |                     # Only extract value if selected
1012 |                     selected = self._copy(tag)('option[selected]')
1013 |                     # Rebuild list to avoid serialization error
1014 |                     return list(selected.map(
1015 |                         lambda _, o: self._copy(o).attr('value')
1016 |                     ))
1017 |                 selected_option = self._copy(tag)('option[selected]:last')
1018 |                 if selected_option:
1019 |                     return selected_option.attr('value')
1020 |                 else:
1021 |                     return self._copy(tag)('option').attr('value')
1022 |             # <input type="checkbox"> or <input type="radio">
1023 |             elif self.is_(':checkbox,:radio'):
1024 |                 val = self._copy(tag).attr('value')
1025 |                 if val is None:
1026 |                     return 'on'
1027 |                 else:
1028 |                     return val
1029 |             # <input>
1030 |             elif tag.tag == 'input':
1031 |                 val = self._copy(tag).attr('value')
1032 |                 return val.replace('\n', '') if val else ''
1033 |             # everything else.
1034 |             return self._copy(tag).attr('value') or ''
1035 | 
1036 |         def _set_value(pq, value):
1037 |             for tag in pq:
1038 |                 # <select>
1039 |                 if tag.tag == 'select':
1040 |                     if not isinstance(value, list):
1041 |                         value = [value]
1042 | 
1043 |                     def _make_option_selected(_, elem):
1044 |                         pq = self._copy(elem)
1045 |                         if pq.attr('value') in value:
1046 |                             pq.attr('selected', 'selected')
1047 |                             if 'multiple' not in tag.attrib:
1048 |                                 del value[:]  # Ensure it toggles first match
1049 |                         else:
1050 |                             pq.removeAttr('selected')
1051 | 
1052 |                     self._copy(tag)('option').each(_make_option_selected)
1053 |                     continue
1054 |                 # Stringify array
1055 |                 if isinstance(value, list):
1056 |                     value = ','.join(value)
1057 |                 # <textarea>
1058 |                 if tag.tag == 'textarea':
1059 |                     self._copy(tag).text(value)
1060 |                     continue
1061 |                 # <input> and everything else.
1062 |                 self._copy(tag).attr('value', value)
1063 | 
1064 |         if value is no_default:
1065 |             if len(self):
1066 |                 return _get_value(self[0])
1067 |         else:
1068 |             _set_value(self, value)
1069 |             return self
1070 | 
1071 |     def html(self, value=no_default, **kwargs):
1072 |         """Get or set the html representation of sub nodes.
1073 | 
1074 |         Get the text value::
1075 | 
1076 |             >>> d = PyQuery('<div><span>toto</span></div>')
1077 |             >>> print(d.html())
1078 |             <span>toto</span>
1079 | 
1080 |         Extra args are passed to ``lxml.etree.tostring``::
1081 | 
1082 |             >>> d = PyQuery('<div><span></span></div>')
1083 |             >>> print(d.html())
1084 |             <span/>
1085 |             >>> print(d.html(method='html'))
1086 |             <span></span>
1087 | 
1088 |         Set the text value::
1089 | 
1090 |             >>> d.html('<span>Youhou !</span>')
1091 |             [<div>]
1092 |             >>> print(d)
1093 |             <div><span>Youhou !</span></div>
1094 |         """
1095 |         if value is no_default:
1096 |             if not self:
1097 |                 return None
1098 |             tag = self[0]
1099 |             children = tag.getchildren()
1100 |             html = escape(tag.text or '', quote=False)
1101 |             if not children:
1102 |                 return html
1103 |             if 'encoding' not in kwargs:
1104 |                 kwargs['encoding'] = str
1105 |             html += u''.join([etree.tostring(e, **kwargs)
1106 |                               for e in children])
1107 |             return html
1108 |         else:
1109 |             if isinstance(value, self.__class__):
1110 |                 new_html = str(value)
1111 |             elif isinstance(value, basestring):
1112 |                 new_html = value
1113 |             elif not value:
1114 |                 new_html = ''
1115 |             else:
1116 |                 raise ValueError(type(value))
1117 | 
1118 |             for tag in self:
1119 |                 for child in tag.getchildren():
1120 |                     tag.remove(child)
1121 |                 root = fromstring(
1122 |                     u'<root>' + new_html + u'</root>',
1123 |                     self.parser)[0]
1124 |                 children = root.getchildren()
1125 |                 if children:
1126 |                     tag.extend(children)
1127 |                 tag.text = root.text
1128 |         return self
1129 | 
1130 |     @with_camel_case_alias
1131 |     def outer_html(self, method="html"):
1132 |         """Get the html representation of the first selected element::
1133 | 
1134 |             >>> d = PyQuery('<div><span class="red">toto</span> rocks</div>')
1135 |             >>> print(d('span'))
1136 |             <span class="red">toto</span> rocks
1137 |             >>> print(d('span').outer_html())
1138 |             <span class="red">toto</span>
1139 |             >>> print(d('span').outerHtml())
1140 |             <span class="red">toto</span>
1141 | 
1142 |             >>> S = PyQuery('<p>Only <b>me</b> & myself</p>')
1143 |             >>> print(S('b').outer_html())
1144 |             <b>me</b>
1145 | 
1146 |         ..
1147 |         """
1148 | 
1149 |         if not self:
1150 |             return None
1151 |         e0 = self[0]
1152 |         if e0.tail:
1153 |             e0 = deepcopy(e0)
1154 |             e0.tail = ''
1155 |         return etree.tostring(e0, encoding=str, method=method)
1156 | 
1157 |     def text(self, value=no_default, **kwargs):
1158 |         """Get or set the text representation of sub nodes.
1159 | 
1160 |         Get the text value::
1161 | 
1162 |             >>> doc = PyQuery('<div><span>toto</span><span>tata</span></div>')
1163 |             >>> print(doc.text())
1164 |             tototata
1165 |             >>> doc = PyQuery('''<div><span>toto</span>
1166 |             ...               <span>tata</span></div>''')
1167 |             >>> print(doc.text())
1168 |             toto tata
1169 | 
1170 |         Get the text value, without squashing newlines::
1171 | 
1172 |             >>> doc = PyQuery('''<div><span>toto</span>
1173 |             ...               <span>tata</span></div>''')
1174 |             >>> print(doc.text(squash_space=False))
1175 |             toto
1176 |             tata
1177 | 
1178 |         Set the text value::
1179 | 
1180 |             >>> doc.text('Youhou !')
1181 |             [<div>]
1182 |             >>> print(doc)
1183 |             <div>Youhou !</div>
1184 | 
1185 |         """
1186 | 
1187 |         if value is no_default:
1188 |             if not self:
1189 |                 return ''
1190 |             return ' '.join(
1191 |                 self._copy(tag).html() if tag.tag == 'textarea' else
1192 |                 extract_text(tag, **kwargs) for tag in self
1193 |             )
1194 | 
1195 |         for tag in self:
1196 |             for child in tag.getchildren():
1197 |                 tag.remove(child)
1198 |             tag.text = value
1199 |         return self
1200 | 
1201 |     ################
1202 |     # Manipulating #
1203 |     ################
1204 | 
1205 |     def _get_root(self, value):
1206 |         if isinstance(value, basestring):
1207 |             root = fromstring(u'<root>' + value + u'</root>',
1208 |                               self.parser)[0]
1209 |         elif isinstance(value, etree._Element):
1210 |             root = self._copy(value)
1211 |         elif isinstance(value, PyQuery):
1212 |             root = value
1213 |         else:
1214 |             raise TypeError(
1215 |                 'Value must be string, PyQuery or Element. Got %r' % value)
1216 |         if hasattr(root, 'text') and isinstance(root.text, basestring):
1217 |             root_text = root.text
1218 |         else:
1219 |             root_text = ''
1220 |         return root, root_text
1221 | 
1222 |     def append(self, value):
1223 |         """append value to each nodes
1224 |         """
1225 |         root, root_text = self._get_root(value)
1226 |         for i, tag in enumerate(self):
1227 |             if len(tag) > 0:  # if the tag has children
1228 |                 last_child = tag[-1]
1229 |                 if not last_child.tail:
1230 |                     last_child.tail = ''
1231 |                 last_child.tail += root_text
1232 |             else:
1233 |                 if not tag.text:
1234 |                     tag.text = ''
1235 |                 tag.text += root_text
1236 |             if i > 0:
1237 |                 root = deepcopy(list(root))
1238 |             tag.extend(root)
1239 |         return self
1240 | 
1241 |     @with_camel_case_alias
1242 |     def append_to(self, value):
1243 |         """append nodes to value
1244 |         """
1245 |         value.append(self)
1246 |         return self
1247 | 
1248 |     def prepend(self, value):
1249 |         """prepend value to nodes
1250 |         """
1251 |         root, root_text = self._get_root(value)
1252 |         for i, tag in enumerate(self):
1253 |             if not tag.text:
1254 |                 tag.text = ''
1255 |             if len(root) > 0:
1256 |                 root[-1].tail = tag.text
1257 |                 tag.text = root_text
1258 |             else:
1259 |                 tag.text = root_text + tag.text
1260 |             if i > 0:
1261 |                 root = deepcopy(list(root))
1262 |             tag[:0] = root
1263 |             root = tag[:len(root)]
1264 |         return self
1265 | 
1266 |     @with_camel_case_alias
1267 |     def prepend_to(self, value):
1268 |         """prepend nodes to value
1269 |         """
1270 |         value.prepend(self)
1271 |         return self
1272 | 
1273 |     def after(self, value):
1274 |         """add value after nodes
1275 |         """
1276 |         root, root_text = self._get_root(value)
1277 |         for i, tag in enumerate(self):
1278 |             if not tag.tail:
1279 |                 tag.tail = ''
1280 |             tag.tail += root_text
1281 |             if i > 0:
1282 |                 root = deepcopy(list(root))
1283 |             parent = tag.getparent()
1284 |             index = parent.index(tag) + 1
1285 |             parent[index:index] = root
1286 |             root = parent[index:len(root)]
1287 |         return self
1288 | 
1289 |     @with_camel_case_alias
1290 |     def insert_after(self, value):
1291 |         """insert nodes after value
1292 |         """
1293 |         value.after(self)
1294 |         return self
1295 | 
1296 |     def before(self, value):
1297 |         """insert value before nodes
1298 |         """
1299 |         root, root_text = self._get_root(value)
1300 |         for i, tag in enumerate(self):
1301 |             previous = tag.getprevious()
1302 |             if previous is not None:
1303 |                 if not previous.tail:
1304 |                     previous.tail = ''
1305 |                 previous.tail += root_text
1306 |             else:
1307 |                 parent = tag.getparent()
1308 |                 if not parent.text:
1309 |                     parent.text = ''
1310 |                 parent.text += root_text
1311 |             if i > 0:
1312 |                 root = deepcopy(list(root))
1313 |             parent = tag.getparent()
1314 |             index = parent.index(tag)
1315 |             parent[index:index] = root
1316 |             root = parent[index:len(root)]
1317 |         return self
1318 | 
1319 |     @with_camel_case_alias
1320 |     def insert_before(self, value):
1321 |         """insert nodes before value
1322 |         """
1323 |         value.before(self)
1324 |         return self
1325 | 
1326 |     def wrap(self, value):
1327 |         """A string of HTML that will be created on the fly and wrapped around
1328 |         each target:
1329 | 
1330 |             >>> d = PyQuery('<span>youhou</span>')
1331 |             >>> d.wrap('<div></div>')
1332 |             [<div>]
1333 |             >>> print(d)
1334 |             <div><span>youhou</span></div>
1335 | 
1336 |         """
1337 |         assert isinstance(value, basestring)
1338 |         value = fromstring(value)[0]
1339 |         nodes = []
1340 |         for tag in self:
1341 |             wrapper = deepcopy(value)
1342 |             # FIXME: using iterchildren is probably not optimal
1343 |             if not wrapper.getchildren():
1344 |                 wrapper.append(deepcopy(tag))
1345 |             else:
1346 |                 childs = [c for c in wrapper.iterchildren()]
1347 |                 child = childs[-1]
1348 |                 child.append(deepcopy(tag))
1349 |             nodes.append(wrapper)
1350 | 
1351 |             parent = tag.getparent()
1352 |             if parent is not None:
1353 |                 for t in parent.iterchildren():
1354 |                     if t is tag:
1355 |                         t.addnext(wrapper)
1356 |                         parent.remove(t)
1357 |                         break
1358 |         self[:] = nodes
1359 |         return self
1360 | 
1361 |     @with_camel_case_alias
1362 |     def wrap_all(self, value):
1363 |         """Wrap all the elements in the matched set into a single wrapper
1364 |         element::
1365 | 
1366 |             >>> d = PyQuery('<div><span>Hey</span><span>you !</span></div>')
1367 |             >>> print(d('span').wrap_all('<div id="wrapper"></div>'))
1368 |             <div id="wrapper"><span>Hey</span><span>you !</span></div>
1369 | 
1370 |             >>> d = PyQuery('<div><span>Hey</span><span>you !</span></div>')
1371 |             >>> print(d('span').wrapAll('<div id="wrapper"></div>'))
1372 |             <div id="wrapper"><span>Hey</span><span>you !</span></div>
1373 | 
1374 |         ..
1375 |         """
1376 |         if not self:
1377 |             return self
1378 | 
1379 |         assert isinstance(value, basestring)
1380 |         value = fromstring(value)[0]
1381 |         wrapper = deepcopy(value)
1382 |         if not wrapper.getchildren():
1383 |             child = wrapper
1384 |         else:
1385 |             childs = [c for c in wrapper.iterchildren()]
1386 |             child = childs[-1]
1387 | 
1388 |         replace_childs = True
1389 |         parent = self[0].getparent()
1390 |         if parent is None:
1391 |             parent = no_default
1392 | 
1393 |         # add nodes to wrapper and check parent
1394 |         for tag in self:
1395 |             child.append(deepcopy(tag))
1396 |             if tag.getparent() is not parent:
1397 |                 replace_childs = False
1398 | 
1399 |         # replace nodes i parent if possible
1400 |         if parent is not no_default and replace_childs:
1401 |             childs = [c for c in parent.iterchildren()]
1402 |             if len(childs) == len(self):
1403 |                 for tag in self:
1404 |                     parent.remove(tag)
1405 |                 parent.append(wrapper)
1406 | 
1407 |         self[:] = [wrapper]
1408 |         return self
1409 | 
1410 |     @with_camel_case_alias
1411 |     def replace_with(self, value):
1412 |         """replace nodes by value:
1413 | 
1414 |             >>> doc = PyQuery("<html><div /></html>")
1415 |             >>> node = PyQuery("<span />")
1416 |             >>> child = doc.find('div')
1417 |             >>> child.replace_with(node)
1418 |             [<div>]
1419 |             >>> print(doc)
1420 |             <html><span/></html>
1421 | 
1422 |         """
1423 |         if isinstance(value, PyQuery):
1424 |             value = str(value)
1425 |         if hasattr(value, '__call__'):
1426 |             for i, element in enumerate(self):
1427 |                 self._copy(element).before(
1428 |                     value(i, element) + (element.tail or ''))
1429 |                 parent = element.getparent()
1430 |                 parent.remove(element)
1431 |         else:
1432 |             for tag in self:
1433 |                 self._copy(tag).before(value + (tag.tail or ''))
1434 |                 parent = tag.getparent()
1435 |                 parent.remove(tag)
1436 |         return self
1437 | 
1438 |     @with_camel_case_alias
1439 |     def replace_all(self, expr):
1440 |         """replace nodes by expr
1441 |         """
1442 |         if self._parent is no_default:
1443 |             raise ValueError(
1444 |                 'replaceAll can only be used with an object with parent')
1445 |         self._parent(expr).replace_with(self)
1446 |         return self
1447 | 
1448 |     def clone(self):
1449 |         """return a copy of nodes
1450 |         """
1451 |         return PyQuery([deepcopy(tag) for tag in self])
1452 | 
1453 |     def empty(self):
1454 |         """remove nodes content
1455 |         """
1456 |         for tag in self:
1457 |             tag.text = None
1458 |             tag[:] = []
1459 |         return self
1460 | 
1461 |     def remove(self, expr=no_default):
1462 |         """Remove nodes:
1463 | 
1464 |              >>> h = (
1465 |              ... '<div>Maybe <em>she</em> does <strong>NOT</strong> know</div>'
1466 |              ... )
1467 |              >>> d = PyQuery(h)
1468 |              >>> d('strong').remove()
1469 |              [<strong>]
1470 |              >>> print(d)
1471 |              <div>Maybe <em>she</em> does  know</div>
1472 |         """
1473 |         if expr is no_default:
1474 |             for tag in self:
1475 |                 parent = tag.getparent()
1476 |                 if parent is not None:
1477 |                     if tag.tail:
1478 |                         prev = tag.getprevious()
1479 |                         if prev is None:
1480 |                             if not parent.text:
1481 |                                 parent.text = ''
1482 |                             parent.text += tag.tail
1483 |                         else:
1484 |                             if not prev.tail:
1485 |                                 prev.tail = ''
1486 |                             prev.tail += tag.tail
1487 |                     parent.remove(tag)
1488 |         else:
1489 |             results = self._copy(expr, self)
1490 |             results.remove()
1491 |         return self
1492 | 
1493 |     class Fn(object):
1494 |         """Hook for defining custom function (like the jQuery.fn):
1495 | 
1496 |         .. sourcecode:: python
1497 | 
1498 |          >>> fn = lambda: this.map(lambda i, el: PyQuery(this).outerHtml())
1499 |          >>> PyQuery.fn.listOuterHtml = fn
1500 |          >>> S = PyQuery(
1501 |          ...   '<ol>   <li>Coffee</li>   <li>Tea</li>   <li>Milk</li>   </ol>')
1502 |          >>> S('li').listOuterHtml()
1503 |          ['<li>Coffee</li>', '<li>Tea</li>', '<li>Milk</li>']
1504 | 
1505 |         """
1506 |         def __setattr__(self, name, func):
1507 |             def fn(self, *args, **kwargs):
1508 |                 func.__globals__['this'] = self
1509 |                 return func(*args, **kwargs)
1510 |             fn.__name__ = name
1511 |             setattr(PyQuery, name, fn)
1512 |     fn = Fn()
1513 | 
1514 |     ########
1515 |     # AJAX #
1516 |     ########
1517 | 
1518 |     @with_camel_case_alias
1519 |     def serialize_array(self):
1520 |         """Serialize form elements as an array of dictionaries, whose structure
1521 |         mirrors that produced by the jQuery API. Notably, it does not handle
1522 |         the deprecated `keygen` form element.
1523 | 
1524 |             >>> d = PyQuery('<form><input name="order" value="spam"></form>')
1525 |             >>> d.serialize_array() == [{'name': 'order', 'value': 'spam'}]
1526 |             True
1527 |             >>> d.serializeArray() == [{'name': 'order', 'value': 'spam'}]
1528 |             True
1529 |         """
1530 |         return list(map(
1531 |             lambda p: {'name': p[0], 'value': p[1]},
1532 |             self.serialize_pairs()
1533 |         ))
1534 | 
1535 |     def serialize(self):
1536 |         """Serialize form elements as a URL-encoded string.
1537 | 
1538 |             >>> h = (
1539 |             ... '<form><input name="order" value="spam">'
1540 |             ... '<input name="order2" value="baked beans"></form>'
1541 |             ... )
1542 |             >>> d = PyQuery(h)
1543 |             >>> d.serialize()
1544 |             'order=spam&order2=baked%20beans'
1545 |         """
1546 |         return urlencode(self.serialize_pairs()).replace('+', '%20')
1547 | 
1548 |     #####################################################
1549 |     # Additional methods that are not in the jQuery API #
1550 |     #####################################################
1551 | 
1552 |     @with_camel_case_alias
1553 |     def serialize_pairs(self):
1554 |         """Serialize form elements as an array of 2-tuples conventional for
1555 |         typical URL-parsing operations in Python.
1556 | 
1557 |             >>> d = PyQuery('<form><input name="order" value="spam"></form>')
1558 |             >>> d.serialize_pairs()
1559 |             [('order', 'spam')]
1560 |             >>> d.serializePairs()
1561 |             [('order', 'spam')]
1562 |         """
1563 |         # https://github.com/jquery/jquery/blob
1564 |         # /2d4f53416e5f74fa98e0c1d66b6f3c285a12f0ce/src/serialize.js#L14
1565 |         _submitter_types = ['submit', 'button', 'image', 'reset', 'file']
1566 | 
1567 |         controls = self._copy([])
1568 |         # Expand list of form controls
1569 |         for el in self.items():
1570 |             if el[0].tag == 'form':
1571 |                 form_id = el.attr('id')
1572 |                 if form_id:
1573 |                     # Include inputs outside of their form owner
1574 |                     root = self._copy(el.root.getroot())
1575 |                     controls.extend(root(
1576 |                         '#%s :not([form]):input, [form="%s"]:input'
1577 |                         % (form_id, form_id)))
1578 |                 else:
1579 |                     controls.extend(el(':not([form]):input'))
1580 |             elif el[0].tag == 'fieldset':
1581 |                 controls.extend(el(':input'))
1582 |             else:
1583 |                 controls.extend(el)
1584 |         # Filter controls
1585 |         selector = '[name]:enabled:not(button)'  # Not serializing image button
1586 |         selector += ''.join(map(
1587 |             lambda s: ':not([type="%s"])' % s,
1588 |             _submitter_types))
1589 |         controls = controls.filter(selector)
1590 | 
1591 |         def _filter_out_unchecked(_, el):
1592 |             el = controls._copy(el)
1593 |             return not el.is_(':checkbox:not(:checked)') and \
1594 |                 not el.is_(':radio:not(:checked)')
1595 |         controls = controls.filter(_filter_out_unchecked)
1596 | 
1597 |         # jQuery serializes inputs with the datalist element as an ancestor
1598 |         # contrary to WHATWG spec as of August 2018
1599 |         #
1600 |         # xpath = 'self::*[not(ancestor::datalist)]'
1601 |         # results = []
1602 |         # for tag in controls:
1603 |         #     results.extend(tag.xpath(xpath, namespaces=controls.namespaces))
1604 |         # controls = controls._copy(results)
1605 | 
1606 |         # Serialize values
1607 |         ret = []
1608 |         for field in controls:
1609 |             val = self._copy(field).val() or ''
1610 |             if isinstance(val, list):
1611 |                 ret.extend(map(
1612 |                     lambda v: (field.attrib['name'], v.replace('\n', '\r\n')),
1613 |                     val
1614 |                 ))
1615 |             else:
1616 |                 ret.append((field.attrib['name'], val.replace('\n', '\r\n')))
1617 |         return ret
1618 | 
1619 |     @with_camel_case_alias
1620 |     def serialize_dict(self):
1621 |         """Serialize form elements as an ordered dictionary. Multiple values
1622 |         corresponding to the same input name are concatenated into one list.
1623 | 
1624 |             >>> d = PyQuery('''<form>
1625 |             ...             <input name="order" value="spam">
1626 |             ...             <input name="order" value="eggs">
1627 |             ...             <input name="order2" value="ham">
1628 |             ...             </form>''')
1629 |             >>> d.serialize_dict()
1630 |             OrderedDict({'order': ['spam', 'eggs'], 'order2': 'ham'})
1631 |             >>> d.serializeDict()
1632 |             OrderedDict({'order': ['spam', 'eggs'], 'order2': 'ham'})
1633 |         """
1634 |         ret = OrderedDict()
1635 |         for name, val in self.serialize_pairs():
1636 |             if name not in ret:
1637 |                 ret[name] = val
1638 |             elif not isinstance(ret[name], list):
1639 |                 ret[name] = [ret[name], val]
1640 |             else:
1641 |                 ret[name].append(val)
1642 |         return ret
1643 | 
1644 |     @property
1645 |     def base_url(self):
1646 |         """Return the url of current html document or None if not available.
1647 |         """
1648 |         if self._base_url is not None:
1649 |             return self._base_url
1650 |         if self._parent is not no_default:
1651 |             return self._parent.base_url
1652 | 
1653 |     def make_links_absolute(self, base_url=None):
1654 |         """Make all links absolute.
1655 |         """
1656 |         if base_url is None:
1657 |             base_url = self.base_url
1658 |             if base_url is None:
1659 |                 raise ValueError((
1660 |                     'You need a base URL to make your links'
1661 |                     'absolute. It can be provided by the base_url parameter.'))
1662 | 
1663 |         def repl(attr):
1664 |             def rep(i, e):
1665 |                 attr_value = self(e).attr(attr)
1666 |                 # when label hasn't such attr, pass
1667 |                 if attr_value is None:
1668 |                     return None
1669 | 
1670 |                 # skip specific "protocol" schemas
1671 |                 if any(attr_value.startswith(schema)
1672 |                        for schema in ('tel:', 'callto:', 'sms:')):
1673 |                     return None
1674 | 
1675 |                 return self(e).attr(attr,
1676 |                                     urljoin(base_url, attr_value.strip()))
1677 |             return rep
1678 | 
1679 |         self('a').each(repl('href'))
1680 |         self('link').each(repl('href'))
1681 |         self('script').each(repl('src'))
1682 |         self('img').each(repl('src'))
1683 |         self('iframe').each(repl('src'))
1684 |         self('form').each(repl('action'))
1685 | 
1686 |         return self
1687 | 
1688 | 
1689 | build_camel_case_aliases(PyQuery)
1690 | 


--------------------------------------------------------------------------------
/pyquery/text.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | # https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#Elements
  5 | INLINE_TAGS = {
  6 |     'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite',
  7 |     'code', 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map',
  8 |     'object', 'q', 'samp', 'script', 'select', 'small', 'span', 'strong',
  9 |     'sub', 'sup', 'textarea', 'time', 'tt', 'var'
 10 | }
 11 | 
 12 | SEPARATORS = {'br'}
 13 | 
 14 | 
 15 | # Definition of whitespace in HTML:
 16 | # https://www.w3.org/TR/html4/struct/text.html#h-9.1
 17 | WHITESPACE_RE = re.compile('[\x20\x09\x0C\u200B\x0A\x0D]+')
 18 | 
 19 | 
 20 | def squash_html_whitespace(text):
 21 |     # use raw extract_text for preformatted content (like <pre> content or set
 22 |     # by CSS rules)
 23 |     # apply this function on top of
 24 |     return WHITESPACE_RE.sub(' ', text)
 25 | 
 26 | 
 27 | def _squash_artifical_nl(parts):
 28 |     output, last_nl = [], False
 29 |     for x in parts:
 30 |         if x is not None:
 31 |             output.append(x)
 32 |             last_nl = False
 33 |         elif not last_nl:
 34 |             output.append(None)
 35 |             last_nl = True
 36 |     return output
 37 | 
 38 | 
 39 | def _strip_artifical_nl(parts):
 40 |     if not parts:
 41 |         return parts
 42 |     for start_idx, pt in enumerate(parts):
 43 |         if isinstance(pt, str):
 44 |             # 0, 1, 2, index of first string [start_idx:...
 45 |             break
 46 |     iterator = enumerate(parts[:start_idx - 1 if start_idx > 0 else None:-1])
 47 |     for end_idx, pt in iterator:
 48 |         if isinstance(pt, str):  # 0=None, 1=-1, 2=-2, index of last string
 49 |             break
 50 |     return parts[start_idx:-end_idx if end_idx > 0 else None]
 51 | 
 52 | 
 53 | def _merge_original_parts(parts):
 54 |     output, orp_buf = [], []
 55 | 
 56 |     def flush():
 57 |         if orp_buf:
 58 |             item = squash_html_whitespace(''.join(orp_buf)).strip()
 59 |             if item:
 60 |                 output.append(item)
 61 |             orp_buf[:] = []
 62 | 
 63 |     for x in parts:
 64 |         if not isinstance(x, str):
 65 |             flush()
 66 |             output.append(x)
 67 |         else:
 68 |             orp_buf.append(x)
 69 |     flush()
 70 |     return output
 71 | 
 72 | 
 73 | def extract_text_array(dom, squash_artifical_nl=True, strip_artifical_nl=True):
 74 |     if callable(dom.tag):
 75 |         return ''
 76 |     r = []
 77 |     if dom.tag in SEPARATORS:
 78 |         r.append(True)  # equivalent of '\n' used to designate separators
 79 |     elif dom.tag not in INLINE_TAGS:
 80 |         # equivalent of '\n' used to designate artificially inserted newlines
 81 |         r.append(None)
 82 |     if dom.text is not None:
 83 |         r.append(dom.text)
 84 |     for child in dom.getchildren():
 85 |         r.extend(extract_text_array(child, squash_artifical_nl=False,
 86 |                                     strip_artifical_nl=False))
 87 |         if child.tail is not None:
 88 |             r.append(child.tail)
 89 |     if dom.tag not in INLINE_TAGS and dom.tag not in SEPARATORS:
 90 |         # equivalent of '\n' used to designate artificially inserted newlines
 91 |         r.append(None)
 92 |     if squash_artifical_nl:
 93 |         r = _squash_artifical_nl(r)
 94 |     if strip_artifical_nl:
 95 |         r = _strip_artifical_nl(r)
 96 |     return r
 97 | 
 98 | 
 99 | def extract_text(dom, block_symbol='\n', sep_symbol='\n', squash_space=True):
100 |     a = extract_text_array(dom, squash_artifical_nl=squash_space)
101 |     if squash_space:
102 |         a = _strip_artifical_nl(_squash_artifical_nl(_merge_original_parts(a)))
103 |     result = ''.join(
104 |         block_symbol if x is None else (
105 |             sep_symbol if x is True else x
106 |         )
107 |         for x in a
108 |     )
109 |     if squash_space:
110 |         result = result.strip()
111 |     return result
112 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | 
2 | [pytest]
3 | filterwarnings =
4 |     ignore::DeprecationWarning
5 | doctest_optionflags = ELLIPSIS NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL
6 | addopts = --doctest-modules --doctest-glob="*.rst" --ignore=docs/conf.py
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
 3 | #
 4 | # Distributed under the BSD license, see LICENSE.txt
 5 | 
 6 | from setuptools import setup, find_packages
 7 | import os
 8 | 
 9 | 
10 | install_requires = [
11 |     'lxml>=2.1',
12 |     'cssselect>=1.2.0',
13 | ]
14 | 
15 | 
16 | def read(*names):
17 |     values = dict()
18 |     for name in names:
19 |         filename = name + '.rst'
20 |         if os.path.isfile(filename):
21 |             fd = open(filename)
22 |             value = fd.read()
23 |             fd.close()
24 |         else:
25 |             value = ''
26 |         values[name] = value
27 |     return values
28 | 
29 | 
30 | long_description = """
31 | %(README)s
32 | 
33 | See http://pyquery.rtfd.org/ for the full documentation
34 | 
35 | News
36 | ====
37 | 
38 | %(CHANGES)s
39 | 
40 | """ % read('README', 'CHANGES')
41 | 
42 | version = '2.0.2.dev0'
43 | 
44 | setup(name='pyquery',
45 |       version=version,
46 |       description='A jquery-like library for python',
47 |       long_description=long_description,
48 |       classifiers=[
49 |           "Intended Audience :: Developers",
50 |           "Development Status :: 5 - Production/Stable",
51 |           "Programming Language :: Python :: 3",
52 |           "Programming Language :: Python :: 3.9",
53 |           "Programming Language :: Python :: 3.10",
54 |           "Programming Language :: Python :: 3.11",
55 |           "Programming Language :: Python :: 3.12",
56 |       ],
57 |       keywords='jquery html xml scraping',
58 |       author='Olivier Lauzanne',
59 |       author_email='olauzanne@gmail.com',
60 |       maintainer='Gael Pasgrimaud',
61 |       maintainer_email='gael@gawel.org',
62 |       url='https://github.com/gawel/pyquery',
63 |       license='BSD',
64 |       packages=find_packages(exclude=[
65 |           'bootstrap', 'bootstrap-py3k', 'docs', 'tests', 'README_fixt'
66 |       ]),
67 |       extras_require={
68 |           'test': ['requests', 'webob', 'webtest', 'pytest', 'pytest-cov'],
69 |       },
70 |       include_package_data=True,
71 |       zip_safe=False,
72 |       install_requires=install_requires,
73 |       entry_points="""
74 |       # -*- Entry points: -*-
75 |       """,
76 |       )
77 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gawel/pyquery/811cd048ffbe4e69fdc512863671131f98d691fb/tests/__init__.py


--------------------------------------------------------------------------------
/tests/apps.py:
--------------------------------------------------------------------------------
 1 | from webob import Request
 2 | from webob import Response
 3 | from webob import exc
 4 | 
 5 | 
 6 | def input_app(environ, start_response):
 7 |     resp = Response()
 8 |     req = Request(environ)
 9 |     if req.path_info == '/':
10 |         resp.text = '<input name="youyou" type="text" value="" />'
11 |     elif req.path_info == '/submit':
12 |         resp.text = '<input type="submit" value="OK" />'
13 |     elif req.path_info.startswith('/html'):
14 |         resp.text = '<html><p>Success</p></html>'
15 |     else:
16 |         resp.text = '<html></html>'
17 |     return resp(environ, start_response)
18 | 
19 | 
20 | def application(environ, start_response):
21 |     req = Request(environ)
22 |     response = Response()
23 |     if req.method == 'GET':
24 |         response.text = '<pre>Yeah !</pre>'
25 |     else:
26 |         response.text = '<a href="/plop">Yeah !</a>'
27 |     return response(environ, start_response)
28 | 
29 | 
30 | def secure_application(environ, start_response):
31 |     if 'REMOTE_USER' not in environ:
32 |         return exc.HTTPUnauthorized('vomis')(environ, start_response)
33 |     return application(environ, start_response)
34 | 


--------------------------------------------------------------------------------
/tests/browser_base.py:
--------------------------------------------------------------------------------
 1 | class TextExtractionMixin():
 2 |     def _prepare_dom(self, html):
 3 |         self.last_html = '<html><body>' + html + '</body></html>'
 4 | 
 5 |     def _simple_test(self, html, expected_sq, expected_nosq, **kwargs):
 6 |         raise NotImplementedError
 7 | 
 8 |     def test_inline_tags(self):
 9 |         self._simple_test(
10 |             'Phas<em>ell</em>us<i> eget </i>sem <b>facilisis</b> justo',
11 |             'Phasellus eget sem facilisis justo',
12 |             'Phasellus eget sem facilisis justo',
13 |         )
14 |         self._simple_test(
15 |             'Phasellus <span> eget </span> sem <b>facilisis\n</b> justo',
16 |             'Phasellus eget sem facilisis justo',
17 |             'Phasellus  eget  sem facilisis\n justo',
18 |         )
19 |         self._simple_test(
20 |             ('Phasellus   <span>\n  eget\n           '
21 |              'sem\n\tfacilisis</span>   justo'),
22 |             'Phasellus eget sem facilisis justo',
23 |             'Phasellus   \n  eget\n           sem\n\tfacilisis   justo'
24 |         )
25 | 
26 |     def test_block_tags(self):
27 |         self._simple_test(
28 |             'Phas<p>ell</p>us<div> eget </div>sem <h1>facilisis</h1> justo',
29 |             'Phas\nell\nus\neget\nsem\nfacilisis\njusto',
30 |             'Phas\nell\nus\n eget \nsem \nfacilisis\n justo',
31 |         )
32 |         self._simple_test(
33 |             '<p>In sagittis</p> <p>rutrum</p><p>condimentum</p>',
34 |             'In sagittis\nrutrum\ncondimentum',
35 |             'In sagittis\n \nrutrum\n\ncondimentum',
36 |         )
37 |         self._simple_test(
38 |             'In <p>\nultricies</p>\n erat et <p>\n\n\nmaximus\n\n</p> mollis',
39 |             'In\nultricies\nerat et\nmaximus\nmollis',
40 |             'In \n\nultricies\n\n erat et \n\n\n\nmaximus\n\n\n mollis',
41 |         )
42 |         self._simple_test(
43 |             ('Integer <div><div>\n  <div>quis commodo</div></div> '
44 |              '</div> libero'),
45 |             'Integer\nquis commodo\nlibero',
46 |             'Integer \n\n\n  \nquis commodo\n\n \n libero',
47 |         )
48 |         self._simple_test(
49 |             'Heading<ul><li>one</li><li>two</li><li>three</li></ul>',
50 |             'Heading\none\ntwo\nthree',
51 |             'Heading\n\none\n\ntwo\n\nthree',
52 |         )
53 | 
54 |     def test_separators(self):
55 |         self._simple_test(
56 |             'Some words<br>test. Another word<br><br> <br> test.',
57 |             'Some words\ntest. Another word\n\n\ntest.',
58 |             'Some words\ntest. Another word\n\n \n test.',
59 |         )
60 |         self._simple_test(
61 |             'Inline <span>  split by\nbr<br>tag</span> test',
62 |             'Inline split by br\ntag test',
63 |             'Inline   split by\nbr\ntag test',
64 |         )
65 |         self._simple_test(
66 |             'Some words<hr>test. Another word<hr><hr> <hr> test.',
67 |             'Some words\ntest. Another word\ntest.',
68 |             'Some words\n\ntest. Another word\n\n\n\n \n\n test.',
69 |         )
70 | 
71 |     def test_strip(self):
72 |         self._simple_test(
73 |             ' text\n',
74 |             'text',
75 |             ' text\n',
76 |         )
77 | 
78 |     def test_ul_li(self):
79 |         self._simple_test(
80 |             '<ul> <li>  </li> </ul>',
81 |             '',
82 |             ' \n  \n '
83 |         )
84 | 


--------------------------------------------------------------------------------
/tests/doctests.rst:
--------------------------------------------------------------------------------
 1 | Import::
 2 | 
 3 |     >>> from pyquery import PyQuery as pq
 4 | 
 5 | 
 6 | Assume spaces normalization::
 7 | 
 8 |     >>> pq('<ul> <li>  </li> </ul>').text()
 9 |     ''
10 | 
11 |     >>> print(pq('<ul> <li> toto </li> <li> tata </li> </ul>').text())
12 |     toto tata
13 | 
14 | Complex wrapping::
15 | 
16 |     >>> d = pq('<div id="bouh"><span>youhou</span></div>')
17 |     >>> s = d('span')
18 |     >>> s is d
19 |     False
20 |     >>> s.wrap('<div><div id="wrapper"></div></div>')
21 |     [<div>]
22 | 
23 | We get the original doc with new node::
24 | 
25 |     >>> print(d)
26 |     <div id="bouh"><div><div id="wrapper"><span>youhou</span></div></div></div>
27 | 
28 | Complex wrapAll::
29 | 
30 |     >>> doc = pq('<div><span>Hey</span><span>you !</span></div>')
31 |     >>> s = doc('span')
32 |     >>> s.wrapAll('<div id="wrapper"></div>')
33 |     [<div#wrapper>]
34 | 
35 |     >>> print(doc)
36 |     <div><div id="wrapper"><span>Hey</span><span>you !</span></div></div>
37 | 


--------------------------------------------------------------------------------
/tests/geckodriver.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | driver="https://github.com/mozilla/geckodriver/releases/download/v0.26.0/geckodriver-v0.26.0-linux64.tar.gz"
4 | 
5 | [ -f geckodriver ] || wget -cqO- $driver | tar xvzf -
6 | 


--------------------------------------------------------------------------------
/tests/invalid.xml:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <body>
 3 | <p class="hello" id="hello">Hello world !</p>
 4 | 
 5 | <p id="test">
 6 | hello <a href="http://python.org">python</a> !
 7 | </p>
 8 | 
 9 | <p>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/tests/selenium.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # script to run selenium tests
3 | 
4 | # get geckodriver
5 | ./tests/geckodriver.sh
6 | 
7 | # run tox with py3.7
8 | MOZ_HEADLESS=1 PATH=$PATH:$PWD tox -e py37 tests/test_real_browser.py
9 | 


--------------------------------------------------------------------------------
/tests/test.html:
--------------------------------------------------------------------------------
1 | <html>
2 |   <body>
3 |     <p class="hello" id="hello">Hello world !</p>
4 | 
5 |     <p id="test">hello <a href="http://python.org">python</a> !</p>
6 |   </body>
7 | </html>
8 | 


--------------------------------------------------------------------------------
/tests/test_browser.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from pyquery.pyquery import PyQuery
 4 | from .browser_base import TextExtractionMixin
 5 | 
 6 | 
 7 | class TestInnerText(unittest.TestCase, TextExtractionMixin):
 8 |     def _prepare_dom(self, html):
 9 |         super()._prepare_dom(html)
10 |         self.pq = PyQuery(self.last_html)
11 | 
12 |     def _simple_test(self, html, expected_sq, expected_nosq, **kwargs):
13 |         self._prepare_dom(html)
14 |         text_sq = self.pq.text(squash_space=True, **kwargs)
15 |         text_nosq = self.pq.text(squash_space=False, **kwargs)
16 |         self.assertEqual(text_sq, expected_sq)
17 |         self.assertEqual(text_nosq, expected_nosq)
18 | 


--------------------------------------------------------------------------------
/tests/test_pyquery.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
  2 | #
  3 | # Distributed under the BSD license, see LICENSE.txt
  4 | import os
  5 | import sys
  6 | import time
  7 | from lxml import etree
  8 | from pyquery.pyquery import PyQuery as pq, no_default
  9 | from pyquery.openers import HAS_REQUEST
 10 | from webtest import http
 11 | from webtest.debugapp import debug_app
 12 | from unittest import TestCase
 13 | 
 14 | sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
 15 | 
 16 | 
 17 | dirname = os.path.dirname(os.path.abspath(__file__))
 18 | docs = os.path.join(os.path.dirname(dirname), 'docs')
 19 | path_to_html_file = os.path.join(dirname, 'test.html')
 20 | path_to_invalid_file = os.path.join(dirname, 'invalid.xml')
 21 | 
 22 | 
 23 | class TestUnicode(TestCase):
 24 | 
 25 |     def test_unicode(self):
 26 |         xml = pq("<html><p>é</p></html>")
 27 |         self.assertEqual(type(xml.html()), str)
 28 |         self.assertEqual(str(xml), '<html><p>é</p></html>')
 29 |         self.assertEqual(str(xml('p:contains("é")')), '<p>é</p>')
 30 | 
 31 | 
 32 | class TestAttributeCase(TestCase):
 33 | 
 34 |     def test_xml_upper_element_name(self):
 35 |         xml = pq('<X>foo</X>', parser='xml')
 36 |         self.assertEqual(len(xml('X')), 1)
 37 |         self.assertEqual(len(xml('x')), 0)
 38 | 
 39 |     def test_html_upper_element_name(self):
 40 |         xml = pq('<X>foo</X>', parser='html')
 41 |         self.assertEqual(len(xml('X')), 1)
 42 |         self.assertEqual(len(xml('x')), 1)
 43 | 
 44 | 
 45 | class TestSelector(TestCase):
 46 |     klass = pq
 47 |     html = """
 48 |            <html>
 49 |             <body>
 50 |               <div>node1</div>
 51 |               <div id="node2">node2</div>
 52 |               <div class="node3">node3</div>
 53 |             </body>
 54 |            </html>
 55 |            """
 56 | 
 57 |     html2 = """
 58 |            <html>
 59 |             <body>
 60 |               <div>node1</div>
 61 |             </body>
 62 |            </html>
 63 |            """
 64 | 
 65 |     html3 = """
 66 |            <html>
 67 |             <body>
 68 |               <div>node1</div>
 69 |               <div id="node2">node2</div>
 70 |               <div class="node3">node3</div>
 71 |             </body>
 72 |            </html>
 73 |            """
 74 | 
 75 |     html4 = """
 76 |            <html>
 77 |             <body>
 78 |               <form action="/">
 79 |                 <input name="enabled" type="text" value="test"/>
 80 |                 <b disabled>Not :disabled</b>
 81 |                 <input name="disabled" type="text"
 82 |                        value="disabled" disabled="disabled"/>
 83 |                 <fieldset>
 84 |                     <input name="fieldset-enabled">
 85 |                 </fieldset>
 86 |                 <fieldset disabled>
 87 |                     <legend>
 88 |                         <input name="legend-enabled">
 89 |                     </legend>
 90 |                     <input name="fieldset-disabled">
 91 |                     <legend>
 92 |                         <input name="legend-disabled">
 93 |                     </legend>
 94 |                     <select id="disabled-select">
 95 |                         <optgroup>
 96 |                             <option></option>
 97 |                         </optgroup>
 98 |                     </select>
 99 |                 </fieldset>
100 |                 <select>
101 |                     <optgroup id="disabled-optgroup" disabled>
102 |                         <option id="disabled-from-optgroup"></option>
103 |                         <option id="disabled-option" disabled></option>
104 |                     </optgroup>
105 |                 </select>
106 |                 <input name="file" type="file" />
107 |                 <select name="select">
108 |                   <option value="">Choose something</option>
109 |                   <option value="one">One</option>
110 |                   <option value="two" selected="selected">Two</option>
111 |                   <option value="three">Three</option>
112 |                 </select>
113 |                 <input name="radio" type="radio" value="one"/>
114 |                 <input name="radio" type="radio"
115 |                        value="two" checked="checked"/>
116 |                 <input name="radio" type="radio" value="three"/>
117 |                 <input name="checkbox" type="checkbox" value="a"/>
118 |                 <input name="checkbox" type="checkbox"
119 |                        value="b" checked="checked"/>
120 |                 <input name="checkbox" type="checkbox" value="c"/>
121 |                 <input name="button" type="button" value="button" />
122 |                 <button>button</button>
123 |               </form>
124 |             </body>
125 |            </html>
126 |            """
127 | 
128 |     html5 = """
129 |            <html>
130 |             <body>
131 |               <h1>Heading 1</h1>
132 |               <h2>Heading 2</h2>
133 |               <h3>Heading 3</h3>
134 |               <h4>Heading 4</h4>
135 |               <h5>Heading 5</h5>
136 |               <h6>Heading 6</h6>
137 |               <div></div>
138 |             </body>
139 |            </html>
140 |            """
141 | 
142 |     def test_get_root(self):
143 |         doc = pq(b'<?xml version="1.0" encoding="UTF-8"?><root><p/></root>')
144 |         self.assertEqual(isinstance(doc.root, etree._ElementTree), True)
145 |         self.assertEqual(doc.encoding, 'UTF-8')
146 | 
147 |         child = doc.children().eq(0)
148 |         self.assertNotEqual(child._parent, no_default)
149 |         self.assertTrue(isinstance(child.root, etree._ElementTree))
150 | 
151 |     def test_selector_from_doc(self):
152 |         doc = etree.fromstring(self.html)
153 |         assert len(self.klass(doc)) == 1
154 |         assert len(self.klass('div', doc)) == 3
155 |         assert len(self.klass('div#node2', doc)) == 1
156 | 
157 |     def test_selector_from_html(self):
158 |         assert len(self.klass(self.html)) == 1
159 |         assert len(self.klass('div', self.html)) == 3
160 |         assert len(self.klass('div#node2', self.html)) == 1
161 | 
162 |     def test_selector_from_obj(self):
163 |         e = self.klass(self.html)
164 |         assert len(e('div')) == 3
165 |         assert len(e('div#node2')) == 1
166 | 
167 |     def test_selector_from_html_from_obj(self):
168 |         e = self.klass(self.html)
169 |         assert len(e('div', self.html2)) == 1
170 |         assert len(e('div#node2', self.html2)) == 0
171 | 
172 |     def test_class(self):
173 |         e = self.klass(self.html)
174 |         assert isinstance(e, self.klass)
175 |         n = e('div', self.html2)
176 |         assert isinstance(n, self.klass)
177 |         assert n._parent is e
178 | 
179 |     def test_pseudo_classes(self):
180 |         e = self.klass(self.html)
181 |         self.assertEqual(e('div:first').text(), 'node1')
182 |         self.assertEqual(e('div:last').text(), 'node3')
183 |         self.assertEqual(e('div:even').text(), 'node1 node3')
184 |         self.assertEqual(e('div div:even').text(), '')
185 |         self.assertEqual(e('body div:even').text(), 'node1 node3')
186 |         self.assertEqual(e('div:gt(0)').text(), 'node2 node3')
187 |         self.assertEqual(e('div:lt(1)').text(), 'node1')
188 |         self.assertEqual(e('div:eq(2)').text(), 'node3')
189 | 
190 |         # test on the form
191 |         e = self.klass(self.html4)
192 |         disabled = e(':disabled')
193 |         self.assertIn(e('[name="disabled"]')[0], disabled)
194 |         self.assertIn(e('fieldset[disabled]')[0], disabled)
195 |         self.assertIn(e('[name="legend-disabled"]')[0], disabled)
196 |         self.assertIn(e('[name="fieldset-disabled"]')[0], disabled)
197 |         self.assertIn(e('#disabled-optgroup')[0], disabled)
198 |         self.assertIn(e('#disabled-from-optgroup')[0], disabled)
199 |         self.assertIn(e('#disabled-option')[0], disabled)
200 |         self.assertIn(e('#disabled-select')[0], disabled)
201 | 
202 |         assert len(disabled) == 8
203 |         assert len(e('select:enabled')) == 2
204 |         assert len(e('input:enabled')) == 11
205 |         assert len(e(':selected')) == 1
206 |         assert len(e(':checked')) == 2
207 |         assert len(e(':file')) == 1
208 |         assert len(e(':input')) == 18
209 |         assert len(e(':button')) == 2
210 |         assert len(e(':radio')) == 3
211 |         assert len(e(':checkbox')) == 3
212 | 
213 |         # test on other elements
214 |         e = self.klass(self.html5)
215 |         assert len(e(":header")) == 6
216 |         assert len(e(":parent")) == 2
217 |         assert len(e(":empty")) == 1
218 |         assert len(e(":contains('Heading')")) == 8
219 | 
220 |     def test_on_the_fly_dom_creation(self):
221 |         e = self.klass(self.html)
222 |         assert e('<p>Hello world</p>').text() == 'Hello world'
223 |         assert e('').text() == ''
224 | 
225 | 
226 | class TestTraversal(TestCase):
227 |     klass = pq
228 |     html = """
229 |            <html>
230 |             <body>
231 |               <div id="node1"><span>node1</span></div>
232 |               <div id="node2" class="node3">
233 |                         <span>node2</span><span> booyah</span></div>
234 |             </body>
235 |            </html>
236 |            """
237 | 
238 |     html2 = """
239 |             <html>
240 |              <body>
241 |                <dl>
242 |                  <dt id="term-1">term 1</dt>
243 |                  <dd>definition 1-a</dd>
244 |                  <dd>definition 1-b</dd>
245 |                  <dd>definition 1-c</dd>
246 |                  <dd>definition 1-d</dd>
247 |                  <dt id="term-2">term 2</dt>
248 |                  <dd>definition 2-a</dd>
249 |                  <dd class="strange">definition 2-b</dd>
250 |                  <dd>definition 2-c</dd>
251 |                  <dt id="term-3">term 3</dt>
252 |                  <dd>definition 3-a</dd>
253 |                  <dd>definition 3-b</dd>
254 |                </dl>
255 |              </body>
256 |             </html>
257 |             """
258 | 
259 |     def test_filter(self):
260 |         assert len(self.klass('div', self.html).filter('.node3')) == 1
261 |         assert len(self.klass('div', self.html).filter('#node2')) == 1
262 |         assert len(self.klass('div', self.html).filter(lambda i: i == 0)) == 1
263 | 
264 |         d = pq('<p>Hello <b>warming</b> world</p>')
265 |         self.assertEqual(d('strong').filter(lambda el: True), [])
266 | 
267 |     def test_not(self):
268 |         assert len(self.klass('div', self.html).not_('.node3')) == 1
269 | 
270 |     def test_is(self):
271 |         assert self.klass('div', self.html).is_('.node3')
272 |         assert not self.klass('div', self.html).is_('.foobazbar')
273 | 
274 |     def test_find(self):
275 |         assert len(self.klass('#node1', self.html).find('span')) == 1
276 |         assert len(self.klass('#node2', self.html).find('span')) == 2
277 |         assert len(self.klass('div', self.html).find('span')) == 3
278 | 
279 |     def test_each(self):
280 |         doc = self.klass(self.html)
281 |         doc('span').each(lambda: doc(this).wrap("<em></em>"))  # NOQA
282 |         assert len(doc('em')) == 3
283 | 
284 |     def test_map(self):
285 |         def ids_minus_one(i, elem):
286 |             return int(self.klass(elem).attr('id')[-1]) - 1
287 |         assert self.klass('div', self.html).map(ids_minus_one) == [0, 1]
288 | 
289 |         d = pq('<p>Hello <b>warming</b> world</p>')
290 |         self.assertEqual(d('strong').map(lambda i, el: pq(this).text()), [])  # NOQA
291 | 
292 |     def test_end(self):
293 |         assert len(self.klass('div', self.html).find('span').end()) == 2
294 |         assert len(self.klass('#node2', self.html).find('span').end()) == 1
295 | 
296 |     def test_closest(self):
297 |         assert len(self.klass('#node1 span', self.html).closest('body')) == 1
298 |         assert self.klass('#node2',
299 |                           self.html).closest('.node3').attr('id') == 'node2'
300 |         assert self.klass('.node3', self.html).closest('form') == []
301 | 
302 |     def test_next_all(self):
303 |         d = pq(self.html2)
304 | 
305 |         # without filter
306 |         self.assertEqual(
307 |             len(d('#term-2').next_all()), 6)
308 |         # with filter
309 |         self.assertEqual(
310 |             len(d('#term-2').next_all('dd')), 5)
311 |         # when empty
312 |         self.assertEqual(
313 |             d('#NOTHING').next_all(), [])
314 | 
315 |     def test_next_until(self):
316 |         d = pq(self.html2)
317 | 
318 |         # without filter
319 |         self.assertEqual(
320 |             len(d('#term-2').next_until('dt')), 3)
321 |         # with filter
322 |         self.assertEqual(
323 |             len(d('#term-2').next_until('dt', ':not(.strange)')), 2)
324 |         # when empty
325 |         self.assertEqual(
326 |             d('#NOTHING').next_until('*'), [])
327 | 
328 | 
329 | class TestOpener(TestCase):
330 | 
331 |     def test_open_filename(self):
332 |         doc = pq(filename=path_to_html_file)
333 |         self.assertEqual(len(doc('p#test').text()), 14)
334 | 
335 |     def test_invalid_filename(self):
336 |         doc = pq(filename=path_to_invalid_file)
337 |         self.assertEqual(len(doc('p#test').text()), 14)
338 | 
339 |     def test_custom_opener(self):
340 |         def opener(url):
341 |             return '<html><body><div class="node"></div>'
342 | 
343 |         doc = pq(url='http://example.com', opener=opener)
344 |         assert len(doc('.node')) == 1, doc
345 | 
346 | 
347 | class TestConstruction(TestCase):
348 | 
349 |     def test_typeerror_on_invalid_value(self):
350 |         self.assertRaises(TypeError, pq, object())
351 | 
352 | 
353 | class TestComment(TestCase):
354 | 
355 |     def test_comment(self):
356 |         doc = pq('<div><!-- foo --> bar</div>')
357 |         self.assertEqual(doc.text(), 'bar')
358 | 
359 | 
360 | class TestCallback(TestCase):
361 |     html = """
362 |         <ol>
363 |             <li>Coffee</li>
364 |             <li>Tea</li>
365 |             <li>Milk</li>
366 |         </ol>
367 |     """
368 | 
369 |     def test_S_this_inside_callback(self):
370 |         S = pq(self.html)
371 |         self.assertEqual(S('li').map(
372 |             lambda i, el: S(this).html()),  # NOQA
373 |             ['Coffee', 'Tea', 'Milk']
374 |         )
375 | 
376 |     def test_parameterless_callback(self):
377 |         S = pq(self.html)
378 |         self.assertEqual(S('li').map(
379 |             lambda: S(this).html()),  # NOQA
380 |             ['Coffee', 'Tea', 'Milk']
381 |         )
382 | 
383 | 
384 | class TestHook(TestCase):
385 |     html = """
386 |         <ol>
387 |             <li>Coffee</li>
388 |             <li>Tea</li>
389 |             <li>Milk</li>
390 |         </ol>
391 |     """
392 | 
393 |     def test_fn(self):
394 |         "Example from `PyQuery.Fn` docs."
395 |         fn = lambda: this.map(lambda i, el: pq(this).outerHtml())  # NOQA
396 |         pq.fn.listOuterHtml = fn
397 |         S = pq(self.html)
398 |         self.assertEqual(S('li').listOuterHtml(),
399 |                          ['<li>Coffee</li>', '<li>Tea</li>', '<li>Milk</li>'])
400 | 
401 |     def test_fn_with_kwargs(self):
402 |         "fn() with keyword arguments."
403 |         pq.fn.test = lambda p=1: pq(this).eq(p)  # NOQA
404 |         S = pq(self.html)
405 |         self.assertEqual(S('li').test(0).text(), 'Coffee')
406 |         self.assertEqual(S('li').test().text(), 'Tea')
407 |         self.assertEqual(S('li').test(p=2).text(), 'Milk')
408 | 
409 | 
410 | class TestManipulating(TestCase):
411 |     html = '''
412 |     <div class="portlet">
413 |       <a href="/toto">Test<img src ="myimage" />My link text</a>
414 |       <a href="/toto2"><img src ="myimage2" />My link text 2</a>
415 |     </div>
416 |     '''
417 | 
418 |     html2 = '''
419 |         <input name="spam" value="Spam">
420 |         <input name="eggs" value="Eggs">
421 |         <input type="checkbox" value="Bacon">
422 |         <input type="radio" value="Ham">
423 |     '''
424 | 
425 |     html2_newline = '''
426 |         <input id="newline-text" type="text" name="order" value="S
427 | pam">
428 |         <input id="newline-radio" type="radio" name="order" value="S
429 | pam">
430 |     '''
431 | 
432 |     html3 = '''
433 |         <textarea id="textarea-single">Spam</textarea>
434 |         <textarea id="textarea-multi">Spam
435 | <b>Eggs</b>
436 | Bacon</textarea>
437 |     '''
438 | 
439 |     html4 = '''
440 |         <select id="first">
441 |             <option value="spam">Spam</option>
442 |             <option value="eggs">Eggs</option>
443 |         </select>
444 |         <select id="second">
445 |             <option value="spam">Spam</option>
446 |             <option value="eggs" selected>Eggs</option>
447 |             <option value="bacon">Bacon</option>
448 |         </select>
449 |         <select id="third">
450 |         </select>
451 |         <select id="fourth">
452 |             <option value="spam">Spam</option>
453 |             <option value="spam">Eggs</option>
454 |             <option value="spam">Bacon</option>
455 |         </select>
456 |     '''
457 | 
458 |     html6 = '''
459 |         <select id="first" multiple>
460 |             <option value="spam" selected>Spam</option>
461 |             <option value="eggs" selected>Eggs</option>
462 |             <option value="bacon">Bacon</option>
463 |         </select>
464 |         <select id="second" multiple>
465 |             <option value="spam">Spam</option>
466 |             <option value="eggs">Eggs</option>
467 |             <option value="bacon">Bacon</option>
468 |         </select>
469 |         <select id="third" multiple>
470 |             <option value="spam">Spam</option>
471 |             <option value="spam">Eggs</option>
472 |             <option value="spam">Bacon</option>
473 |         </select>
474 |     '''
475 | 
476 |     html5 = '''
477 |         <div>
478 |             <input id="first" value="spam">
479 |             <input id="second" value="eggs">
480 |             <textarea id="third">bacon</textarea>
481 |         </div>
482 |     '''
483 | 
484 |     def test_attr_empty_string(self):
485 |         d = pq('<div>')
486 |         d.attr('value', '')
487 |         self.assertEqual(d.outer_html(), '<div value=""></div>')
488 |         self.assertEqual(d.outer_html(method="xml"), '<div value=""/>')
489 | 
490 |     def test_remove(self):
491 |         d = pq(self.html)
492 |         d('img').remove()
493 |         val = d('a:first').html()
494 |         assert val == 'TestMy link text', repr(val)
495 |         val = d('a:last').html()
496 |         assert val == 'My link text 2', repr(val)
497 | 
498 |     def test_class(self):
499 |         d = pq('<div></div>')
500 |         d.removeClass('xx')
501 |         assert 'class' not in str(d), str(d)
502 | 
503 |     def test_val_for_inputs(self):
504 |         d = pq(self.html2)
505 |         self.assertIsNone(d('input[name="none"]').val())
506 |         self.assertEqual(d('input[name="spam"]').val(), 'Spam')
507 |         self.assertEqual(d('input[name="eggs"]').val(), 'Eggs')
508 |         self.assertEqual(d('input:checkbox').val(), 'Bacon')
509 |         self.assertEqual(d('input:radio').val(), 'Ham')
510 |         d('input[name="spam"]').val('42')
511 |         d('input[name="eggs"]').val('43')
512 |         d('input:checkbox').val('44')
513 |         d('input:radio').val('45')
514 |         self.assertEqual(d('input[name="spam"]').val(), '42')
515 |         self.assertEqual(d('input[name="eggs"]').val(), '43')
516 |         self.assertEqual(d('input:checkbox').val(), '44')
517 |         self.assertEqual(d('input:radio').val(), '45')
518 | 
519 |     def test_val_for_inputs_with_newline(self):
520 |         d = pq(self.html2_newline)
521 |         self.assertEqual(d('#newline-text').val(), 'Spam')
522 |         self.assertEqual(d('#newline-radio').val(), 'S\npam')
523 | 
524 |     def test_val_for_textarea(self):
525 |         d = pq(self.html3)
526 |         self.assertEqual(d('#textarea-single').val(), 'Spam')
527 |         self.assertEqual(d('#textarea-single').text(), 'Spam')
528 |         d('#textarea-single').val('42')
529 |         self.assertEqual(d('#textarea-single').val(), '42')
530 |         # Note: jQuery still returns 'Spam' here.
531 |         self.assertEqual(d('#textarea-single').text(), '42')
532 | 
533 |         multi_expected = '''Spam\n<b>Eggs</b>\nBacon'''
534 |         self.assertEqual(d('#textarea-multi').val(), multi_expected)
535 |         self.assertEqual(d('#textarea-multi').text(), multi_expected)
536 |         multi_new = '''Bacon\n<b>Eggs</b>\nSpam'''
537 |         multi_new_expected = '''Bacon\n&lt;b&gt;Eggs&lt;/b&gt;\nSpam'''
538 |         d('#textarea-multi').val(multi_new)
539 |         self.assertEqual(d('#textarea-multi').val(), multi_new_expected)
540 |         self.assertEqual(d('#textarea-multi').text(), multi_new_expected)
541 | 
542 |     def test_val_for_select(self):
543 |         d = pq(self.html4)
544 |         self.assertEqual(d('#first').val(), 'spam')
545 |         self.assertEqual(d('#second').val(), 'eggs')
546 |         self.assertIsNone(d('#third').val())
547 |         d('#first').val('eggs')
548 |         d('#second').val('bacon')
549 |         d('#third').val('eggs')  # Selecting non-existing option.
550 |         self.assertEqual(d('#first').val(), 'eggs')
551 |         self.assertEqual(d('#second').val(), 'bacon')
552 |         self.assertIsNone(d('#third').val())
553 |         d('#first').val('bacon')  # Selecting non-existing option.
554 |         self.assertEqual(d('#first').val(), 'spam')
555 |         # Value set based on option order, not value order
556 |         d('#second').val(['bacon', 'eggs'])
557 |         self.assertEqual(d('#second').val(), 'eggs')
558 |         d('#fourth').val(['spam'])
559 |         self.assertEqual(d('#fourth').val(), 'spam')
560 |         # Sets first option with matching value
561 |         self.assertEqual(d('#fourth option[selected]').length, 1)
562 |         self.assertEqual(d('#fourth option[selected]').text(), 'Spam')
563 | 
564 |     def test_val_for_select_multiple(self):
565 |         d = pq(self.html6)
566 |         self.assertEqual(d('#first').val(), ['spam', 'eggs'])
567 |         # Selecting non-existing option.
568 |         d('#first').val(['eggs', 'sausage', 'bacon'])
569 |         self.assertEqual(d('#first').val(), ['eggs', 'bacon'])
570 |         self.assertEqual(d('#second').val(), [])
571 |         d('#second').val('eggs')
572 |         self.assertEqual(d('#second').val(), ['eggs'])
573 |         d('#second').val(['not spam', 'not eggs'])
574 |         self.assertEqual(d('#second').val(), [])
575 |         d('#third').val(['spam'])
576 |         self.assertEqual(d('#third').val(), ['spam', 'spam', 'spam'])
577 | 
578 |     def test_val_for_input_and_textarea_given_array_value(self):
579 |         d = pq('<input type="text">')
580 |         d('input').val(['spam', 'eggs'])
581 |         self.assertEqual(d('input').val(), 'spam,eggs')
582 |         d = pq('<textarea></textarea>')
583 |         d('textarea').val(['spam', 'eggs'])
584 |         self.assertEqual(d('textarea').val(), 'spam,eggs')
585 | 
586 |     def test_val_for_multiple_elements(self):
587 |         d = pq(self.html5)
588 |         # "Get" returns *first* value.
589 |         self.assertEqual(d('div > *').val(), 'spam')
590 |         # "Set" updates *every* value.
591 |         d('div > *').val('42')
592 |         self.assertEqual(d('#first').val(), '42')
593 |         self.assertEqual(d('#second').val(), '42')
594 |         self.assertEqual(d('#third').val(), '42')
595 | 
596 |     def test_val_checkbox_no_value_attribute(self):
597 |         d = pq('<input type="checkbox">')
598 |         self.assertEqual(d.val(), 'on')
599 |         d = pq('<input type="checkbox" value="">')
600 |         self.assertEqual(d.val(), '')
601 | 
602 |     def test_val_radio_no_value_attribute(self):
603 |         d = pq('<input type="radio">')
604 |         self.assertEqual(d.val(), 'on')
605 | 
606 |     def test_val_value_is_empty_string(self):
607 |         d = pq('<input value="">')
608 |         self.assertEqual(d.val(), '')
609 | 
610 |     def test_val_input_has_no_value_attr(self):
611 |         d = pq('<input>')
612 |         self.assertEqual(d.val(), '')
613 | 
614 |     def test_html_replacement(self):
615 |         html = '<div>Not Me<span>Replace Me</span>Not Me</div>'
616 |         replacement = 'New <em>Contents</em> New'
617 |         expected = html.replace('Replace Me', replacement)
618 | 
619 |         d = pq(html)
620 |         d.find('span').html(replacement)
621 | 
622 |         new_html = d.outerHtml()
623 |         self.assertEqual(new_html, expected)
624 |         self.assertIn(replacement, new_html)
625 | 
626 |     def test_html_escape(self):
627 |         inner_html = 'encoded &lt;script&gt; tag with "quotes".' \
628 |                      '<span>nested &lt;tag&gt;</span>'
629 |         html = '<div>' + inner_html + '</div>'
630 |         d = pq(html)
631 |         self.assertEqual(d.html(), inner_html)
632 | 
633 | 
634 | class TestAjax(TestCase):
635 | 
636 |     html = '''
637 |     <div id="div">
638 |     <input form="dispersed" name="order" value="spam">
639 |     </div>
640 |     <form id="dispersed">
641 |     <div><input name="order" value="eggs"></div>
642 |     <input form="dispersed" name="order" value="ham">
643 |     <input form="other-form" name="order" value="nothing">
644 |     <input form="" name="order" value="nothing">
645 |     </form>
646 |     <form id="other-form">
647 |     <input form="dispersed" name="order" value="tomato">
648 |     </form>
649 |     <form class="no-id">
650 |     <input form="dispersed" name="order" value="baked beans">
651 |     <input name="spam" value="Spam">
652 |     </form>
653 |     '''
654 | 
655 |     html2 = '''
656 |     <form id="first">
657 |     <input name="order" value="spam">
658 |     <fieldset>
659 |     <input name="fieldset" value="eggs">
660 |     <input id="input" name="fieldset" value="ham">
661 |     </fieldset>
662 |     </form>
663 |     <form id="datalist">
664 |     <datalist><div><input name="datalist" value="eggs"></div></datalist>
665 |     <input type="checkbox" name="checkbox" checked>
666 |     <input type="radio" name="radio" checked>
667 |     </form>
668 |     '''
669 | 
670 |     html3 = '''
671 |     <form>
672 |     <input name="order" value="spam">
673 |     <input id="noname" value="sausage">
674 |     <fieldset disabled>
675 |     <input name="order" value="sausage">
676 |     </fieldset>
677 |     <input name="disabled" value="ham" disabled>
678 |     <input type="submit" name="submit" value="Submit">
679 |     <input type="button" name="button" value="">
680 |     <input type="image" name="image" value="">
681 |     <input type="reset" name="reset" value="Reset">
682 |     <input type="file" name="file" value="">
683 |     <button type="submit" name="submit" value="submit"></button>
684 |     <input type="checkbox" name="spam">
685 |     <input type="radio" name="eggs">
686 |     </form>
687 |     '''
688 | 
689 |     html4 = '''
690 |     <form>
691 |     <input name="spam" value="Spam/
692 | spam">
693 |     <select name="order" multiple>
694 |     <option value="baked
695 | beans" selected>
696 |     <option value="tomato" selected>
697 |     <option value="spam">
698 |     </select>
699 |     <textarea name="multiline">multiple
700 | lines
701 | of text</textarea>
702 |     </form>
703 |     '''
704 | 
705 |     def test_serialize_pairs_form_id(self):
706 |         d = pq(self.html)
707 |         self.assertEqual(d('#div').serialize_pairs(), [])
708 |         self.assertEqual(d('#dispersed').serialize_pairs(), [
709 |             ('order', 'spam'), ('order', 'eggs'), ('order', 'ham'),
710 |             ('order', 'tomato'), ('order', 'baked beans'),
711 |         ])
712 |         self.assertEqual(d('.no-id').serialize_pairs(), [
713 |             ('spam', 'Spam'),
714 |         ])
715 | 
716 |     def test_serialize_pairs_form_controls(self):
717 |         d = pq(self.html2)
718 |         self.assertEqual(d('fieldset').serialize_pairs(), [
719 |             ('fieldset', 'eggs'), ('fieldset', 'ham'),
720 |         ])
721 |         self.assertEqual(d('#input, fieldset, #first').serialize_pairs(), [
722 |             ('order', 'spam'), ('fieldset', 'eggs'), ('fieldset', 'ham'),
723 |             ('fieldset', 'eggs'), ('fieldset', 'ham'), ('fieldset', 'ham'),
724 |         ])
725 |         self.assertEqual(d('#datalist').serialize_pairs(), [
726 |             ('datalist', 'eggs'), ('checkbox', 'on'), ('radio', 'on'),
727 |         ])
728 | 
729 |     def test_serialize_pairs_filter_controls(self):
730 |         d = pq(self.html3)
731 |         self.assertEqual(d('form').serialize_pairs(), [
732 |             ('order', 'spam')
733 |         ])
734 | 
735 |     def test_serialize_pairs_form_values(self):
736 |         d = pq(self.html4)
737 |         self.assertEqual(d('form').serialize_pairs(), [
738 |             ('spam', 'Spam/spam'), ('order', 'baked\r\nbeans'),
739 |             ('order', 'tomato'), ('multiline', 'multiple\r\nlines\r\nof text'),
740 |         ])
741 | 
742 |     def test_serialize_array(self):
743 |         d = pq(self.html4)
744 |         self.assertEqual(d('form').serialize_array(), [
745 |             {'name': 'spam', 'value': 'Spam/spam'},
746 |             {'name': 'order', 'value': 'baked\r\nbeans'},
747 |             {'name': 'order', 'value': 'tomato'},
748 |             {'name': 'multiline', 'value': 'multiple\r\nlines\r\nof text'},
749 |         ])
750 | 
751 |     def test_serialize(self):
752 |         d = pq(self.html4)
753 |         self.assertEqual(
754 |             d('form').serialize(),
755 |             'spam=Spam%2Fspam&order=baked%0D%0Abeans&order=tomato&'
756 |             'multiline=multiple%0D%0Alines%0D%0Aof%20text'
757 |         )
758 | 
759 |     def test_serialize_dict(self):
760 |         d = pq(self.html4)
761 |         self.assertEqual(d('form').serialize_dict(), {
762 |             'spam': 'Spam/spam',
763 |             'order': ['baked\r\nbeans', 'tomato'],
764 |             'multiline': 'multiple\r\nlines\r\nof text',
765 |         })
766 | 
767 | 
768 | class TestMakeLinks(TestCase):
769 | 
770 |     html = '''
771 |     <html>
772 |     <div>
773 |     <a href="/path_info">with href</a>
774 |     <a>without href</a>
775 |     </div>
776 |     </html>
777 |     '''
778 | 
779 |     def test_make_link(self):
780 |         d = pq(self.html, parser='xml')
781 |         d.make_links_absolute(base_url='http://example.com')
782 |         self.assertTrue(len(d('a[href]')), 1)
783 |         self.assertEqual(d('a[href]').attr('href'),
784 |                          'http://example.com/path_info')
785 | 
786 | 
787 | class TestHTMLParser(TestCase):
788 |     xml = "<div>I'm valid XML</div>"
789 |     html = '''<div class="portlet">
790 |       <a href="/toto">TestimageMy link text</a>
791 |       <a href="/toto2">imageMy link text 2</a>
792 |       Behind you, a three-headed HTML&dash;Entity!
793 |     </div>'''
794 | 
795 |     def test_parser_persistance(self):
796 |         d = pq(self.xml, parser='xml')
797 |         self.assertRaises(etree.XMLSyntaxError, lambda: d.after(self.html))
798 |         d = pq(self.xml, parser='html')
799 |         d.after(self.html)  # this should not fail
800 | 
801 |     def test_replaceWith(self):
802 |         expected = '''<div class="portlet">
803 |       <a href="/toto">TestimageMy link text</a>
804 |       <a href="/toto2">imageMy link text 2</a>
805 |       Behind you, a three-headed HTML&amp;dash;Entity!
806 |     </div>'''
807 |         d = pq(self.html)
808 |         d('img').replace_with('image')
809 |         val = d.__html__()
810 |         assert val == expected, (repr(val), repr(expected))
811 | 
812 |     def test_replaceWith_with_function(self):
813 |         expected = '''<div class="portlet">
814 |       TestimageMy link text
815 |       imageMy link text 2
816 |       Behind you, a three-headed HTML&amp;dash;Entity!
817 |     </div>'''
818 |         d = pq(self.html)
819 |         d('a').replace_with(lambda i, e: pq(e).html())
820 |         val = d.__html__()
821 |         assert val == expected, (repr(val), repr(expected))
822 | 
823 | 
824 | class TestXMLNamespace(TestCase):
825 |     xml = '''<?xml version="1.0" encoding="UTF-8" ?>
826 |     <foo xmlns:bar="http://example.com/bar">
827 |     <bar:blah>What</bar:blah>
828 |     <idiot>123</idiot>
829 |     <baz xmlns="http://example.com/baz" a="b">
830 |           <subbaz/>
831 |     </baz>
832 |     </foo>'''
833 | 
834 |     xhtml = '''
835 |     <html xmlns="http://www.w3.org/1999/xhtml">
836 |     <body>
837 |     <div>What</div>
838 |     </body>
839 |     </html>'''
840 | 
841 |     namespaces = {'bar': 'http://example.com/bar',
842 |                   'baz': 'http://example.com/baz'}
843 | 
844 |     def test_selector(self):
845 |         expected = 'What'
846 |         d = pq(self.xml.encode('utf8'), parser='xml')
847 |         val = d('bar|blah',
848 |                 namespaces=self.namespaces).text()
849 |         self.assertEqual(repr(val), repr(expected))
850 | 
851 |     def test_selector_with_xml(self):
852 |         expected = 'What'
853 |         d = pq('bar|blah', self.xml.encode('utf8'), parser='xml',
854 |                namespaces=self.namespaces)
855 |         val = d.text()
856 |         self.assertEqual(repr(val), repr(expected))
857 | 
858 |     def test_xhtml_namespace(self):
859 |         expected = 'What'
860 |         d = pq(self.xhtml.encode('utf8'), parser='xml')
861 |         d.xhtml_to_html()
862 |         val = d('div').text()
863 |         self.assertEqual(repr(val), repr(expected))
864 | 
865 |     def test_xhtml_namespace_html_parser(self):
866 |         expected = 'What'
867 |         d = pq(self.xhtml, parser='html')
868 |         d.xhtml_to_html()
869 |         val = d('div').text()
870 |         self.assertEqual(repr(val), repr(expected))
871 | 
872 |     def test_remove_namespaces(self):
873 |         expected = 'What'
874 |         d = pq(self.xml.encode('utf8'), parser='xml').remove_namespaces()
875 |         val = d('blah').text()
876 |         self.assertEqual(repr(val), repr(expected))
877 | 
878 |     def test_persistent_namespaces(self):
879 |         d = pq(self.xml.encode('utf8'), parser='xml',
880 |                namespaces=self.namespaces)
881 |         val = d('bar|blah').text()
882 |         self.assertEqual(repr(val), repr('What'))
883 | 
884 |     def test_namespace_traversal(self):
885 |         d = pq(self.xml.encode('utf8'), parser='xml',
886 |                namespaces=self.namespaces)
887 |         val = d('baz|subbaz').closest('baz|baz').attr('a')
888 |         self.assertEqual(repr(val), repr('b'))
889 | 
890 | 
891 | class TestWebScrapping(TestCase):
892 | 
893 |     def setUp(self):
894 |         self.s = http.StopableWSGIServer.create(debug_app)
895 |         self.s.wait()
896 |         self.application_url = self.s.application_url.rstrip('/')
897 | 
898 |     def test_get(self):
899 |         d = pq(url=self.application_url, data={'q': 'foo'},
900 |                method='get')
901 |         print(d)
902 |         self.assertIn('REQUEST_METHOD: GET', d('p').text())
903 |         self.assertIn('q=foo', d('p').text())
904 | 
905 |     def test_post(self):
906 |         d = pq(url=self.application_url, data={'q': 'foo'},
907 |                method='post')
908 |         self.assertIn('REQUEST_METHOD: POST', d('p').text())
909 |         self.assertIn('q=foo', d('p').text())
910 | 
911 |     def test_session(self):
912 |         if HAS_REQUEST:
913 |             import requests
914 |             session = requests.Session()
915 |             session.headers.update({'X-FOO': 'bar'})
916 |             d = pq(url=self.application_url, data={'q': 'foo'},
917 |                    method='get', session=session)
918 |             self.assertIn('HTTP_X_FOO: bar', d('p').text())
919 |         else:
920 |             self.skipTest('no requests library')
921 | 
922 |     def tearDown(self):
923 |         self.s.shutdown()
924 | 
925 | 
926 | class TestWebScrappingEncoding(TestCase):
927 | 
928 |     def test_get(self):
929 |         d = pq(url='http://ru.wikipedia.org/wiki/Заглавная_страница',
930 |                method='get')
931 |         print(d)
932 |         self.assertEqual(d('#pt-login').text(), 'Войти')
933 | 
934 | 
935 | class TestWebScrappingTimeouts(TestCase):
936 | 
937 |     def setUp(self):
938 |         def app(environ, start_response):
939 |             start_response('200 OK', [('Content-Type', 'text/plain')])
940 |             time.sleep(2)
941 |             return [b'foobar\n']
942 |         self.s = http.StopableWSGIServer.create(app)
943 |         self.s.wait()
944 |         self.application_url = self.s.application_url.rstrip('/')
945 | 
946 |     def test_get(self):
947 |         pq(url=self.application_url)
948 |         with self.assertRaises(Exception):
949 |             pq(url=self.application_url, timeout=1)
950 | 
951 |     def tearDown(self):
952 |         self.s.shutdown()
953 | 


--------------------------------------------------------------------------------
/tests/test_real_browser.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | from threading import Thread
  4 | from time import sleep
  5 | 
  6 | from .browser_base import TextExtractionMixin
  7 | 
  8 | SELENIUM = 'MOZ_HEADLESS' in os.environ
  9 | 
 10 | try:
 11 |     from selenium import webdriver
 12 |     from selenium.webdriver.firefox.options import Options
 13 | except ImportError:
 14 |     SELENIUM = False
 15 | 
 16 | if SELENIUM:
 17 |     from urllib.parse import urlunsplit
 18 |     from http.server import HTTPServer, BaseHTTPRequestHandler
 19 |     from queue import Queue
 20 | 
 21 |     class BaseTestRequestHandler(BaseHTTPRequestHandler):
 22 |         _last_html = ''
 23 | 
 24 |         def _get_last_html(self):
 25 |             q = self.server.html_queue
 26 |             while not q.empty():
 27 |                 self._last_html = q.get_nowait()
 28 |             return self._last_html
 29 | 
 30 |         def log_request(self, code='-', size='-'):
 31 |             pass
 32 | 
 33 |         def recv_from_testsuite(self, non_blocking=False):
 34 |             q = self.server.in_queue
 35 |             if non_blocking:
 36 |                 return None if q.empty() else q.get_nowait()
 37 |             return q.get()
 38 | 
 39 |         def send_to_testsuite(self, value):
 40 |             self.server.out_queue.put(value)
 41 | 
 42 |     class HTMLSnippetSender(BaseTestRequestHandler):
 43 |         last_html = b''
 44 | 
 45 |         def get_last_html(self):
 46 |             while True:
 47 |                 value = self.recv_from_testsuite(non_blocking=True)
 48 |                 if value is None:
 49 |                     break
 50 |                 self.last_html = value
 51 |             return self.last_html
 52 | 
 53 |         def do_GET(self):
 54 |             if self.path == '/':
 55 |                 self.send_response(200)
 56 |                 self.send_header('Content-Type', 'text/html; charset=utf-8')
 57 |                 self.end_headers()
 58 |                 self.wfile.write(self.get_last_html().encode('utf-8'))
 59 |             else:
 60 |                 self.send_response(404)
 61 |                 self.end_headers()
 62 | 
 63 |     class BaseBrowserTest(unittest.TestCase):
 64 |         LOCAL_IP = '127.0.0.1'
 65 |         PORT = 28546
 66 |         # descendant of BaseBrowserTestRequestHandler
 67 |         REQUEST_HANDLER_CLASS = None
 68 | 
 69 |         @classmethod
 70 |         def setUpClass(cls):
 71 |             cls.to_server_queue = Queue()
 72 |             cls.from_server_queue = Queue()
 73 |             cls.server = HTTPServer((cls.LOCAL_IP, cls.PORT),
 74 |                                     cls.REQUEST_HANDLER_CLASS)
 75 |             cls.server.in_queue = cls.to_server_queue
 76 |             cls.server.out_queue = cls.from_server_queue
 77 |             cls.server_thread = Thread(target=cls.server.serve_forever)
 78 |             cls.server_thread.daemon = True
 79 |             cls.server_thread.start()
 80 |             options = Options()
 81 |             options.add_argument('-headless')
 82 |             cls.driver = webdriver.Firefox(options=options)
 83 |             sleep(1)
 84 | 
 85 |         @classmethod
 86 |         def tearDownClass(cls):
 87 |             cls.driver.quit()
 88 |             cls.server.shutdown()
 89 |             cls.server.server_close()
 90 | 
 91 |         def send_to_server(self, value):
 92 |             self.to_server_queue.put(value)
 93 | 
 94 |         def recv_from_server(self, non_blocking=False):
 95 |             q = self.from_server_queue
 96 |             if non_blocking:
 97 |                 return None if q.empty() else q.get_nowait()
 98 |             return q.get()
 99 | 
100 |         def open_url(self, path):
101 |             self.driver.get(urlunsplit(
102 |                 ('http', '{}:{}'.format(
103 |                     self.LOCAL_IP, self.PORT), path, '', '')))
104 | 
105 |     class TestInnerText(BaseBrowserTest, TextExtractionMixin):
106 |         REQUEST_HANDLER_CLASS = HTMLSnippetSender
107 | 
108 |         def _simple_test(self, html, expected_sq, expected_nosq, **kwargs):
109 |             self.send_to_server(html)
110 |             self.open_url('/')
111 | 
112 |             selenium_text = self.driver.find_element_by_tag_name('body').text
113 |             self.assertEqual(selenium_text, expected_sq)
114 | 
115 |             #  inner_text = self.driver.execute_script(
116 |             #    'return document.body.innerText')
117 |             #  text_content = self.driver.execute_script(
118 |             #    'return document.body.textContent')
119 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist=py38,py39,py310,py311,py312
 3 | 
 4 | [testenv]
 5 | whitelist_externals=
 6 |     rm
 7 | passenv=
 8 |     MOZ_HEADLESS
 9 | commands =
10 |     pytest []
11 | deps =
12 |     py38: selenium
13 |     -e .[test]
14 | 
15 | [testenv:lint]
16 | skipsdist=true
17 | skip_install=true
18 | basepython = python3.11
19 | commands =
20 |     ruff check
21 | deps =
22 |     ruff
23 | 
24 | [testenv:docs]
25 | skip_install=false
26 | skipsdist=true
27 | basepython = python3.11
28 | changedir = docs
29 | deps =
30 |     sphinx
31 |     Pygments
32 | allowlist_externals =
33 |     rm
34 | commands =
35 |     rm -Rf {envtmpdir}/doctrees {envtmpdir}/html
36 |     sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html
37 | 
38 | # [testenv:selenium]
39 | # basepython = python3.5
40 | # deps =
41 | #     selenium
42 | # commands =
43 | #     {envbindir}/python -m unittest seleniumtests.offline
44 | #     {envbindir}/python -m unittest seleniumtests.browser
45 | 


--------------------------------------------------------------------------------