├── tests ├── __init__.py ├── test.html ├── geckodriver.sh ├── invalid.xml ├── selenium.sh ├── test_browser.py ├── doctests.rst ├── apps.py ├── browser_base.py ├── test_real_browser.py └── test_pyquery.py ├── docs ├── changes.rst ├── api.rst ├── testing.rst ├── future.rst ├── conftest.py ├── attributes.rst ├── index.rst ├── scrap.rst ├── css.rst ├── traversing.rst ├── tips.rst ├── manipulating.rst ├── Makefile ├── pseudo_classes.rst └── conf.py ├── pyquery ├── __init__.py ├── openers.py ├── text.py ├── cssselectpatch.py └── pyquery.py ├── .hgignore ├── MANIFEST.in ├── pytest.ini ├── .gitignore ├── conftest.py ├── README_fixt.py ├── tox.ini ├── LICENSE.txt ├── .github └── workflows │ └── tox.yml ├── setup.py ├── README.rst └── CHANGES.rst /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/changes.rst: -------------------------------------------------------------------------------- 1 | News 2 | ===== 3 | 4 | .. include:: ../CHANGES.rst 5 | -------------------------------------------------------------------------------- /pyquery/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2008 - Olivier Lauzanne 2 | # 3 | # Distributed under the BSD license, see LICENSE.txt 4 | 5 | from .pyquery import PyQuery # NOQA 6 | -------------------------------------------------------------------------------- /tests/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Hello world !

4 | 5 |

hello python !

6 | 7 | 8 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | # use glob syntax. 2 | syntax: glob 3 | 4 | develop-eggs/ 5 | bin/ 6 | dist/ 7 | build/ 8 | parts/ 9 | docs/_build/ 10 | .tox/ 11 | .installed.cfg 12 | *.egg-info 13 | *.pyc 14 | *.swp 15 | *~ 16 | -------------------------------------------------------------------------------- /tests/geckodriver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | driver="https://github.com/mozilla/geckodriver/releases/download/v0.26.0/geckodriver-v0.26.0-linux64.tar.gz" 4 | 5 | [ -f geckodriver ] || wget -cqO- $driver | tar xvzf - 6 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft docs 2 | prune docs/_build 3 | graft pyquery 4 | graft tests 5 | include *.py 6 | include *.txt 7 | include *_fixt.py *.rst *.cfg *.ini 8 | global-exclude *.pyc 9 | global-exclude __pycache__ 10 | -------------------------------------------------------------------------------- /tests/invalid.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Hello world !

4 | 5 |

6 | hello python ! 7 |

8 | 9 |

10 | 11 | 12 | -------------------------------------------------------------------------------- /tests/selenium.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # script to run selenium tests 3 | 4 | # get geckodriver 5 | ./tests/geckodriver.sh 6 | 7 | # run tox with py3.7 8 | MOZ_HEADLESS=1 PATH=$PATH:$PWD tox -e py37 tests/test_real_browser.py 9 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | 2 | [pytest] 3 | filterwarnings = 4 | ignore::DeprecationWarning 5 | doctest_optionflags = ELLIPSIS NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL 6 | addopts = --doctest-modules --doctest-glob="*.rst" --ignore=docs/conf.py 7 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | ================================================ 2 | :mod:`~pyquery.pyquery` -- PyQuery complete API 3 | ================================================ 4 | 5 | .. automodule:: pyquery.pyquery 6 | 7 | .. autoclass:: PyQuery 8 | :members: 9 | 10 | 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # Distribution / packaging 6 | develop-eggs/ 7 | bin/ 8 | dist/ 9 | build/ 10 | parts/ 11 | .tox/ 12 | .installed.cfg 13 | *.egg-info 14 | *.swp 15 | 16 | # Temporary files 17 | *~ 18 | geckodriver 19 | 20 | # Log files 21 | geckodriver.log 22 | 23 | # Sphinx documentation 24 | docs/_build/ 25 | -------------------------------------------------------------------------------- /docs/testing.rst: -------------------------------------------------------------------------------- 1 | Testing 2 | ------- 3 | 4 | If you want to run the tests that you can see above you should do:: 5 | 6 | $ git clone git://github.com/gawel/pyquery.git 7 | $ cd pyquery 8 | $ python bootstrap.py 9 | $ bin/buildout install tox 10 | $ bin/tox 11 | 12 | You can build the Sphinx documentation by doing:: 13 | 14 | $ cd docs 15 | $ make html 16 | -------------------------------------------------------------------------------- /docs/future.rst: -------------------------------------------------------------------------------- 1 | Future 2 | ------- 3 | 4 | - SELECTORS: done 5 | 6 | - ATTRIBUTES: done 7 | 8 | - CSS: done 9 | 10 | - HTML: done 11 | 12 | - MANIPULATING: missing the wrapInner method 13 | 14 | - TRAVERSING: about half done 15 | 16 | - EVENTS: nothing to do with server side might be used later for automatic ajax 17 | 18 | - CORE UI EFFECTS: did hide and show the rest doesn't really makes sense on 19 | server side 20 | 21 | - AJAX: some with wsgi app 22 | 23 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from webtest import http 4 | from webtest.debugapp import debug_app 5 | from urllib.request import urlopen 6 | 7 | 8 | @pytest.fixture 9 | def readme_fixt(): 10 | server = http.StopableWSGIServer.create(debug_app) 11 | server.wait() 12 | path_to_html_file = os.path.join('tests', 'test.html') 13 | yield ( 14 | urlopen, 15 | server.application_url, 16 | path_to_html_file, 17 | ) 18 | server.shutdown() 19 | -------------------------------------------------------------------------------- /tests/test_browser.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pyquery.pyquery import PyQuery 4 | from .browser_base import TextExtractionMixin 5 | 6 | 7 | class TestInnerText(unittest.TestCase, TextExtractionMixin): 8 | def _prepare_dom(self, html): 9 | super()._prepare_dom(html) 10 | self.pq = PyQuery(self.last_html) 11 | 12 | def _simple_test(self, html, expected_sq, expected_nosq, **kwargs): 13 | self._prepare_dom(html) 14 | text_sq = self.pq.text(squash_space=True, **kwargs) 15 | text_nosq = self.pq.text(squash_space=False, **kwargs) 16 | self.assertEqual(text_sq, expected_sq) 17 | self.assertEqual(text_nosq, expected_nosq) 18 | -------------------------------------------------------------------------------- /docs/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pytest 4 | from webtest import http 5 | from webtest.debugapp import debug_app 6 | 7 | 8 | @pytest.fixture 9 | def scrap_url(): 10 | sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) 11 | from tests.apps import input_app 12 | server = http.StopableWSGIServer.create(input_app) 13 | server.wait() 14 | yield server.application_url.rstrip('/') + '/html' 15 | server.shutdown() 16 | 17 | 18 | @pytest.fixture 19 | def tips_url(): 20 | server = http.StopableWSGIServer.create(debug_app) 21 | server.wait() 22 | yield server.application_url.rstrip('/') + '/form.html' 23 | server.shutdown() 24 | -------------------------------------------------------------------------------- /README_fixt.py: -------------------------------------------------------------------------------- 1 | import os 2 | from webtest import http 3 | from webtest.debugapp import debug_app 4 | 5 | try: 6 | from urllib import urlopen 7 | except ImportError: 8 | from urllib.request import urlopen 9 | 10 | 11 | def setup_test(test): 12 | server = http.StopableWSGIServer.create(debug_app) 13 | server.wait() 14 | path_to_html_file = os.path.join('tests', 'test.html') 15 | test.globs.update( 16 | urlopen=urlopen, 17 | server=server, 18 | your_url=server.application_url, 19 | path_to_html_file=path_to_html_file, 20 | ) 21 | setup_test.__test__ = False 22 | 23 | 24 | def teardown_test(test): 25 | test.globs['server'].shutdown() 26 | teardown_test.__test__ = False 27 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist=py38,py39,py310,py311,py312 3 | 4 | [testenv] 5 | whitelist_externals= 6 | rm 7 | passenv= 8 | MOZ_HEADLESS 9 | commands = 10 | pytest [] 11 | deps = 12 | py38: selenium 13 | -e .[test] 14 | 15 | [testenv:lint] 16 | skipsdist=true 17 | skip_install=true 18 | basepython = python3.11 19 | commands = 20 | ruff check 21 | deps = 22 | ruff 23 | 24 | [testenv:docs] 25 | skip_install=false 26 | skipsdist=true 27 | basepython = python3.11 28 | changedir = docs 29 | deps = 30 | sphinx 31 | Pygments 32 | allowlist_externals = 33 | rm 34 | commands = 35 | rm -Rf {envtmpdir}/doctrees {envtmpdir}/html 36 | sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html 37 | 38 | # [testenv:selenium] 39 | # basepython = python3.5 40 | # deps = 41 | # selenium 42 | # commands = 43 | # {envbindir}/python -m unittest seleniumtests.offline 44 | # {envbindir}/python -m unittest seleniumtests.browser 45 | -------------------------------------------------------------------------------- /tests/doctests.rst: -------------------------------------------------------------------------------- 1 | Import:: 2 | 3 | >>> from pyquery import PyQuery as pq 4 | 5 | 6 | Assume spaces normalization:: 7 | 8 | >>> pq('

').text() 9 | '' 10 | 11 | >>> print(pq('').text()) 12 | toto tata 13 | 14 | Complex wrapping:: 15 | 16 | >>> d = pq('
youhou
') 17 | >>> s = d('span') 18 | >>> s is d 19 | False 20 | >>> s.wrap('
') 21 | [
] 22 | 23 | We get the original doc with new node:: 24 | 25 | >>> print(d) 26 |
youhou
27 | 28 | Complex wrapAll:: 29 | 30 | >>> doc = pq('
Heyyou !
') 31 | >>> s = doc('span') 32 | >>> s.wrapAll('
') 33 | [] 34 | 35 | >>> print(doc) 36 |
Heyyou !
37 | -------------------------------------------------------------------------------- /docs/attributes.rst: -------------------------------------------------------------------------------- 1 | Attributes 2 | ---------- 3 | 4 | .. 5 | >>> from pyquery import PyQuery as pq 6 | 7 | Using attribute to select specific tag 8 | In attribute selectors, the value should be a valid CSS identifier or quoted as string:: 9 | 10 | >>> d = pq("
') 53 | >>> d.remove('p#id') 54 | [] 55 | >>> d('p#id') 56 | [] 57 | 58 | Remove what's inside the selection:: 59 | 60 | >>> d('p').empty() 61 | [

] 62 | 63 | And you can get back the modified html:: 64 | 65 | >>> print(d) 66 |

67 | 68 | You can generate html stuff:: 69 | 70 | >>> from pyquery import PyQuery as pq 71 | >>> print(pq('

Yeah !
').addClass('myclass') + pq('cool')) 72 |
Yeah !
cool 73 | 74 | Remove all namespaces:: 75 | 76 | >>> d = pq('') 77 | >>> d 78 | [<{http://example.com/foo}foo>] 79 | >>> d.remove_namespaces() 80 | [] 81 | 82 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | pyquery: a jquery-like library for python 2 | ========================================= 3 | 4 | .. image:: https://github.com/gawel/pyquery/actions/workflows/tox.yml/badge.svg 5 | :alt: Build Status 6 | :target: https://github.com/gawel/pyquery/actions/workflows/tox.yml 7 | 8 | pyquery allows you to make jquery queries on xml documents. 9 | The API is as much as possible similar to jquery. pyquery uses lxml for fast 10 | xml and html manipulation. 11 | 12 | This is not (or at least not yet) a library to produce or interact with 13 | javascript code. I just liked the jquery API and I missed it in python so I 14 | told myself "Hey let's make jquery in python". This is the result. 15 | 16 | The `project`_ is being actively developed on a git repository on Github. I 17 | have the policy of giving push access to anyone who wants it and then reviewing 18 | what they do. So if you want to contribute just email me. 19 | 20 | Please report bugs on the `github 21 | `_ issue 22 | tracker. 23 | 24 | .. _deliverance: http://www.gawel.org/weblog/en/2008/12/skinning-with-pyquery-and-deliverance 25 | .. _project: https://github.com/gawel/pyquery/ 26 | 27 | .. 28 | >>> (urlopen, your_url, path_to_html_file) = getfixture('readme_fixt') 29 | 30 | Quickstart 31 | ========== 32 | 33 | You can use the PyQuery class to load an xml document from a string, a lxml 34 | document, from a file or from an url:: 35 | 36 | >>> from pyquery import PyQuery as pq 37 | >>> from lxml import etree 38 | >>> import urllib 39 | >>> d = pq("") 40 | >>> d = pq(etree.fromstring("")) 41 | >>> d = pq(url=your_url) 42 | >>> d = pq(url=your_url, 43 | ... opener=lambda url, **kw: urlopen(url).read()) 44 | >>> d = pq(filename=path_to_html_file) 45 | 46 | Now d is like the $ in jquery:: 47 | 48 | >>> d("#hello") 49 | [] 50 | >>> p = d("#hello") 51 | >>> print(p.html()) 52 | Hello world ! 53 | >>> p.html("you know Python rocks") 54 | [] 55 | >>> print(p.html()) 56 | you know Python rocks 57 | >>> print(p.text()) 58 | you know Python rocks 59 | 60 | You can use some of the pseudo classes that are available in jQuery but that 61 | are not standard in css such as :first :last :even :odd :eq :lt :gt :checked 62 | :selected :file:: 63 | 64 | >>> d('p:first') 65 | [] 66 | 67 | -------------------------------------------------------------------------------- /tests/browser_base.py: -------------------------------------------------------------------------------- 1 | class TextExtractionMixin(): 2 | def _prepare_dom(self, html): 3 | self.last_html = '' + html + '' 4 | 5 | def _simple_test(self, html, expected_sq, expected_nosq, **kwargs): 6 | raise NotImplementedError 7 | 8 | def test_inline_tags(self): 9 | self._simple_test( 10 | 'Phasellus eget sem facilisis justo', 11 | 'Phasellus eget sem facilisis justo', 12 | 'Phasellus eget sem facilisis justo', 13 | ) 14 | self._simple_test( 15 | 'Phasellus eget sem facilisis\n justo', 16 | 'Phasellus eget sem facilisis justo', 17 | 'Phasellus eget sem facilisis\n justo', 18 | ) 19 | self._simple_test( 20 | ('Phasellus \n eget\n ' 21 | 'sem\n\tfacilisis justo'), 22 | 'Phasellus eget sem facilisis justo', 23 | 'Phasellus \n eget\n sem\n\tfacilisis justo' 24 | ) 25 | 26 | def test_block_tags(self): 27 | self._simple_test( 28 | 'Phas

ell

us
eget
sem

facilisis

justo', 29 | 'Phas\nell\nus\neget\nsem\nfacilisis\njusto', 30 | 'Phas\nell\nus\n eget \nsem \nfacilisis\n justo', 31 | ) 32 | self._simple_test( 33 | '

In sagittis

rutrum

condimentum

', 34 | 'In sagittis\nrutrum\ncondimentum', 35 | 'In sagittis\n \nrutrum\n\ncondimentum', 36 | ) 37 | self._simple_test( 38 | 'In

\nultricies

\n erat et

\n\n\nmaximus\n\n

mollis', 39 | 'In\nultricies\nerat et\nmaximus\nmollis', 40 | 'In \n\nultricies\n\n erat et \n\n\n\nmaximus\n\n\n mollis', 41 | ) 42 | self._simple_test( 43 | ('Integer
\n
quis commodo
' 44 | '
libero'), 45 | 'Integer\nquis commodo\nlibero', 46 | 'Integer \n\n\n \nquis commodo\n\n \n libero', 47 | ) 48 | self._simple_test( 49 | 'Heading
  • one
  • two
  • three
', 50 | 'Heading\none\ntwo\nthree', 51 | 'Heading\n\none\n\ntwo\n\nthree', 52 | ) 53 | 54 | def test_separators(self): 55 | self._simple_test( 56 | 'Some words
test. Another word


test.', 57 | 'Some words\ntest. Another word\n\n\ntest.', 58 | 'Some words\ntest. Another word\n\n \n test.', 59 | ) 60 | self._simple_test( 61 | 'Inline split by\nbr
tag
test', 62 | 'Inline split by br\ntag test', 63 | 'Inline split by\nbr\ntag test', 64 | ) 65 | self._simple_test( 66 | 'Some words
test. Another word


test.', 67 | 'Some words\ntest. Another word\ntest.', 68 | 'Some words\n\ntest. Another word\n\n\n\n \n\n test.', 69 | ) 70 | 71 | def test_strip(self): 72 | self._simple_test( 73 | ' text\n', 74 | 'text', 75 | ' text\n', 76 | ) 77 | 78 | def test_ul_li(self): 79 | self._simple_test( 80 | '
', 81 | '', 82 | ' \n \n ' 83 | ) 84 | -------------------------------------------------------------------------------- /pyquery/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | # https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#Elements 5 | INLINE_TAGS = { 6 | 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 7 | 'code', 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 8 | 'object', 'q', 'samp', 'script', 'select', 'small', 'span', 'strong', 9 | 'sub', 'sup', 'textarea', 'time', 'tt', 'var' 10 | } 11 | 12 | SEPARATORS = {'br'} 13 | 14 | 15 | # Definition of whitespace in HTML: 16 | # https://www.w3.org/TR/html4/struct/text.html#h-9.1 17 | WHITESPACE_RE = re.compile('[\x20\x09\x0C\u200B\x0A\x0D]+') 18 | 19 | 20 | def squash_html_whitespace(text): 21 | # use raw extract_text for preformatted content (like
 content or set
 22 |     # by CSS rules)
 23 |     # apply this function on top of
 24 |     return WHITESPACE_RE.sub(' ', text)
 25 | 
 26 | 
 27 | def _squash_artifical_nl(parts):
 28 |     output, last_nl = [], False
 29 |     for x in parts:
 30 |         if x is not None:
 31 |             output.append(x)
 32 |             last_nl = False
 33 |         elif not last_nl:
 34 |             output.append(None)
 35 |             last_nl = True
 36 |     return output
 37 | 
 38 | 
 39 | def _strip_artifical_nl(parts):
 40 |     if not parts:
 41 |         return parts
 42 |     for start_idx, pt in enumerate(parts):
 43 |         if isinstance(pt, str):
 44 |             # 0, 1, 2, index of first string [start_idx:...
 45 |             break
 46 |     iterator = enumerate(parts[:start_idx - 1 if start_idx > 0 else None:-1])
 47 |     for end_idx, pt in iterator:
 48 |         if isinstance(pt, str):  # 0=None, 1=-1, 2=-2, index of last string
 49 |             break
 50 |     return parts[start_idx:-end_idx if end_idx > 0 else None]
 51 | 
 52 | 
 53 | def _merge_original_parts(parts):
 54 |     output, orp_buf = [], []
 55 | 
 56 |     def flush():
 57 |         if orp_buf:
 58 |             item = squash_html_whitespace(''.join(orp_buf)).strip()
 59 |             if item:
 60 |                 output.append(item)
 61 |             orp_buf[:] = []
 62 | 
 63 |     for x in parts:
 64 |         if not isinstance(x, str):
 65 |             flush()
 66 |             output.append(x)
 67 |         else:
 68 |             orp_buf.append(x)
 69 |     flush()
 70 |     return output
 71 | 
 72 | 
 73 | def extract_text_array(dom, squash_artifical_nl=True, strip_artifical_nl=True):
 74 |     if callable(dom.tag):
 75 |         return ''
 76 |     r = []
 77 |     if dom.tag in SEPARATORS:
 78 |         r.append(True)  # equivalent of '\n' used to designate separators
 79 |     elif dom.tag not in INLINE_TAGS:
 80 |         # equivalent of '\n' used to designate artificially inserted newlines
 81 |         r.append(None)
 82 |     if dom.text is not None:
 83 |         r.append(dom.text)
 84 |     for child in dom.getchildren():
 85 |         r.extend(extract_text_array(child, squash_artifical_nl=False,
 86 |                                     strip_artifical_nl=False))
 87 |         if child.tail is not None:
 88 |             r.append(child.tail)
 89 |     if dom.tag not in INLINE_TAGS and dom.tag not in SEPARATORS:
 90 |         # equivalent of '\n' used to designate artificially inserted newlines
 91 |         r.append(None)
 92 |     if squash_artifical_nl:
 93 |         r = _squash_artifical_nl(r)
 94 |     if strip_artifical_nl:
 95 |         r = _strip_artifical_nl(r)
 96 |     return r
 97 | 
 98 | 
 99 | def extract_text(dom, block_symbol='\n', sep_symbol='\n', squash_space=True):
100 |     a = extract_text_array(dom, squash_artifical_nl=squash_space)
101 |     if squash_space:
102 |         a = _strip_artifical_nl(_squash_artifical_nl(_merge_original_parts(a)))
103 |     result = ''.join(
104 |         block_symbol if x is None else (
105 |             sep_symbol if x is True else x
106 |         )
107 |         for x in a
108 |     )
109 |     if squash_space:
110 |         result = result.strip()
111 |     return result
112 | 


--------------------------------------------------------------------------------
/tests/test_real_browser.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | from threading import Thread
  4 | from time import sleep
  5 | 
  6 | from .browser_base import TextExtractionMixin
  7 | 
  8 | SELENIUM = 'MOZ_HEADLESS' in os.environ
  9 | 
 10 | try:
 11 |     from selenium import webdriver
 12 |     from selenium.webdriver.firefox.options import Options
 13 | except ImportError:
 14 |     SELENIUM = False
 15 | 
 16 | if SELENIUM:
 17 |     from urllib.parse import urlunsplit
 18 |     from http.server import HTTPServer, BaseHTTPRequestHandler
 19 |     from queue import Queue
 20 | 
 21 |     class BaseTestRequestHandler(BaseHTTPRequestHandler):
 22 |         _last_html = ''
 23 | 
 24 |         def _get_last_html(self):
 25 |             q = self.server.html_queue
 26 |             while not q.empty():
 27 |                 self._last_html = q.get_nowait()
 28 |             return self._last_html
 29 | 
 30 |         def log_request(self, code='-', size='-'):
 31 |             pass
 32 | 
 33 |         def recv_from_testsuite(self, non_blocking=False):
 34 |             q = self.server.in_queue
 35 |             if non_blocking:
 36 |                 return None if q.empty() else q.get_nowait()
 37 |             return q.get()
 38 | 
 39 |         def send_to_testsuite(self, value):
 40 |             self.server.out_queue.put(value)
 41 | 
 42 |     class HTMLSnippetSender(BaseTestRequestHandler):
 43 |         last_html = b''
 44 | 
 45 |         def get_last_html(self):
 46 |             while True:
 47 |                 value = self.recv_from_testsuite(non_blocking=True)
 48 |                 if value is None:
 49 |                     break
 50 |                 self.last_html = value
 51 |             return self.last_html
 52 | 
 53 |         def do_GET(self):
 54 |             if self.path == '/':
 55 |                 self.send_response(200)
 56 |                 self.send_header('Content-Type', 'text/html; charset=utf-8')
 57 |                 self.end_headers()
 58 |                 self.wfile.write(self.get_last_html().encode('utf-8'))
 59 |             else:
 60 |                 self.send_response(404)
 61 |                 self.end_headers()
 62 | 
 63 |     class BaseBrowserTest(unittest.TestCase):
 64 |         LOCAL_IP = '127.0.0.1'
 65 |         PORT = 28546
 66 |         # descendant of BaseBrowserTestRequestHandler
 67 |         REQUEST_HANDLER_CLASS = None
 68 | 
 69 |         @classmethod
 70 |         def setUpClass(cls):
 71 |             cls.to_server_queue = Queue()
 72 |             cls.from_server_queue = Queue()
 73 |             cls.server = HTTPServer((cls.LOCAL_IP, cls.PORT),
 74 |                                     cls.REQUEST_HANDLER_CLASS)
 75 |             cls.server.in_queue = cls.to_server_queue
 76 |             cls.server.out_queue = cls.from_server_queue
 77 |             cls.server_thread = Thread(target=cls.server.serve_forever)
 78 |             cls.server_thread.daemon = True
 79 |             cls.server_thread.start()
 80 |             options = Options()
 81 |             options.add_argument('-headless')
 82 |             cls.driver = webdriver.Firefox(options=options)
 83 |             sleep(1)
 84 | 
 85 |         @classmethod
 86 |         def tearDownClass(cls):
 87 |             cls.driver.quit()
 88 |             cls.server.shutdown()
 89 |             cls.server.server_close()
 90 | 
 91 |         def send_to_server(self, value):
 92 |             self.to_server_queue.put(value)
 93 | 
 94 |         def recv_from_server(self, non_blocking=False):
 95 |             q = self.from_server_queue
 96 |             if non_blocking:
 97 |                 return None if q.empty() else q.get_nowait()
 98 |             return q.get()
 99 | 
100 |         def open_url(self, path):
101 |             self.driver.get(urlunsplit(
102 |                 ('http', '{}:{}'.format(
103 |                     self.LOCAL_IP, self.PORT), path, '', '')))
104 | 
105 |     class TestInnerText(BaseBrowserTest, TextExtractionMixin):
106 |         REQUEST_HANDLER_CLASS = HTMLSnippetSender
107 | 
108 |         def _simple_test(self, html, expected_sq, expected_nosq, **kwargs):
109 |             self.send_to_server(html)
110 |             self.open_url('/')
111 | 
112 |             selenium_text = self.driver.find_element_by_tag_name('body').text
113 |             self.assertEqual(selenium_text, expected_sq)
114 | 
115 |             #  inner_text = self.driver.execute_script(
116 |             #    'return document.body.innerText')
117 |             #  text_content = self.driver.execute_script(
118 |             #    'return document.body.textContent')
119 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = ../bin/sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make ' where  is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/chut.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/chut.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/chut"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/chut"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
  1 | 2.0.2 (unreleased)
  2 | ------------------
  3 | 
  4 | - Nothing changed yet.
  5 | 
  6 | 
  7 | 2.0.1 (2024-08-30)
  8 | ------------------
  9 | 
 10 | - Breaking change: its seems no longer possible to use the html parser with a xml file so its no longer tested
 11 | 
 12 | - Drop support for python 3.7
 13 | 
 14 | 2.0.0 (2022-12-28)
 15 | ------------------
 16 | 
 17 | - Breaking change: inputs starting with ``"http://"`` or ``"https://"`` like
 18 |   ``PyQuery("http://example.com")`` will no longer fetch the contents of the URL.
 19 |   Users desiring the old behavior should switch to ``PyQuery(url="http://example.com")``.
 20 | 
 21 | - Add nextUntil method
 22 | 
 23 | - ``.remove()`` no longer inserts a space in place of the removed element
 24 | 
 25 | - Fix escaping of top-level element text in ``.html()`` output
 26 | 
 27 | - Support (and require) cssselect 1.2+
 28 | 
 29 | - Drop support for python 3.5/3.6
 30 | 
 31 | 
 32 | 1.4.3 (2020-11-21)
 33 | ------------------
 34 | 
 35 | - No longer use a universal wheel
 36 | 
 37 | 
 38 | 1.4.2 (2020-11-21)
 39 | ------------------
 40 | 
 41 | - Fix exception raised when calling `PyQuery("").text()`
 42 | 
 43 | - python2 is no longer supported
 44 | 
 45 | 1.4.1 (2019-10-26)
 46 | ------------------
 47 | 
 48 | - This is the latest release with py2 support
 49 | 
 50 | - Remove py33, py34 support
 51 | 
 52 | - web scraping improvements: default timeout and session support
 53 | 
 54 | - Add API methods to serialize form-related elements according to spec
 55 | 
 56 | - Include HTML markup when querying textarea text/value
 57 | 
 58 | 
 59 | 1.4.0 (2018-01-11)
 60 | ------------------
 61 | 
 62 | - Refactoring of `.text()` to match firefox behavior.
 63 | 
 64 | 
 65 | 1.3.0 (2017-10-21)
 66 | ------------------
 67 | 
 68 | - Remove some unmaintained modules: ``pyquery.ajax`` and ``pyquery.rules``
 69 | 
 70 | - Code cleanup. No longer use ugly hacks required by python2.6/python3.2.
 71 | 
 72 | - Run tests with python3.6 on CI
 73 | 
 74 | - Add a ``method`` argument to ``.outer_html()``
 75 | 
 76 | 
 77 | 1.2.17 (2016-10-14)
 78 | -------------------
 79 | 
 80 | - ``PyQuery('').val()`` is ``''``
 81 | - ``PyQuery('').val()`` is ``''``
 82 | 
 83 | 
 84 | 1.2.16 (2016-10-14)
 85 | -------------------
 86 | 
 87 | - ``.attr('value', '')`` no longer removes the ``value`` attribute
 88 | 
 89 | - ```` without ``value="..."`` have a ``.val()`` of
 90 |   ``'on'``
 91 | 
 92 | - ```` without ``value="..."`` have a ``.val()`` of
 93 |   ``'on'``
 94 | 
 95 | - ``'))
226 |         >>> d(':input')
227 |         [, '))
196 |             >>> d(':input')
197 |             [, 
434 |         
437 |     '''
438 | 
439 |     html4 = '''
440 |         
444 |         
449 |         
451 |         
456 |     '''
457 | 
458 |     html6 = '''
459 |         
464 |         
469 |         
474 |     '''
475 | 
476 |     html5 = '''
477 |         
478 | 479 | 480 | 481 |
482 | ''' 483 | 484 | def test_attr_empty_string(self): 485 | d = pq('
') 486 | d.attr('value', '') 487 | self.assertEqual(d.outer_html(), '
') 488 | self.assertEqual(d.outer_html(method="xml"), '
') 489 | 490 | def test_remove(self): 491 | d = pq(self.html) 492 | d('img').remove() 493 | val = d('a:first').html() 494 | assert val == 'TestMy link text', repr(val) 495 | val = d('a:last').html() 496 | assert val == 'My link text 2', repr(val) 497 | 498 | def test_class(self): 499 | d = pq('
') 500 | d.removeClass('xx') 501 | assert 'class' not in str(d), str(d) 502 | 503 | def test_val_for_inputs(self): 504 | d = pq(self.html2) 505 | self.assertIsNone(d('input[name="none"]').val()) 506 | self.assertEqual(d('input[name="spam"]').val(), 'Spam') 507 | self.assertEqual(d('input[name="eggs"]').val(), 'Eggs') 508 | self.assertEqual(d('input:checkbox').val(), 'Bacon') 509 | self.assertEqual(d('input:radio').val(), 'Ham') 510 | d('input[name="spam"]').val('42') 511 | d('input[name="eggs"]').val('43') 512 | d('input:checkbox').val('44') 513 | d('input:radio').val('45') 514 | self.assertEqual(d('input[name="spam"]').val(), '42') 515 | self.assertEqual(d('input[name="eggs"]').val(), '43') 516 | self.assertEqual(d('input:checkbox').val(), '44') 517 | self.assertEqual(d('input:radio').val(), '45') 518 | 519 | def test_val_for_inputs_with_newline(self): 520 | d = pq(self.html2_newline) 521 | self.assertEqual(d('#newline-text').val(), 'Spam') 522 | self.assertEqual(d('#newline-radio').val(), 'S\npam') 523 | 524 | def test_val_for_textarea(self): 525 | d = pq(self.html3) 526 | self.assertEqual(d('#textarea-single').val(), 'Spam') 527 | self.assertEqual(d('#textarea-single').text(), 'Spam') 528 | d('#textarea-single').val('42') 529 | self.assertEqual(d('#textarea-single').val(), '42') 530 | # Note: jQuery still returns 'Spam' here. 531 | self.assertEqual(d('#textarea-single').text(), '42') 532 | 533 | multi_expected = '''Spam\nEggs\nBacon''' 534 | self.assertEqual(d('#textarea-multi').val(), multi_expected) 535 | self.assertEqual(d('#textarea-multi').text(), multi_expected) 536 | multi_new = '''Bacon\nEggs\nSpam''' 537 | multi_new_expected = '''Bacon\n<b>Eggs</b>\nSpam''' 538 | d('#textarea-multi').val(multi_new) 539 | self.assertEqual(d('#textarea-multi').val(), multi_new_expected) 540 | self.assertEqual(d('#textarea-multi').text(), multi_new_expected) 541 | 542 | def test_val_for_select(self): 543 | d = pq(self.html4) 544 | self.assertEqual(d('#first').val(), 'spam') 545 | self.assertEqual(d('#second').val(), 'eggs') 546 | self.assertIsNone(d('#third').val()) 547 | d('#first').val('eggs') 548 | d('#second').val('bacon') 549 | d('#third').val('eggs') # Selecting non-existing option. 550 | self.assertEqual(d('#first').val(), 'eggs') 551 | self.assertEqual(d('#second').val(), 'bacon') 552 | self.assertIsNone(d('#third').val()) 553 | d('#first').val('bacon') # Selecting non-existing option. 554 | self.assertEqual(d('#first').val(), 'spam') 555 | # Value set based on option order, not value order 556 | d('#second').val(['bacon', 'eggs']) 557 | self.assertEqual(d('#second').val(), 'eggs') 558 | d('#fourth').val(['spam']) 559 | self.assertEqual(d('#fourth').val(), 'spam') 560 | # Sets first option with matching value 561 | self.assertEqual(d('#fourth option[selected]').length, 1) 562 | self.assertEqual(d('#fourth option[selected]').text(), 'Spam') 563 | 564 | def test_val_for_select_multiple(self): 565 | d = pq(self.html6) 566 | self.assertEqual(d('#first').val(), ['spam', 'eggs']) 567 | # Selecting non-existing option. 568 | d('#first').val(['eggs', 'sausage', 'bacon']) 569 | self.assertEqual(d('#first').val(), ['eggs', 'bacon']) 570 | self.assertEqual(d('#second').val(), []) 571 | d('#second').val('eggs') 572 | self.assertEqual(d('#second').val(), ['eggs']) 573 | d('#second').val(['not spam', 'not eggs']) 574 | self.assertEqual(d('#second').val(), []) 575 | d('#third').val(['spam']) 576 | self.assertEqual(d('#third').val(), ['spam', 'spam', 'spam']) 577 | 578 | def test_val_for_input_and_textarea_given_array_value(self): 579 | d = pq('') 580 | d('input').val(['spam', 'eggs']) 581 | self.assertEqual(d('input').val(), 'spam,eggs') 582 | d = pq('') 583 | d('textarea').val(['spam', 'eggs']) 584 | self.assertEqual(d('textarea').val(), 'spam,eggs') 585 | 586 | def test_val_for_multiple_elements(self): 587 | d = pq(self.html5) 588 | # "Get" returns *first* value. 589 | self.assertEqual(d('div > *').val(), 'spam') 590 | # "Set" updates *every* value. 591 | d('div > *').val('42') 592 | self.assertEqual(d('#first').val(), '42') 593 | self.assertEqual(d('#second').val(), '42') 594 | self.assertEqual(d('#third').val(), '42') 595 | 596 | def test_val_checkbox_no_value_attribute(self): 597 | d = pq('') 598 | self.assertEqual(d.val(), 'on') 599 | d = pq('') 600 | self.assertEqual(d.val(), '') 601 | 602 | def test_val_radio_no_value_attribute(self): 603 | d = pq('') 604 | self.assertEqual(d.val(), 'on') 605 | 606 | def test_val_value_is_empty_string(self): 607 | d = pq('') 608 | self.assertEqual(d.val(), '') 609 | 610 | def test_val_input_has_no_value_attr(self): 611 | d = pq('') 612 | self.assertEqual(d.val(), '') 613 | 614 | def test_html_replacement(self): 615 | html = '
Not MeReplace MeNot Me
' 616 | replacement = 'New Contents New' 617 | expected = html.replace('Replace Me', replacement) 618 | 619 | d = pq(html) 620 | d.find('span').html(replacement) 621 | 622 | new_html = d.outerHtml() 623 | self.assertEqual(new_html, expected) 624 | self.assertIn(replacement, new_html) 625 | 626 | def test_html_escape(self): 627 | inner_html = 'encoded <script> tag with "quotes".' \ 628 | 'nested <tag>' 629 | html = '
' + inner_html + '
' 630 | d = pq(html) 631 | self.assertEqual(d.html(), inner_html) 632 | 633 | 634 | class TestAjax(TestCase): 635 | 636 | html = ''' 637 |
638 | 639 |
640 |
641 |
642 | 643 | 644 | 645 |
646 |
647 | 648 |
649 |
650 | 651 | 652 |
653 | ''' 654 | 655 | html2 = ''' 656 |
657 | 658 |
659 | 660 | 661 |
662 |
663 |
664 |
665 | 666 | 667 |
668 | ''' 669 | 670 | html3 = ''' 671 |
672 | 673 | 674 |
675 | 676 |
677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 |
687 | ''' 688 | 689 | html4 = ''' 690 |
691 | 693 | 699 | 702 |
703 | ''' 704 | 705 | def test_serialize_pairs_form_id(self): 706 | d = pq(self.html) 707 | self.assertEqual(d('#div').serialize_pairs(), []) 708 | self.assertEqual(d('#dispersed').serialize_pairs(), [ 709 | ('order', 'spam'), ('order', 'eggs'), ('order', 'ham'), 710 | ('order', 'tomato'), ('order', 'baked beans'), 711 | ]) 712 | self.assertEqual(d('.no-id').serialize_pairs(), [ 713 | ('spam', 'Spam'), 714 | ]) 715 | 716 | def test_serialize_pairs_form_controls(self): 717 | d = pq(self.html2) 718 | self.assertEqual(d('fieldset').serialize_pairs(), [ 719 | ('fieldset', 'eggs'), ('fieldset', 'ham'), 720 | ]) 721 | self.assertEqual(d('#input, fieldset, #first').serialize_pairs(), [ 722 | ('order', 'spam'), ('fieldset', 'eggs'), ('fieldset', 'ham'), 723 | ('fieldset', 'eggs'), ('fieldset', 'ham'), ('fieldset', 'ham'), 724 | ]) 725 | self.assertEqual(d('#datalist').serialize_pairs(), [ 726 | ('datalist', 'eggs'), ('checkbox', 'on'), ('radio', 'on'), 727 | ]) 728 | 729 | def test_serialize_pairs_filter_controls(self): 730 | d = pq(self.html3) 731 | self.assertEqual(d('form').serialize_pairs(), [ 732 | ('order', 'spam') 733 | ]) 734 | 735 | def test_serialize_pairs_form_values(self): 736 | d = pq(self.html4) 737 | self.assertEqual(d('form').serialize_pairs(), [ 738 | ('spam', 'Spam/spam'), ('order', 'baked\r\nbeans'), 739 | ('order', 'tomato'), ('multiline', 'multiple\r\nlines\r\nof text'), 740 | ]) 741 | 742 | def test_serialize_array(self): 743 | d = pq(self.html4) 744 | self.assertEqual(d('form').serialize_array(), [ 745 | {'name': 'spam', 'value': 'Spam/spam'}, 746 | {'name': 'order', 'value': 'baked\r\nbeans'}, 747 | {'name': 'order', 'value': 'tomato'}, 748 | {'name': 'multiline', 'value': 'multiple\r\nlines\r\nof text'}, 749 | ]) 750 | 751 | def test_serialize(self): 752 | d = pq(self.html4) 753 | self.assertEqual( 754 | d('form').serialize(), 755 | 'spam=Spam%2Fspam&order=baked%0D%0Abeans&order=tomato&' 756 | 'multiline=multiple%0D%0Alines%0D%0Aof%20text' 757 | ) 758 | 759 | def test_serialize_dict(self): 760 | d = pq(self.html4) 761 | self.assertEqual(d('form').serialize_dict(), { 762 | 'spam': 'Spam/spam', 763 | 'order': ['baked\r\nbeans', 'tomato'], 764 | 'multiline': 'multiple\r\nlines\r\nof text', 765 | }) 766 | 767 | 768 | class TestMakeLinks(TestCase): 769 | 770 | html = ''' 771 | 772 |
773 | with href 774 | without href 775 |
776 | 777 | ''' 778 | 779 | def test_make_link(self): 780 | d = pq(self.html, parser='xml') 781 | d.make_links_absolute(base_url='http://example.com') 782 | self.assertTrue(len(d('a[href]')), 1) 783 | self.assertEqual(d('a[href]').attr('href'), 784 | 'http://example.com/path_info') 785 | 786 | 787 | class TestHTMLParser(TestCase): 788 | xml = "
I'm valid XML
" 789 | html = '''
790 | TestimageMy link text 791 | imageMy link text 2 792 | Behind you, a three-headed HTML‐Entity! 793 |
''' 794 | 795 | def test_parser_persistance(self): 796 | d = pq(self.xml, parser='xml') 797 | self.assertRaises(etree.XMLSyntaxError, lambda: d.after(self.html)) 798 | d = pq(self.xml, parser='html') 799 | d.after(self.html) # this should not fail 800 | 801 | def test_replaceWith(self): 802 | expected = '''
803 | TestimageMy link text 804 | imageMy link text 2 805 | Behind you, a three-headed HTML&dash;Entity! 806 |
''' 807 | d = pq(self.html) 808 | d('img').replace_with('image') 809 | val = d.__html__() 810 | assert val == expected, (repr(val), repr(expected)) 811 | 812 | def test_replaceWith_with_function(self): 813 | expected = '''
814 | TestimageMy link text 815 | imageMy link text 2 816 | Behind you, a three-headed HTML&dash;Entity! 817 |
''' 818 | d = pq(self.html) 819 | d('a').replace_with(lambda i, e: pq(e).html()) 820 | val = d.__html__() 821 | assert val == expected, (repr(val), repr(expected)) 822 | 823 | 824 | class TestXMLNamespace(TestCase): 825 | xml = ''' 826 | 827 | What 828 | 123 829 | 830 | 831 | 832 | ''' 833 | 834 | xhtml = ''' 835 | 836 | 837 |
What
838 | 839 | ''' 840 | 841 | namespaces = {'bar': 'http://example.com/bar', 842 | 'baz': 'http://example.com/baz'} 843 | 844 | def test_selector(self): 845 | expected = 'What' 846 | d = pq(self.xml.encode('utf8'), parser='xml') 847 | val = d('bar|blah', 848 | namespaces=self.namespaces).text() 849 | self.assertEqual(repr(val), repr(expected)) 850 | 851 | def test_selector_with_xml(self): 852 | expected = 'What' 853 | d = pq('bar|blah', self.xml.encode('utf8'), parser='xml', 854 | namespaces=self.namespaces) 855 | val = d.text() 856 | self.assertEqual(repr(val), repr(expected)) 857 | 858 | def test_xhtml_namespace(self): 859 | expected = 'What' 860 | d = pq(self.xhtml.encode('utf8'), parser='xml') 861 | d.xhtml_to_html() 862 | val = d('div').text() 863 | self.assertEqual(repr(val), repr(expected)) 864 | 865 | def test_xhtml_namespace_html_parser(self): 866 | expected = 'What' 867 | d = pq(self.xhtml, parser='html') 868 | d.xhtml_to_html() 869 | val = d('div').text() 870 | self.assertEqual(repr(val), repr(expected)) 871 | 872 | def test_remove_namespaces(self): 873 | expected = 'What' 874 | d = pq(self.xml.encode('utf8'), parser='xml').remove_namespaces() 875 | val = d('blah').text() 876 | self.assertEqual(repr(val), repr(expected)) 877 | 878 | def test_persistent_namespaces(self): 879 | d = pq(self.xml.encode('utf8'), parser='xml', 880 | namespaces=self.namespaces) 881 | val = d('bar|blah').text() 882 | self.assertEqual(repr(val), repr('What')) 883 | 884 | def test_namespace_traversal(self): 885 | d = pq(self.xml.encode('utf8'), parser='xml', 886 | namespaces=self.namespaces) 887 | val = d('baz|subbaz').closest('baz|baz').attr('a') 888 | self.assertEqual(repr(val), repr('b')) 889 | 890 | 891 | class TestWebScrapping(TestCase): 892 | 893 | def setUp(self): 894 | self.s = http.StopableWSGIServer.create(debug_app) 895 | self.s.wait() 896 | self.application_url = self.s.application_url.rstrip('/') 897 | 898 | def test_get(self): 899 | d = pq(url=self.application_url, data={'q': 'foo'}, 900 | method='get') 901 | print(d) 902 | self.assertIn('REQUEST_METHOD: GET', d('p').text()) 903 | self.assertIn('q=foo', d('p').text()) 904 | 905 | def test_post(self): 906 | d = pq(url=self.application_url, data={'q': 'foo'}, 907 | method='post') 908 | self.assertIn('REQUEST_METHOD: POST', d('p').text()) 909 | self.assertIn('q=foo', d('p').text()) 910 | 911 | def test_session(self): 912 | if HAS_REQUEST: 913 | import requests 914 | session = requests.Session() 915 | session.headers.update({'X-FOO': 'bar'}) 916 | d = pq(url=self.application_url, data={'q': 'foo'}, 917 | method='get', session=session) 918 | self.assertIn('HTTP_X_FOO: bar', d('p').text()) 919 | else: 920 | self.skipTest('no requests library') 921 | 922 | def tearDown(self): 923 | self.s.shutdown() 924 | 925 | 926 | class TestWebScrappingEncoding(TestCase): 927 | 928 | def test_get(self): 929 | d = pq(url='http://ru.wikipedia.org/wiki/Заглавная_страница', 930 | method='get') 931 | print(d) 932 | self.assertEqual(d('#pt-login').text(), 'Войти') 933 | 934 | 935 | class TestWebScrappingTimeouts(TestCase): 936 | 937 | def setUp(self): 938 | def app(environ, start_response): 939 | start_response('200 OK', [('Content-Type', 'text/plain')]) 940 | time.sleep(2) 941 | return [b'foobar\n'] 942 | self.s = http.StopableWSGIServer.create(app) 943 | self.s.wait() 944 | self.application_url = self.s.application_url.rstrip('/') 945 | 946 | def test_get(self): 947 | pq(url=self.application_url) 948 | with self.assertRaises(Exception): 949 | pq(url=self.application_url, timeout=1) 950 | 951 | def tearDown(self): 952 | self.s.shutdown() 953 | -------------------------------------------------------------------------------- /pyquery/pyquery.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2008 - Olivier Lauzanne 2 | # 3 | # Distributed under the BSD license, see LICENSE.txt 4 | from .cssselectpatch import JQueryTranslator 5 | from reprlib import recursive_repr 6 | from urllib.parse import urlencode 7 | from urllib.parse import urljoin 8 | from .openers import url_opener 9 | from .text import extract_text 10 | from copy import deepcopy 11 | from html import escape 12 | from lxml import etree 13 | import lxml.html 14 | import inspect 15 | import itertools 16 | import types 17 | import sys 18 | 19 | if sys.version_info >= (3, 12, 0): 20 | from collections import OrderedDict 21 | else: 22 | # backward compat. to be able to run doctest with 3.7+. see: 23 | # https://github.com/gawel/pyquery/issues/249 24 | # and: 25 | # https://github.com/python/cpython/blob/3.12/Lib/collections/__init__.py#L272 26 | from collections import OrderedDict as BaseOrderedDict 27 | 28 | class OrderedDict(BaseOrderedDict): 29 | @recursive_repr() 30 | def __repr__(self): 31 | 'od.__repr__() <==> repr(od)' 32 | if not self: 33 | return '%s()' % (self.__class__.__name__,) 34 | return '%s(%r)' % (self.__class__.__name__, dict(self.items())) 35 | 36 | basestring = (str, bytes) 37 | 38 | 39 | def getargspec(func): 40 | args = inspect.signature(func).parameters.values() 41 | return [p.name for p in args 42 | if p.kind == p.POSITIONAL_OR_KEYWORD] 43 | 44 | 45 | def with_camel_case_alias(func): 46 | """decorator for methods who required a camelcase alias""" 47 | _camel_case_aliases.add(func.__name__) 48 | return func 49 | 50 | 51 | _camel_case_aliases = set() 52 | 53 | 54 | def build_camel_case_aliases(PyQuery): 55 | """add camelcase aliases to PyQuery""" 56 | for alias in _camel_case_aliases: 57 | parts = list(alias.split('_')) 58 | name = parts[0] + ''.join([p.title() for p in parts[1:]]) 59 | func = getattr(PyQuery, alias) 60 | f = types.FunctionType(func.__code__, func.__globals__, 61 | name, func.__defaults__) 62 | f.__doc__ = ( 63 | 'Alias for :func:`~pyquery.pyquery.PyQuery.%s`') % func.__name__ 64 | setattr(PyQuery, name, f.__get__(None, PyQuery)) 65 | 66 | 67 | def fromstring(context, parser=None, custom_parser=None): 68 | """use html parser if we don't have clean xml 69 | """ 70 | if hasattr(context, 'read') and hasattr(context.read, '__call__'): 71 | meth = 'parse' 72 | else: 73 | meth = 'fromstring' 74 | if custom_parser is None: 75 | if parser is None: 76 | try: 77 | result = getattr(etree, meth)(context) 78 | except etree.XMLSyntaxError: 79 | if hasattr(context, 'seek'): 80 | context.seek(0) 81 | result = getattr(lxml.html, meth)(context) 82 | if isinstance(result, etree._ElementTree): 83 | return [result.getroot()] 84 | else: 85 | return [result] 86 | elif parser == 'xml': 87 | custom_parser = getattr(etree, meth) 88 | elif parser == 'html': 89 | custom_parser = getattr(lxml.html, meth) 90 | elif parser == 'html5': 91 | from lxml.html import html5parser 92 | custom_parser = getattr(html5parser, meth) 93 | elif parser == 'soup': 94 | from lxml.html import soupparser 95 | custom_parser = getattr(soupparser, meth) 96 | elif parser == 'html_fragments': 97 | custom_parser = lxml.html.fragments_fromstring 98 | else: 99 | raise ValueError('No such parser: "%s"' % parser) 100 | 101 | result = custom_parser(context) 102 | if isinstance(result, list): 103 | return result 104 | elif isinstance(result, etree._ElementTree): 105 | return [result.getroot()] 106 | elif result is not None: 107 | return [result] 108 | else: 109 | return [] 110 | 111 | 112 | def callback(func, *args): 113 | return func(*args[:func.__code__.co_argcount]) 114 | 115 | 116 | class NoDefault(object): 117 | def __repr__(self): 118 | """clean representation in Sphinx""" 119 | return '' 120 | 121 | 122 | no_default = NoDefault() 123 | del NoDefault 124 | 125 | 126 | class FlexibleElement(object): 127 | """property to allow a flexible api""" 128 | def __init__(self, pget, pset=no_default, pdel=no_default): 129 | self.pget = pget 130 | self.pset = pset 131 | self.pdel = pdel 132 | 133 | def __get__(self, instance, klass): 134 | class _element(object): 135 | """real element to support set/get/del attr and item and js call 136 | style""" 137 | def __call__(prop, *args, **kwargs): 138 | return self.pget(instance, *args, **kwargs) 139 | __getattr__ = __getitem__ = __setattr__ = __setitem__ = __call__ 140 | 141 | def __delitem__(prop, name): 142 | if self.pdel is not no_default: 143 | return self.pdel(instance, name) 144 | else: 145 | raise NotImplementedError() 146 | __delattr__ = __delitem__ 147 | 148 | def __repr__(prop): 149 | return '' % self.pget.__name__ 150 | return _element() 151 | 152 | def __set__(self, instance, value): 153 | if self.pset is not no_default: 154 | self.pset(instance, value) 155 | else: 156 | raise NotImplementedError() 157 | 158 | 159 | class PyQuery(list): 160 | """The main class 161 | """ 162 | 163 | _translator_class = JQueryTranslator 164 | 165 | def __init__(self, *args, **kwargs): 166 | html = None 167 | elements = [] 168 | self._base_url = None 169 | self.parser = kwargs.pop('parser', None) 170 | 171 | if 'parent' in kwargs: 172 | self._parent = kwargs.pop('parent') 173 | else: 174 | self._parent = no_default 175 | 176 | if 'css_translator' in kwargs: 177 | self._translator = kwargs.pop('css_translator') 178 | elif self.parser in ('xml',): 179 | self._translator = self._translator_class(xhtml=True) 180 | elif self._parent is not no_default: 181 | self._translator = self._parent._translator 182 | else: 183 | self._translator = self._translator_class(xhtml=False) 184 | 185 | self.namespaces = kwargs.pop('namespaces', None) 186 | 187 | if kwargs: 188 | # specific case to get the dom 189 | if 'filename' in kwargs: 190 | html = open(kwargs['filename'], 191 | encoding=kwargs.get('encoding')) 192 | elif 'url' in kwargs: 193 | url = kwargs.pop('url') 194 | if 'opener' in kwargs: 195 | opener = kwargs.pop('opener') 196 | html = opener(url, **kwargs) 197 | else: 198 | html = url_opener(url, kwargs) 199 | if not self.parser: 200 | self.parser = 'html' 201 | self._base_url = url 202 | else: 203 | raise ValueError('Invalid keyword arguments %s' % kwargs) 204 | 205 | elements = fromstring(html, self.parser) 206 | # close open descriptor if possible 207 | if hasattr(html, 'close'): 208 | try: 209 | html.close() 210 | except Exception: 211 | pass 212 | 213 | else: 214 | # get nodes 215 | 216 | # determine context and selector if any 217 | selector = context = no_default 218 | length = len(args) 219 | if length == 1: 220 | context = args[0] 221 | elif length == 2: 222 | selector, context = args 223 | else: 224 | raise ValueError( 225 | "You can't do that. Please, provide arguments") 226 | 227 | # get context 228 | if isinstance(context, basestring): 229 | try: 230 | elements = fromstring(context, self.parser) 231 | except Exception: 232 | raise 233 | elif isinstance(context, self.__class__): 234 | # copy 235 | elements = context[:] 236 | elif isinstance(context, list): 237 | elements = context 238 | elif isinstance(context, etree._Element): 239 | elements = [context] 240 | else: 241 | raise TypeError(context) 242 | 243 | # select nodes 244 | if elements and selector is not no_default: 245 | xpath = self._css_to_xpath(selector) 246 | results = [] 247 | for tag in elements: 248 | results.extend( 249 | tag.xpath(xpath, namespaces=self.namespaces)) 250 | elements = results 251 | 252 | list.__init__(self, elements) 253 | 254 | def _css_to_xpath(self, selector, prefix='descendant-or-self::'): 255 | selector = selector.replace('[@', '[') 256 | return self._translator.css_to_xpath(selector, prefix) 257 | 258 | def _copy(self, *args, **kwargs): 259 | kwargs.setdefault('namespaces', self.namespaces) 260 | return self.__class__(*args, **kwargs) 261 | 262 | def __call__(self, *args, **kwargs): 263 | """return a new PyQuery instance 264 | """ 265 | length = len(args) 266 | if length == 0: 267 | raise ValueError('You must provide at least a selector') 268 | if args[0] == '': 269 | return self._copy([]) 270 | if (len(args) == 1 and 271 | isinstance(args[0], str) and 272 | not args[0].startswith('<')): 273 | args += (self,) 274 | result = self._copy(*args, parent=self, **kwargs) 275 | return result 276 | 277 | # keep original list api prefixed with _ 278 | _append = list.append 279 | _extend = list.extend 280 | 281 | # improve pythonic api 282 | def __add__(self, other): 283 | assert isinstance(other, self.__class__) 284 | return self._copy(self[:] + other[:]) 285 | 286 | def extend(self, other): 287 | """Extend with another PyQuery object""" 288 | assert isinstance(other, self.__class__) 289 | self._extend(other[:]) 290 | return self 291 | 292 | def items(self, selector=None): 293 | """Iter over elements. Return PyQuery objects: 294 | 295 | >>> d = PyQuery('
foobar
') 296 | >>> [i.text() for i in d.items('span')] 297 | ['foo', 'bar'] 298 | >>> [i.text() for i in d('span').items()] 299 | ['foo', 'bar'] 300 | >>> list(d.items('a')) == list(d('a').items()) 301 | True 302 | """ 303 | if selector: 304 | elems = self(selector) or [] 305 | else: 306 | elems = self 307 | for elem in elems: 308 | yield self._copy(elem, parent=self) 309 | 310 | def xhtml_to_html(self): 311 | """Remove xhtml namespace: 312 | 313 | >>> doc = PyQuery( 314 | ... '') 315 | >>> doc 316 | [<{http://www.w3.org/1999/xhtml}html>] 317 | >>> doc.xhtml_to_html() 318 | [] 319 | """ 320 | try: 321 | root = self[0].getroottree() 322 | except IndexError: 323 | pass 324 | else: 325 | lxml.html.xhtml_to_html(root) 326 | return self 327 | 328 | def remove_namespaces(self): 329 | """Remove all namespaces: 330 | 331 | >>> doc = PyQuery('') 332 | >>> doc 333 | [<{http://example.com/foo}foo>] 334 | >>> doc.remove_namespaces() 335 | [] 336 | """ 337 | try: 338 | root = self[0].getroottree() 339 | except IndexError: 340 | pass 341 | else: 342 | for el in root.iter('{*}*'): 343 | if el.tag.startswith('{'): 344 | el.tag = el.tag.split('}', 1)[1] 345 | return self 346 | 347 | def __str__(self): 348 | """xml representation of current nodes:: 349 | 350 | >>> xml = PyQuery( 351 | ... '', parser='html_fragments') 352 | >>> print(str(xml)) 353 | 354 | 355 | """ 356 | return ''.join([etree.tostring(e, encoding=str) for e in self]) 357 | 358 | def __unicode__(self): 359 | """xml representation of current nodes""" 360 | return u''.join([etree.tostring(e, encoding=str) 361 | for e in self]) 362 | 363 | def __html__(self): 364 | """html representation of current nodes:: 365 | 366 | >>> html = PyQuery( 367 | ... '', parser='html_fragments') 368 | >>> print(html.__html__()) 369 | 370 | 371 | """ 372 | return u''.join([lxml.html.tostring(e, encoding=str) 373 | for e in self]) 374 | 375 | def __repr__(self): 376 | r = [] 377 | try: 378 | for el in self: 379 | c = el.get('class') 380 | c = c and '.' + '.'.join(c.split(' ')) or '' 381 | id = el.get('id') 382 | id = id and '#' + id or '' 383 | r.append('<%s%s%s>' % (el.tag, id, c)) 384 | return '[' + (', '.join(r)) + ']' 385 | except AttributeError: 386 | return list.__repr__(self) 387 | 388 | @property 389 | def root(self): 390 | """return the xml root element 391 | """ 392 | if self._parent is not no_default: 393 | return self._parent[0].getroottree() 394 | return self[0].getroottree() 395 | 396 | @property 397 | def encoding(self): 398 | """return the xml encoding of the root element 399 | """ 400 | root = self.root 401 | if root is not None: 402 | return self.root.docinfo.encoding 403 | 404 | ############## 405 | # Traversing # 406 | ############## 407 | 408 | def _filter_only(self, selector, elements, reverse=False, unique=False): 409 | """Filters the selection set only, as opposed to also including 410 | descendants. 411 | """ 412 | if selector is None: 413 | results = elements 414 | else: 415 | xpath = self._css_to_xpath(selector, 'self::') 416 | results = [] 417 | for tag in elements: 418 | results.extend(tag.xpath(xpath, namespaces=self.namespaces)) 419 | if reverse: 420 | results.reverse() 421 | if unique: 422 | result_list = results 423 | results = [] 424 | for item in result_list: 425 | if item not in results: 426 | results.append(item) 427 | return self._copy(results, parent=self) 428 | 429 | def parent(self, selector=None): 430 | return self._filter_only( 431 | selector, 432 | [e.getparent() for e in self if e.getparent() is not None], 433 | unique=True) 434 | 435 | def prev(self, selector=None): 436 | return self._filter_only( 437 | selector, 438 | [e.getprevious() for e in self if e.getprevious() is not None]) 439 | 440 | def next(self, selector=None): 441 | return self._filter_only( 442 | selector, 443 | [e.getnext() for e in self if e.getnext() is not None]) 444 | 445 | def _traverse(self, method): 446 | for e in self: 447 | current = getattr(e, method)() 448 | while current is not None: 449 | yield current 450 | current = getattr(current, method)() 451 | 452 | def _traverse_parent_topdown(self): 453 | for e in self: 454 | this_list = [] 455 | current = e.getparent() 456 | while current is not None: 457 | this_list.append(current) 458 | current = current.getparent() 459 | this_list.reverse() 460 | for j in this_list: 461 | yield j 462 | 463 | def _next_all(self): 464 | return [e for e in self._traverse('getnext')] 465 | 466 | @with_camel_case_alias 467 | def next_all(self, selector=None): 468 | """ 469 | >>> h = '

Hi

Bye

' 470 | >>> d = PyQuery(h) 471 | >>> d('p:last').next_all() 472 | [] 473 | >>> d('p:last').nextAll() 474 | [] 475 | """ 476 | return self._filter_only(selector, self._next_all()) 477 | 478 | @with_camel_case_alias 479 | def next_until(self, selector, filter_=None): 480 | """ 481 | >>> h = ''' 482 | ...

Greeting 1

483 | ...

Hello!

World!

484 | ...

Greeting 2

Bye!

485 | ... ''' 486 | >>> d = PyQuery(h) 487 | >>> d('h2:first').nextUntil('h2') 488 | [

,

] 489 | """ 490 | return self._filter_only( 491 | filter_, [ 492 | e 493 | for q in itertools.takewhile( 494 | lambda q: not q.is_(selector), self.next_all().items()) 495 | for e in q 496 | ] 497 | ) 498 | 499 | def _prev_all(self): 500 | return [e for e in self._traverse('getprevious')] 501 | 502 | @with_camel_case_alias 503 | def prev_all(self, selector=None): 504 | """ 505 | >>> h = '

Hi

Bye

' 506 | >>> d = PyQuery(h) 507 | >>> d('p:last').prev_all() 508 | [] 509 | >>> d('p:last').prevAll() 510 | [] 511 | """ 512 | return self._filter_only(selector, self._prev_all(), reverse=True) 513 | 514 | def siblings(self, selector=None): 515 | """ 516 | >>> h = '

Hi

Bye

' 517 | >>> d = PyQuery(h) 518 | >>> d('.hello').siblings() 519 | [

, ] 520 | >>> d('.hello').siblings('img') 521 | [] 522 | 523 | """ 524 | return self._filter_only(selector, self._prev_all() + self._next_all()) 525 | 526 | def parents(self, selector=None): 527 | """ 528 | >>> d = PyQuery('

Hi

Bye

') 529 | >>> d('p').parents() 530 | [] 531 | >>> d('.hello').parents('span') 532 | [] 533 | >>> d('.hello').parents('p') 534 | [] 535 | """ 536 | return self._filter_only( 537 | selector, 538 | [e for e in self._traverse_parent_topdown()], 539 | unique=True 540 | ) 541 | 542 | def children(self, selector=None): 543 | """Filter elements that are direct children of self using optional 544 | selector: 545 | 546 | >>> d = PyQuery('

Hi

Bye

') 547 | >>> d 548 | [] 549 | >>> d.children() 550 | [,

] 551 | >>> d.children('.hello') 552 | [] 553 | """ 554 | elements = [child for tag in self for child in tag.getchildren()] 555 | return self._filter_only(selector, elements) 556 | 557 | def closest(self, selector=None): 558 | """ 559 | >>> d = PyQuery( 560 | ... '

This is a ' 561 | ... 'test

') 562 | >>> d('strong').closest('div') 563 | [] 564 | >>> d('strong').closest('.hello') 565 | [] 566 | >>> d('strong').closest('form') 567 | [] 568 | """ 569 | result = [] 570 | for current in self: 571 | while (current is not None and 572 | not self._copy(current).is_(selector)): 573 | current = current.getparent() 574 | if current is not None: 575 | result.append(current) 576 | return self._copy(result, parent=self) 577 | 578 | def contents(self): 579 | """ 580 | Return contents (with text nodes): 581 | 582 | >>> d = PyQuery('hello bold') 583 | >>> d.contents() # doctest: +ELLIPSIS 584 | ['hello ', ] 585 | """ 586 | results = [] 587 | for elem in self: 588 | results.extend(elem.xpath('child::text()|child::*', 589 | namespaces=self.namespaces)) 590 | return self._copy(results, parent=self) 591 | 592 | def filter(self, selector): 593 | """Filter elements in self using selector (string or function): 594 | 595 | >>> d = PyQuery('

Hi

Bye

') 596 | >>> d('p') 597 | [,

] 598 | >>> d('p').filter('.hello') 599 | [] 600 | >>> d('p').filter(lambda i: i == 1) 601 | [

] 602 | >>> d('p').filter(lambda i: PyQuery(this).text() == 'Hi') 603 | [] 604 | >>> d('p').filter(lambda i, this: PyQuery(this).text() == 'Hi') 605 | [] 606 | """ 607 | if not hasattr(selector, '__call__'): 608 | return self._filter_only(selector, self) 609 | else: 610 | elements = [] 611 | args = getargspec(callback) 612 | try: 613 | for i, this in enumerate(self): 614 | if len(args) == 1: 615 | selector.__globals__['this'] = this 616 | if callback(selector, i, this): 617 | elements.append(this) 618 | finally: 619 | f_globals = selector.__globals__ 620 | if 'this' in f_globals: 621 | del f_globals['this'] 622 | return self._copy(elements, parent=self) 623 | 624 | def not_(self, selector): 625 | """Return elements that don't match the given selector: 626 | 627 | >>> d = PyQuery('

Hi

Bye

') 628 | >>> d('p').not_('.hello') 629 | [

] 630 | """ 631 | exclude = set(self._copy(selector, self)) 632 | return self._copy([e for e in self if e not in exclude], 633 | parent=self) 634 | 635 | def is_(self, selector): 636 | """Returns True if selector matches at least one current element, else 637 | False: 638 | 639 | >>> d = PyQuery('

Hi

Bye

') 640 | >>> d('p').eq(0).is_('.hello') 641 | True 642 | 643 | >>> d('p').eq(0).is_('span') 644 | False 645 | 646 | >>> d('p').eq(1).is_('.hello') 647 | False 648 | 649 | .. 650 | """ 651 | return bool(self._filter_only(selector, self)) 652 | 653 | def find(self, selector): 654 | """Find elements using selector traversing down from self: 655 | 656 | >>> m = '

Whoah!

there

' 657 | >>> d = PyQuery(m) 658 | >>> d('p').find('em') 659 | [, ] 660 | >>> d('p').eq(1).find('em') 661 | [] 662 | """ 663 | xpath = self._css_to_xpath(selector) 664 | results = [child.xpath(xpath, namespaces=self.namespaces) 665 | for tag in self 666 | for child in tag.getchildren()] 667 | # Flatten the results 668 | elements = [] 669 | for r in results: 670 | elements.extend(r) 671 | return self._copy(elements, parent=self) 672 | 673 | def eq(self, index): 674 | """Return PyQuery of only the element with the provided index:: 675 | 676 | >>> d = PyQuery('

Hi

Bye

') 677 | >>> d('p').eq(0) 678 | [] 679 | >>> d('p').eq(1) 680 | [

] 681 | >>> d('p').eq(2) 682 | [] 683 | 684 | .. 685 | """ 686 | # Slicing will return empty list when index=-1 687 | # we should handle out of bound by ourselves 688 | try: 689 | items = self[index] 690 | except IndexError: 691 | items = [] 692 | return self._copy(items, parent=self) 693 | 694 | def each(self, func): 695 | """apply func on each nodes 696 | """ 697 | try: 698 | for i, element in enumerate(self): 699 | func.__globals__['this'] = element 700 | if callback(func, i, element) is False: 701 | break 702 | finally: 703 | f_globals = func.__globals__ 704 | if 'this' in f_globals: 705 | del f_globals['this'] 706 | return self 707 | 708 | def map(self, func): 709 | """Returns a new PyQuery after transforming current items with func. 710 | 711 | func should take two arguments - 'index' and 'element'. Elements can 712 | also be referred to as 'this' inside of func:: 713 | 714 | >>> d = PyQuery('

Hi there

Bye


') 715 | >>> d('p').map(lambda i, e: PyQuery(e).text()) 716 | ['Hi there', 'Bye'] 717 | 718 | >>> d('p').map(lambda i, e: len(PyQuery(this).text())) 719 | [8, 3] 720 | 721 | >>> d('p').map(lambda i, e: PyQuery(this).text().split()) 722 | ['Hi', 'there', 'Bye'] 723 | 724 | """ 725 | items = [] 726 | try: 727 | for i, element in enumerate(self): 728 | func.__globals__['this'] = element 729 | result = callback(func, i, element) 730 | if result is not None: 731 | if not isinstance(result, list): 732 | items.append(result) 733 | else: 734 | items.extend(result) 735 | finally: 736 | f_globals = func.__globals__ 737 | if 'this' in f_globals: 738 | del f_globals['this'] 739 | return self._copy(items, parent=self) 740 | 741 | @property 742 | def length(self): 743 | return len(self) 744 | 745 | def size(self): 746 | return len(self) 747 | 748 | def end(self): 749 | """Break out of a level of traversal and return to the parent level. 750 | 751 | >>> m = '

Whoah!

there

' 752 | >>> d = PyQuery(m) 753 | >>> d('p').eq(1).find('em').end().end() 754 | [

,

] 755 | """ 756 | return self._parent 757 | 758 | ############## 759 | # Attributes # 760 | ############## 761 | def attr(self, *args, **kwargs): 762 | """Attributes manipulation 763 | """ 764 | 765 | mapping = {'class_': 'class', 'for_': 'for'} 766 | 767 | attr = value = no_default 768 | length = len(args) 769 | if length == 1: 770 | attr = args[0] 771 | attr = mapping.get(attr, attr) 772 | elif length == 2: 773 | attr, value = args 774 | attr = mapping.get(attr, attr) 775 | elif kwargs: 776 | attr = {} 777 | for k, v in kwargs.items(): 778 | attr[mapping.get(k, k)] = v 779 | else: 780 | raise ValueError('Invalid arguments %s %s' % (args, kwargs)) 781 | 782 | if not self: 783 | return None 784 | elif isinstance(attr, dict): 785 | for tag in self: 786 | for key, value in attr.items(): 787 | tag.set(key, value) 788 | elif value is no_default: 789 | return self[0].get(attr) 790 | elif value is None: 791 | return self.remove_attr(attr) 792 | else: 793 | for tag in self: 794 | tag.set(attr, value) 795 | return self 796 | 797 | @with_camel_case_alias 798 | def remove_attr(self, name): 799 | """Remove an attribute:: 800 | 801 | >>> d = PyQuery('

') 802 | >>> d.remove_attr('id') 803 | [
] 804 | >>> d.removeAttr('id') 805 | [
] 806 | 807 | .. 808 | """ 809 | for tag in self: 810 | try: 811 | del tag.attrib[name] 812 | except KeyError: 813 | pass 814 | return self 815 | 816 | attr = FlexibleElement(pget=attr, pdel=remove_attr) 817 | 818 | ####### 819 | # CSS # 820 | ####### 821 | def height(self, value=no_default): 822 | """set/get height of element 823 | """ 824 | return self.attr('height', value) 825 | 826 | def width(self, value=no_default): 827 | """set/get width of element 828 | """ 829 | return self.attr('width', value) 830 | 831 | @with_camel_case_alias 832 | def has_class(self, name): 833 | """Return True if element has class:: 834 | 835 | >>> d = PyQuery('
') 836 | >>> d.has_class('myclass') 837 | True 838 | >>> d.hasClass('myclass') 839 | True 840 | 841 | .. 842 | """ 843 | return self.is_('.%s' % name) 844 | 845 | @with_camel_case_alias 846 | def add_class(self, value): 847 | """Add a css class to elements:: 848 | 849 | >>> d = PyQuery('
') 850 | >>> d.add_class('myclass') 851 | [] 852 | >>> d.addClass('myclass') 853 | [] 854 | 855 | .. 856 | """ 857 | for tag in self: 858 | values = value.split(' ') 859 | classes = (tag.get('class') or '').split() 860 | classes += [v for v in values if v not in classes] 861 | tag.set('class', ' '.join(classes)) 862 | return self 863 | 864 | @with_camel_case_alias 865 | def remove_class(self, value): 866 | """Remove a css class to elements:: 867 | 868 | >>> d = PyQuery('
') 869 | >>> d.remove_class('myclass') 870 | [
] 871 | >>> d.removeClass('myclass') 872 | [
] 873 | 874 | .. 875 | """ 876 | for tag in self: 877 | values = value.split(' ') 878 | classes = set((tag.get('class') or '').split()) 879 | classes.difference_update(values) 880 | classes.difference_update(['']) 881 | classes = ' '.join(classes) 882 | if classes.strip(): 883 | tag.set('class', classes) 884 | elif tag.get('class'): 885 | tag.set('class', classes) 886 | return self 887 | 888 | @with_camel_case_alias 889 | def toggle_class(self, value): 890 | """Toggle a css class to elements 891 | 892 | >>> d = PyQuery('
') 893 | >>> d.toggle_class('myclass') 894 | [] 895 | >>> d.toggleClass('myclass') 896 | [
] 897 | 898 | """ 899 | for tag in self: 900 | values = value.split(' ') 901 | classes = (tag.get('class') or '').split() 902 | values_to_add = [v for v in values if v not in classes] 903 | values_to_del = [v for v in values if v in classes] 904 | classes = [v for v in classes if v not in values_to_del] 905 | classes += values_to_add 906 | tag.set('class', ' '.join(classes)) 907 | return self 908 | 909 | def css(self, *args, **kwargs): 910 | """css attributes manipulation 911 | """ 912 | 913 | attr = value = no_default 914 | length = len(args) 915 | if length == 1: 916 | attr = args[0] 917 | elif length == 2: 918 | attr, value = args 919 | elif kwargs: 920 | attr = kwargs 921 | else: 922 | raise ValueError('Invalid arguments %s %s' % (args, kwargs)) 923 | 924 | if isinstance(attr, dict): 925 | for tag in self: 926 | stripped_keys = [key.strip().replace('_', '-') 927 | for key in attr.keys()] 928 | current = [el.strip() 929 | for el in (tag.get('style') or '').split(';') 930 | if el.strip() 931 | and el.split(':')[0].strip() not in stripped_keys] 932 | for key, value in attr.items(): 933 | key = key.replace('_', '-') 934 | current.append('%s: %s' % (key, value)) 935 | tag.set('style', '; '.join(current)) 936 | elif isinstance(value, basestring): 937 | attr = attr.replace('_', '-') 938 | for tag in self: 939 | current = [ 940 | el.strip() 941 | for el in (tag.get('style') or '').split(';') 942 | if (el.strip() and 943 | not el.split(':')[0].strip() == attr.strip())] 944 | current.append('%s: %s' % (attr, value)) 945 | tag.set('style', '; '.join(current)) 946 | return self 947 | 948 | css = FlexibleElement(pget=css, pset=css) 949 | 950 | ################### 951 | # CORE UI EFFECTS # 952 | ################### 953 | def hide(self): 954 | """Add display:none to elements style: 955 | 956 | >>> print(PyQuery('
').hide()) 957 |
958 | 959 | """ 960 | return self.css('display', 'none') 961 | 962 | def show(self): 963 | """Add display:block to elements style: 964 | 965 | >>> print(PyQuery('
').show()) 966 |
967 | 968 | """ 969 | return self.css('display', 'block') 970 | 971 | ######## 972 | # HTML # 973 | ######## 974 | def val(self, value=no_default): 975 | """Set the attribute value:: 976 | 977 | >>> d = PyQuery('') 978 | >>> d.val('Youhou') 979 | [] 980 | 981 | Get the attribute value:: 982 | 983 | >>> d.val() 984 | 'Youhou' 985 | 986 | Set the selected values for a `select` element with the `multiple` 987 | attribute:: 988 | 989 | >>> d = PyQuery(''' 990 | ... 993 | ... ''') 994 | >>> d.val(['you', 'hou']) 995 | [