Hello</head><body onload=crash()>Hi all<p>', parser='soup') 421 | self.assertEqual(str(d), '<html><meta/><head><title>Hello

Heading 2

Heading 3

Heading 4

Heading 5

Heading 6

Hello world

node1

250 |

node2 booyah

251 | 252 | 253 | """ 254 | 255 | def test_filter(self): 256 | assert len(self.klass('div', self.html).filter('.node3')) == 1 257 | assert len(self.klass('div', self.html).filter('#node2')) == 1 258 | assert len(self.klass('div', self.html).filter(lambda i: i == 0)) == 1 259 | 260 | d = pq('

Hello warming world

Hello warming world

Coffee
Tea
Milk

Yeah !

389 | Test

My link text 390 |

My link text 2 391 |

I'm valid XML

" 404 | html = ''' 405 |

406 | TestimageMy link text 407 | imageMy link text 2 408 | Behind you, a three-headed HTML‐Entity! 409 |

') 422 | 423 | def test_replaceWith(self): 424 | expected = '''

425 | TestimageMy link text 426 | imageMy link text 2 427 | Behind you, a three-headed HTML&dash;Entity! 428 |

436 | TestimageMy link text 437 | imageMy link text 2 438 | Behind you, a three-headed HTML&dash;Entity! 439 |

''' 440 | d = pq(self.html) 441 | d('a').replaceWith(lambda i, e: pq(e).html()) 442 | val = d.__html__() 443 | assert val == expected, (repr(val), repr(expected)) 444 | 445 | class TestWebScrapping(unittest.TestCase): 446 | @with_net 447 | def test_get(self): 448 | d = pq('http://www.theonion.com/search/', {'q': 'inconsistency'}, method='get') 449 | self.assertEqual(d('input[name=q]:last').val(), 'inconsistency') 450 | self.assertEqual(d('.news-in-brief h3').text(), 'Slight Inconsistency Found In Bible') 451 | 452 | @with_net 453 | def test_post(self): 454 | d = pq('http://www.theonion.com/search/', {'q': 'inconsistency'}, method='post') 455 | self.assertEqual(d('input[name=q]:last').val(), '') # the onion does not search on post 456 | 457 | if __name__ == '__main__': 458 | fails, total = unittest.main() 459 | if fails == 0: 460 | print('OK') 461 | -------------------------------------------------------------------------------- /pyquery/pyquery.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | # 3 | # Copyright (C) 2008 - Olivier Lauzanne 4 | # 5 | # Distributed under the BSD license, see LICENSE.txt 6 | from .cssselectpatch import selector_to_xpath 7 | from copy import deepcopy 8 | from lxml import etree 9 | import lxml.html 10 | import sys 11 | 12 | PY3k = sys.version_info >= (3,) 13 | 14 | if PY3k: 15 | from urllib.request import urlopen 16 | from urllib.parse import urlencode 17 | from urllib.parse import urljoin 18 | basestring = (str, bytes) 19 | unicode = str 20 | else: 21 | from urllib2 import urlopen 22 | from urllib import urlencode 23 | from urlparse import urljoin 24 | 25 | def func_globals(f): 26 | return f.__globals__ if PY3k else f.func_globals 27 | 28 | def func_code(f): 29 | return f.__code__ if PY3k else f.func_code 30 | 31 | def fromstring(context, parser=None, custom_parser=None): 32 | """use html parser if we don't have clean xml 33 | """ 34 | if hasattr(context, 'read') and hasattr(context.read, '__call__'): 35 | meth = 'parse' 36 | else: 37 | meth = 'fromstring' 38 | if custom_parser is None: 39 | if parser is None: 40 | try: 41 | result = getattr(etree, meth)(context) 42 | except etree.XMLSyntaxError: 43 | result = getattr(lxml.html, meth)(context) 44 | if isinstance(result, etree._ElementTree): 45 | return [result.getroot()] 46 | else: 47 | return [result] 48 | elif parser == 'xml': 49 | custom_parser = getattr(etree, meth) 50 | elif parser == 'html': 51 | custom_parser = getattr(lxml.html, meth) 52 | elif parser == 'soup': 53 | from lxml.html import soupparser 54 | custom_parser = getattr(lxml.html.soupparser, meth) 55 | elif parser == 'html_fragments': 56 | custom_parser = lxml.html.fragments_fromstring 57 | else: 58 | ValueError('No such parser: "%s"' % parser) 59 | 60 | result = custom_parser(context) 61 | if type(result) is list: 62 | return result 63 | elif isinstance(result, etree._ElementTree): 64 | return [result.getroot()] 65 | else: 66 | return [result] 67 | 68 | def callback(func, *args): 69 | return func(*args[:func_code(func).co_argcount]) 70 | 71 | class NoDefault(object): 72 | def __repr__(self): 73 | """clean representation in Sphinx""" 74 | return '' 75 | 76 | no_default = NoDefault() 77 | del NoDefault 78 | 79 | class FlexibleElement(object): 80 | """property to allow a flexible api""" 81 | def __init__(self, pget, pset=no_default, pdel=no_default): 82 | self.pget = pget 83 | self.pset = pset 84 | self.pdel = pdel 85 | def __get__(self, instance, klass): 86 | class _element(object): 87 | """real element to support set/get/del attr and item and js call 88 | style""" 89 | def __call__(prop, *args, **kwargs): 90 | return self.pget(instance, *args, **kwargs) 91 | __getattr__ = __getitem__ = __setattr__ = __setitem__ = __call__ 92 | def __delitem__(prop, name): 93 | if self.pdel is not no_default: 94 | return self.pdel(instance, name) 95 | else: 96 | raise NotImplementedError() 97 | __delattr__ = __delitem__ 98 | def __repr__(prop): 99 | return '' % self.pget.__name__ 100 | return _element() 101 | def __set__(self, instance, value): 102 | if self.pset is not no_default: 103 | self.pset(instance, value) 104 | else: 105 | raise NotImplementedError() 106 | 107 | class PyQuery(list): 108 | """The main class 109 | """ 110 | def __init__(self, *args, **kwargs): 111 | html = None 112 | elements = [] 113 | self._base_url = None 114 | self.parser = kwargs.get('parser', None) 115 | if 'parser' in kwargs: 116 | del kwargs['parser'] 117 | if len(args) >= 1 and isinstance(args[0], basestring) \ 118 | and args[0].startswith('http://'): 119 | kwargs['url'] = args[0] 120 | if len(args) >= 2: 121 | kwargs['data'] = args[1] 122 | args = [] 123 | 124 | if 'parent' in kwargs: 125 | self._parent = kwargs.pop('parent') 126 | else: 127 | self._parent = no_default 128 | 129 | if kwargs: 130 | # specific case to get the dom 131 | if 'filename' in kwargs: 132 | html = open(kwargs['filename']) 133 | elif 'url' in kwargs: 134 | url = kwargs.pop('url') 135 | if 'opener' in kwargs: 136 | opener = kwargs.pop('opener') 137 | html = opener(url) 138 | else: 139 | method = kwargs.get('method') 140 | data = kwargs.get('data') 141 | if type(data) in (dict, list, tuple): 142 | data = urlencode(data) 143 | 144 | if isinstance(method, basestring) and method.lower() == 'get' and data: 145 | if '?' not in url: 146 | url += '?' 147 | elif url[-1] not in ('?', '&'): 148 | url += '&' 149 | url += data 150 | data = None 151 | 152 | if data and PY3k: 153 | data = data.encode('utf-8') 154 | 155 | html = urlopen(url, data) 156 | if not self.parser: 157 | self.parser = 'html' 158 | self._base_url = url 159 | else: 160 | raise ValueError('Invalid keyword arguments %s' % kwargs) 161 | elements = fromstring(html, self.parser) 162 | else: 163 | # get nodes 164 | 165 | # determine context and selector if any 166 | selector = context = no_default 167 | length = len(args) 168 | if len(args) == 1: 169 | context = args[0] 170 | elif len(args) == 2: 171 | selector, context = args 172 | else: 173 | raise ValueError("You can't do that." +\ 174 | " Please, provide arguments") 175 | 176 | # get context 177 | if isinstance(context, basestring): 178 | try: 179 | elements = fromstring(context, self.parser) 180 | except Exception: 181 | raise ValueError(context) 182 | elif isinstance(context, self.__class__): 183 | # copy 184 | elements = context[:] 185 | elif isinstance(context, list): 186 | elements = context 187 | elif isinstance(context, etree._Element): 188 | elements = [context] 189 | 190 | # select nodes 191 | if elements and selector is not no_default: 192 | xpath = selector_to_xpath(selector) 193 | results = [tag.xpath(xpath) for tag in elements] 194 | # Flatten the results 195 | elements = [] 196 | for r in results: 197 | elements.extend(r) 198 | 199 | list.__init__(self, elements) 200 | 201 | def __call__(self, *args): 202 | """return a new PyQuery instance 203 | """ 204 | length = len(args) 205 | if length == 0: 206 | raise ValueError('You must provide at least a selector') 207 | if args[0] == '': 208 | return self.__class__([]) 209 | if len(args) == 1 and isinstance(args[0], str) and not args[0].startswith('<'): 210 | args += (self,) 211 | result = self.__class__(*args, **dict(parent=self)) 212 | return result 213 | 214 | # keep original list api prefixed with _ 215 | _append = list.append 216 | _extend = list.extend 217 | 218 | # improve pythonic api 219 | def __add__(self, other): 220 | assert isinstance(other, self.__class__) 221 | return self.__class__(self[:] + other[:]) 222 | 223 | def extend(self, other): 224 | assert isinstance(other, self.__class__) 225 | self._extend(other[:]) 226 | 227 | def __str__(self): 228 | """xml representation of current nodes:: 229 | 230 | >>> xml = PyQuery('', parser='html_fragments') 231 | >>> print(str(xml)) 232 | 233 | 234 | """ 235 | if PY3k: 236 | return ''.join([etree.tostring(e, encoding=str) for e in self]) 237 | else: 238 | return ''.join([etree.tostring(e) for e in self]) 239 | 240 | def __unicode__(self): 241 | """xml representation of current nodes""" 242 | return unicode('').join([etree.tostring(e, encoding=unicode) for e in self]) 243 | 244 | def __html__(self): 245 | """html representation of current nodes:: 246 | 247 | >>> html = PyQuery('', parser='html_fragments') 248 | >>> print(html.__html__()) 249 | 250 | 251 | """ 252 | return unicode('').join([lxml.html.tostring(e, encoding=unicode) for e in self]) 253 | 254 | def __repr__(self): 255 | r = [] 256 | try: 257 | for el in self: 258 | c = el.get('class') 259 | c = c and '.' + '.'.join(c.split(' ')) or '' 260 | id = el.get('id') 261 | id = id and '#' + id or '' 262 | r.append('<%s%s%s>' % (el.tag, id, c)) 263 | return '[' + (', '.join(r)) + ']' 264 | except AttributeError: 265 | if PY3k: 266 | return list.__repr__(self) 267 | else: 268 | for el in self: 269 | if isinstance(el, unicode): 270 | r.append(el.encode('utf-8')) 271 | else: 272 | r.append(el) 273 | return repr(r) 274 | 275 | 276 | @property 277 | def root(self): 278 | """return the xml root element 279 | """ 280 | if self._parent is not no_default: 281 | return self._parent.getroottree() 282 | return self[0].getroottree() 283 | 284 | @property 285 | def encoding(self): 286 | """return the xml encoding of the root element 287 | """ 288 | root = self.root 289 | if root is not None: 290 | return self.root.docinfo.encoding 291 | 292 | ############## 293 | # Traversing # 294 | ############## 295 | 296 | def _filter_only(self, selector, elements, reverse=False, unique=False): 297 | """Filters the selection set only, as opposed to also including 298 | descendants. 299 | """ 300 | if selector is None: 301 | results = elements 302 | else: 303 | xpath = selector_to_xpath(selector, 'self::') 304 | results = [] 305 | for tag in elements: 306 | results.extend(tag.xpath(xpath)) 307 | if reverse: 308 | results.reverse() 309 | if unique: 310 | result_list = results 311 | results = [] 312 | for item in result_list: 313 | if not item in results: 314 | results.append(item) 315 | return self.__class__(results, **dict(parent=self)) 316 | 317 | def parent(self, selector=None): 318 | return self._filter_only(selector, [e.getparent() for e in self if e.getparent() is not None], unique = True) 319 | 320 | def prev(self, selector=None): 321 | return self._filter_only(selector, [e.getprevious() for e in self if e.getprevious() is not None]) 322 | 323 | def next(self, selector=None): 324 | return self._filter_only(selector, [e.getnext() for e in self if e.getnext() is not None]) 325 | 326 | def _traverse(self, method): 327 | for e in self: 328 | current = getattr(e, method)() 329 | while current is not None: 330 | yield current 331 | current = getattr(current, method)() 332 | 333 | def _traverse_parent_topdown(self): 334 | for e in self: 335 | this_list = [] 336 | current = e.getparent() 337 | while current is not None: 338 | this_list.append(current) 339 | current = current.getparent() 340 | this_list.reverse() 341 | for j in this_list: 342 | yield j 343 | 344 | def _nextAll(self): 345 | return [e for e in self._traverse('getnext')] 346 | 347 | def nextAll(self, selector=None): 348 | """ 349 | >>> d = PyQuery('

Bye

') 350 | >>> d('p:last').nextAll() 351 | [] 352 | """ 353 | return self._filter_only(selector, self._nextAll()) 354 | 355 | def _prevAll(self): 356 | return [e for e in self._traverse('getprevious')] 357 | 358 | def prevAll(self, selector=None): 359 | """ 360 | >>> d = PyQuery('

Bye

') 361 | >>> d('p:last').prevAll() 362 | [] 363 | """ 364 | return self._filter_only(selector, self._prevAll(), reverse = True) 365 | 366 | def siblings(self, selector=None): 367 | """ 368 | >>> d = PyQuery('

Bye

') 369 | >>> d('.hello').siblings() 370 | [

, ] 371 | >>> d('.hello').siblings('img') 372 | [] 373 | """ 374 | return self._filter_only(selector, self._prevAll() + self._nextAll()) 375 | 376 | def parents(self, selector=None): 377 | """ 378 | >>> d = PyQuery('

Bye

') 379 | >>> d('p').parents() 380 | [] 381 | >>> d('.hello').parents('span') 382 | [] 383 | >>> d('.hello').parents('p') 384 | [] 385 | """ 386 | return self._filter_only( 387 | selector, 388 | [e for e in self._traverse_parent_topdown()], 389 | unique = True 390 | ) 391 | 392 | def children(self, selector=None): 393 | """Filter elements that are direct children of self using optional selector. 394 | 395 | >>> d = PyQuery('

Bye

') 396 | >>> d 397 | [] 398 | >>> d.children() 399 | [,

] 400 | >>> d.children('.hello') 401 | [] 402 | """ 403 | elements = [child for tag in self for child in tag.getchildren()] 404 | return self._filter_only(selector, elements) 405 | 406 | def closest(self, selector=None): 407 | """ 408 | >>> d = PyQuery('

This is a test

Bye

') 428 | >>> d('p') 429 | [,

] 430 | >>> d('p').filter('.hello') 431 | [] 432 | >>> d('p').filter(lambda i: i == 1) 433 | [

Bye

') 456 | >>> d('p').not_('.hello') 457 | [

] 458 | """ 459 | exclude = set(self.__class__(selector, self)) 460 | return self.__class__([e for e in self if e not in exclude], **dict(parent=self)) 461 | 462 | def is_(self, selector): 463 | """Returns True if selector matches at least one current element, else False:: 464 | 465 | >>> d = PyQuery('

Bye

') 466 | >>> d('p').eq(0).is_('.hello') 467 | True 468 | 469 | >>> d('p').eq(1).is_('.hello') 470 | False 471 | 472 | .. 473 | """ 474 | return bool(self.__class__(selector, self)) 475 | 476 | def find(self, selector): 477 | """Find elements using selector traversing down from self:: 478 | 479 | >>> m = '

Whoah!

there

' 480 | >>> d = PyQuery(m) 481 | >>> d('p').find('em') 482 | [, ] 483 | >>> d('p').eq(1).find('em') 484 | [] 485 | 486 | .. 487 | """ 488 | xpath = selector_to_xpath(selector) 489 | results = [child.xpath(xpath) for tag in self for child in tag.getchildren()] 490 | # Flatten the results 491 | elements = [] 492 | for r in results: 493 | elements.extend(r) 494 | return self.__class__(elements, **dict(parent=self)) 495 | 496 | def eq(self, index): 497 | """Return PyQuery of only the element with the provided index:: 498 | 499 | >>> d = PyQuery('
Hi
Bye
') 500 | >>> d('p').eq(0) 501 | [] 502 | >>> d('p').eq(1) 503 | [
] 504 | >>> d('p').eq(2) 505 | [] 506 | 507 | .. 508 | """ 509 | # Use slicing to silently handle out of bounds indexes 510 | items = self[index:index+1] 511 | return self.__class__(items, **dict(parent=self)) 512 | 513 | def each(self, func): 514 | """apply func on each nodes 515 | """ 516 | try: 517 | for i, element in enumerate(self): 518 | func_globals(func)['this'] = element 519 | if callback(func, i, element) == False: 520 | break 521 | finally: 522 | f_globals = func_globals(func) 523 | if 'this' in f_globals: 524 | del f_globals['this'] 525 | return self 526 | 527 | def map(self, func): 528 | """Returns a new PyQuery after transforming current items with func. 529 | 530 | func should take two arguments - 'index' and 'element'. Elements can 531 | also be referred to as 'this' inside of func:: 532 | 533 | >>> d = PyQuery('
Hi there
Bye

') 534 | >>> d('p').map(lambda i, e: PyQuery(e).text()) 535 | ['Hi there', 'Bye'] 536 | 537 | >>> d('p').map(lambda i, e: len(PyQuery(this).text())) 538 | [8, 3] 539 | 540 | >>> d('p').map(lambda i, e: PyQuery(this).text().split()) 541 | ['Hi', 'there', 'Bye'] 542 | 543 | """ 544 | items = [] 545 | try: 546 | for i, element in enumerate(self): 547 | func_globals(func)['this'] = element 548 | result = callback(func, i, element) 549 | if result is not None: 550 | if not isinstance(result, list): 551 | items.append(result) 552 | else: 553 | items.extend(result) 554 | finally: 555 | f_globals = func_globals(func) 556 | if 'this' in f_globals: 557 | del f_globals['this'] 558 | return self.__class__(items, **dict(parent=self)) 559 | 560 | @property 561 | def length(self): 562 | return len(self) 563 | 564 | def size(self): 565 | return len(self) 566 | 567 | def end(self): 568 | """Break out of a level of traversal and return to the parent level. 569 | 570 | >>> m = '
Whoah!
there
' 571 | >>> d = PyQuery(m) 572 | >>> d('p').eq(1).find('em').end().end() 573 | [
,
] 574 | """ 575 | return self._parent 576 | 577 | ############## 578 | # Attributes # 579 | ############## 580 | def attr(self, *args, **kwargs): 581 | """Attributes manipulation 582 | """ 583 | 584 | mapping = {'class_': 'class', 'for_': 'for'} 585 | 586 | attr = value = no_default 587 | length = len(args) 588 | if length == 1: 589 | attr = args[0] 590 | attr = mapping.get(attr, attr) 591 | elif length == 2: 592 | attr, value = args 593 | attr = mapping.get(attr, attr) 594 | elif kwargs: 595 | attr = {} 596 | for k, v in kwargs.items(): 597 | attr[mapping.get(k, k)] = v 598 | else: 599 | raise ValueError('Invalid arguments %s %s' % (args, kwargs)) 600 | 601 | if not self: 602 | return None 603 | elif isinstance(attr, dict): 604 | for tag in self: 605 | for key, value in attr.items(): 606 | tag.set(key, value) 607 | elif value is no_default: 608 | return self[0].get(attr) 609 | elif value is None or value == '': 610 | return self.removeAttr(attr) 611 | else: 612 | for tag in self: 613 | tag.set(attr, value) 614 | return self 615 | 616 | def removeAttr(self, name): 617 | """Remove an attribute:: 618 | 619 | >>> d = PyQuery('
') 620 | >>> d.removeAttr('id') 621 | [
] 622 | 623 | .. 624 | """ 625 | for tag in self: 626 | del tag.attrib[name] 627 | return self 628 | 629 | attr = FlexibleElement(pget=attr, pdel=removeAttr) 630 | 631 | ####### 632 | # CSS # 633 | ####### 634 | def height(self, value=no_default): 635 | """set/get height of element 636 | """ 637 | return self.attr('height', value) 638 | 639 | def width(self, value=no_default): 640 | """set/get width of element 641 | """ 642 | return self.attr('width', value) 643 | 644 | def hasClass(self, name): 645 | """Return True if element has class:: 646 | 647 | >>> d = PyQuery('
') 648 | >>> d.hasClass('myclass') 649 | True 650 | 651 | .. 652 | """ 653 | for tag in self: 654 | classes = set((tag.get('class') or '').split()) 655 | if name in classes: 656 | return True 657 | return False 658 | 659 | def addClass(self, value): 660 | """Add a css class to elements:: 661 | 662 | >>> d = PyQuery('
') 663 | >>> d.addClass('myclass') 664 | [] 665 | 666 | .. 667 | """ 668 | for tag in self: 669 | values = value.split(' ') 670 | classes = set((tag.get('class') or '').split()) 671 | classes = classes.union(values) 672 | classes.difference_update(['']) 673 | tag.set('class', ' '.join(classes)) 674 | return self 675 | 676 | def removeClass(self, value): 677 | """Remove a css class to elements:: 678 | 679 | >>> d = PyQuery('
') 680 | >>> d.removeClass('myclass') 681 | [
] 682 | 683 | .. 684 | """ 685 | for tag in self: 686 | values = value.split(' ') 687 | classes = set((tag.get('class') or '').split()) 688 | classes.difference_update(values) 689 | classes.difference_update(['']) 690 | tag.set('class', ' '.join(classes)) 691 | return self 692 | 693 | def toggleClass(self, value): 694 | """Toggle a css class to elements 695 | 696 | >>> d = PyQuery('
') 697 | >>> d.toggleClass('myclass') 698 | [] 699 | 700 | """ 701 | for tag in self: 702 | values = set(value.split(' ')) 703 | classes = set((tag.get('class') or '').split()) 704 | values_to_add = values.difference(classes) 705 | classes.difference_update(values) 706 | classes = classes.union(values_to_add) 707 | classes.difference_update(['']) 708 | tag.set('class', ' '.join(classes)) 709 | return self 710 | 711 | def css(self, *args, **kwargs): 712 | """css attributes manipulation 713 | """ 714 | 715 | attr = value = no_default 716 | length = len(args) 717 | if length == 1: 718 | attr = args[0] 719 | elif length == 2: 720 | attr, value = args 721 | elif kwargs: 722 | attr = kwargs 723 | else: 724 | raise ValueError('Invalid arguments %s %s' % (args, kwargs)) 725 | 726 | if isinstance(attr, dict): 727 | for tag in self: 728 | stripped_keys = [key.strip().replace('_', '-') 729 | for key in attr.keys()] 730 | current = [el.strip() 731 | for el in (tag.get('style') or '').split(';') 732 | if el.strip() 733 | and not el.split(':')[0].strip() in stripped_keys] 734 | for key, value in attr.items(): 735 | key = key.replace('_', '-') 736 | current.append('%s: %s' % (key, value)) 737 | tag.set('style', '; '.join(current)) 738 | elif isinstance(value, basestring): 739 | attr = attr.replace('_', '-') 740 | for tag in self: 741 | current = [el.strip() 742 | for el in (tag.get('style') or '').split(';') 743 | if el.strip() 744 | and not el.split(':')[0].strip() == attr.strip()] 745 | current.append('%s: %s' % (attr, value)) 746 | tag.set('style', '; '.join(current)) 747 | return self 748 | 749 | css = FlexibleElement(pget=css, pset=css) 750 | 751 | ################### 752 | # CORE UI EFFECTS # 753 | ################### 754 | def hide(self): 755 | """remove display:none to elements style 756 | 757 | >>> print(PyQuery('
').hide()) 758 |
759 | 760 | """ 761 | return self.css('display', 'none') 762 | 763 | def show(self): 764 | """add display:block to elements style 765 | 766 | >>> print(PyQuery('
').show()) 767 |
768 | 769 | """ 770 | return self.css('display', 'block') 771 | 772 | ######## 773 | # HTML # 774 | ######## 775 | def val(self, value=no_default): 776 | """Set the attribute value:: 777 | 778 | >>> d = PyQuery('') 779 | >>> d.val('Youhou') 780 | [] 781 | 782 | Get the attribute value:: 783 | 784 | >>> d.val() 785 | 'Youhou' 786 | 787 | """ 788 | return self.attr('value', value) 789 | 790 | def html(self, value=no_default): 791 | """Get or set the html representation of sub nodes. 792 | 793 | Get the text value:: 794 | 795 | >>> d = PyQuery('
toto
') 796 | >>> print(d.html()) 797 | toto 798 | 799 | Set the text value:: 800 | 801 | >>> d.html('Youhou !') 802 | [
] 803 | >>> print(d) 804 |
Youhou !
805 | """ 806 | if value is no_default: 807 | if not self: 808 | return None 809 | tag = self[0] 810 | children = tag.getchildren() 811 | if not children: 812 | return tag.text 813 | html = tag.text or '' 814 | html += unicode('').join([etree.tostring(e, encoding=unicode) for e in children]) 815 | return html 816 | else: 817 | if isinstance(value, self.__class__): 818 | new_html = unicode(value) 819 | elif isinstance(value, basestring): 820 | new_html = value 821 | elif not value: 822 | new_html = '' 823 | else: 824 | raise ValueError(type(value)) 825 | 826 | for tag in self: 827 | for child in tag.getchildren(): 828 | tag.remove(child) 829 | root = fromstring(unicode('') + new_html + unicode(''), self.parser)[0] 830 | children = root.getchildren() 831 | if children: 832 | tag.extend(children) 833 | tag.text = root.text 834 | tag.tail = root.tail 835 | return self 836 | 837 | def outerHtml(self): 838 | """Get the html representation of the first selected element:: 839 | 840 | >>> d = PyQuery('
toto rocks
') 841 | >>> print(d('span')) 842 | toto rocks 843 | >>> print(d('span').outerHtml()) 844 | toto 845 | 846 | >>> S = PyQuery('
Only me & myself
') 847 | >>> print(S('b').outerHtml()) 848 | me 849 | 850 | .. 851 | """ 852 | 853 | if not self: 854 | return None 855 | e0 = self[0] 856 | if e0.tail: 857 | e0 = deepcopy(e0) 858 | e0.tail = '' 859 | return lxml.html.tostring(e0, encoding=unicode) 860 | 861 | def text(self, value=no_default): 862 | """Get or set the text representation of sub nodes. 863 | 864 | Get the text value:: 865 | 866 | >>> doc = PyQuery('
tototata
') 867 | >>> print(doc.text()) 868 | toto tata 869 | 870 | Set the text value:: 871 | 872 | >>> doc.text('Youhou !') 873 | [
] 874 | >>> print(doc) 875 |
Youhou !
876 | 877 | """ 878 | 879 | if value is no_default: 880 | if not self: 881 | return None 882 | 883 | text = [] 884 | 885 | def add_text(tag, no_tail=False): 886 | if tag.text: 887 | text.append(tag.text) 888 | for child in tag.getchildren(): 889 | add_text(child) 890 | if not no_tail and tag.tail: 891 | text.append(tag.tail) 892 | 893 | for tag in self: 894 | add_text(tag, no_tail=True) 895 | return ' '.join([t.strip() for t in text if t.strip()]) 896 | 897 | for tag in self: 898 | for child in tag.getchildren(): 899 | tag.remove(child) 900 | tag.text = value 901 | return self 902 | 903 | ################ 904 | # Manipulating # 905 | ################ 906 | 907 | def _get_root(self, value): 908 | if isinstance(value, basestring): 909 | root = fromstring(unicode('') + value + unicode(''), self.parser)[0] 910 | elif isinstance(value, etree._Element): 911 | root = self.__class__(value) 912 | elif isinstance(value, PyQuery): 913 | root = value 914 | else: 915 | raise TypeError( 916 | 'Value must be string, PyQuery or Element. Got %r' % value) 917 | if hasattr(root, 'text') and isinstance(root.text, basestring): 918 | root_text = root.text 919 | else: 920 | root_text = '' 921 | return root, root_text 922 | 923 | def append(self, value): 924 | """append value to each nodes 925 | """ 926 | root, root_text = self._get_root(value) 927 | for i, tag in enumerate(self): 928 | if len(tag) > 0: # if the tag has children 929 | last_child = tag[-1] 930 | if not last_child.tail: 931 | last_child.tail = '' 932 | last_child.tail += root_text 933 | else: 934 | if not tag.text: 935 | tag.text = '' 936 | tag.text += root_text 937 | if i > 0: 938 | root = deepcopy(list(root)) 939 | tag.extend(root) 940 | root = tag[-len(root):] 941 | return self 942 | 943 | def appendTo(self, value): 944 | """append nodes to value 945 | """ 946 | value.append(self) 947 | return self 948 | 949 | def prepend(self, value): 950 | """prepend value to nodes 951 | """ 952 | root, root_text = self._get_root(value) 953 | for i, tag in enumerate(self): 954 | if not tag.text: 955 | tag.text = '' 956 | if len(root) > 0: 957 | root[-1].tail = tag.text 958 | tag.text = root_text 959 | else: 960 | tag.text = root_text + tag.text 961 | if i > 0: 962 | root = deepcopy(list(root)) 963 | tag[:0] = root 964 | root = tag[:len(root)] 965 | return self 966 | 967 | def prependTo(self, value): 968 | """prepend nodes to value 969 | """ 970 | value.prepend(self) 971 | return self 972 | 973 | def after(self, value): 974 | """add value after nodes 975 | """ 976 | root, root_text = self._get_root(value) 977 | for i, tag in enumerate(self): 978 | if not tag.tail: 979 | tag.tail = '' 980 | tag.tail += root_text 981 | if i > 0: 982 | root = deepcopy(list(root)) 983 | parent = tag.getparent() 984 | index = parent.index(tag) + 1 985 | parent[index:index] = root 986 | root = parent[index:len(root)] 987 | return self 988 | 989 | def insertAfter(self, value): 990 | """insert nodes after value 991 | """ 992 | value.after(self) 993 | return self 994 | 995 | def before(self, value): 996 | """insert value before nodes 997 | """ 998 | root, root_text = self._get_root(value) 999 | for i, tag in enumerate(self): 1000 | previous = tag.getprevious() 1001 | if previous != None: 1002 | if not previous.tail: 1003 | previous.tail = '' 1004 | previous.tail += root_text 1005 | else: 1006 | parent = tag.getparent() 1007 | if not parent.text: 1008 | parent.text = '' 1009 | parent.text += root_text 1010 | if i > 0: 1011 | root = deepcopy(list(root)) 1012 | parent = tag.getparent() 1013 | index = parent.index(tag) 1014 | parent[index:index] = root 1015 | root = parent[index:len(root)] 1016 | return self 1017 | 1018 | def insertBefore(self, value): 1019 | """insert nodes before value 1020 | """ 1021 | value.before(self) 1022 | return self 1023 | 1024 | def wrap(self, value): 1025 | """A string of HTML that will be created on the fly and wrapped around 1026 | each target:: 1027 | 1028 | >>> d = PyQuery('youhou') 1029 | >>> d.wrap('
') 1030 | [
] 1031 | >>> print(d) 1032 |
youhou
1033 | 1034 | """ 1035 | assert isinstance(value, basestring) 1036 | value = fromstring(value)[0] 1037 | nodes = [] 1038 | for tag in self: 1039 | wrapper = deepcopy(value) 1040 | # FIXME: using iterchildren is probably not optimal 1041 | if not wrapper.getchildren(): 1042 | wrapper.append(deepcopy(tag)) 1043 | else: 1044 | childs = [c for c in wrapper.iterchildren()] 1045 | child = childs[-1] 1046 | child.append(deepcopy(tag)) 1047 | nodes.append(wrapper) 1048 | 1049 | parent = tag.getparent() 1050 | if parent is not None: 1051 | for t in parent.iterchildren(): 1052 | if t is tag: 1053 | t.addnext(wrapper) 1054 | parent.remove(t) 1055 | break 1056 | self[:] = nodes 1057 | return self 1058 | 1059 | def wrapAll(self, value): 1060 | """Wrap all the elements in the matched set into a single wrapper element:: 1061 | 1062 | >>> d = PyQuery('
Heyyou !
') 1063 | >>> print(d('span').wrapAll('
')) 1064 |
Heyyou !
1065 | 1066 | .. 1067 | """ 1068 | if not self: 1069 | return self 1070 | 1071 | assert isinstance(value, basestring) 1072 | value = fromstring(value)[0] 1073 | wrapper = deepcopy(value) 1074 | if not wrapper.getchildren(): 1075 | child = wrapper 1076 | else: 1077 | childs = [c for c in wrapper.iterchildren()] 1078 | child = childs[-1] 1079 | 1080 | replace_childs = True 1081 | parent = self[0].getparent() 1082 | if parent is None: 1083 | parent = no_default 1084 | 1085 | # add nodes to wrapper and check parent 1086 | for tag in self: 1087 | child.append(deepcopy(tag)) 1088 | if tag.getparent() is not parent: 1089 | replace_childs = False 1090 | 1091 | # replace nodes i parent if possible 1092 | if parent is not no_default and replace_childs: 1093 | childs = [c for c in parent.iterchildren()] 1094 | if len(childs) == len(self): 1095 | for tag in self: 1096 | parent.remove(tag) 1097 | parent.append(wrapper) 1098 | 1099 | self[:] = [wrapper] 1100 | return self 1101 | 1102 | def replaceWith(self, value): 1103 | """replace nodes by value 1104 | """ 1105 | if hasattr(value, '__call__'): 1106 | for i, element in enumerate(self): 1107 | self.__class__(element).before(value(i, element) + (element.tail or '')) 1108 | parent = element.getparent() 1109 | parent.remove(element) 1110 | else: 1111 | for tag in self: 1112 | self.__class__(tag).before(value + (tag.tail or '')) 1113 | parent = tag.getparent() 1114 | parent.remove(tag) 1115 | return self 1116 | 1117 | def replaceAll(self, expr): 1118 | """replace nodes by expr 1119 | """ 1120 | if self._parent is no_default: 1121 | raise ValueError( 1122 | 'replaceAll can only be used with an object with parent') 1123 | self._parent(expr).replaceWith(self) 1124 | return self 1125 | 1126 | def clone(self): 1127 | """return a copy of nodes 1128 | """ 1129 | self[:] = [deepcopy(tag) for tag in self] 1130 | return self 1131 | 1132 | def empty(self): 1133 | """remove nodes content 1134 | """ 1135 | for tag in self: 1136 | tag.text = None 1137 | tag[:] = [] 1138 | return self 1139 | 1140 | def remove(self, expr=no_default): 1141 | """remove nodes 1142 | 1143 | >>> d = PyQuery('
Maybe she does NOT know
') 1144 | >>> d('strong').remove() 1145 | [] 1146 | >>> print(d) 1147 |
Maybe she does know
1148 | """ 1149 | if expr is no_default: 1150 | for tag in self: 1151 | parent = tag.getparent() 1152 | if parent is not None: 1153 | if tag.tail: 1154 | prev = tag.getprevious() 1155 | if prev is None: 1156 | if not parent.text: 1157 | parent.text = '' 1158 | parent.text += ' ' + tag.tail 1159 | else: 1160 | if not prev.tail: 1161 | prev.tail = '' 1162 | prev.tail += ' ' + tag.tail 1163 | parent.remove(tag) 1164 | else: 1165 | results = self.__class__(expr, self) 1166 | results.remove() 1167 | return self 1168 | 1169 | class Fn(object): 1170 | """Hook for defining custom function (like the jQuery.fn) 1171 | 1172 | >>> PyQuery.fn.listOuterHtml = lambda: this.map(lambda i, el: PyQuery(this).outerHtml()) 1173 | >>> S = PyQuery('

Coffee

Tea

Milk

') 1174 | >>> S('li').listOuterHtml() 1175 | ['
Coffee
', '
Tea
', '
Milk
'] 1176 | 1177 | """ 1178 | def __setattr__(self, name, func): 1179 | def fn(self, *args): 1180 | func_globals(func)['this'] = self 1181 | return func(*args) 1182 | fn.__name__ = name 1183 | setattr(PyQuery, name, fn) 1184 | fn = Fn() 1185 | 1186 | ##################################################### 1187 | # Additional methods that are not in the jQuery API # 1188 | ##################################################### 1189 | 1190 | @property 1191 | def base_url(self): 1192 | """Return the url of current html document or None if not available. 1193 | """ 1194 | if self._base_url is not None: 1195 | return self._base_url 1196 | if self._parent is not no_default: 1197 | return self._parent.base_url 1198 | 1199 | def make_links_absolute(self, base_url=None): 1200 | """Make all links absolute. 1201 | """ 1202 | if base_url is None: 1203 | base_url = self.base_url 1204 | if base_url is None: 1205 | raise ValueError('You need a base URL to make your links' 1206 | 'absolute. It can be provided by the base_url parameter.') 1207 | 1208 | self('a').each(lambda: self(this).attr('href', urljoin(base_url, self(this).attr('href')))) 1209 | return self 1210 | --------------------------------------------------------------------------------

Heading 1

Heading 2

Heading 3

Heading 4

Heading 5

Heading 6