470 | ... Some paragraph
471 | ...
472 | ...
473 | ...
474 | ... """
475 | >>> rules = {"heading": "h1"}
476 | >>>
477 | >>> # default text extraction includes tail text
478 | ... parslepy.Parselet(rules).parse_fromstring(doc)
479 | {'heading': u'Some heading Some text'}
480 | >>>
481 | >>> # 2nd argument to false means without tail text
482 | ... rules = {"heading": "parslepy:text(//h1, false())"}
483 | >>> parslepy.Parselet(rules).parse_fromstring(doc)
484 | {'heading': 'Some heading'}
485 | >>>
486 | >>> # 2nd argument to true is equivalent to default text extraction
487 | >>> rules = {"heading": "parslepy:text(//h1, true())"}
488 | >>> parslepy.Parselet(rules).parse_fromstring(doc)
489 | {'heading': 'Some heading Some text'}
490 | >>>
491 |
492 | See http://lxml.de/tutorial.html#elements-contain-text for details
493 | on how `lxml`_ handles text and tail element properties
494 |
495 | * ``parslepy:textnl(xpath_expression)``:
496 | similar to ``parslepy:text()`` but appends `\\n` characters to HTML
497 | block elements such as ``
498 |
499 | >>> import parslepy
500 | >>> doc = """
501 | ...
502 | ...
503 | ...
Some page title
504 | ...
505 | ...
506 | ...
Some heading
Some paragraph
with some span inside
ending now.
507 | ...
508 | ...
509 | ... """
510 | >>> parslepy.Parselet({"heading": "parslepy:text(//body)"}).parse_fromstring(doc)
511 | {'heading': 'Some headingSome paragraphwith some span insideending now.'}
512 | >>>
513 | >>> parslepy.Parselet({"heading": "parslepy:textnl(//body)"}).parse_fromstring(doc)
514 | {'heading': 'Some heading\nSome paragraph\nwith some span inside\nending now.'}
515 | >>>
516 |
517 |
518 | * ``parslepy:html(xpath_expression)``
519 | returns the HTML content for elements matching *xpath_expression*.
520 | Internally, this calls `lxml.html.tostring(element)`.
521 |
522 | >>> import parslepy
523 | >>> doc = """
524 | ...
525 | ...
526 | ...
Some page title
527 | ...
528 | ...
529 | ...
(Some heading)
530 | ...
[some sub-heading]
531 | ...
532 | ...
533 | ... """
534 | >>> parslepy.Parselet({"heading": "parslepy:html(//h1)"}).parse_fromstring(doc)
535 | {'heading': '
(Some heading)
'}
536 | >>> parslepy.Parselet({"heading": "parslepy:html(//body)"}).parse_fromstring(doc)
537 | {'heading': '\n
(Some heading)
\n
[some sub-heading]
\n'}
538 | >>>
539 |
540 |
541 | * ``parslepy:xml(xpath_expression)``
542 | returns the XML content for elements matching *xpath_expression*.
543 | Internally, this calls `lxml.etree.tostring(element)`.
544 |
545 | * ``parslepy:strip(xpath_expression[, chars])``
546 | behaves like Python's `strip()` method for strings but for the text
547 | content of elements matching *xpath_expression*.
548 | See http://docs.python.org/2/library/string.html#string.strip
549 |
550 | >>> import parslepy
551 | >>> doc = """
552 | ...
553 | ...
554 | ...
Some page title
555 | ...
556 | ...
557 | ...
(Some heading)
558 | ...
[some sub-heading]
559 | ...
560 | ...
561 | ... """
562 | >>> parslepy.Parselet({"heading": "parslepy:strip(//h2, '[')"}).parse_fromstring(doc)
563 | {'heading': 'some sub-heading]'}
564 | >>> parslepy.Parselet({"heading": "parslepy:strip(//h2, ']')"}).parse_fromstring(doc)
565 | {'heading': '[some sub-heading'}
566 | >>> parslepy.Parselet({"heading": "parslepy:strip(//h2, '[]')"}).parse_fromstring(doc)
567 | {'heading': 'some sub-heading'}
568 | >>> parslepy.Parselet({"heading": "parslepy:strip(//h1, '()')"}).parse_fromstring(doc)
569 | {'heading': 'Some heading'}
570 | >>>
571 |
572 | * ``parslepy:attrname(xpath_expression_matching_attribute)``
573 | returns name of an attribute. This works with the catch-all-attributes
574 | `@*` expression or a specific attribute expression like `@class`.
575 | It may sound like a useless extension but it can be useful
576 | when combined with the simple `@*` selector like in the example below:
577 |
578 | >>> img_attributes = {
579 | ... "images(img)": [{
580 | ... "attr_names": ["parslepy:attrname(@*)"],
581 | ... "attr_vals": ["@*"],
582 | ... }]
583 | ... }
584 | >>> extracted = parslepy.Parselet(img_attributes).parse('http://www.python.org')
585 | >>> for r in extracted["images"]:
586 | ...: print dict(zip(r.get("attr_names"), r.get("attr_vals")))
587 | ...:
588 | {'src': '/images/python-logo.gif', 'alt': 'homepage', 'border': '0', 'id': 'logo'}
589 | {'src': '/images/trans.gif', 'alt': 'skip to navigation', 'border': '0', 'id': 'skiptonav'}
590 | {'src': '/images/trans.gif', 'alt': 'skip to content', 'border': '0', 'id': 'skiptocontent'}
591 | {'width': '116', 'alt': '', 'src': '/images/donate.png', 'title': '', 'height': '42'}
592 | {'width': '94', 'style': 'align:center', 'src': '/images/worldmap.jpg', 'alt': '[Python resources in languages other than English]', 'height': '46'}
593 | {'src': '/images/success/Carmanah.png', 'alt': 'success story photo', 'class': 'success'}
594 |
595 |
596 | User-defined extensions
597 | ^^^^^^^^^^^^^^^^^^^^^^^
598 |
599 | *parslepy* also lets you define your own XPath extensions, just like
600 | `lxml`_ does, except the function you register must accept a user-supplied
601 | context object passed as first argument, subsequent arguments to your extension
602 | function will be the same as for `lxml`_ extensions, i.e. an XPath context,
603 | followed by matching elements and whatever additional parameters your XPath
604 | call passes.
605 |
606 | The user-supplied context should be passed to :meth:`parslepy.base.Parselet.parse`,
607 | or globally to a XPathSelectorHandler subclass instance passed to instantiate a Parselet.
608 |
609 | Let's illustrate this with a custom extension to make `
![]()
` @src
610 | attributes "absolute".
611 |
612 | Suppose we already have an extraction rule that outputs the `@src` attributes
613 | from `
![]()
` tags on the Python.org homepage:
614 |
615 | >>> import parslepy
616 | >>> import pprint
617 | >>> parselet = parslepy.Parselet({"img_abslinks": ["//img/@src"]})
618 | >>> pprint.pprint(parselet.parse('http://www.python.org'))
619 | {'img_abslinks': ['/images/python-logo.gif',
620 | '/images/trans.gif',
621 | '/images/trans.gif',
622 | '/images/donate.png',
623 | '/images/worldmap.jpg',
624 | '/images/success/afnic.fr.png']}
625 |
626 | We now want to generate full URLs for these images, not relative to
627 | http://www.python.org.
628 |
629 | **First we need to define our extension function as a Python function**:
630 |
631 | *parslepy*'s extension functions must accept a user-context as first argument,
632 | then should expect an XPath context, followed by elements or strings
633 | matching the XPath expression,
634 | and finally whatever other parameters are passed to the function call
635 | in extraction rules.
636 |
637 | In our example, we expect `@src` attribute values as input from XPath,
638 | and combine them with a base URL (via `urlparse.urljoin()`),
639 | the URL from which the HTML document was fetched.
640 | The base URL will be passed as user-context, and we will receive it as
641 | first argument.
642 | So the Python extension function may look like this:
643 |
644 | >>> import urlparse
645 | >>> def absurl(ctx, xpctx, attributes, *args):
646 | ... # user-context "ctx" will be the URL of the page
647 | ... return [urlparse.urljoin(ctx, u) for u in attributes]
648 | ...
649 |
650 | **Then, we need to register this function with parslepy** through
651 | a custom selector handler, with a custom namespace and its prefix:
652 |
653 | >>> # choose a prefix and namespace, e.g. "myext" and "local-extensions"
654 | ... mynamespaces = {
655 | ... "myext": "local-extensions"
656 | ... }
657 | >>> myextensions = {
658 | ... ("local-extensions", "absurl"): absurl,
659 | ... }
660 | >>>
661 | >>> import parslepy
662 | >>> sh = parslepy.DefaultSelectorHandler(
663 | ... namespaces=mynamespaces,
664 | ... extensions=myextensions)
665 | >>>
666 |
667 |
668 | Now we can use this **absurl()** XPath extension within *parslepy* rules,
669 | with the "myext" prefix
670 | (**do not forget to pass your selector handler** to your Parselet instance):
671 |
672 | >>> rules = {"img_abslinks": ["myext:absurl(//img/@src)"]}
673 | >>> parselet = parslepy.Parselet(rules, selector_handler=sh)
674 |
675 | And finally, run the extraction rules on Python.org's homepage again,
676 | with a context argument set to the URL
677 |
678 | >>> import pprint
679 | >>> pprint.pprint(parselet.parse('http://www.python.org',
680 | ... context='http://www.python.org'))
681 | {'img_abslinks': ['http://www.python.org/images/python-logo.gif',
682 | 'http://www.python.org/images/trans.gif',
683 | 'http://www.python.org/images/trans.gif',
684 | 'http://www.python.org/images/donate.png',
685 | 'http://www.python.org/images/worldmap.jpg',
686 | 'http://www.python.org/images/success/afnic.fr.png']}
687 | >>>
688 |
689 | In this case, it may feel odd to have to pass the URL *twice*,
690 | but parse(*URL*) does not store the URL anywhere, it processes only
691 | the HTML stream from the page.
692 |
693 | More examples
694 | =============
695 |
696 | Check out more examples and tutorials at `parsley's wiki at GitHub
`_.
697 |
698 | .. include:: ../CHANGELOG
699 |
--------------------------------------------------------------------------------
/tests/data/validator.w3.org.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 | The W3C Markup Validation Service
8 |
9 |
10 |
11 |
14 |
15 |
16 |
17 |
19 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
32 |
Check the markup (HTML, XHTML, …) of Web documents
33 |
34 |
35 |
36 |
37 |
38 |
415 |
429 |
430 |
431 |
432 |
433 |
435 |
436 |
437 |
438 | NEW -
439 | W3C offers a beta release of a new service providing you an
440 | integrated validation report on your entire web site.
441 |
442 | Try it now
443 | to quickly identify those portions of your web site that may
444 | benefit from attention.
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
459 |
460 |
461 |
462 |
496 |
497 |
498 |
499 |
500 |
--------------------------------------------------------------------------------