├── .gitignore ├── .hgignore ├── .hgtags ├── .travis.yml ├── CHANGES.txt ├── CREDITS.txt ├── DD.py ├── IDEAS.txt ├── INSTALL.txt ├── LICENSES.txt ├── MANIFEST.in ├── Makefile ├── README.rst ├── TODO.txt ├── benchmark ├── bench_etree.py ├── bench_objectify.py ├── bench_xpath.py ├── bench_xslt.py └── benchbase.py ├── bisect_crashes.py ├── buildlibxml.py ├── doc ├── FAQ.txt ├── api.txt ├── build.txt ├── capi.txt ├── compatibility.txt ├── cssselect.txt ├── docstructure.py ├── element_classes.txt ├── elementsoup.txt ├── extensions.txt ├── html │ ├── flattr-badge-large.png │ ├── paypal_btn_donateCC_LG.gif │ ├── proxies.png │ ├── python-xml-title.png │ ├── python-xml.png │ ├── style.css │ └── tagpython-big.png ├── html5parser.txt ├── intro.txt ├── licenses │ ├── BSD.txt │ ├── GPL.txt │ ├── ZopePublicLicense.txt │ └── elementtree.txt ├── lxml-source-howto.txt ├── lxml.mgp ├── lxml2.txt ├── lxmlhtml.txt ├── main.txt ├── memorymanagement.txt ├── mkhtml.py ├── mklatex.py ├── objectify.txt ├── parsing.txt ├── performance.txt ├── pubkey.asc ├── resolvers.txt ├── rest2html.py ├── rest2latex.py ├── s5 │ ├── Makefile │ ├── ep2008 │ │ ├── atom-example.xml │ │ ├── atom.py │ │ ├── atom.rng │ │ ├── atomgen.py │ │ └── proxies.png │ ├── lxml-ep2008.txt │ ├── rst2s5.py │ ├── tagpython.png │ └── ui │ │ └── default │ │ ├── blank.gif │ │ ├── bodybg.gif │ │ ├── framing.css │ │ ├── iepngfix.htc │ │ ├── lxml-logo64.png │ │ ├── opera.css │ │ ├── outline.css │ │ ├── pretty.css │ │ ├── print.css │ │ ├── s5-core.css │ │ ├── slides.css │ │ ├── slides.js │ │ └── tagpython.png ├── sax.txt ├── test.xml ├── tutorial.txt ├── valgrind.txt ├── validation.txt └── xpathxslt.txt ├── ez_setup.py ├── fake_pyrex └── Pyrex │ ├── Distutils │ ├── __init__.py │ └── build_ext.py │ └── __init__.py ├── samples ├── simple-ns.xml └── simple.xml ├── selftest.py ├── selftest2.py ├── setup.py ├── setupinfo.py ├── src ├── local_doctest.py └── lxml │ ├── ElementInclude.py │ ├── __init__.py │ ├── _elementpath.py │ ├── apihelpers.pxi │ ├── builder.py │ ├── classlookup.pxi │ ├── cleanup.pxi │ ├── cssselect.py │ ├── cvarargs.pxd │ ├── debug.pxi │ ├── docloader.pxi │ ├── doctestcompare.py │ ├── dtd.pxi │ ├── extensions.pxi │ ├── html │ ├── ElementSoup.py │ ├── __init__.py │ ├── _diffcommand.py │ ├── _html5builder.py │ ├── _setmixin.py │ ├── builder.py │ ├── clean.py │ ├── defs.py │ ├── diff.py │ ├── formfill.py │ ├── html5parser.py │ ├── soupparser.py │ ├── tests │ │ ├── __init__.py │ │ ├── feedparser-data │ │ │ ├── entry_content_applet.data │ │ │ ├── entry_content_blink.data │ │ │ ├── entry_content_crazy.data │ │ │ ├── entry_content_embed.data │ │ │ ├── entry_content_frame.data │ │ │ ├── entry_content_iframe.data │ │ │ ├── entry_content_link.data │ │ │ ├── entry_content_meta.data │ │ │ ├── entry_content_object.data │ │ │ ├── entry_content_onabort.data │ │ │ ├── entry_content_onblur.data │ │ │ ├── entry_content_onchange.data │ │ │ ├── entry_content_onclick.data │ │ │ ├── entry_content_ondblclick.data │ │ │ ├── entry_content_onerror.data │ │ │ ├── entry_content_onfocus.data │ │ │ ├── entry_content_onkeydown.data │ │ │ ├── entry_content_onkeypress.data │ │ │ ├── entry_content_onkeyup.data │ │ │ ├── entry_content_onload.data │ │ │ ├── entry_content_onmousedown.data │ │ │ ├── entry_content_onmouseout.data │ │ │ ├── entry_content_onmouseover.data │ │ │ ├── entry_content_onmouseup.data │ │ │ ├── entry_content_onreset.data │ │ │ ├── entry_content_onresize.data │ │ │ ├── entry_content_onsubmit.data │ │ │ ├── entry_content_onunload.data │ │ │ ├── entry_content_script.data │ │ │ ├── entry_content_script_cdata.data │ │ │ ├── entry_content_script_inline.data │ │ │ └── entry_content_style.data │ │ ├── hackers-org-data │ │ │ ├── background-image-plus.data │ │ │ ├── background-image-with-unicoded.data │ │ │ ├── downlevel-hidden.data │ │ │ ├── html-plus-time.data │ │ │ ├── javascript-link.data │ │ │ ├── style-comment.data │ │ │ ├── style-expression.data │ │ │ ├── style-import.data │ │ │ ├── style-js-tag.data │ │ │ ├── style-url-js.data │ │ │ ├── xml-data-island.data │ │ │ ├── xml-embedded-js.data │ │ │ └── xml-namespace.data.BROKEN │ │ ├── test_autolink.py │ │ ├── test_autolink.txt │ │ ├── test_basic.py │ │ ├── test_basic.txt │ │ ├── test_clean.py │ │ ├── test_clean.txt │ │ ├── test_clean_embed.txt │ │ ├── test_diff.py │ │ ├── test_diff.txt │ │ ├── test_elementsoup.py │ │ ├── test_feedparser_data.py │ │ ├── test_formfill.py │ │ ├── test_formfill.txt │ │ ├── test_forms.py │ │ ├── test_forms.txt │ │ ├── test_frames.py │ │ ├── test_html5parser.py │ │ ├── test_rewritelinks.py │ │ ├── test_rewritelinks.txt │ │ ├── test_xhtml.py │ │ ├── test_xhtml.txt │ │ └── transform_feedparser_data.py │ └── usedoctest.py │ ├── includes │ ├── __init__.py │ ├── c14n.pxd │ ├── config.pxd │ ├── dtdvalid.pxd │ ├── etree_defs.h │ ├── etreepublic.pxd │ ├── htmlparser.pxd │ ├── relaxng.pxd │ ├── schematron.pxd │ ├── tree.pxd │ ├── uri.pxd │ ├── xinclude.pxd │ ├── xmlerror.pxd │ ├── xmlparser.pxd │ ├── xmlschema.pxd │ ├── xpath.pxd │ └── xslt.pxd │ ├── isoschematron │ ├── __init__.py │ └── resources │ │ ├── rng │ │ └── iso-schematron.rng │ │ └── xsl │ │ ├── RNG2Schtrn.xsl │ │ ├── XSD2Schtrn.xsl │ │ └── iso-schematron-xslt1 │ │ ├── iso_abstract_expand.xsl │ │ ├── iso_dsdl_include.xsl │ │ ├── iso_schematron_message.xsl │ │ ├── iso_schematron_skeleton_for_xslt1.xsl │ │ ├── iso_svrl_for_xslt1.xsl │ │ └── readme.txt │ ├── iterparse.pxi │ ├── lxml.etree.pyx │ ├── lxml.objectify.pyx │ ├── nsclasses.pxi │ ├── objectpath.pxi │ ├── parser.pxi │ ├── parsertarget.pxi │ ├── proxy.pxi │ ├── public-api.pxi │ ├── pyclasslookup.py │ ├── python.pxd │ ├── readonlytree.pxi │ ├── relaxng.pxi │ ├── sax.py │ ├── saxparser.pxi │ ├── schematron.pxi │ ├── serializer.pxi │ ├── tests │ ├── __init__.py │ ├── common_imports.py │ ├── dummy_http_server.py │ ├── include │ │ └── test_xinclude.xml │ ├── shakespeare.html │ ├── test-document.xslt │ ├── test-string.xml │ ├── test.dtd │ ├── test.sch │ ├── test.xml │ ├── test.xsd │ ├── test1.rng │ ├── test1.xslt │ ├── test2.rng │ ├── test2.xslt │ ├── test_broken.xml │ ├── test_builder.py │ ├── test_classlookup.py │ ├── test_css.py │ ├── test_dtd.py │ ├── test_elementtree.py │ ├── test_errors.py │ ├── test_etree.py │ ├── test_htmlparser.py │ ├── test_http_io.py │ ├── test_import.xsd │ ├── test_inc.xsd │ ├── test_incremental_xmlfile.py │ ├── test_io.py │ ├── test_isoschematron.py │ ├── test_nsclasses.py │ ├── test_objectify.py │ ├── test_pyclasslookup.py │ ├── test_relaxng.py │ ├── test_sax.py │ ├── test_schematron.py │ ├── test_threading.py │ ├── test_unicode.py │ ├── test_xmlschema.py │ ├── test_xpathevaluator.py │ └── test_xslt.py │ ├── usedoctest.py │ ├── xinclude.pxi │ ├── xmlerror.pxi │ ├── xmlid.pxi │ ├── xmlschema.pxi │ ├── xpath.pxi │ ├── xslt.pxi │ └── xsltext.pxi ├── test.py ├── tools └── xpathgrep.py ├── tox.ini ├── update-error-constants.py ├── valgrind-python.supp ├── version.txt └── versioninfo.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .tox 3 | build 4 | libs 5 | *.egg-info 6 | *.so 7 | src/lxml/includes/lxml-version.h 8 | src/lxml/lxml.etree.c 9 | src/lxml/lxml.etree.h 10 | src/lxml/lxml.etree_api.h 11 | src/lxml/lxml.objectify.c 12 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | 3 | *.pyc 4 | *.pyo 5 | __pycache__ 6 | 7 | build/ 8 | dist/ 9 | cython_debug/ 10 | .git/ 11 | .gitrev 12 | .coverage 13 | funding.txt 14 | .tox 15 | *.orig 16 | *.rej 17 | *.dep 18 | *.swp 19 | *.so 20 | *.o 21 | *~ 22 | -------------------------------------------------------------------------------- /.hgtags: -------------------------------------------------------------------------------- 1 | 40fdc2efbcf833c2d2de7a1ebff7cc0b634e3a0a lxml-2.3 2 | ea513f9a9811ee9b3991a1df0319b197b361e5cb lxml-0.5.1 3 | e0fa117052c57bb83d005b962ff8788605efeadc lxml-0.6 4 | 802f612635d91469d9430bee819713ce7ecb30e2 lxml-0.7 5 | 1623013df810d6b4363dd1daf9f7f6fe5603f458 lxml-0.9 6 | 11e79f443fed94d91f90c8080a2c8a8afeb1ae94 lxml-1.0.beta 7 | a37777a46c55ae77a78266f57c3b6bca2ca04c5f lxml-1.0 8 | 6a117f91ff2ac2824aeed1ccc87512608d131f47 lxml-1.1 9 | 782bc8d9146fd9666879ace31cb4cf541a390173 lxml-1.1alpha 10 | 4144bbe6f24822a7ce5392130b2c98354cd9847e lxml-1.1beta 11 | 8205702eda77bb4a23d6789cc5ee94b4d36e65d4 lxml-1.2 12 | 4d410818a0e10638bb5eb5b54a37350a3477629b lxml-2.0 13 | 1dabace6188ee89b433f30b3838b6f2129698ba5 lxml-2.0alpha1 14 | 9dec5f9222aea1b9c531cfcc7e68d2c328394247 lxml-2.0alpha2 15 | b7873fce37031508d6fb115d8c79abad00b9f219 lxml-2.0alpha3 16 | 2ae894916b47710bbc79c139ff9f4a861ca5815c lxml-2.0alpha4 17 | c81c85642ca9eafe85630c46ef828828e692842e lxml-2.0alpha5 18 | 7a9b9811fadbd32b34d2b3e07901e213daca98d3 lxml-2.0alpha6 19 | 68ba2cbfc422d59bcba09216a7707153bd58b2e8 lxml-2.0beta1 20 | b1389dfc312b7d438fd673f7f7ee75d892ef81d7 lxml-2.0beta2 21 | 9b2be5208b1ecf4cfe5fd3cd0de4a396267abd97 lxml-2.0.1 22 | c5790462867c207a3c78dd510055589bf2950f9d lxml-2.1alpha1 23 | d1f3cf7d078796553de3b276db580796d5aeb048 lxml-2.1beta1 24 | fb891a783f270aefd03df44105677d49d765f2b2 lxml-2.1beta2 25 | 714552f48a53c2555994b6c56deb3de1e7ee702e lxml-2.1beta3 26 | 89227c4d5809f866f4a54a791d2452dc0ccc8d3b lxml-2.1 27 | e38e2a1162010841eebb60174be16797e0d34a87 lxml-2.2 28 | 3f730df23e58592418e22572fea5d8dfce7cf87d lxml-2.2.1 29 | 376b4baba7c91b98fae1ebd07592b21e5e535ba8 lxml-2.2.2 30 | 405a1fb3486ee8a9be7f37fd000553df21b95d5e lxml-2.3alpha1 31 | 901463d324cda95df28b2cab3ee86f715103fa84 lxml-2.3alpha2 32 | c9fef2d447ca436a83fa41183b73ccc825052f48 lxml-2.3beta1 33 | 5f5143534860cfba7fbe3ceab98dc749e79a4fc3 lxml-2.3.1 34 | 65ce4c8efb51013363dcc7318c847fc2f28f2eb2 lxml-2.3.2 35 | 945b29e5b54abf07897b46ebcb6d2227c05f8137 lxml-2.3.3 36 | cf0980063266b383d0403759993536eaa18ebe93 lxml-2.3.4 37 | c161cb55f4d4ebd93f5aee72ed73f267155bd894 lxml-2.3.5 38 | 66c66707c7d8a89b99a24bb29c791dbe9dc860f1 lxml-2.3.6 39 | 36e5b10c3ae6256e613554e7d71c34de0d71f385 lxml-3.0beta1 40 | 5bd7af62e93207ff58d54fc83f96b078b621eef8 lxml-3.0alpha2 41 | 6d41ed7c4b756792c9be44a5b8a383c10718016f lxml-3.0alpha1 42 | 60dd2d56701944e05c7655d1f47c56657d7837b3 lxml-3.0 43 | 22efeb405c9c4fb326541f56e431fd8e2686c435 lxml-3.0.1 44 | 714ab3c31e40ab2fef58e2be523de1ef4cb2a8a0 lxml-3.0.2 45 | 3e04be8a649395b193c70dd7d0e2b2b5acecb563 lxml-3.1beta1 46 | 91c436e11e2a822154fef48abe64274646cfde45 lxml-3.1.0 47 | e408b1f0eca00cb226acd28cd169988f9690067f lxml-3.1.1 48 | 862039d37b73e0250c9d8af6e5a689f6fe6321cd lxml-3.1.2 49 | 76262b9d449e75624b9dea745364f87e2e99b2a3 lxml-3.2.0 50 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - 2.5 5 | - 2.6 6 | - 2.7 7 | - 3.2 8 | - 3.3 9 | - pypy 10 | 11 | install: 12 | pip install --use-mirrors cython 13 | 14 | script: 15 | - python setup.py clean 16 | - python setup.py build_ext --inplace 17 | - make test 18 | 19 | matrix: 20 | allow_failures: 21 | - python: pypy 22 | -------------------------------------------------------------------------------- /CREDITS.txt: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Main contributors 6 | ================= 7 | 8 | Stefan Behnel 9 | main developer and maintainer 10 | 11 | Martijn Faassen 12 | creator of lxml and initial main developer 13 | 14 | Ian Bicking 15 | creator and maintainer of lxml.html 16 | 17 | Holger Joukl 18 | ISO-Schematron support, development on lxml.objectify, bug reports, feedback 19 | 20 | Simon Sapin 21 | external maintenance and development of the cssselect package 22 | 23 | Marc-Antoine Parent 24 | XPath extension function help and patches 25 | 26 | Olivier Grisel 27 | improved (c)ElementTree compatibility patches, 28 | website improvements. 29 | 30 | Kasimier Buchcik 31 | help with specs and libxml2 32 | 33 | Florian Wagner 34 | help with copy.deepcopy support, bug reporting 35 | 36 | Emil Kroymann 37 | help with encoding support, bug reporting 38 | 39 | Paul Everitt 40 | bug reporting, feedback on API design 41 | 42 | Victor Ng 43 | Discussions on memory management strategies, vlibxml2 44 | 45 | Robert Kern 46 | feedback on API design 47 | 48 | Andreas Pakulat 49 | rpath linking support, doc improvements 50 | 51 | David Sankel 52 | building statically on Windows 53 | 54 | Marcin Kasperski 55 | PDF documentation generation 56 | 57 | Sidnei da Silva 58 | official MS Windows builds 59 | 60 | Pascal Oberndörfer 61 | official Mac-OS builds 62 | 63 | ... and lots of other people who contributed to lxml by reporting 64 | bugs, discussing its functionality or blaming the docs for the bugs in 65 | their code. Thank you all, user feedback and discussions form a very 66 | important part of an Open Source project! 67 | 68 | 69 | Special thanks goes to: 70 | ======================= 71 | 72 | * Daniel Veillard and the libxml2 project for a great XML library. 73 | 74 | * Fredrik Lundh for ElementTree, its API, and the competition through 75 | cElementTree. 76 | 77 | * Greg Ewing (Pyrex) and Robert Bradshaw et al. (Cython) for the 78 | binding technology. 79 | 80 | * Jonathan Stoppani for hosting the new mailing list on lxml.de. 81 | 82 | * the codespeak crew, in particular Philipp von Weitershausen and 83 | Holger Krekel for originally hosting lxml on codespeak.net 84 | -------------------------------------------------------------------------------- /IDEAS.txt: -------------------------------------------------------------------------------- 1 | Things to try out when life permits 2 | =================================== 3 | 4 | * zlib-based parsing/serialising of compressed in-memory data 5 | 6 | * requires a libxml2 I/O OutputBuffer with appropriate I/O functions 7 | that call into the zlib compression routines 8 | 9 | * lzma-based parsing/serialising of compressed in-memory data 10 | 11 | * requires a libxml2 I/O OutputBuffer with appropriate I/O functions 12 | that call into the lzma compression routines 13 | 14 | * advantage over zlib: probably faster and better compression 15 | 16 | * maybe embed the lzma C sources in the distro 17 | http://www.7-zip.org/sdk.html 18 | 19 | * generating XML using the ``with`` statement 20 | 21 | http://comments.gmane.org/gmane.comp.python.general/579950?set_lines=100000 22 | 23 | * parse-time validation against a user provided DTD 24 | 25 | * currently only works for XML Schema 26 | 27 | * somehow integrate RelaxNG compact notation (rnc versus rng) 28 | 29 | * currently not supported by libxml2 (patch exists) 30 | 31 | * support subclassing XSLTAccessControl to provide custom per-URL 32 | access check methods 33 | 34 | * maybe custom resolvers are enough, or can be combined with this? 35 | 36 | * reimplement iterparse() using the libxml2 xmlReader API 37 | 38 | * Advantage: the implementation can be made safer than the current 39 | SAX implementation, as the parser would not interact with the 40 | Python-level tree. 41 | 42 | * Disadvantage: the tree has to be built manually. In the current 43 | SAX based implementation, libxml2 does it for us. 44 | 45 | * rewrite iterparse() to accept a parser as argument instead of being 46 | one 47 | 48 | * disadvantage: iterparse() can't deal with all parser options 49 | 50 | * provide an HTMLParser wrapper that handles broken encodings in broken 51 | HTML better, e.g. using BeautifulSoup's "unicode dammit" analyser 52 | 53 | * expose namespace prefixes through the QName class 54 | 55 | -------------------------------------------------------------------------------- /LICENSES.txt: -------------------------------------------------------------------------------- 1 | lxml is copyright Infrae and distributed under the BSD license (see 2 | doc/licenses/BSD.txt), with the following exceptions: 3 | 4 | Some code, such a selftest.py, selftest2.py and 5 | src/lxml/_elementpath.py are derived from ElementTree and 6 | cElementTree. See doc/licenses/elementtree.txt for the license text. 7 | 8 | lxml.cssselect and lxml.html are copyright Ian Bicking and distributed 9 | under the BSD license (see doc/licenses/BSD.txt). 10 | 11 | test.py, the test-runner script, is GPL and copyright Shuttleworth 12 | Foundation. See doc/licenses/GPL.txt. It is believed the unchanged 13 | inclusion of test.py to run the unit test suite falls under the 14 | "aggregation" clause of the GPL and thus does not affect the license 15 | of the rest of the package. 16 | 17 | The doctest.py module is taken from the Python library and falls under 18 | the PSF Python License. 19 | 20 | The isoschematron implementation uses several XSL and RelaxNG resources: 21 | * The (XML syntax) RelaxNG schema for schematron, copyright International 22 | Organization for Standardization (see 23 | src/lxml/isoschematron/resources/rng/iso-schematron.rng for the license 24 | text) 25 | * The skeleton iso-schematron-xlt1 pure-xslt schematron implementation 26 | xsl stylesheets, copyright Rick Jelliffe and Academia Sinica Computing 27 | Center, Taiwan (see the xsl files here for the license text: 28 | src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/) 29 | * The xsd/rng schema schematron extraction xsl transformations are unlicensed 30 | and copyright the respective authors as noted (see 31 | src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl and 32 | src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl) 33 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | exclude *.py 2 | include setup.py ez_setup.py setupinfo.py versioninfo.py buildlibxml.py 3 | include test.py selftest.py selftest2.py 4 | include update-error-constants.py 5 | include MANIFEST.in Makefile version.txt 6 | include CHANGES.txt CREDITS.txt INSTALL.txt LICENSES.txt README.rst TODO.txt 7 | recursive-include src *.pyx *.pxd *.pxi *.py 8 | recursive-include src/lxml lxml.etree.c lxml.objectify.c 9 | recursive-include src/lxml lxml.etree.h lxml.etree_api.h etree_defs.h 10 | recursive-include src/lxml/isoschematron *.rng *.xsl *.txt 11 | recursive-include src/lxml/tests *.rng *.xslt *.xml *.dtd *.xsd *.sch *.html *.txt 12 | recursive-include src/lxml/html/tests *.data *.txt 13 | recursive-include samples *.xml 14 | recursive-include benchmark *.py 15 | recursive-include doc *.txt *.html *.css *.xml *.mgp pubkey.asc tagpython*.png Makefile 16 | recursive-include doc/s5/ui *.gif *.htc *.png *.js 17 | recursive-include doc/s5/ep2008 *.py *.png *.rng 18 | recursive-include fake_pyrex *.py 19 | include doc/*.py 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON?=python 2 | PYTHON3?=python3 3 | TESTFLAGS=-p -v 4 | TESTOPTS= 5 | SETUPFLAGS= 6 | LXMLVERSION=`cat version.txt` 7 | 8 | PY2_WITH_CYTHON=$(shell $(PYTHON) -c 'import Cython.Compiler' >/dev/null 2>/dev/null && echo " --with-cython" || true) 9 | PY3_WITH_CYTHON=$(shell $(PYTHON3) -c 'import Cython.Compiler' >/dev/null 2>/dev/null && echo " --with-cython" || true) 10 | 11 | all: inplace 12 | 13 | # Build in-place 14 | inplace: 15 | $(PYTHON) setup.py $(SETUPFLAGS) build_ext -i $(PY2_WITH_CYTHON) 16 | 17 | build: 18 | $(PYTHON) setup.py $(SETUPFLAGS) build $(PY2_WITH_CYTHON) 19 | 20 | test_build: build 21 | $(PYTHON) test.py $(TESTFLAGS) $(TESTOPTS) 22 | 23 | test_inplace: inplace 24 | $(PYTHON) test.py $(TESTFLAGS) $(TESTOPTS) 25 | PYTHONPATH=src:$(PYTHONPATH) $(PYTHON) selftest.py 26 | PYTHONPATH=src:$(PYTHONPATH) $(PYTHON) selftest2.py 27 | 28 | test_inplace3: inplace 29 | $(PYTHON3) setup.py $(SETUPFLAGS) build_ext -i $(PY3_WITH_CYTHON) 30 | $(PYTHON3) test.py $(TESTFLAGS) $(TESTOPTS) 31 | PYTHONPATH=src:$(PYTHONPATH) $(PYTHON3) selftest.py 32 | PYTHONPATH=src:$(PYTHONPATH) $(PYTHON3) selftest2.py 33 | 34 | valgrind_test_inplace: inplace 35 | valgrind --tool=memcheck --leak-check=full --num-callers=30 --suppressions=valgrind-python.supp \ 36 | $(PYTHON) test.py 37 | 38 | gdb_test_inplace: inplace 39 | @echo -e "file $(PYTHON)\nrun test.py" > .gdb.command 40 | gdb -x .gdb.command -d src -d src/lxml 41 | 42 | bench_inplace: inplace 43 | $(PYTHON) benchmark/bench_etree.py -i 44 | $(PYTHON) benchmark/bench_xpath.py -i 45 | $(PYTHON) benchmark/bench_xslt.py -i 46 | $(PYTHON) benchmark/bench_objectify.py -i 47 | 48 | ftest_build: build 49 | $(PYTHON) test.py -f $(TESTFLAGS) $(TESTOPTS) 50 | 51 | ftest_inplace: inplace 52 | $(PYTHON) test.py -f $(TESTFLAGS) $(TESTOPTS) 53 | 54 | apihtml: inplace 55 | rm -fr doc/html/api 56 | @[ -x "`which epydoc`" ] \ 57 | && (cd src && echo "Generating API docs ..." && \ 58 | PYTHONPATH=. epydoc -v --docformat "restructuredtext en" \ 59 | -o ../doc/html/api --exclude='[.]html[.]tests|[.]_' \ 60 | --exclude-introspect='[.]usedoctest' \ 61 | --name "lxml API" --url / lxml/) \ 62 | || (echo "not generating epydoc API documentation") 63 | 64 | website: inplace 65 | PYTHONPATH=src:$(PYTHONPATH) $(PYTHON) doc/mkhtml.py doc/html . ${LXMLVERSION} 66 | 67 | html: inplace website apihtml s5 68 | 69 | s5: 70 | $(MAKE) -C doc/s5 slides 71 | 72 | apipdf: inplace 73 | rm -fr doc/pdf 74 | mkdir -p doc/pdf 75 | @[ -x "`which epydoc`" ] \ 76 | && (cd src && echo "Generating API docs ..." && \ 77 | PYTHONPATH=. epydoc -v --latex --docformat "restructuredtext en" \ 78 | -o ../doc/pdf --exclude='([.]html)?[.]tests|[.]_' \ 79 | --exclude-introspect='html[.]clean|[.]usedoctest' \ 80 | --name "lxml API" --url / lxml/) \ 81 | || (echo "not generating epydoc API documentation") 82 | 83 | pdf: apipdf 84 | $(PYTHON) doc/mklatex.py doc/pdf . ${LXMLVERSION} 85 | (cd doc/pdf && pdflatex lxmldoc.tex \ 86 | && pdflatex lxmldoc.tex \ 87 | && pdflatex lxmldoc.tex) 88 | @pdfopt doc/pdf/lxmldoc.pdf doc/pdf/lxmldoc-${LXMLVERSION}.pdf 89 | @echo "PDF available as doc/pdf/lxmldoc-${LXMLVERSION}.pdf" 90 | 91 | # Two pdflatex runs are needed to build the correct Table of contents. 92 | 93 | test: test_inplace 94 | 95 | test3: test_inplace3 96 | 97 | valtest: valgrind_test_inplace 98 | 99 | gdbtest: gdb_test_inplace 100 | 101 | bench: bench_inplace 102 | 103 | ftest: ftest_inplace 104 | 105 | clean: 106 | find . \( -name '*.o' -o -name '*.so' -o -name '*.py[cod]' -o -name '*.dll' \) -exec rm -f {} \; 107 | rm -rf build 108 | 109 | docclean: 110 | $(MAKE) -C doc/s5 clean 111 | rm -f doc/html/*.html 112 | rm -fr doc/html/api 113 | rm -fr doc/pdf 114 | 115 | realclean: clean docclean 116 | find . -name '*.c' -exec rm -f {} \; 117 | rm -f TAGS 118 | $(PYTHON) setup.py clean -a --without-cython 119 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | What is lxml? 2 | ============= 3 | 4 | lxml is the most feature-rich and easy-to-use library for processing XML and HTML in the Python language. 5 | It's also very fast and memory friendly, just so you know. 6 | 7 | For an introduction and further documentation, see `doc/main.txt`_. 8 | 9 | For installation information, see `INSTALL.txt`_. 10 | 11 | 12 | Support the project 13 | ------------------- 14 | 15 | lxml has been downloaded from the `Python Package Index`_ more than 16 | two million times and is also available directly in many package 17 | distributions, e.g. for Linux or MacOS-X. 18 | 19 | .. _`Python Package Index`: https://pypi.python.org/pypi/lxml 20 | 21 | Most people who use lxml do so because they like using it. 22 | You can show us that you like it by blogging about your experience 23 | with it and linking to the project website. 24 | 25 | If you are using lxml for your work and feel like giving a bit of 26 | your own benefit back to support the project, consider sending us 27 | money through PayPal that we can use for fixing bugs in the software 28 | and improving its features and documentation. Please read the Legal 29 | Notice below, at the bottom of this page. Thank you for your support. 30 | 31 | .. class:: center 32 | 33 | |Donate|_ 34 | 35 | .. _Donate: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=R56JE3VCPDA9N 36 | 37 | Alternatively, if you prefer expressing your appreciation in a monthy 38 | dose of pennies rather than a dedicated donation, you can also use 39 | Flattr to do so. 40 | 41 | .. class:: center 42 | 43 | |FlattrLink|_ 44 | 45 | .. _FlattrLink: https://flattr.com/thing/268156/lxml-The-Python-XML-Toolkit 46 | 47 | Note that Flattr keeps 10% of the transactions for itself, which is money 48 | you pay that will not reach us. Do not send any larger amounts through 49 | Flattr. Use PayPal for donations instead, or `contact Stefan Behnel`_ for 50 | other ways to support the lxml project, as well as commercial consulting, 51 | customisations and trainings on lxml and fast Python XML processing. 52 | 53 | .. |Donate| image:: https://github.com/lxml/lxml/raw/master/doc/html/paypal_btn_donateCC_LG.gif 54 | :width: 160 55 | :height: 47 56 | :alt: Donate to the lxml project 57 | 58 | .. |FlattrLink| image:: https://github.com/lxml/lxml/raw/master/doc/html/flattr-badge-large.png 59 | :width: 93 60 | :height: 20 61 | :alt: Flattr the lxml project 62 | 63 | .. _`contact Stefan Behnel`: http://consulting.behnel.de/ 64 | .. _`doc/main.txt`: http://lxml.de/ 65 | .. _`INSTALL.txt`: http://lxml.de/installation.html 66 | 67 | 68 | Legal Notice for Donations 69 | -------------------------- 70 | 71 | Any donation that you make to the lxml project is voluntary and 72 | is not a fee for any services, goods, or advantages. By making 73 | a donation to the lxml project, you acknowledge that we have the 74 | right to use the money you donate in any lawful way and for any 75 | lawful purpose we see fit and we are not obligated to disclose 76 | the way and purpose to any party unless required by applicable 77 | law. Although lxml is free software, to the best of our knowledge 78 | the lxml project does not have any tax exempt status. The lxml 79 | project is neither a registered non-profit corporation nor a 80 | registered charity in any country. Your donation may or may not 81 | be tax-deductible; please consult your tax advisor in this matter. 82 | We will not publish or disclose your name and/or e-mail address 83 | without your consent, unless required by applicable law. Your 84 | donation is non-refundable. 85 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | =============== 2 | ToDo's for lxml 3 | =============== 4 | 5 | lxml 6 | ==== 7 | 8 | In general 9 | ---------- 10 | 11 | * more testing on multi-threading 12 | 13 | * better exception messages for XPath and schemas based on error log, 14 | e.g. missing namespace mappings in XPath 15 | 16 | * when building statically, compile everything into one shared library 17 | instead of one for lxml.etree and one for lxml.objectify to prevent 18 | the redundant static linking of the library dependencies. 19 | 20 | * more testing on input/output of encoded filenames, including custom 21 | resolvers, relative XSLT imports, ... 22 | 23 | * always use '' as URL when tree was parsed from string? (can libxml2 24 | handle this?) 25 | 26 | * follow PEP 8 in API naming (avoidCamelCase in_favour_of_underscores) 27 | 28 | * use per-call or per-thread error logs in XSLT/XPath/etc. to keep the 29 | messages separate, especially in exceptions 30 | 31 | * add 'nsmap' parameter to cleanup_namespaces() 32 | 33 | 34 | Entities 35 | -------- 36 | 37 | * clean support for entities (is the Entity element class enough?) 38 | 39 | 40 | Objectify 41 | --------- 42 | 43 | * emulate setting special __attributes__ on ObjectifiedElement's as Python 44 | attributes, not XML children 45 | -------------------------------------------------------------------------------- /benchmark/bench_objectify.py: -------------------------------------------------------------------------------- 1 | import sys, copy 2 | from itertools import * 3 | 4 | import benchbase 5 | from benchbase import (with_attributes, with_text, onlylib, 6 | serialized, children, nochange) 7 | 8 | ############################################################ 9 | # Benchmarks 10 | ############################################################ 11 | 12 | class BenchMark(benchbase.TreeBenchMark): 13 | repeat100 = range(100) 14 | repeat1000 = range(1000) 15 | repeat3000 = range(3000) 16 | 17 | def __init__(self, lib): 18 | from lxml import etree, objectify 19 | self.objectify = objectify 20 | parser = etree.XMLParser(remove_blank_text=True) 21 | lookup = objectify.ObjectifyElementClassLookup() 22 | parser.setElementClassLookup(lookup) 23 | super(BenchMark, self).__init__(etree, parser) 24 | 25 | @nochange 26 | def bench_attribute(self, root): 27 | "1 2 4" 28 | for i in self.repeat3000: 29 | root.zzzzz 30 | 31 | def bench_attribute_assign_int(self, root): 32 | "1 2 4" 33 | for i in self.repeat3000: 34 | root.XYZ = 5 35 | 36 | def bench_attribute_assign_string(self, root): 37 | "1 2 4" 38 | for i in self.repeat3000: 39 | root.XYZ = "5" 40 | 41 | @nochange 42 | def bench_attribute_cached(self, root): 43 | "1 2 4" 44 | cache = root.zzzzz 45 | for i in self.repeat3000: 46 | root.zzzzz 47 | 48 | @nochange 49 | def bench_attributes_deep(self, root): 50 | "1 2 4" 51 | for i in self.repeat3000: 52 | root.zzzzz['{cdefg}a00001'] 53 | 54 | @nochange 55 | def bench_attributes_deep_cached(self, root): 56 | "1 2 4" 57 | cache1 = root.zzzzz 58 | cache2 = cache1['{cdefg}a00001'] 59 | for i in self.repeat3000: 60 | root.zzzzz['{cdefg}a00001'] 61 | 62 | @nochange 63 | def bench_objectpath(self, root): 64 | "1 2 4" 65 | path = self.objectify.ObjectPath(".zzzzz") 66 | for i in self.repeat3000: 67 | path(root) 68 | 69 | @nochange 70 | def bench_objectpath_deep(self, root): 71 | "1 2 4" 72 | path = self.objectify.ObjectPath(".zzzzz.{cdefg}a00001") 73 | for i in self.repeat3000: 74 | path(root) 75 | 76 | @nochange 77 | def bench_objectpath_deep_cached(self, root): 78 | "1 2 4" 79 | cache1 = root.zzzzz 80 | cache2 = cache1['{cdefg}a00001'] 81 | path = self.objectify.ObjectPath(".zzzzz.{cdefg}a00001") 82 | for i in self.repeat3000: 83 | path(root) 84 | 85 | @with_text(text=True, utext=True, no_text=True) 86 | def bench_annotate(self, root): 87 | self.objectify.annotate(root) 88 | 89 | @nochange 90 | def bench_descendantpaths(self, root): 91 | root.descendantpaths() 92 | 93 | @nochange 94 | @with_text(text=True) 95 | def bench_type_inference(self, root): 96 | "1 2 4" 97 | el = root.aaaaa 98 | for i in self.repeat1000: 99 | el.getchildren() 100 | 101 | @nochange 102 | @with_text(text=True) 103 | def bench_type_inference_annotated(self, root): 104 | "1 2 4" 105 | el = root.aaaaa 106 | self.objectify.annotate(el) 107 | for i in self.repeat1000: 108 | el.getchildren() 109 | 110 | @nochange 111 | @children 112 | def bench_elementmaker(self, children): 113 | E = self.objectify.E 114 | for child in children: 115 | root = E.this( 116 | "test", 117 | E.will( 118 | E.do("nothing"), 119 | E.special, 120 | ) 121 | ) 122 | 123 | if __name__ == '__main__': 124 | benchbase.main(BenchMark) 125 | -------------------------------------------------------------------------------- /benchmark/bench_xpath.py: -------------------------------------------------------------------------------- 1 | import sys, copy 2 | from itertools import * 3 | 4 | import benchbase 5 | from benchbase import with_attributes, with_text, onlylib, serialized, children, nochange 6 | 7 | ############################################################ 8 | # Benchmarks 9 | ############################################################ 10 | 11 | class XPathBenchMark(benchbase.TreeBenchMark): 12 | @nochange 13 | @onlylib('lxe') 14 | @children 15 | def bench_xpath_class(self, children): 16 | xpath = self.etree.XPath("./*[1]") 17 | for child in children: 18 | xpath(child) 19 | 20 | @nochange 21 | @onlylib('lxe') 22 | @children 23 | def bench_xpath_class_repeat(self, children): 24 | for child in children: 25 | xpath = self.etree.XPath("./*[1]") 26 | xpath(child) 27 | 28 | @nochange 29 | @onlylib('lxe') 30 | def bench_xpath_element(self, root): 31 | xpath = self.etree.XPathElementEvaluator(root) 32 | for child in root: 33 | xpath.evaluate("./*[1]") 34 | 35 | @nochange 36 | @onlylib('lxe') 37 | @children 38 | def bench_xpath_method(self, children): 39 | for child in children: 40 | child.xpath("./*[1]") 41 | 42 | @nochange 43 | @onlylib('lxe') 44 | @children 45 | def bench_multiple_xpath_or(self, children): 46 | xpath = self.etree.XPath(".//p:a00001|.//p:b00001|.//p:c00001", 47 | namespaces={'p':'cdefg'}) 48 | for child in children: 49 | xpath(child) 50 | 51 | @nochange 52 | @onlylib('lxe') 53 | @children 54 | def bench_multiple_iter_tag(self, children): 55 | for child in children: 56 | list(child.iter("{cdefg}a00001")) 57 | list(child.iter("{cdefg}b00001")) 58 | list(child.iter("{cdefg}c00001")) 59 | 60 | @nochange 61 | @onlylib('lxe') 62 | @children 63 | def bench_xpath_old_extensions(self, children): 64 | def return_child(_, elements): 65 | if elements: 66 | return elements[0][0] 67 | else: 68 | return () 69 | extensions = {("test", "child") : return_child} 70 | xpath = self.etree.XPath("t:child(.)", namespaces={"t":"test"}, 71 | extensions=extensions) 72 | for child in children: 73 | xpath(child) 74 | 75 | @nochange 76 | @onlylib('lxe') 77 | @children 78 | def bench_xpath_extensions(self, children): 79 | def return_child(_, elements): 80 | if elements: 81 | return elements[0][0] 82 | else: 83 | return () 84 | self.etree.FunctionNamespace("testns")["t"] = return_child 85 | 86 | try: 87 | xpath = self.etree.XPath("test:t(.)", namespaces={"test":"testns"}) 88 | for child in children: 89 | xpath(child) 90 | finally: 91 | del self.etree.FunctionNamespace("testns")["t"] 92 | 93 | if __name__ == '__main__': 94 | benchbase.main(XPathBenchMark) 95 | -------------------------------------------------------------------------------- /benchmark/bench_xslt.py: -------------------------------------------------------------------------------- 1 | import sys, copy 2 | from itertools import * 3 | 4 | import benchbase 5 | from benchbase import with_attributes, with_text, onlylib, serialized 6 | 7 | ############################################################ 8 | # Benchmarks 9 | ############################################################ 10 | 11 | class XSLTBenchMark(benchbase.TreeBenchMark): 12 | @onlylib('lxe') 13 | def bench_xslt_extensions_old(self, root): 14 | tree = self.etree.XML("""\ 15 | 19 | TEST 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | """) 29 | def return_child(_, elements): 30 | return elements[0][0] 31 | 32 | extensions = {('testns', 'child') : return_child} 33 | 34 | transform = self.etree.XSLT(tree, extensions) 35 | for i in range(10): 36 | transform(root) 37 | 38 | @onlylib('lxe') 39 | def bench_xslt_document(self, root): 40 | transform = self.etree.XSLT(self.etree.XML("""\ 41 | 44 | TEST 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | """)) 54 | transform(root) 55 | 56 | if __name__ == '__main__': 57 | benchbase.main(XSLTBenchMark) 58 | -------------------------------------------------------------------------------- /bisect_crashes.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | import unittest 5 | 6 | # make sure we import test.py from the right place 7 | script_path = os.path.abspath(os.path.dirname(sys.argv[0])) 8 | sys.path.insert(0, script_path) 9 | 10 | test_base_path = os.path.join(script_path, 'src') 11 | sys.path.insert(1, test_base_path) 12 | 13 | import test 14 | from DD import DD 15 | 16 | cfg = test.Options() 17 | cfg.verbosity = 0 18 | cfg.basedir = test_base_path 19 | cfg.unit_tests = True 20 | 21 | def write(line, *args): 22 | if args: 23 | line = line % args 24 | sys.stderr.write(line + '\n') 25 | 26 | 27 | def find_tests(): 28 | test_files = test.get_test_files(cfg) 29 | return test.get_test_cases(test_files, cfg) 30 | 31 | class DDTester(DD): 32 | def _test(self, test_cases): 33 | if not test_cases: 34 | return self.PASS 35 | write('Running subset of %d tests %s', 36 | len(test_cases), self.coerce(test_cases)) 37 | test_cases = [ item[-1] for item in test_cases ] 38 | pid = os.fork() 39 | if not pid: 40 | # child executes tests 41 | runner = test.CustomTestRunner(cfg, None) 42 | suite = unittest.TestSuite() 43 | suite.addTests(test_cases) 44 | os._exit( not runner.run(suite).wasSuccessful() ) 45 | cid, retval = os.waitpid(pid, 0) 46 | if retval: 47 | write('exit status: %d, signal: %d', retval >> 8, retval % 0xFF) 48 | if (retval % 0xFF) > 2: # signal received? 49 | return self.FAIL 50 | return self.PASS 51 | 52 | def coerce(self, test_cases): 53 | if not test_cases: 54 | return '[]' 55 | test_cases = [ item[-1] for item in test_cases ] 56 | return '[%s .. %s]' % (test_cases[0].id(), test_cases[-1].id()) 57 | 58 | def dd_tests(): 59 | tests = find_tests() 60 | write('Found %d tests', len(tests)) 61 | dd = DDTester() 62 | min_tests = dd.ddmin( list(enumerate(tests)) ) 63 | return [ item[-1] for item in min_tests ] 64 | 65 | if __name__ == '__main__': 66 | write('Failing tests:\n%s', '\n'.join([test.id() for test in dd_tests()])) 67 | -------------------------------------------------------------------------------- /doc/capi.txt: -------------------------------------------------------------------------------- 1 | ============================== 2 | The public C-API of lxml.etree 3 | ============================== 4 | 5 | As of version 1.1, lxml.etree provides a public C-API. This allows external 6 | C extensions to efficiently access public functions and classes of lxml, 7 | without going through the Python API. 8 | 9 | The API is described in the file `etreepublic.pxd`_, which is directly 10 | c-importable by extension modules implemented in Pyrex_ or Cython_. 11 | 12 | .. _`etreepublic.pxd`: https://github.com/lxml/lxml/blob/master/src/lxml/include/etreepublic.pxd 13 | .. _Cython: http://cython.org 14 | .. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/ 15 | 16 | .. contents:: 17 | .. 18 | 1 Writing external modules in Cython 19 | 2 Writing external modules in C 20 | 21 | 22 | Writing external modules in Cython 23 | ---------------------------------- 24 | 25 | This is the easiest way of extending lxml at the C level. A Cython_ 26 | (or Pyrex_) module should start like this:: 27 | 28 | # My Cython extension 29 | 30 | # import the public functions and classes of lxml.etree 31 | cimport etreepublic as cetree 32 | 33 | # import the lxml.etree module in Python 34 | cdef object etree 35 | from lxml import etree 36 | 37 | # initialize the access to the C-API of lxml.etree 38 | cetree.import_lxml__etree() 39 | 40 | From this line on, you can access all public functions of lxml.etree 41 | from the ``cetree`` namespace like this:: 42 | 43 | # build a tag name from namespace and element name 44 | py_tag = cetree.namespacedNameFromNsName("http://some/url", "myelement") 45 | 46 | Public lxml classes are easily subclassed. For example, to implement 47 | and set a new default element class, you can write Cython code like 48 | the following:: 49 | 50 | from etreepublic cimport ElementBase 51 | cdef class NewElementClass(ElementBase): 52 | def set_value(self, myval): 53 | self.set("my_attribute", myval) 54 | 55 | etree.set_element_class_lookup( 56 | etree.DefaultElementClassLookup(element=NewElementClass)) 57 | 58 | 59 | Writing external modules in C 60 | ----------------------------- 61 | 62 | If you really feel like it, you can also interface with lxml.etree straight 63 | from C code. All you have to do is include the header file for the public 64 | API, import the ``lxml.etree`` module and then call the import function: 65 | 66 | .. sourcecode:: c 67 | 68 | /* My C extension */ 69 | 70 | /* common includes */ 71 | #include "Python.h" 72 | #include "stdio.h" 73 | #include "string.h" 74 | #include "stdarg.h" 75 | #include "libxml/xmlversion.h" 76 | #include "libxml/encoding.h" 77 | #include "libxml/hash.h" 78 | #include "libxml/tree.h" 79 | #include "libxml/xmlIO.h" 80 | #include "libxml/xmlsave.h" 81 | #include "libxml/globals.h" 82 | #include "libxml/xmlstring.h" 83 | 84 | /* lxml.etree specific includes */ 85 | #include "lxml-version.h" 86 | #include "etree_defs.h" 87 | #include "etree.h" 88 | 89 | /* setup code */ 90 | import_lxml__etree() 91 | 92 | Note that including ``etree.h`` does not automatically include the 93 | header files it requires. Note also that the above list of common 94 | includes may not be sufficient. 95 | -------------------------------------------------------------------------------- /doc/cssselect.txt: -------------------------------------------------------------------------------- 1 | ============== 2 | lxml.cssselect 3 | ============== 4 | 5 | lxml supports a number of interesting languages for tree traversal and element 6 | selection. The most important is obviously XPath_, but there is also 7 | ObjectPath_ in the `lxml.objectify`_ module. The newest child of this family 8 | is `CSS selection`_, which is made available in form of the ``lxml.cssselect`` 9 | module. 10 | 11 | Although it started its life in lxml, cssselect_ is now an independent project. 12 | It translates CSS selectors to XPath 1.0 expressions that can be used with 13 | lxml's XPath engine. ``lxml.cssselect`` adds a few convenience shortcuts into 14 | that package. 15 | 16 | 17 | .. _XPath: xpathxslt.html#xpath 18 | .. _ObjectPath: objectify.html#objectpath 19 | .. _`lxml.objectify`: objectify.html 20 | .. _`CSS selection`: http://www.w3.org/TR/CSS21/selector.html 21 | .. _cssselect: http://packages.python.org/cssselect/ 22 | 23 | .. contents:: 24 | .. 25 | 1 The CSSSelector class 26 | 2 CSS Selectors 27 | 2.1 Namespaces 28 | 3 Limitations 29 | 30 | 31 | The CSSSelector class 32 | ===================== 33 | 34 | The most important class in the ``lxml.cssselect`` module is ``CSSSelector``. It 35 | provides the same interface as the XPath_ class, but accepts a CSS selector 36 | expression as input: 37 | 38 | .. sourcecode:: pycon 39 | 40 | >>> from lxml.cssselect import CSSSelector 41 | >>> sel = CSSSelector('div.content') 42 | >>> sel #doctest: +ELLIPSIS 43 | 44 | >>> sel.css 45 | 'div.content' 46 | 47 | The selector actually compiles to XPath, and you can see the 48 | expression by inspecting the object: 49 | 50 | .. sourcecode:: pycon 51 | 52 | >>> sel.path 53 | "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' content ')]" 54 | 55 | To use the selector, simply call it with a document or element 56 | object: 57 | 58 | .. sourcecode:: pycon 59 | 60 | >>> from lxml.etree import fromstring 61 | >>> h = fromstring('''
62 | ...
63 | ... text 64 | ...
''') 65 | >>> [e.get('id') for e in sel(h)] 66 | ['inner'] 67 | 68 | Using ``CSSSelector`` is equivalent to translating with ``cssselect`` 69 | and using the ``XPath`` class: 70 | 71 | .. sourcecode:: pycon 72 | 73 | >>> from cssselect import GenericTranslator 74 | >>> from lxml.etree import XPath 75 | >>> sel = XPath(GenericTranslator().css_to_xpath('div.content')) 76 | 77 | ``CSSSelector`` takes a ``translator`` parameter to let you choose which 78 | translator to use. It can be ``'xml'`` (the default), ``'xhtml'``, ``'html'`` 79 | or a `Translator object`_. 80 | 81 | .. _Translator object: http://packages.python.org/cssselect/#cssselect.GenericTranslator 82 | 83 | 84 | The cssselect method 85 | ==================== 86 | 87 | lxml ``Element`` objects have a ``cssselect`` convenience method. 88 | 89 | .. sourcecode:: pycon 90 | 91 | >>> h.cssselect('div.content') == sel(h) 92 | True 93 | 94 | Note however that pre-compiling the expression with the ``CSSSelector`` or 95 | ``XPath`` class can provide a substantial speedup. 96 | 97 | The method also accepts a ``translator`` parameter. On ``HtmlElement`` 98 | objects, the default is changed to ``'html'``. 99 | 100 | 101 | Supported Selectors 102 | =================== 103 | 104 | Most `Level 3`_ selectors are supported. The details are in the 105 | `cssselect documentation`_. 106 | 107 | .. _Level 3: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/ 108 | .. _cssselect documentation: http://packages.python.org/cssselect/#supported-selectors 109 | 110 | 111 | Namespaces 112 | ========== 113 | 114 | In CSS you can use ``namespace-prefix|element``, similar to 115 | ``namespace-prefix:element`` in an XPath expression. In fact, it maps 116 | one-to-one, and the same rules are used to map namespace prefixes to 117 | namespace URIs: the ``CSSSelector`` class accepts a dictionary as its 118 | ``namespaces`` argument. 119 | -------------------------------------------------------------------------------- /doc/docstructure.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | 4 | if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'funding.txt')): 5 | funding = ('../funding.txt',) 6 | else: 7 | funding = () 8 | 9 | SITE_STRUCTURE = [ 10 | ('lxml', ('main.txt', 'intro.txt', '../INSTALL.txt', # 'lxml2.txt', 11 | 'performance.txt', 'compatibility.txt', 'FAQ.txt') + funding), 12 | ('Developing with lxml', ('tutorial.txt', '@API reference', 13 | 'api.txt', 'parsing.txt', 14 | 'validation.txt', 'xpathxslt.txt', 15 | 'objectify.txt', 'lxmlhtml.txt', 16 | 'cssselect.txt', 'elementsoup.txt', 17 | 'html5parser.txt')), 18 | ('Extending lxml', ('resolvers.txt', 'extensions.txt', 19 | 'element_classes.txt', 'sax.txt', 'capi.txt')), 20 | ('Developing lxml', ('build.txt', 'lxml-source-howto.txt', 21 | '@Release Changelog', '../CREDITS.txt')), 22 | ] 23 | 24 | HREF_MAP = { 25 | "API reference" : "api/index.html" 26 | } 27 | 28 | BASENAME_MAP = { 29 | 'main' : 'index', 30 | 'INSTALL' : 'installation', 31 | 'CREDITS' : 'credits', 32 | } 33 | -------------------------------------------------------------------------------- /doc/html/flattr-badge-large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/html/flattr-badge-large.png -------------------------------------------------------------------------------- /doc/html/paypal_btn_donateCC_LG.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/html/paypal_btn_donateCC_LG.gif -------------------------------------------------------------------------------- /doc/html/proxies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/html/proxies.png -------------------------------------------------------------------------------- /doc/html/python-xml-title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/html/python-xml-title.png -------------------------------------------------------------------------------- /doc/html/python-xml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/html/python-xml.png -------------------------------------------------------------------------------- /doc/html/tagpython-big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/html/tagpython-big.png -------------------------------------------------------------------------------- /doc/html5parser.txt: -------------------------------------------------------------------------------- 1 | =============== 2 | html5lib Parser 3 | =============== 4 | 5 | `html5lib`_ is a Python package that implements the HTML5 parsing algorithm 6 | which is heavily influenced by current browsers and based on the `WHATWG 7 | HTML5 specification`_. 8 | 9 | .. _html5lib: http://code.google.com/p/html5lib/ 10 | .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ 11 | .. _WHATWG HTML5 specification: http://www.whatwg.org/specs/web-apps/current-work/ 12 | 13 | lxml can benefit from the parsing capabilities of `html5lib` through 14 | the ``lxml.html.html5parser`` module. It provides a similar interface 15 | to the ``lxml.html`` module by providing ``fromstring()``, 16 | ``parse()``, ``document_fromstring()``, ``fragment_fromstring()`` and 17 | ``fragments_fromstring()`` that work like the regular html parsing 18 | functions. 19 | 20 | 21 | Differences to regular HTML parsing 22 | =================================== 23 | 24 | There are a few differences in the returned tree to the regular HTML 25 | parsing functions from ``lxml.html``. html5lib normalizes some elements 26 | and element structures to a common format. For example even if a tables 27 | does not have a `tbody` html5lib will inject one automatically: 28 | 29 | .. sourcecode:: pycon 30 | 31 | >>> from lxml.html import tostring, html5parser 32 | >>> tostring(html5parser.fromstring("
foo")) 33 | '
foo
' 34 | 35 | Also the parameters the functions accept are different. 36 | 37 | 38 | Function Reference 39 | ================== 40 | 41 | ``parse(filename_url_or_file)``: 42 | Parses the named file or url, or if the object has a ``.read()`` 43 | method, parses from that. 44 | 45 | ``document_fromstring(html, guess_charset=True)``: 46 | Parses a document from the given string. This always creates a 47 | correct HTML document, which means the parent node is ````, 48 | and there is a body and possibly a head. 49 | 50 | If a bytestring is passed and ``guess_charset`` is true the chardet 51 | library (if installed) will guess the charset if ambiguities exist. 52 | 53 | ``fragment_fromstring(string, create_parent=False, guess_charset=False)``: 54 | Returns an HTML fragment from a string. The fragment must contain 55 | just a single element, unless ``create_parent`` is given; 56 | e.g,. ``fragment_fromstring(string, create_parent='div')`` will 57 | wrap the element in a ``
``. If ``create_parent`` is true the 58 | default parent tag (div) is used. 59 | 60 | If a bytestring is passed and ``guess_charset`` is true the chardet 61 | library (if installed) will guess the charset if ambiguities exist. 62 | 63 | ``fragments_fromstring(string, no_leading_text=False, parser=None)``: 64 | Returns a list of the elements found in the fragment. The first item in 65 | the list may be a string. If ``no_leading_text`` is true, then it will 66 | be an error if there is leading text, and it will always be a list of 67 | only elements. 68 | 69 | If a bytestring is passed and ``guess_charset`` is true the chardet 70 | library (if installed) will guess the charset if ambiguities exist. 71 | 72 | ``fromstring(string)``: 73 | Returns ``document_fromstring`` or ``fragment_fromstring``, based 74 | on whether the string looks like a full document, or just a 75 | fragment. 76 | 77 | Additionally all parsing functions accept an ``parser`` keyword argument 78 | that can be set to a custom parser instance. To create custom parsers 79 | you can subclass the ``HTMLParser`` and ``XHTMLParser`` from the same 80 | module. Note that these are the parser classes provided by html5lib. 81 | -------------------------------------------------------------------------------- /doc/intro.txt: -------------------------------------------------------------------------------- 1 | Why lxml? 2 | ========= 3 | 4 | .. contents:: 5 | .. 6 | 1 Motto 7 | 2 Aims 8 | 9 | 10 | Motto 11 | ----- 12 | 13 | "the thrills without the strangeness" 14 | 15 | To explain the motto: 16 | 17 | "Programming with libxml2 is like the thrilling embrace of an exotic stranger. 18 | It seems to have the potential to fulfill your wildest dreams, but there's a 19 | nagging voice somewhere in your head warning you that you're about to get 20 | screwed in the worst way." (`a quote by Mark Pilgrim`_) 21 | 22 | Mark Pilgrim was describing in particular the experience a Python programmer 23 | has when dealing with libxml2. The default Python bindings of libxml2 are 24 | fast, thrilling, powerful, and your code might fail in some horrible way that 25 | you really shouldn't have to worry about when writing Python code. lxml 26 | combines the power of libxml2 with the ease of use of Python. 27 | 28 | .. _`a quote by Mark Pilgrim`: http://diveintomark.org/archives/2004/02/18/libxml2 29 | 30 | 31 | Aims 32 | ---- 33 | 34 | The C libraries libxml2_ and libxslt_ have huge benefits: 35 | 36 | * Standards-compliant XML support. 37 | 38 | * Support for (broken) HTML. 39 | 40 | * Full-featured. 41 | 42 | * Actively maintained by XML experts. 43 | 44 | * fast. fast! FAST! 45 | 46 | .. _libxml2: http://www.xmlsoft.org 47 | 48 | .. _libxslt: http://xmlsoft.org/XSLT 49 | 50 | 51 | These libraries already ship with Python bindings, but these Python bindings 52 | mimic the C-level interface. This yields a number of problems: 53 | 54 | * very low level and C-ish (not Pythonic). 55 | 56 | * underdocumented and huge, you get lost in them. 57 | 58 | * UTF-8 in API, instead of Python unicode strings. 59 | 60 | * Can easily cause segfaults from Python. 61 | 62 | * Require manual memory management! 63 | 64 | 65 | lxml is a new Python binding for libxml2 and libxslt, completely independent 66 | from these existing Python bindings. Its aims: 67 | 68 | * Pythonic API. 69 | 70 | * Documented. 71 | 72 | * Use Python unicode strings in API. 73 | 74 | * Safe (no segfaults). 75 | 76 | * No manual memory management! 77 | 78 | lxml aims to provide a Pythonic API by following as much as possible the 79 | `ElementTree API`_. We're trying to avoid inventing too many new APIs, or you 80 | having to learn new things -- XML is complicated enough. 81 | 82 | .. _`ElementTree API`: http://effbot.org/zone/element-index.htm 83 | -------------------------------------------------------------------------------- /doc/licenses/BSD.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2004 Infrae. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in 12 | the documentation and/or other materials provided with the 13 | distribution. 14 | 15 | 3. Neither the name of Infrae nor the names of its contributors may 16 | be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INFRAE OR 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /doc/licenses/ZopePublicLicense.txt: -------------------------------------------------------------------------------- 1 | Zope Public License (ZPL) Version 2.0 2 | ----------------------------------------------- 3 | 4 | This software is Copyright (c) Zope Corporation (tm) and 5 | Contributors. All rights reserved. 6 | 7 | This license has been certified as open source. It has also 8 | been designated as GPL compatible by the Free Software 9 | Foundation (FSF). 10 | 11 | Redistribution and use in source and binary forms, with or 12 | without modification, are permitted provided that the 13 | following conditions are met: 14 | 15 | 1. Redistributions in source code must retain the above 16 | copyright notice, this list of conditions, and the following 17 | disclaimer. 18 | 19 | 2. Redistributions in binary form must reproduce the above 20 | copyright notice, this list of conditions, and the following 21 | disclaimer in the documentation and/or other materials 22 | provided with the distribution. 23 | 24 | 3. The name Zope Corporation (tm) must not be used to 25 | endorse or promote products derived from this software 26 | without prior written permission from Zope Corporation. 27 | 28 | 4. The right to distribute this software or to use it for 29 | any purpose does not give you the right to use Servicemarks 30 | (sm) or Trademarks (tm) of Zope Corporation. Use of them is 31 | covered in a separate agreement (see 32 | http://www.zope.com/Marks). 33 | 34 | 5. If any files are modified, you must cause the modified 35 | files to carry prominent notices stating that you changed 36 | the files and the date of any change. 37 | 38 | Disclaimer 39 | 40 | THIS SOFTWARE IS PROVIDED BY ZOPE CORPORATION ``AS IS'' 41 | AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT 42 | NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 43 | AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 44 | NO EVENT SHALL ZOPE CORPORATION OR ITS CONTRIBUTORS BE 45 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 46 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 47 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 48 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 50 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 51 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 52 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 53 | DAMAGE. 54 | 55 | 56 | This software consists of contributions made by Zope 57 | Corporation and many individuals on behalf of Zope 58 | Corporation. Specific attributions are listed in the 59 | accompanying credits file. 60 | -------------------------------------------------------------------------------- /doc/licenses/elementtree.txt: -------------------------------------------------------------------------------- 1 | The ElementTree / XML Toys Library is 2 | 3 | Copyright (c) 1999-2003 by Secret Labs AB 4 | Copyright (c) 1999-2003 by Fredrik Lundh 5 | 6 | By obtaining, using, and/or copying this software and/or its 7 | associated documentation, you agree that you have read, understood, 8 | and will comply with the following terms and conditions: 9 | 10 | Permission to use, copy, modify, and distribute this software and its 11 | associated documentation for any purpose and without fee is hereby 12 | granted, provided that the above copyright notice appears in all 13 | copies, and that both that copyright notice and this permission notice 14 | appear in supporting documentation, and that the name of Secret Labs 15 | AB or the author not be used in advertising or publicity pertaining to 16 | distribution of the software without specific, written prior 17 | permission. 18 | 19 | SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 20 | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 21 | FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 22 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 23 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 24 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 25 | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 26 | -------------------------------------------------------------------------------- /doc/lxml.mgp: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | %deffont "standard" xfont "helvetica-medium-r" 3 | %deffont "thick" xfont "helvetica-bold-r" 4 | %deffont "typewriter" xfont "courier-medium-r" 5 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 6 | %% 7 | %% Default settings per each line numbers. 8 | %% 9 | %default 1 area 90 90, leftfill, size 2, fore "gray20", back "white", font "standard", hgap 0 10 | %default 2 size 7, vgap 10, prefix " ", ccolor "blue" 11 | %default 3 size 2, bar "gray70", vgap 10 12 | %default 4 size 5, fore "gray20", vgap 30, prefix " ", font "standard" 13 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 14 | %% 15 | %% Default settings that are applied to TAB-indented lines. 16 | %% 17 | %tab 1 size 5, vgap 40, prefix " ", icon box "red" 50 18 | %tab 2 size 4, vgap 40, prefix " ", icon arc "yellow" 50 19 | %tab 3 size 3, vgap 40, prefix " ", icon delta3 "white" 40 20 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 21 | %page 22 | 23 | lxml - a sane Python wrapper for libxml 24 | 25 | 26 | 27 | %center 28 | Martijn Faassen, Infrae 29 | faassen@infrae.com 30 | 31 | %page 32 | 33 | The C library libxml has huge benefits 34 | 35 | 36 | Standards-compliant XML support 37 | 38 | full-featured 39 | 40 | actively maintained by XML exports 41 | 42 | fast. fast! FAST! 43 | 44 | %page 45 | 46 | Features of libxml 47 | 48 | 49 | Parsing 50 | 51 | Tree based (DOM-ish) XML structure 52 | 53 | XPath support 54 | 55 | XSLT support (libxslt) 56 | 57 | Relax NG (schema) support 58 | 59 | And more 60 | 61 | %page 62 | 63 | But libxml already has Python bindings! 64 | 65 | 66 | very low level and C-ish (not Pythonic) 67 | 68 | underdocumented. huge, you get lost in them 69 | 70 | works with UTF-8, not native Python unicode 71 | 72 | can cause segfaults from Python 73 | 74 | have to do manual memory management! 75 | 76 | %page 77 | 78 | lxml is a new Python binding for libxml 79 | 80 | Aims (read: TODOS) 81 | 82 | Pythonic API 83 | 84 | Documented 85 | 86 | Use Python unicode strings in API 87 | 88 | Safe (no segfaults) 89 | 90 | No manual memory management! 91 | 92 | %page 93 | 94 | Tradeoffs 95 | 96 | 97 | Slower because of better wrapping. 98 | 99 | But libxml is so fast this likely doesn't matter much. 100 | 101 | Not all features of libxml exposed (unless you help) 102 | 103 | %page 104 | 105 | What is there now - Proof of concept 106 | 107 | 108 | Automatic destruction of documents (refcounted) 109 | 110 | Start of ElementTree style API for tree 111 | 112 | %page 113 | 114 | Future 115 | 116 | 117 | Fix bugs, add features 118 | 119 | Moving into svn repository on codespeak.net 120 | 121 | Help! 122 | 123 | -------------------------------------------------------------------------------- /doc/memorymanagement.txt: -------------------------------------------------------------------------------- 1 | Memory management 2 | ================= 3 | 4 | There can be two types of nodes: 5 | 6 | * those connected to an existing tree 7 | 8 | * those unconnected. These may be the top node of a tree 9 | 10 | Nodes consist of a C-level libxml2 node, Node for short, and 11 | optionally a Python-level proxy node, Proxy. Zero, one or more Proxies can 12 | exist for a single Node. 13 | 14 | Proxies are garbage collected automatically by Python. Nodes are not 15 | garbage collected at all. Instead, explicit mechanisms exist for 16 | Nodes to clear them and the tree they may be the top of. 17 | 18 | A Node can be safely freed when: 19 | 20 | * no Proxy is connected to this Node 21 | 22 | * no Proxy cannot be created for this Node 23 | 24 | A Proxy cannot be created to a CNode when: 25 | 26 | * no Proxy exist for nodes that are connected to that Node 27 | 28 | This is the case when: 29 | 30 | * the Node is in a tree that has no Proxy connected to any of the nodes. 31 | 32 | This means that the whole tree in such a condition can be freed. 33 | 34 | Detecting whether a Node is in a tree thas has no Proxies connected to 35 | it can be done by relying on Python's garbage collection 36 | algorithm. Each Proxy can have a reference to the Proxy that points to 37 | the top of the tree. In case of a document tree, this reference is to 38 | the Document Proxy. When no more references exist in the system to the 39 | top Proxy, this means no more Proxies exist that point to the Node 40 | tree the top Proxy is the top of. If this Node tree is unconnected; 41 | i.e. it is not a subtree, this means that tree can be safely garbage 42 | collected. 43 | 44 | A special case exists for document references. Each Proxy will always 45 | have a reference to the Document Proxy, as any Node will have such a 46 | reference to the Document Node. This means that a Document Node can 47 | only be garbage collected when no more Proxies at all exist anymore 48 | which refer to the Document. This is a separate system from the 49 | top-Node references, even though the top-node in many cases will be 50 | the Document. This because there is no way to get to a node that is 51 | not connected to the Document tree from a Document Proxy. 52 | 53 | This approach requires a system that can keep track of the top of the 54 | tree in any case. Usually this is simple: when a Proxy gets connected, 55 | the tree top becomes the tree top of whatever node it is connected 56 | to. 57 | 58 | Sometimes this is more difficult: a Proxy may exist pointing to a node 59 | in a subtree that just got connected. The top reference cannot be 60 | updated. This is a problem in the following case: 61 | 62 | a 63 | b c h 64 | d e f g i j 65 | k 66 | 67 | now imagine we have a proxy to k, K, and a proxy of i, I. They both 68 | have a pointer to proxy H. 69 | 70 | Now imagine i gets moved under g through proxy I. Proxy I will have an 71 | updated pointer to proxy A. However, proxy K cannot be updated and still 72 | points to H, from which it is now in fact disconnected. 73 | 74 | proxy H cannot be removed now until proxy A is removed. In addition, 75 | proxy A has a refcount that is too low because proxy K doesn't point 76 | to it but should. 77 | 78 | Another strategy involves having a reference count on the underlying 79 | nodes, one per proxy. A node can only be freed if there is no 80 | descendant-or-self that has the refcount higher than 0. A node, when 81 | no more Python references to it exist, will check for refcounts first. 82 | The drawback of this is potentially heavy tree-walking each time a proxy 83 | can be removed. 84 | -------------------------------------------------------------------------------- /doc/pubkey.asc: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP PUBLIC KEY BLOCK----- 2 | Version: GnuPG v1.4.2 (GNU/Linux) 3 | 4 | mQGiBEQf3JQRBACciSqxoX0q3VurkRENVVtG/pVqtFh/d2CohbVJlLCrO4s7nnPj 5 | CTfZFt6tmykZjsLJl24XpEJt0O/C0jLcaBqvXVgVvRXHz4DjEYYuQF4LPthhI4MA 6 | 4T7ExptX4lU5g3BVJ46vPU8uRBbbxarBRas9rYewgnrYKWpZZCa7yMq+9wCgnyyR 7 | Si4E3viLwi77jda135nA6vcD/iqu8zIl9/dFuUcOvxJrhrm+UdY72puZ1TVczSAH 8 | GOqMjrKkfyHlaJh/ZzWENpTZIfOdVhy7Chvva18vH4Wz7jKj5UeIpRrBvjAD28r3 9 | Y3W5bfsnpPkvDOyU1vqBsw4q+/250GXEX0JqV2Rbf5yLVgEZPdGrswO460dr4UVS 10 | 8RS0BACYTmyrz57AugHc5tRkqNw6o7ux2deOT0c3AbUcOWtOocGumCsUf+M1nOrc 11 | VWkeBWTv4HIIiecWYY/KwIemTthQGjxywaZDxOlBT0BOL/+vfYTq/plZULXr+g90 12 | rSe82+kLl9N5onkBDJKeDIcJDzRoxIRPV1i0Om/5JBI4jmUnv7QnU3RlZmFuIEJl 13 | aG5lbCA8c2NvZGVyQHVzZXJzLmJlcmxpb3MuZGU+iF8EExECACAFAkQiqKYCGwMG 14 | CwkIBwMCBBUCCAMEFgIDAQIeAQIXgAAKCRANPVNpCNOgHi+2AJ0a0JH8iP3RqrOL 15 | JefvHz1dSl3MxACYo7Ma6CeIgsGnyaSSdNOmNVXn+IhGBBARAgAGBQJEIqk0AAoJ 16 | ELO5mMzzmgZbmCcAoKZ2En1IlsxBpaPPxgWYrUOWfc6hAKCBWODMMOYptCBkSrjg 17 | m3gsrjHgYbQsU3RlZmFuIEJlaG5lbCA8c2NvZGVyQHVzZXJzLnNvdXJjZWZvcmdl 18 | Lm5ldD6IYAQTEQIAIAUCRB/clAIbAwYLCQgHAwIEFQIIAwQWAgMBAh4BAheAAAoJ 19 | EA09U2kI06Aen2YAn0hvuDs+Gslq9vPRFFbsFNJI40PmAJ0chjiiEy0xV5C+n6YX 20 | XFuldRDILYhGBBARAgAGBQJEIp4AAAoJELO5mMzzmgZbgKQAn3pWrmFdj8YaEyuR 21 | tEjKVZJDQ6ZVAJ0Y1igwADT40BPra+G/xiLa3YbCrrkCDQREH9ynEAgAiR4/0r0d 22 | doViNECfSLClllu5K0Bo1SEiMtvVNC3sJYgVzBddD8Xn8UAdjyAgmaL5FC2FsNQu 23 | RxxKkNlHNYCq8ZSWtZaL2MQ+SyMUyHv6VXVCGuSW0COpzbx58u+SZpjyESJ1kaZc 24 | 73SaIw6kv/dVQHjurwmlo1lg3dLZ3PG08WGCYUMqkkv2K+J7+puzE2Cjo31gTq4s 25 | LYDCV26wjVQ6BqT2EcHQhVEjh0xq5ugc908cr/2FQAKkTifEbF+OVBGWiFMGgri+ 26 | 6+G54/BV/RakpvNCFYBiZHn/M9mQaWt7XoTmnEQ1ldq5KNlRhkqnQRF/NK5VpGcQ 27 | 29As28aqpZTECwADBgf/WlRvBRI1Q1eIv2falEv7C6sOxqc3kr5z1uUBTRG5v9t6 28 | ff9k/J4oC6cnQx00GK3ZR8ija6bl8zwu+0m0M3rW49Krb1rsiT7r4ahOZ7p9RRro 29 | oG3NbUJYgMG10D1nxpaioYqa/m+PpILJM0wfYZZEuX0xkZcOB24yb+J7EIcGR09T 30 | mMd5sXtdTU+w/p7Xi2cP61uQ8qixyHBH8E06qgW2JtVFV9rGn7CNUOvkNaUBRnY5 31 | QxhdkvKJRx7voOLYWZFUBIWgto+6vmTgKmc2Ho6qddzME9UgwUNcknRgm0cf6Cxr 32 | 6zPtxZl8a6KemjQcK7kARSmMNCDkqp/Pohe519A5vYhJBBgRAgAJBQJEH9ynAhsM 33 | AAoJEA09U2kI06Aesv4AnjiVQVLzqnNS/64vvMMP1UARY3HtAJ90YxNGhRNIhWYL 34 | UU16oJlGD/9M1Q== 35 | =gWy2 36 | -----END PGP PUBLIC KEY BLOCK----- 37 | -------------------------------------------------------------------------------- /doc/rest2html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | A minimal front end to the Docutils Publisher, producing HTML with 5 | Pygments syntax highlighting. 6 | """ 7 | 8 | # Set to True if you want inline CSS styles instead of classes 9 | INLINESTYLES = False 10 | 11 | 12 | try: 13 | import locale 14 | locale.setlocale(locale.LC_ALL, '') 15 | except: 16 | pass 17 | 18 | # set up Pygments 19 | 20 | from pygments.formatters import HtmlFormatter 21 | 22 | # The default formatter 23 | DEFAULT = HtmlFormatter(noclasses=INLINESTYLES, cssclass='syntax') 24 | 25 | # Add name -> formatter pairs for every variant you want to use 26 | VARIANTS = { 27 | # 'linenos': HtmlFormatter(noclasses=INLINESTYLES, linenos=True), 28 | } 29 | 30 | 31 | from docutils import nodes 32 | from docutils.parsers.rst import directives 33 | 34 | from pygments import highlight 35 | from pygments.lexers import get_lexer_by_name, TextLexer 36 | 37 | def pygments_directive(name, arguments, options, content, lineno, 38 | content_offset, block_text, state, state_machine): 39 | try: 40 | lexer = get_lexer_by_name(arguments[0]) 41 | except ValueError, e: 42 | # no lexer found - use the text one instead of an exception 43 | lexer = TextLexer() 44 | # take an arbitrary option if more than one is given 45 | formatter = options and VARIANTS[options.keys()[0]] or DEFAULT 46 | parsed = highlight(u'\n'.join(content), lexer, formatter) 47 | return [nodes.raw('', parsed, format='html')] 48 | 49 | pygments_directive.arguments = (1, 0, 1) 50 | pygments_directive.content = 1 51 | pygments_directive.options = dict([(key, directives.flag) for key in VARIANTS]) 52 | 53 | directives.register_directive('sourcecode', pygments_directive) 54 | 55 | 56 | # run the generation 57 | 58 | from docutils.core import publish_cmdline, default_description 59 | 60 | description = ('Generates (X)HTML documents from standalone reStructuredText ' 61 | 'sources. ' + default_description) 62 | 63 | publish_cmdline(writer_name='html', description=description) 64 | -------------------------------------------------------------------------------- /doc/rest2latex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Testing: 4 | # python rest2latex.py objectify.txt > latex/objectify.tex 5 | 6 | """ 7 | A minimal front end to the Docutils Publisher, producing LaTeX with 8 | some syntax highlighting. 9 | """ 10 | 11 | # Set to True if you want inline CSS styles instead of classes 12 | INLINESTYLES = False 13 | 14 | 15 | try: 16 | import locale 17 | locale.setlocale(locale.LC_ALL, '') 18 | except: 19 | pass 20 | 21 | # set up Pygments 22 | 23 | from pygments.formatters import LatexFormatter 24 | 25 | # The default formatter 26 | DEFAULT = LatexFormatter() 27 | 28 | # Add name -> formatter pairs for every variant you want to use 29 | VARIANTS = { 30 | # 'linenos': HtmlFormatter(noclasses=INLINESTYLES, linenos=True), 31 | } 32 | 33 | 34 | from docutils import nodes 35 | from docutils.parsers.rst import directives 36 | 37 | from pygments import highlight 38 | from pygments.lexers import get_lexer_by_name, TextLexer 39 | 40 | def pygments_directive(name, arguments, options, content, lineno, 41 | content_offset, block_text, state, state_machine): 42 | try: 43 | lexer = get_lexer_by_name(arguments[0]) 44 | except ValueError, e: 45 | # no lexer found - use the text one instead of an exception 46 | lexer = TextLexer() 47 | # take an arbitrary option if more than one is given 48 | formatter = options and VARIANTS[options.keys()[0]] or DEFAULT 49 | parsed = highlight(u'\n'.join(content), lexer, formatter) 50 | return [nodes.raw('', parsed, format='latex')] 51 | 52 | pygments_directive.arguments = (1, 0, 1) 53 | pygments_directive.content = 1 54 | pygments_directive.options = dict([(key, directives.flag) for key in VARIANTS]) 55 | 56 | directives.register_directive('sourcecode', pygments_directive) 57 | 58 | 59 | # run the generation 60 | 61 | from docutils.core import publish_cmdline, default_description 62 | 63 | description = ('Generates LaTeX documents from standalone reStructuredText ' 64 | 'sources. ' + default_description) 65 | 66 | publish_cmdline(writer_name='latex2e', description=description) 67 | -------------------------------------------------------------------------------- /doc/s5/Makefile: -------------------------------------------------------------------------------- 1 | PYTHON?=python 2 | 3 | SLIDES=$(subst .txt,.html,$(wildcard *.txt)) 4 | 5 | slides: $(SLIDES) 6 | 7 | %.html: %.txt 8 | $(PYTHON) rst2s5.py --current-slide --language=en $< $@ 9 | 10 | clean: 11 | rm -f *~ $(SLIDES) 12 | -------------------------------------------------------------------------------- /doc/s5/ep2008/atom-example.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Feed 5 | 6 | 2003-12-13T18:30:02Z 7 | 8 | John Doe 9 | 10 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 11 | 12 | 13 | Atom-Powered Robots Run Amok 14 | 15 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 16 | 2003-12-13T18:30:02Z 17 | Some text. 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /doc/s5/ep2008/atomgen.py: -------------------------------------------------------------------------------- 1 | # atomgen.py 2 | 3 | import os.path 4 | 5 | from lxml import etree 6 | from lxml.builder import ElementMaker 7 | 8 | ATOM_NAMESPACE = "http://www.w3.org/2005/Atom" 9 | 10 | A = ElementMaker(namespace=ATOM_NAMESPACE, 11 | nsmap={None : ATOM_NAMESPACE}) 12 | 13 | feed = A.feed 14 | entry = A.entry 15 | title = A.title 16 | author = A.author 17 | name = A.name 18 | link = A.link 19 | summary = A.summary 20 | id = A.id 21 | updated = A.updated 22 | # ... and so on and so forth ... 23 | 24 | 25 | # plus a little validation function: isvalid() 26 | isvalid = etree.RelaxNG( 27 | file=os.path.join(os.path.abspath(os.path.dirname(__file__)), "atom.rng")) 28 | -------------------------------------------------------------------------------- /doc/s5/ep2008/proxies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/s5/ep2008/proxies.png -------------------------------------------------------------------------------- /doc/s5/rst2s5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | The Pygments reStructuredText directive 4 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 5 | 6 | This fragment is a Docutils_ 0.5 directive that renders source code 7 | (to HTML only, currently) via Pygments. 8 | 9 | To use it, adjust the options below and copy the code into a module 10 | that you import on initialization. The code then automatically 11 | registers a ``sourcecode`` directive that you can use instead of 12 | normal code blocks like this:: 13 | 14 | .. sourcecode:: python 15 | 16 | My code goes here. 17 | 18 | If you want to have different code styles, e.g. one with line numbers 19 | and one without, add formatters with their names in the VARIANTS dict 20 | below. You can invoke them instead of the DEFAULT one by using a 21 | directive option:: 22 | 23 | .. sourcecode:: python 24 | :linenos: 25 | 26 | My code goes here. 27 | 28 | Look at the `directive documentation`_ to get all the gory details. 29 | 30 | .. _Docutils: http://docutils.sf.net/ 31 | .. _directive documentation: 32 | http://docutils.sourceforge.net/docs/howto/rst-directives.html 33 | 34 | :copyright: Copyright 2006-2009 by the Pygments team, see AUTHORS. 35 | :license: BSD, see LICENSE for details. 36 | """ 37 | 38 | # Options 39 | # ~~~~~~~ 40 | 41 | # Set to True if you want inline CSS styles instead of classes 42 | INLINESTYLES = False 43 | STYLE = "fruity" 44 | 45 | from pygments.formatters import HtmlFormatter 46 | 47 | # The default formatter 48 | DEFAULT = HtmlFormatter(noclasses=INLINESTYLES, style=STYLE) 49 | 50 | # Add name -> formatter pairs for every variant you want to use 51 | VARIANTS = { 52 | # 'linenos': HtmlFormatter(noclasses=INLINESTYLES, linenos=True), 53 | } 54 | 55 | 56 | from docutils import nodes 57 | from docutils.parsers.rst import directives, Directive 58 | 59 | from pygments import highlight 60 | from pygments.lexers import get_lexer_by_name, TextLexer 61 | 62 | class Pygments(Directive): 63 | """ Source code syntax hightlighting. 64 | """ 65 | required_arguments = 1 66 | optional_arguments = 0 67 | final_argument_whitespace = True 68 | option_spec = dict([(key, directives.flag) for key in VARIANTS]) 69 | has_content = True 70 | 71 | def run(self): 72 | self.assert_has_content() 73 | try: 74 | lexer = get_lexer_by_name(self.arguments[0]) 75 | except ValueError: 76 | # no lexer found - use the text one instead of an exception 77 | lexer = TextLexer() 78 | # take an arbitrary option if more than one is given 79 | formatter = self.options and VARIANTS[self.options.keys()[0]] or DEFAULT 80 | 81 | # print >>open('ui/default/pygments.css', 'w'), formatter.get_style_defs('.highlight') 82 | parsed = highlight(u'\n'.join(self.content), lexer, formatter) 83 | return [nodes.raw('', parsed, format='html')] 84 | 85 | directives.register_directive('sourcecode', Pygments) 86 | 87 | from docutils.core import publish_cmdline, default_description 88 | 89 | description = ('Generates S5 (X)HTML slideshow documents from standalone ' 90 | 'reStructuredText sources. ' + default_description) 91 | 92 | publish_cmdline(writer_name='s5', description=description) 93 | -------------------------------------------------------------------------------- /doc/s5/tagpython.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/s5/tagpython.png -------------------------------------------------------------------------------- /doc/s5/ui/default/blank.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/s5/ui/default/blank.gif -------------------------------------------------------------------------------- /doc/s5/ui/default/bodybg.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/s5/ui/default/bodybg.gif -------------------------------------------------------------------------------- /doc/s5/ui/default/framing.css: -------------------------------------------------------------------------------- 1 | /* The following styles size, place, and layer the slide components. 2 | Edit these if you want to change the overall slide layout. 3 | The commented lines can be uncommented (and modified, if necessary) 4 | to help you with the rearrangement process. */ 5 | 6 | /* target = 1024x768 */ 7 | 8 | div#header, div#footer, .slide {width: 100%; top: 0; left: 0;} 9 | div#header {top: 0; height: 3em; z-index: 1;} 10 | div#footer {top: auto; bottom: 0; height: 2.5em; z-index: 5;} 11 | .slide {top: 0; width: 92%; padding: 3.5em 4% 4%; z-index: 2; list-style: none;} 12 | div#controls {left: 50%; bottom: 0; width: 50%; z-index: 100;} 13 | div#controls form {position: absolute; bottom: 0; right: 0; width: 100%; 14 | margin: 0;} 15 | #currentSlide {position: absolute; width: 10%; left: 45%; bottom: 1em; z-index: 10;} 16 | html>body #currentSlide {position: fixed;} 17 | 18 | /* 19 | div#header {background: #FCC;} 20 | div#footer {background: #CCF;} 21 | div#controls {background: #BBD;} 22 | div#currentSlide {background: #FFC;} 23 | */ 24 | -------------------------------------------------------------------------------- /doc/s5/ui/default/iepngfix.htc: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 42 | -------------------------------------------------------------------------------- /doc/s5/ui/default/lxml-logo64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/s5/ui/default/lxml-logo64.png -------------------------------------------------------------------------------- /doc/s5/ui/default/opera.css: -------------------------------------------------------------------------------- 1 | /* DO NOT CHANGE THESE unless you really want to break Opera Show */ 2 | .slide { 3 | visibility: visible !important; 4 | position: static !important; 5 | page-break-before: always; 6 | } 7 | #slide0 {page-break-before: avoid;} 8 | -------------------------------------------------------------------------------- /doc/s5/ui/default/outline.css: -------------------------------------------------------------------------------- 1 | /* don't change this unless you want the layout stuff to show up in the outline view! */ 2 | 3 | .layout div, #footer *, #controlForm * {display: none;} 4 | #footer, #controls, #controlForm, #navLinks, #toggle { 5 | display: block; visibility: visible; margin: 0; padding: 0;} 6 | #toggle {float: right; padding: 0.5em;} 7 | html>body #toggle {position: fixed; top: 0; right: 0;} 8 | 9 | /* making the outline look pretty-ish */ 10 | 11 | #slide0 h1, #slide0 h2, #slide0 h3, #slide0 h4 {border: none; margin: 0;} 12 | #slide0 h1 {padding-top: 1.5em;} 13 | .slide h1 {margin: 1.5em 0 0; padding-top: 0.25em; 14 | border-top: 1px solid #888; border-bottom: 1px solid #AAA;} 15 | #toggle {border: 1px solid; border-width: 0 0 1px 1px; background: #FFF;} 16 | -------------------------------------------------------------------------------- /doc/s5/ui/default/print.css: -------------------------------------------------------------------------------- 1 | /* The following rule is necessary to have all slides appear in print! DO NOT REMOVE IT! */ 2 | .slide, ul {page-break-inside: avoid; visibility: visible !important;} 3 | h1 {page-break-after: avoid;} 4 | 5 | body {font-size: 12pt; background: white;} 6 | * {color: black;} 7 | 8 | #slide0 h1 {font-size: 200%; border: none; margin: 0.5em 0 0.25em;} 9 | #slide0 h3 {margin: 0; padding: 0;} 10 | #slide0 h4 {margin: 0 0 0.5em; padding: 0;} 11 | #slide0 {margin-bottom: 3em;} 12 | 13 | h1 {border-top: 2pt solid gray; border-bottom: 1px dotted silver;} 14 | .extra {background: transparent !important;} 15 | div.extra, pre.extra, .example {font-size: 10pt; color: #333;} 16 | ul.extra a {font-weight: bold;} 17 | p.example {display: none;} 18 | 19 | #header {display: none;} 20 | #footer h1 {margin: 0; border-bottom: 1px solid; color: gray; font-style: italic;} 21 | #footer h2, #controls {display: none;} 22 | 23 | /* The following rule keeps the layout stuff out of print. Remove at your own risk! */ 24 | .layout, .layout * {display: none !important;} 25 | -------------------------------------------------------------------------------- /doc/s5/ui/default/s5-core.css: -------------------------------------------------------------------------------- 1 | /* Do not edit or override these styles! The system will likely break if you do. */ 2 | 3 | div#header, div#footer, div#controls, .slide {position: absolute;} 4 | html>body div#header, html>body div#footer, 5 | html>body div#controls, html>body .slide {position: fixed;} 6 | .handout {display: none;} 7 | .layout {display: block;} 8 | .slide, .hideme, .incremental {visibility: hidden;} 9 | #slide0 {visibility: visible;} 10 | -------------------------------------------------------------------------------- /doc/s5/ui/default/slides.css: -------------------------------------------------------------------------------- 1 | @import url(s5-core.css); /* required to make the slide show run at all */ 2 | @import url(framing.css); /* sets basic placement and size of slide components */ 3 | @import url(pretty.css); /* stuff that makes the slides look better than blah */ -------------------------------------------------------------------------------- /doc/s5/ui/default/tagpython.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/doc/s5/ui/default/tagpython.png -------------------------------------------------------------------------------- /doc/sax.txt: -------------------------------------------------------------------------------- 1 | Sax support 2 | =========== 3 | 4 | In this document we'll describe lxml's SAX support. lxml has support for 5 | producing SAX events for an ElementTree or Element. lxml can also turn SAX 6 | events into an ElementTree. The SAX API used by lxml is compatible with that 7 | in the Python core (xml.sax), so is useful for interfacing lxml with code that 8 | uses the Python core SAX facilities. 9 | 10 | .. contents:: 11 | .. 12 | 1 Building a tree from SAX events 13 | 2 Producing SAX events from an ElementTree or Element 14 | 3 Interfacing with pulldom/minidom 15 | 16 | .. 17 | >>> try: from StringIO import StringIO 18 | ... except ImportError: 19 | ... from io import BytesIO 20 | ... def StringIO(s): 21 | ... if isinstance(s, str): s = s.encode("UTF-8") 22 | ... return BytesIO(s) 23 | 24 | 25 | Building a tree from SAX events 26 | ------------------------------- 27 | 28 | First of all, lxml has support for building a new tree given SAX events. To 29 | do this, we use the special SAX content handler defined by lxml named 30 | ``lxml.sax.ElementTreeContentHandler``: 31 | 32 | .. sourcecode:: pycon 33 | 34 | >>> import lxml.sax 35 | >>> handler = lxml.sax.ElementTreeContentHandler() 36 | 37 | Now let's fire some SAX events at it: 38 | 39 | .. sourcecode:: pycon 40 | 41 | >>> handler.startElementNS((None, 'a'), 'a', {}) 42 | >>> handler.startElementNS((None, 'b'), 'b', {(None, 'foo'): 'bar'}) 43 | >>> handler.characters('Hello world') 44 | >>> handler.endElementNS((None, 'b'), 'b') 45 | >>> handler.endElementNS((None, 'a'), 'a') 46 | 47 | This constructs an equivalent tree. You can access it through the ``etree`` 48 | property of the handler: 49 | 50 | .. sourcecode:: pycon 51 | 52 | >>> tree = handler.etree 53 | >>> lxml.etree.tostring(tree.getroot()) 54 | b'Hello world' 55 | 56 | By passing a ``makeelement`` function the constructor of 57 | ``ElementTreeContentHandler``, e.g. the one of a parser you configured, you 58 | can determine which element class lookup scheme should be used. 59 | 60 | 61 | Producing SAX events from an ElementTree or Element 62 | --------------------------------------------------- 63 | 64 | Let's make a tree we can generate SAX events for: 65 | 66 | .. sourcecode:: pycon 67 | 68 | >>> f = StringIO('Text') 69 | >>> tree = lxml.etree.parse(f) 70 | 71 | To see whether the correct SAX events are produced, we'll write a custom 72 | content handler.: 73 | 74 | .. sourcecode:: pycon 75 | 76 | >>> from xml.sax.handler import ContentHandler 77 | >>> class MyContentHandler(ContentHandler): 78 | ... def __init__(self): 79 | ... self.a_amount = 0 80 | ... self.b_amount = 0 81 | ... self.text = None 82 | ... 83 | ... def startElementNS(self, name, qname, attributes): 84 | ... uri, localname = name 85 | ... if localname == 'a': 86 | ... self.a_amount += 1 87 | ... if localname == 'b': 88 | ... self.b_amount += 1 89 | ... 90 | ... def characters(self, data): 91 | ... self.text = data 92 | 93 | Note that it only defines the startElementNS() method and not startElement(). 94 | The SAX event generator in lxml.sax currently only supports namespace-aware 95 | processing. 96 | 97 | To test the content handler, we can produce SAX events from the tree: 98 | 99 | .. sourcecode:: pycon 100 | 101 | >>> handler = MyContentHandler() 102 | >>> lxml.sax.saxify(tree, handler) 103 | 104 | This is what we expect: 105 | 106 | .. sourcecode:: pycon 107 | 108 | >>> handler.a_amount 109 | 1 110 | >>> handler.b_amount 111 | 1 112 | >>> handler.text 113 | 'Text' 114 | 115 | 116 | Interfacing with pulldom/minidom 117 | -------------------------------- 118 | 119 | lxml.sax is a simple way to interface with the standard XML support in the 120 | Python library. Note, however, that this is a one-way solution, as Python's 121 | DOM implementation connot generate SAX events from a DOM tree. 122 | 123 | You can use xml.dom.pulldom to build a minidom from lxml: 124 | 125 | .. sourcecode:: pycon 126 | 127 | >>> from xml.dom.pulldom import SAX2DOM 128 | >>> handler = SAX2DOM() 129 | >>> lxml.sax.saxify(tree, handler) 130 | 131 | PullDOM makes the result available through the ``document`` attribute: 132 | 133 | .. sourcecode:: pycon 134 | 135 | >>> dom = handler.document 136 | >>> print(dom.firstChild.localName) 137 | a 138 | -------------------------------------------------------------------------------- /doc/test.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /doc/valgrind.txt: -------------------------------------------------------------------------------- 1 | The command used to run the tests with valgrind: 2 | 3 | valgrind --tool=memcheck --leak-check=full --suppressions=valgrind-python.supp python2.7 test.py 4 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/Distutils/__init__.py: -------------------------------------------------------------------------------- 1 | # work around broken setuptools monkey patching 2 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/Distutils/build_ext.py: -------------------------------------------------------------------------------- 1 | build_ext = "yes, it's there!" 2 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/__init__.py: -------------------------------------------------------------------------------- 1 | # work around broken setuptools monkey patching 2 | -------------------------------------------------------------------------------- /samples/simple-ns.xml: -------------------------------------------------------------------------------- 1 | 2 | text 3 | texttail 4 | 5 | 6 | -------------------------------------------------------------------------------- /samples/simple.xml: -------------------------------------------------------------------------------- 1 | 2 | text 3 | texttail 4 | 5 | 6 | -------------------------------------------------------------------------------- /selftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/selftest.py -------------------------------------------------------------------------------- /src/lxml/__init__.py: -------------------------------------------------------------------------------- 1 | # this is a package 2 | 3 | def get_include(): 4 | """ 5 | Returns a list of header include paths (for lxml itself, libxml2 6 | and libxslt) needed to compile C code against lxml if it was built 7 | with statically linked libraries. 8 | """ 9 | import os 10 | lxml_path = __path__[0] 11 | include_path = os.path.join(lxml_path, 'includes') 12 | includes = [include_path, lxml_path] 13 | 14 | for name in os.listdir(include_path): 15 | path = os.path.join(include_path, name) 16 | if os.path.isdir(path): 17 | includes.append(path) 18 | 19 | return includes 20 | 21 | -------------------------------------------------------------------------------- /src/lxml/cssselect.py: -------------------------------------------------------------------------------- 1 | """CSS Selectors based on XPath. 2 | 3 | This module supports selecting XML/HTML tags based on CSS selectors. 4 | See the `CSSSelector` class for details. 5 | 6 | This is a thin wrapper around cssselect 0.7 or later. 7 | """ 8 | 9 | import sys 10 | from lxml import etree 11 | 12 | ## Work-around the lack of absolute import in Python 2.4 13 | #from __future__ import absolute_import 14 | #from cssselect import ... 15 | try: 16 | external_cssselect = __import__('cssselect') 17 | except ImportError: 18 | raise ImportError('cssselect seems not to be installed. ' 19 | 'See http://packages.python.org/cssselect/') 20 | 21 | SelectorSyntaxError = external_cssselect.SelectorSyntaxError 22 | ExpressionError = external_cssselect.ExpressionError 23 | SelectorError = external_cssselect.SelectorError 24 | 25 | 26 | __all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError', 27 | 'CSSSelector'] 28 | 29 | 30 | class LxmlTranslator(external_cssselect.GenericTranslator): 31 | """ 32 | A custom CSS selector to XPath translator with lxml-specific extensions. 33 | """ 34 | def xpath_contains_function(self, xpath, function): 35 | # Defined there, removed in later drafts: 36 | # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors 37 | if function.argument_types() not in (['STRING'], ['IDENT']): 38 | raise ExpressionError( 39 | "Expected a single string or ident for :contains(), got %r" 40 | % function.arguments) 41 | value = function.arguments[0].value 42 | return xpath.add_condition( 43 | 'contains(__lxml_internal_css:lower-case(string(.)), %s)' 44 | % self.xpath_literal(value.lower())) 45 | 46 | 47 | class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator): 48 | """ 49 | lxml extensions + HTML support. 50 | """ 51 | 52 | 53 | def _make_lower_case(context, s): 54 | return s.lower() 55 | 56 | ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') 57 | ns.prefix = '__lxml_internal_css' 58 | ns['lower-case'] = _make_lower_case 59 | 60 | 61 | class CSSSelector(etree.XPath): 62 | """A CSS selector. 63 | 64 | Usage:: 65 | 66 | >>> from lxml import etree, cssselect 67 | >>> select = cssselect.CSSSelector("a tag > child") 68 | 69 | >>> root = etree.XML("TEXT") 70 | >>> [ el.tag for el in select(root) ] 71 | ['child'] 72 | 73 | To use CSS namespaces, you need to pass a prefix-to-namespace 74 | mapping as ``namespaces`` keyword argument:: 75 | 76 | >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' 77 | >>> select_ns = cssselect.CSSSelector('root > rdf|Description', 78 | ... namespaces={'rdf': rdfns}) 79 | 80 | >>> rdf = etree.XML(( 81 | ... '' 82 | ... 'blah' 83 | ... '') % rdfns) 84 | >>> [(el.tag, el.text) for el in select_ns(rdf)] 85 | [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] 86 | 87 | """ 88 | def __init__(self, css, namespaces=None, translator='xml'): 89 | if translator == 'xml': 90 | translator = LxmlTranslator() 91 | elif translator == 'html': 92 | translator = LxmlHTMLTranslator() 93 | elif translator == 'xhtml': 94 | translator = LxmlHTMLTranslator(xhtml=True) 95 | path = translator.css_to_xpath(css) 96 | etree.XPath.__init__(self, path, namespaces=namespaces) 97 | self.css = css 98 | 99 | def __repr__(self): 100 | return '<%s %s for %r>' % ( 101 | self.__class__.__name__, 102 | hex(abs(id(self)))[2:], 103 | self.css) 104 | -------------------------------------------------------------------------------- /src/lxml/cvarargs.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "stdarg.h": 2 | ctypedef void *va_list 3 | void va_start(va_list ap, void *last) nogil 4 | void va_end(va_list ap) nogil 5 | 6 | cdef extern from "etree_defs.h": 7 | cdef int va_int(va_list ap) nogil 8 | cdef char *va_charptr(va_list ap) nogil 9 | -------------------------------------------------------------------------------- /src/lxml/debug.pxi: -------------------------------------------------------------------------------- 1 | 2 | @cython.final 3 | @cython.internal 4 | cdef class _MemDebug: 5 | """Debugging support for the memory allocation in libxml2. 6 | """ 7 | def bytes_used(self): 8 | """bytes_used(self) 9 | 10 | Returns the total amount of memory (in bytes) currently used by libxml2. 11 | Note that libxml2 constrains this value to a C int, which limits 12 | the accuracy on 64 bit systems. 13 | """ 14 | return tree.xmlMemUsed() 15 | 16 | def blocks_used(self): 17 | """blocks_used(self) 18 | 19 | Returns the total number of memory blocks currently allocated by libxml2. 20 | Note that libxml2 constrains this value to a C int, which limits 21 | the accuracy on 64 bit systems. 22 | """ 23 | return tree.xmlMemBlocks() 24 | 25 | def dict_size(self): 26 | """dict_size(self) 27 | 28 | Returns the current size of the global name dictionary used by libxml2 29 | for the current thread. Each thread has its own dictionary. 30 | """ 31 | c_dict = __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL) 32 | if c_dict is NULL: 33 | raise MemoryError() 34 | return tree.xmlDictSize(c_dict) 35 | 36 | def dump(self, output_file=None, byte_count=None): 37 | """dump(self, output_file=None, byte_count=None) 38 | 39 | Dumps the current memory blocks allocated by libxml2 to a file. 40 | 41 | The optional parameter 'output_file' specifies the file path. It defaults 42 | to the file ".memorylist" in the current directory. 43 | 44 | The optional parameter 'byte_count' limits the number of bytes in the dump. 45 | Note that this parameter is ignored when lxml is compiled against a libxml2 46 | version before 2.7.0. 47 | """ 48 | cdef Py_ssize_t c_count 49 | if output_file is None: 50 | output_file = b'.memorylist' 51 | elif isinstance(output_file, unicode): 52 | output_file.encode(sys.getfilesystemencoding()) 53 | 54 | f = stdio.fopen(output_file, "w") 55 | if f is NULL: 56 | raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding())) 57 | try: 58 | if tree.LIBXML_VERSION < 20700: 59 | tree.xmlMemDisplay(f) 60 | elif byte_count is None: 61 | tree.xmlMemDisplay(f) 62 | else: 63 | c_count = byte_count 64 | tree.xmlMemDisplayLast(f, c_count) 65 | finally: 66 | stdio.fclose(f) 67 | 68 | def show(self, output_file=None, block_count=None): 69 | """show(self, output_file=None, block_count=None) 70 | 71 | Dumps the current memory blocks allocated by libxml2 to a file. 72 | The output file format is suitable for line diffing. 73 | 74 | The optional parameter 'output_file' specifies the file path. It defaults 75 | to the file ".memorydump" in the current directory. 76 | 77 | The optional parameter 'block_count' limits the number of blocks 78 | in the dump. 79 | """ 80 | if output_file is None: 81 | output_file = b'.memorydump' 82 | elif isinstance(output_file, unicode): 83 | output_file.encode(sys.getfilesystemencoding()) 84 | 85 | f = stdio.fopen(output_file, "w") 86 | if f is NULL: 87 | raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding())) 88 | try: 89 | tree.xmlMemShow(f, block_count if block_count is not None else tree.xmlMemBlocks()) 90 | finally: 91 | stdio.fclose(f) 92 | 93 | memory_debugger = _MemDebug() 94 | -------------------------------------------------------------------------------- /src/lxml/html/ElementSoup.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Legacy interface to the BeautifulSoup HTML parser. 2 | """ 3 | 4 | __all__ = ["parse", "convert_tree"] 5 | 6 | from soupparser import convert_tree, parse as _parse 7 | 8 | def parse(file, beautifulsoup=None, makeelement=None): 9 | root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement) 10 | return root.getroot() 11 | -------------------------------------------------------------------------------- /src/lxml/html/_diffcommand.py: -------------------------------------------------------------------------------- 1 | import optparse 2 | import sys 3 | import re 4 | import os 5 | from lxml.html.diff import htmldiff 6 | 7 | description = """\ 8 | """ 9 | 10 | parser = optparse.OptionParser( 11 | usage="%prog [OPTIONS] FILE1 FILE2\n" 12 | "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...", 13 | description=description, 14 | ) 15 | 16 | parser.add_option( 17 | '-o', '--output', 18 | metavar="FILE", 19 | dest="output", 20 | default="-", 21 | help="File to write the difference to", 22 | ) 23 | 24 | parser.add_option( 25 | '-a', '--annotation', 26 | action="store_true", 27 | dest="annotation", 28 | help="Do an annotation") 29 | 30 | def main(args=None): 31 | if args is None: 32 | args = sys.argv[1:] 33 | options, args = parser.parse_args(args) 34 | if options.annotation: 35 | return annotate(options, args) 36 | if len(args) != 2: 37 | print('Error: you must give two files') 38 | parser.print_help() 39 | sys.exit(1) 40 | file1, file2 = args 41 | input1 = read_file(file1) 42 | input2 = read_file(file2) 43 | body1 = split_body(input1)[1] 44 | pre, body2, post = split_body(input2) 45 | result = htmldiff(body1, body2) 46 | result = pre + result + post 47 | if options.output == '-': 48 | if not result.endswith('\n'): 49 | result += '\n' 50 | sys.stdout.write(result) 51 | else: 52 | f = open(options.output, 'wb') 53 | f.write(result) 54 | f.close() 55 | 56 | def read_file(filename): 57 | if filename == '-': 58 | c = sys.stdin.read() 59 | elif not os.path.exists(filename): 60 | raise OSError( 61 | "Input file %s does not exist" % filename) 62 | else: 63 | f = open(filename, 'rb') 64 | c = f.read() 65 | f.close() 66 | return c 67 | 68 | body_start_re = re.compile( 69 | r"", re.I|re.S) 70 | body_end_re = re.compile( 71 | r"", re.I|re.S) 72 | 73 | def split_body(html): 74 | match = body_start_re.search(html) 75 | if match: 76 | pre = html[:match.end()] 77 | html = html[match.end():] 78 | match = body_end_re.search(html) 79 | if match: 80 | post = html[match.start():] 81 | html = html[:match.start()] 82 | return pre, html, post 83 | 84 | def annotate(options, args): 85 | print("Not yet implemented") 86 | sys.exit(1) 87 | 88 | -------------------------------------------------------------------------------- /src/lxml/html/_html5builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Legacy module - don't use in new code! 3 | 4 | html5lib now has its own proper implementation. 5 | 6 | This module implements a tree builder for html5lib that generates lxml 7 | html element trees. This module uses camelCase as it follows the 8 | html5lib style guide. 9 | """ 10 | 11 | from html5lib.treebuilders import _base, etree as etree_builders 12 | from lxml import html, etree 13 | 14 | 15 | class DocumentType(object): 16 | 17 | def __init__(self, name, publicId, systemId): 18 | self.name = name 19 | self.publicId = publicId 20 | self.systemId = systemId 21 | 22 | class Document(object): 23 | 24 | def __init__(self): 25 | self._elementTree = None 26 | self.childNodes = [] 27 | 28 | def appendChild(self, element): 29 | self._elementTree.getroot().addnext(element._element) 30 | 31 | 32 | class TreeBuilder(_base.TreeBuilder): 33 | documentClass = Document 34 | doctypeClass = DocumentType 35 | elementClass = None 36 | commentClass = None 37 | fragmentClass = Document 38 | 39 | def __init__(self, *args, **kwargs): 40 | html_builder = etree_builders.getETreeModule(html, fullTree=False) 41 | etree_builder = etree_builders.getETreeModule(etree, fullTree=False) 42 | self.elementClass = html_builder.Element 43 | self.commentClass = etree_builder.Comment 44 | _base.TreeBuilder.__init__(self, *args, **kwargs) 45 | 46 | def reset(self): 47 | _base.TreeBuilder.reset(self) 48 | self.rootInserted = False 49 | self.initialComments = [] 50 | self.doctype = None 51 | 52 | def getDocument(self): 53 | return self.document._elementTree 54 | 55 | def getFragment(self): 56 | fragment = [] 57 | element = self.openElements[0]._element 58 | if element.text: 59 | fragment.append(element.text) 60 | fragment.extend(element.getchildren()) 61 | if element.tail: 62 | fragment.append(element.tail) 63 | return fragment 64 | 65 | def insertDoctype(self, name, publicId, systemId): 66 | doctype = self.doctypeClass(name, publicId, systemId) 67 | self.doctype = doctype 68 | 69 | def insertComment(self, data, parent=None): 70 | if not self.rootInserted: 71 | self.initialComments.append(data) 72 | else: 73 | _base.TreeBuilder.insertComment(self, data, parent) 74 | 75 | def insertRoot(self, name): 76 | buf = [] 77 | if self.doctype and self.doctype.name: 78 | buf.append('') 83 | buf.append('') 84 | root = html.fromstring(''.join(buf)) 85 | 86 | # Append the initial comments: 87 | for comment in self.initialComments: 88 | root.addprevious(etree.Comment(comment)) 89 | 90 | # Create the root document and add the ElementTree to it 91 | self.document = self.documentClass() 92 | self.document._elementTree = root.getroottree() 93 | 94 | # Add the root element to the internal child/open data structures 95 | root_element = self.elementClass(name) 96 | root_element._element = root 97 | self.document.childNodes.append(root_element) 98 | self.openElements.append(root_element) 99 | 100 | self.rootInserted = True 101 | -------------------------------------------------------------------------------- /src/lxml/html/_setmixin.py: -------------------------------------------------------------------------------- 1 | class SetMixin(object): 2 | 3 | """ 4 | Mix-in for sets. You must define __iter__, add, remove 5 | """ 6 | 7 | def __len__(self): 8 | length = 0 9 | for item in self: 10 | length += 1 11 | return length 12 | 13 | def __contains__(self, item): 14 | for has_item in self: 15 | if item == has_item: 16 | return True 17 | return False 18 | 19 | def issubset(self, other): 20 | for item in other: 21 | if item not in self: 22 | return False 23 | return True 24 | 25 | __le__ = issubset 26 | 27 | def issuperset(self, other): 28 | for item in self: 29 | if item not in other: 30 | return False 31 | return True 32 | 33 | __ge__ = issuperset 34 | 35 | def union(self, other): 36 | return self | other 37 | 38 | def __or__(self, other): 39 | new = self.copy() 40 | new |= other 41 | return new 42 | 43 | def intersection(self, other): 44 | return self & other 45 | 46 | def __and__(self, other): 47 | new = self.copy() 48 | new &= other 49 | return new 50 | 51 | def difference(self, other): 52 | return self - other 53 | 54 | def __sub__(self, other): 55 | new = self.copy() 56 | new -= other 57 | return new 58 | 59 | def symmetric_difference(self, other): 60 | return self ^ other 61 | 62 | def __xor__(self, other): 63 | new = self.copy() 64 | new ^= other 65 | return new 66 | 67 | def copy(self): 68 | return set(self) 69 | 70 | def update(self, other): 71 | for item in other: 72 | self.add(item) 73 | 74 | def __ior__(self, other): 75 | self.update(other) 76 | return self 77 | 78 | def intersection_update(self, other): 79 | for item in self: 80 | if item not in other: 81 | self.remove(item) 82 | 83 | def __iand__(self, other): 84 | self.intersection_update(other) 85 | return self 86 | 87 | def difference_update(self, other): 88 | for item in other: 89 | if item in self: 90 | self.remove(item) 91 | 92 | def __isub__(self, other): 93 | self.difference_update(other) 94 | return self 95 | 96 | def symmetric_difference_update(self, other): 97 | for item in other: 98 | if item in self: 99 | self.remove(item) 100 | else: 101 | self.add(item) 102 | 103 | def __ixor__(self, other): 104 | self.symmetric_difference_update(other) 105 | return self 106 | 107 | def discard(self, item): 108 | try: 109 | self.remove(item) 110 | except KeyError: 111 | pass 112 | 113 | def clear(self): 114 | for item in list(self): 115 | self.remove(item) 116 | -------------------------------------------------------------------------------- /src/lxml/html/builder.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------- 2 | # The ElementTree toolkit is 3 | # Copyright (c) 1999-2004 by Fredrik Lundh 4 | # -------------------------------------------------------------------- 5 | 6 | """ 7 | A set of HTML generator tags for building HTML documents. 8 | 9 | Usage:: 10 | 11 | >>> from lxml.html.builder import * 12 | >>> html = HTML( 13 | ... HEAD( TITLE("Hello World") ), 14 | ... BODY( CLASS("main"), 15 | ... H1("Hello World !") 16 | ... ) 17 | ... ) 18 | 19 | >>> import lxml.etree 20 | >>> print lxml.etree.tostring(html, pretty_print=True) 21 | 22 | 23 | Hello World 24 | 25 | 26 |

Hello World !

27 | 28 | 29 | 30 | """ 31 | 32 | from lxml.builder import ElementMaker 33 | from lxml.html import html_parser 34 | 35 | E = ElementMaker(makeelement=html_parser.makeelement) 36 | 37 | # elements 38 | A = E.a # anchor 39 | ABBR = E.abbr # abbreviated form (e.g., WWW, HTTP, etc.) 40 | ACRONYM = E.acronym # 41 | ADDRESS = E.address # information on author 42 | APPLET = E.applet # Java applet (DEPRECATED) 43 | AREA = E.area # client-side image map area 44 | B = E.b # bold text style 45 | BASE = E.base # document base URI 46 | BASEFONT = E.basefont # base font size (DEPRECATED) 47 | BDO = E.bdo # I18N BiDi over-ride 48 | BIG = E.big # large text style 49 | BLOCKQUOTE = E.blockquote # long quotation 50 | BODY = E.body # document body 51 | BR = E.br # forced line break 52 | BUTTON = E.button # push button 53 | CAPTION = E.caption # table caption 54 | CENTER = E.center # shorthand for DIV align=center (DEPRECATED) 55 | CITE = E.cite # citation 56 | CODE = E.code # computer code fragment 57 | COL = E.col # table column 58 | COLGROUP = E.colgroup # table column group 59 | DD = E.dd # definition description 60 | DEL = getattr(E, 'del') # deleted text 61 | DFN = E.dfn # instance definition 62 | DIR = E.dir # directory list (DEPRECATED) 63 | DIV = E.div # generic language/style container 64 | DL = E.dl # definition list 65 | DT = E.dt # definition term 66 | EM = E.em # emphasis 67 | FIELDSET = E.fieldset # form control group 68 | FONT = E.font # local change to font (DEPRECATED) 69 | FORM = E.form # interactive form 70 | FRAME = E.frame # subwindow 71 | FRAMESET = E.frameset # window subdivision 72 | H1 = E.h1 # heading 73 | H2 = E.h2 # heading 74 | H3 = E.h3 # heading 75 | H4 = E.h4 # heading 76 | H5 = E.h5 # heading 77 | H6 = E.h6 # heading 78 | HEAD = E.head # document head 79 | HR = E.hr # horizontal rule 80 | HTML = E.html # document root element 81 | I = E.i # italic text style 82 | IFRAME = E.iframe # inline subwindow 83 | IMG = E.img # Embedded image 84 | INPUT = E.input # form control 85 | INS = E.ins # inserted text 86 | ISINDEX = E.isindex # single line prompt (DEPRECATED) 87 | KBD = E.kbd # text to be entered by the user 88 | LABEL = E.label # form field label text 89 | LEGEND = E.legend # fieldset legend 90 | LI = E.li # list item 91 | LINK = E.link # a media-independent link 92 | MAP = E.map # client-side image map 93 | MENU = E.menu # menu list (DEPRECATED) 94 | META = E.meta # generic metainformation 95 | NOFRAMES = E.noframes # alternate content container for non frame-based rendering 96 | NOSCRIPT = E.noscript # alternate content container for non script-based rendering 97 | OBJECT = E.object # generic embedded object 98 | OL = E.ol # ordered list 99 | OPTGROUP = E.optgroup # option group 100 | OPTION = E.option # selectable choice 101 | P = E.p # paragraph 102 | PARAM = E.param # named property value 103 | PRE = E.pre # preformatted text 104 | Q = E.q # short inline quotation 105 | S = E.s # strike-through text style (DEPRECATED) 106 | SAMP = E.samp # sample program output, scripts, etc. 107 | SCRIPT = E.script # script statements 108 | SELECT = E.select # option selector 109 | SMALL = E.small # small text style 110 | SPAN = E.span # generic language/style container 111 | STRIKE = E.strike # strike-through text (DEPRECATED) 112 | STRONG = E.strong # strong emphasis 113 | STYLE = E.style # style info 114 | SUB = E.sub # subscript 115 | SUP = E.sup # superscript 116 | TABLE = E.table # 117 | TBODY = E.tbody # table body 118 | TD = E.td # table data cell 119 | TEXTAREA = E.textarea # multi-line text field 120 | TFOOT = E.tfoot # table footer 121 | TH = E.th # table header cell 122 | THEAD = E.thead # table header 123 | TITLE = E.title # document title 124 | TR = E.tr # table row 125 | TT = E.tt # teletype or monospaced text style 126 | U = E.u # underlined text style (DEPRECATED) 127 | UL = E.ul # unordered list 128 | VAR = E.var # instance of a variable or program argument 129 | 130 | # attributes (only reserved words are included here) 131 | ATTR = dict 132 | def CLASS(v): return {'class': v} 133 | def FOR(v): return {'for': v} 134 | -------------------------------------------------------------------------------- /src/lxml/html/defs.py: -------------------------------------------------------------------------------- 1 | # FIXME: this should all be confirmed against what a DTD says 2 | # (probably in a test; this may not match the DTD exactly, but we 3 | # should document just how it differs). 4 | 5 | # Data taken from http://www.w3.org/TR/html401/index/elements.html 6 | # and http://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements 7 | # for html5_tags. 8 | 9 | try: 10 | frozenset 11 | except NameError: 12 | from sets import Set as frozenset 13 | 14 | 15 | empty_tags = frozenset([ 16 | 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 17 | 'img', 'input', 'isindex', 'link', 'meta', 'param']) 18 | 19 | deprecated_tags = frozenset([ 20 | 'applet', 'basefont', 'center', 'dir', 'font', 'isindex', 21 | 'menu', 's', 'strike', 'u']) 22 | 23 | # archive actually takes a space-separated list of URIs 24 | link_attrs = frozenset([ 25 | 'action', 'archive', 'background', 'cite', 'classid', 26 | 'codebase', 'data', 'href', 'longdesc', 'profile', 'src', 27 | 'usemap', 28 | # Not standard: 29 | 'dynsrc', 'lowsrc', 30 | ]) 31 | 32 | # Not in the HTML 4 spec: 33 | # onerror, onresize 34 | event_attrs = frozenset([ 35 | 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror', 36 | 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload', 37 | 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', 38 | 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit', 39 | 'onunload', 40 | ]) 41 | 42 | safe_attrs = frozenset([ 43 | 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 44 | 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', 45 | 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', 46 | 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', 47 | 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', 48 | 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', 49 | 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 50 | 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 51 | 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 52 | 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) 53 | 54 | # From http://htmlhelp.com/reference/html40/olist.html 55 | top_level_tags = frozenset([ 56 | 'html', 'head', 'body', 'frameset', 57 | ]) 58 | 59 | head_tags = frozenset([ 60 | 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title', 61 | ]) 62 | 63 | general_block_tags = frozenset([ 64 | 'address', 65 | 'blockquote', 66 | 'center', 67 | 'del', 68 | 'div', 69 | 'h1', 70 | 'h2', 71 | 'h3', 72 | 'h4', 73 | 'h5', 74 | 'h6', 75 | 'hr', 76 | 'ins', 77 | 'isindex', 78 | 'noscript', 79 | 'p', 80 | 'pre', 81 | ]) 82 | 83 | list_tags = frozenset([ 84 | 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul', 85 | ]) 86 | 87 | table_tags = frozenset([ 88 | 'table', 'caption', 'colgroup', 'col', 89 | 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th', 90 | ]) 91 | 92 | # just this one from 93 | # http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm 94 | block_tags = general_block_tags | list_tags | table_tags | frozenset([ 95 | # Partial form tags 96 | 'fieldset', 'form', 'legend', 'optgroup', 'option', 97 | ]) 98 | 99 | form_tags = frozenset([ 100 | 'form', 'button', 'fieldset', 'legend', 'input', 'label', 101 | 'select', 'optgroup', 'option', 'textarea', 102 | ]) 103 | 104 | special_inline_tags = frozenset([ 105 | 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe', 106 | 'img', 'map', 'area', 'object', 'param', 'q', 'script', 107 | 'span', 'sub', 'sup', 108 | ]) 109 | 110 | phrase_tags = frozenset([ 111 | 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em', 112 | 'ins', 'kbd', 'samp', 'strong', 'var', 113 | ]) 114 | 115 | font_style_tags = frozenset([ 116 | 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u', 117 | ]) 118 | 119 | frame_tags = frozenset([ 120 | 'frameset', 'frame', 'noframes', 121 | ]) 122 | 123 | html5_tags = frozenset([ 124 | 'article', 'aside', 'audio', 'canvas', 'command', 'datalist', 125 | 'details', 'embed', 'figcaption', 'figure', 'footer', 'header', 126 | 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output', 127 | 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary', 128 | 'svg', 'time', 'track', 'video', 'wbr' 129 | ]) 130 | 131 | # These tags aren't standard 132 | nonstandard_tags = frozenset(['blink', 'marquee']) 133 | 134 | tags = (top_level_tags | head_tags | general_block_tags | list_tags 135 | | table_tags | form_tags | special_inline_tags | phrase_tags 136 | | font_style_tags | nonstandard_tags | html5_tags) 137 | -------------------------------------------------------------------------------- /src/lxml/html/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_applet.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains applet 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | 5 |
safe description
6 | ---------- 7 |
safe description
-------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_blink.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains embed 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | Notes:
wrapper 5 | 6 |
safe description
7 | ---------- 8 |
safe description
-------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_crazy.data: -------------------------------------------------------------------------------- 1 | Description: entry content is crazy 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'Crazy HTML -' + u'- Can Your Regex Parse This?\n\n\n\n -' + u'->\n\n \n-' + u'->\n\n\n\nfunction executeMe()\n{\n\n\n\n\n/* \n

Did The Javascript Execute?

\n
\nI will execute here, too, if you mouse over me\n
' 3 | Options: -page_structure 4 | Notes: for some reason the comments in the expected field are acting weird 5 | 6 | 7 | 8 | 9 | 10 | 11 | Crazy HTML -- Can Your Regex Parse This? 12 | 13 | 14 | 17 | 18 | 19 | 22 | 23 | 24 | 25 | 26 | 61 |

Did The Javascript Execute?

62 |
66 | I will execute here, too, if you mouse over me 67 |
68 | 69 | 70 | 71 | 72 | 73 | ---------- 74 | 75 | 76 | Crazy HTML -- Can Your Regex Parse This? 77 | 78 | 79 |

Did The Javascript Execute?

80 |
81 | I will execute here, too, if you mouse over me 82 |
83 | 84 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_embed.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains embed 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | Notes:
wrapper, close tag (not closing it lost the tag) 5 | 6 |
safe description
7 | ---------- 8 |
safe description
9 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_frame.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains frameset 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | 5 |
safe description
6 | ---------- 7 |
safe description
8 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_iframe.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains iframe 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | Notes: div wrapper, close description
7 | ---------- 8 |
safe description
-------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_link.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains link 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | 5 |
safe description
6 | ---------- 7 |
safe description
8 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_meta.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains meta 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | 5 |
safe description
6 | ---------- 7 |
safe description
-------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_object.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains object 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | Notes: div wrapper, close 5 | 6 |
safe description
7 | ---------- 8 |
safe description
-------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onabort.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onabort 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onblur.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onblur 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onchange.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onchange 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onclick.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onclick 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_ondblclick.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains ondblclick 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: javascript 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onerror.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onerror 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onfocus.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onfocus 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onkeydown.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onkeydown 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onkeypress.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onkeypress 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onkeyup.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onkeyup 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onload.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onload 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onmousedown.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onmousedown 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onmouseout.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onmouseout 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onmouseover.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onmouseover 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onmouseup.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onmouseup 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onreset.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onreset 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onresize.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onresize 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onsubmit.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onsubmit 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_onunload.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains onunload 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'' 3 | Options: 4 | 5 | 6 | ---------- 7 | -------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_script.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains script 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | 5 |
safe description
6 | ---------- 7 |
safe description
-------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_script_cdata.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains script (cdata) 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description' 3 | Options: 4 | Notes: div wrapper. Currently not working because of how HTML() is parsing the CDATA (not in a useful way) 5 | The resulting code is safe, it just includes crap from the description]]> 11 | 12 | ---------- 13 |
safe description
-------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_script_inline.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains script (inline) 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'
safe description
' 3 | Options: 4 | 5 |
safe description
6 | ---------- 7 |
safe description
-------------------------------------------------------------------------------- /src/lxml/html/tests/feedparser-data/entry_content_style.data: -------------------------------------------------------------------------------- 1 | Description: entry content contains style 2 | Expect: not bozo and entries[0]['content'][0]['value'] == u'never trust your upstream platypus' 3 | Options: style 4 | 5 | never trust your upstream platypus 6 | ---------- 7 | never trust your upstream platypus -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/background-image-plus.data: -------------------------------------------------------------------------------- 1 | Description: I built a quick XSS fuzzer to detect any erroneous characters that are allowed after the open parenthesis but before the JavaScript directive in IE and Netscape 8.1 in secure site mode. These are in decimal but you can include hex and add padding of course. (Any of the following chars can be used: 1-32, 34, 39, 160, 8192-8.13, 12288, 65279) 2 | http://ha.ckers.org/xss.html#XSS_DIV_background-image_plus 3 | Options: -safe_attrs_only 4 | Notes: As you see, the CSS gets corrupted, but I don't really care that much. 5 | 6 |
text
7 | ---------- 8 |
text
9 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data: -------------------------------------------------------------------------------- 1 | Description: exploit (this has been modified slightly to obfuscate the url parameter). The original vulnerability was found by Renaud Lifchitz as a vulnerability in Hotmail. 2 | http://ha.ckers.org/xss.html#XSS_DIV_background_image_unicode 3 | Options: -safe_attrs_only 4 | Ignore: true 5 | Notes: I don't understand how this exploit works. It seems like the description actually refers to 6 | the unicode you'd import, but why that matters I don't know. 7 | 8 |
text
9 | ---------- 10 |
text
11 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/downlevel-hidden.data: -------------------------------------------------------------------------------- 1 | Description: Downlevel-Hidden-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore does not need to be removed, which allows our Cross Site Scripting vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job 2 | http://ha.ckers.org/xss.html#XSS_Downlevel-Hidden 3 | Options: -comments, -processing_instructions 4 | 5 |
8 | ---------- 9 |
10 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/html-plus-time.data: -------------------------------------------------------------------------------- 1 | Description: HTML+TIME in XML. This is how Grey Magic hacked Hotmail and Yahoo!. This only works in Internet Explorer and Netscape 8.1 in IE rendering engine mode and remember that you need to be between HTML and BODY tags for this to work 2 | http://ha.ckers.org/xss.html#XSS_HTML_plus_time 3 | Ignore: true 4 | Notes: I don't understand the vector here, or how this is supposed to work. 5 | 6 |
7 | 8 |
9 | ---------- 10 |
11 | 12 | x
13 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/javascript-link.data: -------------------------------------------------------------------------------- 1 | Description: javascript: in many forms 2 | 3 |
4 | x 6 | x 7 | x 9 |
10 | ---------- 11 |
12 | x 13 | x 14 | x 15 |
16 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/style-comment.data: -------------------------------------------------------------------------------- 1 | Description: to break up expression (Thanks to Roman Ivanov for this one) 2 | http://ha.ckers.org/xss.html#XSS_STYLE_comment 3 | Options: -safe_attrs_only 4 | Notes: Because of the suspicious stuff in there, the style is removed entirely 5 | 6 | 7 | ---------- 8 | 9 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/style-expression.data: -------------------------------------------------------------------------------- 1 | Description: (this is really a hybrid of the above XSS vectors, but it really does show how hard STYLE tags can be to parse apart, like above this can send IE into a loop) 2 | http://ha.ckers.org/xss.html#XSS_IMG_STYLE_expression 3 | Options: -safe_attrs_only 4 | Notes: Modified to avoid a parsing in libxml2 that ruins the XSS (the " marks). 5 | Also there seemed to be an extra "p" in exppression 6 | 7 |
9 | ---------- 10 |
11 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/style-import.data: -------------------------------------------------------------------------------- 1 | Description: tags with broken up JavaScript for XSS (this XSS at times sends IE into an infinite loop of alerts) 2 | http://ha.ckers.org/xss.html#XSS_STYLE 3 | Options: -safe_attrs_only 4 | 5 |
6 | ---------- 7 |
8 | 9 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/style-js-tag.data: -------------------------------------------------------------------------------- 1 | Description: (Older versions of Netscape only) 2 | http://ha.ckers.org/xss.html#XSS_STYLE_tag 3 | Options: -safe_attrs_only 4 | 5 |
6 | ---------- 7 |
8 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/style-url-js.data: -------------------------------------------------------------------------------- 1 | Description: http://ha.ckers.org/xss.html#XSS_STYLE_background-image 2 | Options: -style, -safe_attrs_only 3 | Notes: The CSS is messed up here, but so it goes 4 | 5 |
6 | ---------- 7 |
8 | 9 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/xml-data-island.data: -------------------------------------------------------------------------------- 1 | Description: XML data island with comment obfuscation (this is another take on the same exploit that doesn't use CDATA fields, but rather uses comments to break up the javascript directive) 2 | http://ha.ckers.org/xss.html#XSS_XML_data_island_comment 3 | Ignore: true 4 | Notes: I don't understand the vector here. Maybe datasrc should be filtered? 5 | 6 |
<IMG SRC="javascript:alert('XSS')"> 7 |
8 | ---------- 9 |
<IMG SRC="javascript:alert('XSS')"> 10 | x
11 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/xml-embedded-js.data: -------------------------------------------------------------------------------- 1 | Description: Locally hosted XML with embedded JavaScript#XSS_Local_XML that is generated using an XML data island. This is the same as above but instead referrs to a locally hosted (must be on the same server) XML file that contains your cross site scripting vector. You can see the result here 2 | http://ha.ckers.org/xss.html#XSS_Local_XML 3 | 4 |
5 |
6 | ---------- 7 |
8 | 9 |
10 | -------------------------------------------------------------------------------- /src/lxml/html/tests/hackers-org-data/xml-namespace.data.BROKEN: -------------------------------------------------------------------------------- 1 | Description: XML namespace. The htc file must be located on the same server as your XSS vector 2 | http://ha.ckers.org/xss.html#XSS_XML_namespace 3 | Note: I don't completely understand the vector here. page_structure is what does this. 4 | 5 | 6 | 7 | 8 | XSS 9 | 10 | 11 | ---------- 12 | 13 | 14 |
XSS
15 | 16 | 17 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_autolink.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest 3 | 4 | def test_suite(): 5 | suite = unittest.TestSuite() 6 | if sys.version_info >= (2,4): 7 | suite.addTests([make_doctest('test_autolink.txt')]) 8 | return suite 9 | 10 | if __name__ == '__main__': 11 | unittest.main() 12 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_autolink.txt: -------------------------------------------------------------------------------- 1 | This tests autolink:: 2 | 3 | >>> from lxml.html import usedoctest 4 | >>> from lxml.html.clean import autolink_html 5 | >>> print(autolink_html(''' 6 | ...
Link here: http://test.com/foo.html.
7 | ... ''')) 8 | 9 | >>> print(autolink_html(''' 10 | ...
Mail me at mailto:ianb@test.com or http://myhome.com
11 | ... ''')) 12 |
Mail me at ianb@test.com 13 | or http://myhome.com
14 | >>> print(autolink_html(''' 15 | ...
The great thing is the http://link.com links and 16 | ... the http://foobar.com links.
''')) 17 |
The great thing is the http://link.com links and 18 | the http://foobar.com links.
19 | >>> print(autolink_html(''' 20 | ...
Link: <http://foobar.com>
''')) 21 | 22 | >>> print(autolink_html(''' 23 | ...
Link: (http://foobar.com)
''')) 24 | 25 | 26 | Parenthesis are tricky, we'll do our best:: 27 | 28 | >>> print(autolink_html(''' 29 | ...
(Link: http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software))
30 | ... ''')) 31 | 32 | >>> print(autolink_html(''' 33 | ...
... a link: http://foo.com)
34 | ... ''')) 35 |
... a link: http://foo.com)
36 | 37 | Some cases that won't be caught (on purpose):: 38 | 39 | >>> print(autolink_html(''' 40 | ...
A link to http://localhost/foo/bar won't, but a link to 41 | ... http://test.com will
''')) 42 |
A link to http://localhost/foo/bar won't, but a link to 43 | http://test.com will
44 | >>> print(autolink_html(''' 45 | ...
A link in
''')) 46 |
A link in
47 | >>> print(autolink_html(''' 48 | ...
A link in http://bar.com
''')) 49 |
A link in http://bar.com
50 | >>> print(autolink_html(''' 51 | ...
A link in http://foo.com or 52 | ... http://bar.com
''')) 53 |
A link in http://foo.com or 54 | http://bar.com
55 | 56 | There's also a word wrapping function, that should probably be run 57 | after autolink:: 58 | 59 | >>> from lxml.html.clean import word_break_html 60 | >>> def pascii(s): 61 | ... print(s.encode('ascii', 'xmlcharrefreplace').decode('ascii')) 62 | >>> pascii(word_break_html( u''' 63 | ...
Hey you 64 | ... 12345678901234567890123456789012345678901234567890
''')) 65 |
Hey you 66 | 1234567890123456789012345678901234567890​1234567890
67 | 68 | Not everything is broken: 69 | 70 | >>> pascii(word_break_html(''' 71 | ...
Hey you 72 | ... 12345678901234567890123456789012345678901234567890
''')) 73 |
Hey you 74 | 12345678901234567890123456789012345678901234567890
75 | >>> pascii(word_break_html(''' 76 | ... text''')) 77 | text 78 | 79 | 80 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_basic.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest, doctest 3 | import lxml.html 4 | 5 | def test_suite(): 6 | suite = unittest.TestSuite() 7 | if sys.version_info >= (2,4): 8 | suite.addTests([make_doctest('test_basic.txt')]) 9 | suite.addTests([doctest.DocTestSuite(lxml.html)]) 10 | return suite 11 | 12 | if __name__ == '__main__': 13 | unittest.main() 14 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_clean.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest 3 | from lxml.etree import LIBXML_VERSION 4 | 5 | import lxml.html 6 | from lxml.html.clean import Cleaner 7 | 8 | class CleanerTest(unittest.TestCase): 9 | def test_allow_tags(self): 10 | html = """ 11 | 12 | 13 | 14 | 15 |

some text

16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
helloworld
helloworld
24 | 25 | 26 | 27 | """ 28 | 29 | html_root = lxml.html.document_fromstring(html) 30 | cleaner = Cleaner( 31 | remove_unknown_tags = False, 32 | allow_tags = ['table', 'tr', 'td']) 33 | result = cleaner.clean_html(html_root) 34 | 35 | self.assertEqual(12-5+1, len(list(result.iter()))) 36 | 37 | def test_safe_attrs_included(self): 38 | html = """

Cyan

""" 39 | 40 | safe_attrs=set(lxml.html.defs.safe_attrs) 41 | safe_attrs.add('style') 42 | 43 | cleaner = Cleaner( 44 | safe_attrs_only=True, 45 | safe_attrs=safe_attrs) 46 | result = cleaner.clean_html(html) 47 | 48 | self.assertEqual(html, result) 49 | 50 | def test_safe_attrs_excluded(self): 51 | html = """

Cyan

""" 52 | expected = """

Cyan

""" 53 | 54 | safe_attrs=set() 55 | 56 | cleaner = Cleaner( 57 | safe_attrs_only=True, 58 | safe_attrs=safe_attrs) 59 | result = cleaner.clean_html(html) 60 | 61 | self.assertEqual(expected, result) 62 | 63 | def test_suite(): 64 | suite = unittest.TestSuite() 65 | if sys.version_info >= (2,4): 66 | suite.addTests([make_doctest('test_clean.txt')]) 67 | if LIBXML_VERSION >= (2,6,31): 68 | suite.addTests([make_doctest('test_clean_embed.txt')]) 69 | suite.addTests(unittest.makeSuite(CleanerTest)) 70 | return suite 71 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_clean_embed.txt: -------------------------------------------------------------------------------- 1 | THIS FAILS IN libxml2 2.6.29 AND 2.6.30 !! 2 | 3 | 4 | >>> from lxml.html import fromstring, tostring 5 | >>> from lxml.html.clean import clean, clean_html, Cleaner 6 | >>> from lxml.html import usedoctest 7 | 8 | >>> def tostring(el): # work-around for Py3 'bytes' type 9 | ... from lxml.html import tostring 10 | ... s = tostring(el) 11 | ... if not isinstance(s, str): 12 | ... s = s.decode('UTF-8') 13 | ... return s 14 | 15 | >>> doc_embed = '''
16 | ... 17 | ... 18 | ... 19 | ... 20 | ...
''' 21 | >>> print(tostring(fromstring(doc_embed))) 22 |
23 | 24 | 25 | 26 | 27 |
28 | >>> print(Cleaner().clean_html(doc_embed)) 29 |
30 |
31 | >>> print(Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed)) 32 |
33 | 34 |
35 | >>> print(Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed)) 36 |
37 | 38 | 39 |
40 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_diff.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest, doctest 3 | 4 | from lxml.html import diff 5 | 6 | def test_suite(): 7 | suite = unittest.TestSuite() 8 | if sys.version_info >= (2,4): 9 | suite.addTests([make_doctest('test_diff.txt'), 10 | doctest.DocTestSuite(diff)]) 11 | return suite 12 | 13 | if __name__ == '__main__': 14 | unittest.main() 15 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_elementsoup.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest, HelperTestCase 3 | 4 | try: 5 | import BeautifulSoup 6 | BS_INSTALLED = True 7 | except ImportError: 8 | BS_INSTALLED = False 9 | 10 | if BS_INSTALLED: 11 | class SoupParserTestCase(HelperTestCase): 12 | from lxml.html import soupparser 13 | 14 | def test_broken_attribute(self): 15 | html = """\ 16 | 17 |
18 | 19 | """ 20 | root = self.soupparser.fromstring(html) 21 | self.assertTrue(root.find('.//input').get('disabled') is not None) 22 | 23 | 24 | def test_suite(): 25 | suite = unittest.TestSuite() 26 | if BS_INSTALLED: 27 | suite.addTests([unittest.makeSuite(SoupParserTestCase)]) 28 | if sys.version_info[0] < 3: 29 | suite.addTests([make_doctest('../../../../doc/elementsoup.txt')]) 30 | return suite 31 | 32 | if __name__ == '__main__': 33 | unittest.main() 34 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_feedparser_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import re 4 | try: 5 | from rfc822 import Message 6 | except ImportError: 7 | # Python 3 8 | from email import message_from_file as Message 9 | import unittest 10 | from lxml.tests.common_imports import doctest 11 | if sys.version_info >= (2,4): 12 | from lxml.doctestcompare import LHTMLOutputChecker 13 | 14 | from lxml.html.clean import clean, Cleaner 15 | 16 | feed_dirs = [ 17 | os.path.join(os.path.dirname(__file__), 'feedparser-data'), 18 | os.path.join(os.path.dirname(__file__), 'hackers-org-data'), 19 | ] 20 | bar_re = re.compile(r"-----+") 21 | 22 | class DummyInput: 23 | def __init__(self, **kw): 24 | for name, value in kw.items(): 25 | setattr(self, name, value) 26 | 27 | class FeedTestCase(unittest.TestCase): 28 | 29 | def __init__(self, filename): 30 | self.filename = filename 31 | unittest.TestCase.__init__(self) 32 | 33 | def parse(self): 34 | f = open(self.filename, 'r') 35 | headers = Message(f) 36 | c = f.read() 37 | f.close() 38 | if not c.strip(): 39 | c = headers.get_payload() 40 | if not headers.keys(): 41 | raise Exception( 42 | "File %s has no headers" % self.filename) 43 | self.description = headers['Description'] 44 | self.expect = headers.get('Expect', '') 45 | self.ignore = headers.get('Ignore') 46 | self.options = [ 47 | o.strip() for o in headers.get('Options', '').split(',') 48 | if o.strip()] 49 | parts = bar_re.split(c) 50 | self.input = parts[0].rstrip() + '\n' 51 | if parts[1:]: 52 | self.expect = parts[1].rstrip() + '\n' 53 | else: 54 | self.expect = None 55 | 56 | def runTest(self): 57 | self.parse() 58 | if self.ignore: 59 | # We've marked this test to be ignored. 60 | return 61 | kw = {} 62 | for name in self.options: 63 | if name.startswith('-'): 64 | kw[name[1:]] = False 65 | else: 66 | kw[name] = True 67 | if kw.get('clean', True): 68 | transformed = Cleaner(**kw).clean_html(self.input) 69 | else: 70 | transformed = self.input 71 | assert self.expect is not None, ( 72 | "No expected output in %s" % self.filename) 73 | checker = LHTMLOutputChecker() 74 | if not checker.check_output(self.expect, transformed, 0): 75 | result = checker.output_difference( 76 | DummyInput(want=self.expect), transformed, 0) 77 | #result += '\noptions: %s %r' % (', '.join(self.options), kw) 78 | #result += repr(transformed) 79 | raise Exception("\n"+result) 80 | 81 | def shortDescription(self): 82 | return self.filename 83 | 84 | def test_suite(): 85 | suite = unittest.TestSuite() 86 | if sys.version_info >= (2,4): 87 | for dir in feed_dirs: 88 | for fn in os.listdir(dir): 89 | fn = os.path.join(dir, fn) 90 | if fn.endswith('.data'): 91 | case = FeedTestCase(fn) 92 | suite.addTests([case]) 93 | # This is my lazy way of stopping on first error: 94 | try: 95 | case.runTest() 96 | except: 97 | break 98 | return suite 99 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_formfill.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest 3 | 4 | def test_suite(): 5 | suite = unittest.TestSuite() 6 | if sys.version_info >= (2,4): 7 | suite.addTests([make_doctest('test_formfill.txt')]) 8 | return suite 9 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_formfill.txt: -------------------------------------------------------------------------------- 1 | Some basic imports: 2 | 3 | >>> from lxml.html import usedoctest 4 | >>> from lxml.html.formfill import fill_form_html 5 | 6 | The simplest kind of filling is just filling an input with a value: 7 | 8 | >>> print(fill_form_html(''' 9 | ...
''', dict(foo='bar'))) 10 |
11 | 12 | You can also fill multiple inputs, like: 13 | 14 | >>> print(fill_form_html(''' 15 | ...
16 | ... 17 | ... 18 | ...
''', dict(foo=['bar1', 'bar2']))) 19 |
20 | 21 | 22 |
23 | 24 | Checkboxes can work either as boolean true/false, or be selected based 25 | on their inclusion in a set of values:: 26 | 27 | >>> print(fill_form_html(''' 28 | ...
29 | ... Would you like to be spammed? 30 | ...
31 | ... Spam you'd like to receive:
32 | ... Viagra spam: 33 | ...
34 | ... Stock spam: 35 | ...
36 | ... Other spam: 37 | ...
38 | ... 39 | ...
''', dict(spam_me=True, type=['viagra', 'other']))) 40 |
41 | Would you like to be spammed? 42 |
43 | Spam you'd like to receive:
44 | Viagra spam: 45 |
46 | Stock spam: 47 |
48 | Other spam: 49 |
50 | 51 |
52 | 53 | FIXME: I need to test more of this. But I'm lazy and want to use the 54 | coverage report for some of this. 55 | 56 | 57 | This module also allows you to add error messages to the form. The errors 58 | add an "error" class to the input fields, and any labels if the field 59 | has a label. It also inserts an error message into the form, using a 60 | function you can provide (or the default function). 61 | 62 | Example:: 63 | 64 | >>> from lxml.html.formfill import insert_errors_html 65 | >>> print(insert_errors_html(''' 66 | ...
67 | ...
68 | ...
69 | ... 70 | ...
71 | ...
72 | ... 73 | ... 74 | ... 75 | ... 76 | ...
''', { 77 | ... 'v1': "err1", 78 | ... 'v2': "err2", 79 | ... 'v3': [None, "err3-2"], 80 | ... 'v4': "err4", 81 | ... None: 'general error', 82 | ... '#fieldset': 'area error', 83 | ... })) 84 |
85 |
general error
86 |
87 |
area error
88 |
err1
89 |
90 | 91 |
err2
92 |
93 |
94 | 95 |
err3-2
96 | 97 |
err4
98 | 99 | 100 |
101 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_forms.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest 3 | 4 | def test_suite(): 5 | suite = unittest.TestSuite() 6 | if sys.version_info >= (2,4): 7 | suite.addTests([make_doctest('test_forms.txt')]) 8 | return suite 9 | 10 | if __name__ == '__main__': 11 | unittest.main() 12 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_frames.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest, doctest 3 | import lxml.html 4 | from lxml.html import html_parser, XHTML_NAMESPACE 5 | 6 | class FrameTest(unittest.TestCase): 7 | 8 | def test_parse_fragments_fromstring(self): 9 | parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True) 10 | html = """ 11 | 12 | """ 13 | etree_document = lxml.html.fragments_fromstring(html, parser=parser) 14 | self.assertEqual(len(etree_document), 1) 15 | root = etree_document[0] 16 | self.assertEqual(root.tag, "frameset") 17 | frame_element = root[0] 18 | self.assertEqual(frame_element.tag, 'frame') 19 | 20 | def test_parse_fromstring(self): 21 | parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True) 22 | html = """ 23 | 24 | """ 25 | etree_document = lxml.html.fromstring(html, parser=parser) 26 | self.assertEqual(etree_document.tag, 'html') 27 | self.assertEqual(len(etree_document), 1) 28 | frameset_element = etree_document[0] 29 | self.assertEqual(len(frameset_element), 1) 30 | frame_element = frameset_element[0] 31 | self.assertEqual(frame_element.tag, 'frame') 32 | 33 | 34 | def test_suite(): 35 | loader = unittest.TestLoader() 36 | return loader.loadTestsFromModule(sys.modules[__name__]) -------------------------------------------------------------------------------- /src/lxml/html/tests/test_rewritelinks.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest 3 | 4 | def test_suite(): 5 | suite = unittest.TestSuite() 6 | if sys.version_info >= (2,4): 7 | suite.addTests([make_doctest('test_rewritelinks.txt')]) 8 | return suite 9 | 10 | if __name__ == '__main__': 11 | unittest.main() 12 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_xhtml.py: -------------------------------------------------------------------------------- 1 | import unittest, sys 2 | from lxml.tests.common_imports import make_doctest 3 | import lxml.html 4 | 5 | def test_suite(): 6 | suite = unittest.TestSuite() 7 | suite.addTests([make_doctest('test_xhtml.txt')]) 8 | return suite 9 | 10 | if __name__ == '__main__': 11 | unittest.main() 12 | -------------------------------------------------------------------------------- /src/lxml/html/tests/test_xhtml.txt: -------------------------------------------------------------------------------- 1 | >>> from lxml.html import document_fromstring, fragment_fromstring, tostring 2 | 3 | lxml.html has two parsers, one for HTML, one for XHTML: 4 | 5 | >>> from lxml.html import HTMLParser, XHTMLParser 6 | >>> html = "

Hi!

" 7 | 8 | >>> root = document_fromstring(html, parser=HTMLParser()) 9 | >>> print(root.tag) 10 | html 11 | 12 | >>> root = document_fromstring(html, parser=XHTMLParser()) 13 | >>> print(root.tag) 14 | html 15 | 16 | There are two functions for converting between HTML and XHTML: 17 | 18 | >>> from lxml.html import xhtml_to_html, html_to_xhtml 19 | 20 | >>> doc = document_fromstring(html, parser=HTMLParser()) 21 | >>> tostring(doc) 22 | b'

Hi!

' 23 | 24 | >>> html_to_xhtml(doc) 25 | >>> tostring(doc) 26 | b'Hi!' 27 | 28 | >>> xhtml_to_html(doc) 29 | >>> tostring(doc) 30 | b'

Hi!

' 31 | -------------------------------------------------------------------------------- /src/lxml/html/tests/transform_feedparser_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This takes the feedparser tests from here: 3 | 4 | http://feedparser.org/tests/wellformed/sanitize/ 5 | 6 | and rewrites them to be easier to handle (not using the internal model 7 | of feedparser). The input format is:: 8 | 9 | 13 | ... 14 | {content} 15 | ... 16 | 17 | The Expect expression is checked for 18 | ``entries[0]['content'][0]['value'] == {data}``. 19 | 20 | The output format is:: 21 | 22 | Description: {description} 23 | Expect: {expression} (if data couldn't be parsed) 24 | Options: 25 | 26 | {content, unescaped} 27 | ---------- 28 | {data, unescaped, if found} 29 | 30 | """ 31 | 32 | import re 33 | import os 34 | import traceback 35 | 36 | _desc_re = re.compile(r'\s*Description:\s*(.*)') 37 | _expect_re = re.compile(r'\s*Expect:\s*(.*)') 38 | _data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)") 39 | _feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)") 40 | 41 | def parse_content(content): 42 | match = _desc_re.search(content) 43 | desc = match.group(1) 44 | match = _expect_re.search(content) 45 | expect = match.group(1) 46 | data = None 47 | for regex in [_data_expect_re, _feed_data_expect_re]: 48 | match = regex.search(expect) 49 | if match: 50 | # Icky, but I'll trust it 51 | data = eval(match.group(1).strip()) 52 | break 53 | c = None 54 | for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']: 55 | regex = re.compile(r"<%s.*?>(.*)" % (tag, tag), re.S) 56 | match = regex.search(content) 57 | if match: 58 | c = match.group(1) 59 | break 60 | assert c is not None 61 | # Seems like body isn't quoted 62 | if tag != 'body': 63 | c = c.replace('<', '<') 64 | c = c.replace('&', '&') 65 | # FIXME: I should really do more unescaping... 66 | return { 67 | 'Description': desc, 68 | 'Expect': expect, 69 | 'data': data, 70 | 'content': c} 71 | 72 | def serialize_content(d): 73 | s = '''\ 74 | Description: %(Description)s 75 | Expect: %(Expect)s 76 | Options: 77 | 78 | %(content)s 79 | ''' % d 80 | if d.get('data') is not None: 81 | s += '----------\n%s' % d['data'] 82 | return s 83 | 84 | def translate_file(filename): 85 | f = open(filename, 'rb') 86 | c = f.read() 87 | f.close() 88 | try: 89 | output = serialize_content(parse_content(c)) 90 | except: 91 | print('Bad data in %s:' % filename) 92 | print(c) 93 | traceback.print_exc() 94 | print('-'*60) 95 | return 96 | new = os.path.splitext(filename)[0] + '.data' 97 | f = open(new, 'wb') 98 | f.write(output) 99 | f.close() 100 | 101 | def translate_all(dir): 102 | for fn in os.listdir(dir): 103 | fn = os.path.join(dir, fn) 104 | if fn.endswith('.xml'): 105 | translate_file(fn) 106 | 107 | if __name__ == '__main__': 108 | import sys 109 | translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data')) 110 | 111 | -------------------------------------------------------------------------------- /src/lxml/html/usedoctest.py: -------------------------------------------------------------------------------- 1 | """Doctest module for HTML comparison. 2 | 3 | Usage:: 4 | 5 | >>> import lxml.html.usedoctest 6 | >>> # now do your HTML doctests ... 7 | 8 | See `lxml.doctestcompare`. 9 | """ 10 | 11 | from lxml import doctestcompare 12 | 13 | doctestcompare.temp_install(html=True, del_module=__name__) 14 | -------------------------------------------------------------------------------- /src/lxml/includes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aglyzov/lxml/98efdc08f9886af20b48b225873f96a636f06056/src/lxml/includes/__init__.py -------------------------------------------------------------------------------- /src/lxml/includes/c14n.pxd: -------------------------------------------------------------------------------- 1 | from lxml.includes.tree cimport xmlDoc, xmlOutputBuffer, xmlChar 2 | from lxml.includes.xpath cimport xmlNodeSet 3 | 4 | cdef extern from "libxml/c14n.h": 5 | cdef int xmlC14NDocDumpMemory(xmlDoc* doc, 6 | xmlNodeSet* nodes, 7 | int exclusive, 8 | xmlChar** inclusive_ns_prefixes, 9 | int with_comments, 10 | xmlChar** doc_txt_ptr) nogil 11 | 12 | cdef int xmlC14NDocSave(xmlDoc* doc, 13 | xmlNodeSet* nodes, 14 | int exclusive, 15 | xmlChar** inclusive_ns_prefixes, 16 | int with_comments, 17 | char* filename, 18 | int compression) nogil 19 | 20 | cdef int xmlC14NDocSaveTo(xmlDoc* doc, 21 | xmlNodeSet* nodes, 22 | int exclusive, 23 | xmlChar** inclusive_ns_prefixes, 24 | int with_comments, 25 | xmlOutputBuffer* buffer) nogil 26 | 27 | -------------------------------------------------------------------------------- /src/lxml/includes/config.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "etree_defs.h": 2 | cdef bint ENABLE_THREADING 3 | cdef bint ENABLE_SCHEMATRON 4 | -------------------------------------------------------------------------------- /src/lxml/includes/dtdvalid.pxd: -------------------------------------------------------------------------------- 1 | from lxml.includes cimport tree 2 | from lxml.includes.tree cimport xmlDoc, xmlDtd 3 | 4 | cdef extern from "libxml/valid.h": 5 | ctypedef struct xmlValidCtxt 6 | 7 | cdef xmlValidCtxt* xmlNewValidCtxt() nogil 8 | cdef void xmlFreeValidCtxt(xmlValidCtxt* cur) nogil 9 | 10 | cdef int xmlValidateDtd(xmlValidCtxt* ctxt, xmlDoc* doc, xmlDtd* dtd) nogil 11 | -------------------------------------------------------------------------------- /src/lxml/includes/htmlparser.pxd: -------------------------------------------------------------------------------- 1 | from lxml.includes.tree cimport xmlDoc, xmlDict 2 | from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback 3 | from lxml.includes.xmlparser cimport xmlParserCtxt, xmlSAXHandler, xmlSAXHandlerV1 4 | from lxml.includes.xmlerror cimport xmlError 5 | 6 | cdef extern from "libxml/HTMLparser.h": 7 | ctypedef enum htmlParserOption: 8 | HTML_PARSE_NOERROR # suppress error reports 9 | HTML_PARSE_NOWARNING # suppress warning reports 10 | HTML_PARSE_PEDANTIC # pedantic error reporting 11 | HTML_PARSE_NOBLANKS # remove blank nodes 12 | HTML_PARSE_NONET # Forbid network access 13 | # libxml2 2.6.21+ only: 14 | HTML_PARSE_RECOVER # Relaxed parsing 15 | HTML_PARSE_COMPACT # compact small text nodes 16 | 17 | xmlSAXHandlerV1 htmlDefaultSAXHandler 18 | 19 | cdef xmlParserCtxt* htmlCreateMemoryParserCtxt( 20 | char* buffer, int size) nogil 21 | cdef xmlParserCtxt* htmlCreateFileParserCtxt( 22 | char* filename, char* encoding) nogil 23 | cdef xmlParserCtxt* htmlCreatePushParserCtxt(xmlSAXHandler* sax, 24 | void* user_data, 25 | char* chunk, int size, 26 | char* filename, int enc) nogil 27 | cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt) nogil 28 | cdef void htmlCtxtReset(xmlParserCtxt* ctxt) nogil 29 | cdef int htmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) nogil 30 | cdef int htmlParseDocument(xmlParserCtxt* ctxt) nogil 31 | cdef int htmlParseChunk(xmlParserCtxt* ctxt, 32 | char* chunk, int size, int terminate) nogil 33 | 34 | cdef xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt, 35 | char* filename, char* encoding, 36 | int options) nogil 37 | cdef xmlDoc* htmlCtxtReadDoc(xmlParserCtxt* ctxt, 38 | char* buffer, char* URL, char* encoding, 39 | int options) nogil 40 | cdef xmlDoc* htmlCtxtReadIO(xmlParserCtxt* ctxt, 41 | xmlInputReadCallback ioread, 42 | xmlInputCloseCallback ioclose, 43 | void* ioctx, 44 | char* URL, char* encoding, 45 | int options) nogil 46 | cdef xmlDoc* htmlCtxtReadMemory(xmlParserCtxt* ctxt, 47 | char* buffer, int size, 48 | char* filename, char* encoding, 49 | int options) nogil 50 | -------------------------------------------------------------------------------- /src/lxml/includes/relaxng.pxd: -------------------------------------------------------------------------------- 1 | from lxml.includes cimport tree 2 | from lxml.includes.tree cimport xmlDoc 3 | from lxml.includes.xmlerror cimport xmlStructuredErrorFunc 4 | 5 | cdef extern from "libxml/relaxng.h": 6 | ctypedef struct xmlRelaxNG 7 | ctypedef struct xmlRelaxNGParserCtxt 8 | 9 | ctypedef struct xmlRelaxNGValidCtxt 10 | 11 | ctypedef enum xmlRelaxNGValidErr: 12 | XML_RELAXNG_OK = 0 13 | XML_RELAXNG_ERR_MEMORY = 1 14 | XML_RELAXNG_ERR_TYPE = 2 15 | XML_RELAXNG_ERR_TYPEVAL = 3 16 | XML_RELAXNG_ERR_DUPID = 4 17 | XML_RELAXNG_ERR_TYPECMP = 5 18 | XML_RELAXNG_ERR_NOSTATE = 6 19 | XML_RELAXNG_ERR_NODEFINE = 7 20 | XML_RELAXNG_ERR_LISTEXTRA = 8 21 | XML_RELAXNG_ERR_LISTEMPTY = 9 22 | XML_RELAXNG_ERR_INTERNODATA = 10 23 | XML_RELAXNG_ERR_INTERSEQ = 11 24 | XML_RELAXNG_ERR_INTEREXTRA = 12 25 | XML_RELAXNG_ERR_ELEMNAME = 13 26 | XML_RELAXNG_ERR_ATTRNAME = 14 27 | XML_RELAXNG_ERR_ELEMNONS = 15 28 | XML_RELAXNG_ERR_ATTRNONS = 16 29 | XML_RELAXNG_ERR_ELEMWRONGNS = 17 30 | XML_RELAXNG_ERR_ATTRWRONGNS = 18 31 | XML_RELAXNG_ERR_ELEMEXTRANS = 19 32 | XML_RELAXNG_ERR_ATTREXTRANS = 20 33 | XML_RELAXNG_ERR_ELEMNOTEMPTY = 21 34 | XML_RELAXNG_ERR_NOELEM = 22 35 | XML_RELAXNG_ERR_NOTELEM = 23 36 | XML_RELAXNG_ERR_ATTRVALID = 24 37 | XML_RELAXNG_ERR_CONTENTVALID = 25 38 | XML_RELAXNG_ERR_EXTRACONTENT = 26 39 | XML_RELAXNG_ERR_INVALIDATTR = 27 40 | XML_RELAXNG_ERR_DATAELEM = 28 41 | XML_RELAXNG_ERR_VALELEM = 29 42 | XML_RELAXNG_ERR_LISTELEM = 30 43 | XML_RELAXNG_ERR_DATATYPE = 31 44 | XML_RELAXNG_ERR_VALUE = 32 45 | XML_RELAXNG_ERR_LIST = 33 46 | XML_RELAXNG_ERR_NOGRAMMAR = 34 47 | XML_RELAXNG_ERR_EXTRADATA = 35 48 | XML_RELAXNG_ERR_LACKDATA = 36 49 | XML_RELAXNG_ERR_INTERNAL = 37 50 | XML_RELAXNG_ERR_ELEMWRONG = 38 51 | XML_RELAXNG_ERR_TEXTWRONG = 39 52 | 53 | cdef xmlRelaxNGValidCtxt* xmlRelaxNGNewValidCtxt(xmlRelaxNG* schema) nogil 54 | cdef int xmlRelaxNGValidateDoc(xmlRelaxNGValidCtxt* ctxt, xmlDoc* doc) nogil 55 | cdef xmlRelaxNG* xmlRelaxNGParse(xmlRelaxNGParserCtxt* ctxt) nogil 56 | cdef xmlRelaxNGParserCtxt* xmlRelaxNGNewParserCtxt(char* URL) nogil 57 | cdef xmlRelaxNGParserCtxt* xmlRelaxNGNewDocParserCtxt(xmlDoc* doc) nogil 58 | cdef void xmlRelaxNGFree(xmlRelaxNG* schema) nogil 59 | cdef void xmlRelaxNGFreeParserCtxt(xmlRelaxNGParserCtxt* ctxt) nogil 60 | cdef void xmlRelaxNGFreeValidCtxt(xmlRelaxNGValidCtxt* ctxt) nogil 61 | 62 | cdef void xmlRelaxNGSetValidStructuredErrors( 63 | xmlRelaxNGValidCtxt* ctxt, xmlStructuredErrorFunc serror, void *ctx) nogil 64 | cdef void xmlRelaxNGSetParserStructuredErrors( 65 | xmlRelaxNGParserCtxt* ctxt, xmlStructuredErrorFunc serror, void *ctx) nogil 66 | -------------------------------------------------------------------------------- /src/lxml/includes/schematron.pxd: -------------------------------------------------------------------------------- 1 | from lxml.includes cimport tree, xmlerror 2 | from lxml.includes.tree cimport xmlDoc, xmlDtd 3 | 4 | cdef extern from "libxml/schematron.h": 5 | ctypedef struct xmlSchematron 6 | ctypedef struct xmlSchematronParserCtxt 7 | ctypedef struct xmlSchematronValidCtxt 8 | 9 | ctypedef enum xmlSchematronValidOptions: 10 | XML_SCHEMATRON_OUT_QUIET = 1 # quiet no report 11 | XML_SCHEMATRON_OUT_TEXT = 2 # build a textual report 12 | XML_SCHEMATRON_OUT_XML = 4 # output SVRL 13 | XML_SCHEMATRON_OUT_ERROR = 8 # output via xmlStructuredErrorFunc 14 | XML_SCHEMATRON_OUT_FILE = 256 # output to a file descriptor 15 | XML_SCHEMATRON_OUT_BUFFER = 512 # output to a buffer 16 | XML_SCHEMATRON_OUT_IO = 1024 # output to I/O mechanism 17 | 18 | cdef xmlSchematronParserCtxt* xmlSchematronNewDocParserCtxt( 19 | xmlDoc* doc) nogil 20 | cdef xmlSchematronParserCtxt* xmlSchematronNewParserCtxt( 21 | char* filename) nogil 22 | cdef xmlSchematronValidCtxt* xmlSchematronNewValidCtxt( 23 | xmlSchematron* schema, int options) nogil 24 | 25 | cdef xmlSchematron* xmlSchematronParse(xmlSchematronParserCtxt* ctxt) nogil 26 | cdef int xmlSchematronValidateDoc(xmlSchematronValidCtxt* ctxt, 27 | xmlDoc* instance) nogil 28 | 29 | cdef void xmlSchematronFreeParserCtxt(xmlSchematronParserCtxt* ctxt) nogil 30 | cdef void xmlSchematronFreeValidCtxt(xmlSchematronValidCtxt* ctxt) nogil 31 | cdef void xmlSchematronFree(xmlSchematron* schema) nogil 32 | cdef void xmlSchematronSetValidStructuredErrors( 33 | xmlSchematronValidCtxt* ctxt, 34 | xmlerror.xmlStructuredErrorFunc error_func, void *data) 35 | -------------------------------------------------------------------------------- /src/lxml/includes/uri.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "libxml/uri.h": 2 | ctypedef struct xmlURI 3 | 4 | cdef xmlURI* xmlParseURI(char* str) 5 | cdef void xmlFreeURI(xmlURI* uri) 6 | -------------------------------------------------------------------------------- /src/lxml/includes/xinclude.pxd: -------------------------------------------------------------------------------- 1 | from lxml.includes.tree cimport xmlDoc, xmlNode 2 | 3 | cdef extern from "libxml/xinclude.h": 4 | 5 | ctypedef struct xmlXIncludeCtxt 6 | 7 | cdef int xmlXIncludeProcess(xmlDoc* doc) nogil 8 | cdef int xmlXIncludeProcessFlags(xmlDoc* doc, int parser_opts) nogil 9 | cdef int xmlXIncludeProcessTree(xmlNode* doc) nogil 10 | cdef int xmlXIncludeProcessTreeFlags(xmlNode* doc, int parser_opts) nogil 11 | 12 | cdef xmlXIncludeCtxt* xmlXIncludeNewContext(xmlDoc* doc) nogil 13 | cdef int xmlXIncludeProcessNode(xmlXIncludeCtxt* ctxt, xmlNode* node) nogil 14 | cdef int xmlXIncludeSetFlags(xmlXIncludeCtxt* ctxt, int flags) nogil 15 | 16 | # libxml2 >= 2.6.27 17 | cdef int xmlXIncludeProcessFlagsData( 18 | xmlDoc* doc, int flags, void* data) nogil 19 | -------------------------------------------------------------------------------- /src/lxml/includes/xmlschema.pxd: -------------------------------------------------------------------------------- 1 | from lxml.includes.tree cimport xmlDoc 2 | from lxml.includes.xmlparser cimport xmlSAXHandler 3 | from lxml.includes.xmlerror cimport xmlStructuredErrorFunc 4 | 5 | cdef extern from "libxml/xmlschemas.h": 6 | ctypedef struct xmlSchema 7 | ctypedef struct xmlSchemaParserCtxt 8 | 9 | ctypedef struct xmlSchemaSAXPlugStruct 10 | ctypedef struct xmlSchemaValidCtxt 11 | 12 | ctypedef enum xmlSchemaValidOption: 13 | XML_SCHEMA_VAL_VC_I_CREATE = 1 14 | 15 | cdef xmlSchemaValidCtxt* xmlSchemaNewValidCtxt(xmlSchema* schema) nogil 16 | cdef void xmlSchemaSetParserStructuredErrors(xmlSchemaParserCtxt* ctxt, 17 | xmlStructuredErrorFunc serror, void *ctx) 18 | cdef void xmlSchemaSetValidStructuredErrors(xmlSchemaValidCtxt* ctxt, 19 | xmlStructuredErrorFunc serror, void *ctx) 20 | 21 | cdef int xmlSchemaValidateDoc(xmlSchemaValidCtxt* ctxt, xmlDoc* doc) nogil 22 | cdef xmlSchema* xmlSchemaParse(xmlSchemaParserCtxt* ctxt) nogil 23 | cdef xmlSchemaParserCtxt* xmlSchemaNewParserCtxt(char* URL) nogil 24 | cdef xmlSchemaParserCtxt* xmlSchemaNewDocParserCtxt(xmlDoc* doc) nogil 25 | cdef void xmlSchemaFree(xmlSchema* schema) nogil 26 | cdef void xmlSchemaFreeParserCtxt(xmlSchemaParserCtxt* ctxt) nogil 27 | cdef void xmlSchemaFreeValidCtxt(xmlSchemaValidCtxt* ctxt) nogil 28 | cdef int xmlSchemaSetValidOptions(xmlSchemaValidCtxt* ctxt, 29 | int options) nogil 30 | 31 | cdef xmlSchemaSAXPlugStruct* xmlSchemaSAXPlug(xmlSchemaValidCtxt* ctxt, 32 | xmlSAXHandler** sax, 33 | void** data) nogil 34 | cdef int xmlSchemaSAXUnplug(xmlSchemaSAXPlugStruct* sax_plug) 35 | cdef int xmlSchemaIsValid(xmlSchemaValidCtxt* ctxt) 36 | -------------------------------------------------------------------------------- /src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl: -------------------------------------------------------------------------------- 1 | 2 | 8 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl: -------------------------------------------------------------------------------- 1 | 2 | 8 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 40 | 41 | 42 | 43 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl: -------------------------------------------------------------------------------- 1 | 2 | 25 | 26 | 27 | 28 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 44 | 45 | 46 | 47 | 48 | 49 | ( 51 | / 52 | ) 53 | 54 | 55 | -------------------------------------------------------------------------------- /src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt: -------------------------------------------------------------------------------- 1 | ISO SCHEMATRON 2009 2 | 3 | XSLT implementation by Rick Jelliffe with assistance from members of Schematron-love-in maillist. 4 | 5 | 2009-03-18 6 | 7 | Two distributions are available. One is for XSLT1 engines. 8 | The other is for XSLT2 engines, such as SAXON 9. 9 | 10 | 11 | This version of Schematron splits the process into a pipeline of several different XSLT stages. 12 | 13 | 1) First, preprocess your Schematron schema with iso_dsdl_include.xsl. 14 | This is a macro processor to assemble the schema from various parts. 15 | If your schema is not in separate parts, you can skip this stage. 16 | 17 | 2) Second, preprocess the output from stage 1 with iso_abstract_expand.xsl. 18 | This is a macro processor to convert abstract patterns to real patterns. 19 | If your schema does not use abstract patterns, you can skip this 20 | stage. 21 | 22 | 3) Third, compile the Schematron schema into an XSLT script. 23 | This will typically use iso_svrl_for_xslt1.xsl or iso_svrl_for_xslt2.xsl 24 | (which in turn invoke iso_schematron_skeleton_for_xslt1.xsl or iso_schematron_skeleton_for_saxon.xsl) 25 | However, other "meta-styleseets" are also in common use; the principle of operation is the same. 26 | If your schema uses Schematron phases, supply these as command line/invocation parameters 27 | to this process. 28 | 29 | 4) Fourth, run the script generated by stage 3 against the document being validated. 30 | If you are using the SVRL script, then the output of validation will be an XML document. 31 | If your schema uses Schematron parameters, supply these as command line/invocation parameters 32 | to this process. 33 | 34 | 35 | The XSLT2 distribution also features several next generation features, 36 | such as validating multiple documents. See the source code for details. 37 | 38 | Schematron assertions can be written in any language, of course; the file 39 | sch-messages-en.xhtml contains the diagnostics messages from the XSLT2 skeleton 40 | in English, and this can be used as template to localize the skeleton's 41 | error messages. Note that typically programming errors in Schematron are XPath 42 | errors, which requires localized messages from the XSLT engine. 43 | 44 | ANT 45 | --- 46 | To give an example of how to process a document, here is a sample ANT task. 47 | 48 | 49 | 50 | 51 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/lxml/pyclasslookup.py: -------------------------------------------------------------------------------- 1 | # dummy module for backwards compatibility 2 | 3 | from etree import PythonElementClassLookup 4 | -------------------------------------------------------------------------------- /src/lxml/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The lxml test suite for lxml, ElementTree and cElementTree. 3 | """ 4 | 5 | -------------------------------------------------------------------------------- /src/lxml/tests/dummy_http_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple HTTP request dumper for tests in Python 2.5+. 3 | """ 4 | 5 | import sys 6 | from contextlib import contextmanager 7 | 8 | try: 9 | import urlparse 10 | except ImportError: 11 | # Python 3 12 | import urllib.parse as urlparse 13 | 14 | 15 | @contextmanager 16 | def webserver(app, port=0, host=None): 17 | """Context manager entry point for the 'with' statement. 18 | 19 | Pass 0 as port number to dynamically allocate a free port. 20 | 21 | Usage: 22 | 23 | with webserver(wsgi_app_function, 8080) as host_url: 24 | do_ws_calls(host_url) 25 | """ 26 | server = build_web_server(app, port, host or '127.0.0.1') 27 | host, port = server.socket.getsockname() 28 | 29 | import threading 30 | thread = threading.Thread(target=server.serve_forever, 31 | kwargs={'poll_interval': 0.5}) 32 | thread.setDaemon(True) 33 | thread.start() 34 | try: 35 | yield 'http://%s:%s/' % (host, port) # yield control to 'with' body 36 | finally: 37 | server.shutdown() 38 | 39 | 40 | try: 41 | from SocketServer import ThreadingMixIn 42 | except ImportError: 43 | # Python 3 44 | from socketserver import ThreadingMixIn 45 | 46 | import wsgiref.simple_server as wsgiserver 47 | class WebServer(wsgiserver.WSGIServer, ThreadingMixIn): 48 | """A web server that starts a new thread for each request. 49 | """ 50 | 51 | 52 | class _RequestHandler(wsgiserver.WSGIRequestHandler): 53 | def get_stderr(self): 54 | # don't write to stderr 55 | return sys.stdout 56 | 57 | def log_message(self, format, *args): 58 | # message = "wsmock(%s) %s" % (self.address_string(), format % args) 59 | pass # don't log messages 60 | 61 | 62 | def build_web_server(app, port, host=None): 63 | server = wsgiserver.make_server( 64 | host or '', port, app, 65 | server_class=WebServer, 66 | handler_class=_RequestHandler) 67 | return server 68 | 69 | 70 | class HTTPRequestCollector(object): 71 | def __init__(self, response_data, response_code=200, headers=()): 72 | self.requests = [] 73 | self.response_code = response_code 74 | self.response_data = response_data 75 | self.headers = list(headers or ()) 76 | 77 | def __call__(self, environ, start_response): 78 | self.requests.append(( 79 | environ.get('PATH_INFO'), 80 | urlparse.parse_qsl(environ.get('QUERY_STRING')))) 81 | start_response('%s OK' % self.response_code, self.headers) 82 | return [self.response_data] 83 | -------------------------------------------------------------------------------- /src/lxml/tests/include/test_xinclude.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /src/lxml/tests/test-document.xslt: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/lxml/tests/test-string.xml: -------------------------------------------------------------------------------- 1 | 2 | Søk på nettet 3 | -------------------------------------------------------------------------------- /src/lxml/tests/test.dtd: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /src/lxml/tests/test.sch: -------------------------------------------------------------------------------- 1 | 2 | 3 | mandatory number_of_entries tests 4 | 5 | [ERROR] number_of_entries () must equal the number of entries/entry elements () 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/lxml/tests/test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /src/lxml/tests/test.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/lxml/tests/test1.rng: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /src/lxml/tests/test1.xslt: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 |

Foo

7 |
8 | 9 |
10 | -------------------------------------------------------------------------------- /src/lxml/tests/test2.rng: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/lxml/tests/test2.xslt: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 |

hello

6 |
7 | 8 |
9 | -------------------------------------------------------------------------------- /src/lxml/tests/test_broken.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lxml/tests/test_builder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | 4 | """ 5 | Tests that ElementMaker works properly. 6 | """ 7 | 8 | import sys, os.path 9 | from lxml import etree 10 | from lxml.builder import E 11 | 12 | this_dir = os.path.dirname(__file__) 13 | if this_dir not in sys.path: 14 | sys.path.insert(0, this_dir) # needed for Py3 15 | 16 | from common_imports import HelperTestCase, BytesIO, _bytes 17 | 18 | class BuilderTestCase(HelperTestCase): 19 | etree = etree 20 | 21 | def test_build_from_xpath_result(self): 22 | class StringSubclass(str): pass 23 | wrapped = E.b(StringSubclass('Hello')) 24 | self.assertEqual(_bytes('Hello'), etree.tostring(wrapped)) 25 | 26 | def test_unknown_type_raises(self): 27 | class UnknownType(object): 28 | pass 29 | self.assertRaises(TypeError, E.b, UnknownType()) 30 | 31 | 32 | def test_suite(): 33 | suite = unittest.TestSuite() 34 | suite.addTests([unittest.makeSuite(BuilderTestCase)]) 35 | return suite 36 | 37 | if __name__ == '__main__': 38 | print('to test use test.py %s' % __file__) 39 | -------------------------------------------------------------------------------- /src/lxml/tests/test_css.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import lxml.html 4 | 5 | from lxml.tests.common_imports import doctest, HelperTestCase, skipif 6 | 7 | try: 8 | import cssselect 9 | except ImportError: 10 | cssselect = None 11 | 12 | 13 | HTML = ''' 14 |
15 | link 16 | anchor 17 |
18 | ''' 19 | 20 | 21 | class CSSTestCase(HelperTestCase): 22 | 23 | pytestmark = skipif('cssselect is None') 24 | 25 | def test_cssselect(self): 26 | div, = lxml.html.fromstring(HTML).xpath('//div') 27 | 28 | def count(selector, expected_count, **kwargs): 29 | result = div.cssselect(selector, **kwargs) 30 | self.assertEqual(len(result), expected_count) 31 | 32 | count('div', 1) 33 | count('a', 2) 34 | count('em', 0) 35 | # Element names are case-insensitive in HTML 36 | count('DIV', 1) 37 | # ... but not in XHTML or XML 38 | count('DIV', 0, translator='xhtml') 39 | count('DIV', 0, translator='xml') 40 | 41 | # :contains() is case-insensitive in lxml 42 | count(':contains("link")', 2) # div, a 43 | count(':contains("LInk")', 2) 44 | # Whatever the document language 45 | count(':contains("LInk")', 2, translator='xhtml') 46 | count(':contains("LInk")', 2, translator='xml') 47 | # ... but not in upstream cssselect 48 | import cssselect 49 | count(':contains("link")', 2, translator=cssselect.HTMLTranslator()) 50 | count(':contains("LInk")', 0, translator=cssselect.HTMLTranslator()) 51 | 52 | 53 | def test_suite(): 54 | suite = unittest.TestSuite() 55 | try: 56 | import cssselect 57 | except ImportError: 58 | # no 'cssselect' installed 59 | print("Skipping tests in lxml.cssselect - external cssselect package is not installed") 60 | return suite 61 | 62 | import lxml.cssselect 63 | suite.addTests(doctest.DocTestSuite(lxml.cssselect)) 64 | suite.addTests([unittest.makeSuite(CSSTestCase)]) 65 | return suite 66 | -------------------------------------------------------------------------------- /src/lxml/tests/test_errors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest, doctest 3 | 4 | # These tests check that error handling in the Pyrex code is 5 | # complete. 6 | # It is likely that if there are errors, instead of failing the code 7 | # will simply crash. 8 | 9 | import sys, gc, os.path 10 | from lxml import etree 11 | 12 | this_dir = os.path.dirname(__file__) 13 | if this_dir not in sys.path: 14 | sys.path.insert(0, this_dir) # needed for Py3 15 | 16 | from common_imports import HelperTestCase 17 | 18 | class ErrorTestCase(HelperTestCase): 19 | etree = etree 20 | 21 | def test_bad_element(self): 22 | # attrib argument of Element() should be a dictionary, so if 23 | # we pass a string we should get an error. 24 | self.assertRaises(TypeError, self.etree.Element, 'a', 'b') 25 | 26 | def test_empty_parse(self): 27 | self.assertRaises(etree.XMLSyntaxError, etree.fromstring, '') 28 | 29 | def test_element_cyclic_gc_none(self): 30 | # test if cyclic reference can crash etree 31 | Element = self.etree.Element 32 | gc.collect() 33 | 34 | count = sys.getrefcount(None) 35 | 36 | l = [Element('name'), Element('name')] 37 | l.append(l) 38 | 39 | del l 40 | gc.collect() 41 | 42 | self.assertEqual(sys.getrefcount(None), count) 43 | 44 | def test_suite(): 45 | suite = unittest.TestSuite() 46 | suite.addTests([unittest.makeSuite(ErrorTestCase)]) 47 | return suite 48 | 49 | if __name__ == '__main__': 50 | print('to test use test.py %s' % __file__) 51 | -------------------------------------------------------------------------------- /src/lxml/tests/test_http_io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Web IO test cases that need Python 2.5+ (wsgiref) 5 | """ 6 | 7 | from __future__ import with_statement 8 | 9 | import unittest 10 | import textwrap 11 | import os 12 | import sys 13 | import gzip 14 | 15 | this_dir = os.path.dirname(__file__) 16 | if this_dir not in sys.path: 17 | sys.path.insert(0, this_dir) # needed for Py3 18 | 19 | from .common_imports import ( 20 | etree, HelperTestCase, BytesIO, _bytes) 21 | from .dummy_http_server import webserver, HTTPRequestCollector 22 | 23 | 24 | class HttpIOTestCase(HelperTestCase): 25 | etree = etree 26 | 27 | def _parse_from_http(self, data, code=200, headers=None, parser=None): 28 | handler = HTTPRequestCollector(data, code, headers) 29 | with webserver(handler) as host_url: 30 | tree = self.etree.parse(host_url + 'TEST', parser=parser) 31 | self.assertEqual([('/TEST', [])], handler.requests) 32 | return tree 33 | 34 | def test_http_client(self): 35 | tree = self._parse_from_http(_bytes('')) 36 | self.assertEqual('root', tree.getroot().tag) 37 | self.assertEqual('a', tree.getroot()[0].tag) 38 | 39 | def test_http_client_404(self): 40 | try: 41 | self._parse_from_http(_bytes(''), code=404) 42 | except IOError: 43 | self.assertTrue(True) 44 | else: 45 | self.assertTrue(False, "expected IOError") 46 | 47 | def test_http_client_gzip(self): 48 | f = BytesIO() 49 | gz = gzip.GzipFile(fileobj=f, mode='w', filename='test.xml') 50 | gz.write(_bytes('')) 51 | gz.close() 52 | data = f.getvalue() 53 | del f, gz 54 | 55 | headers = [('Content-Encoding', 'gzip')] 56 | tree = self._parse_from_http(data, headers=headers) 57 | self.assertEqual('root', tree.getroot().tag) 58 | self.assertEqual('a', tree.getroot()[0].tag) 59 | 60 | def test_parser_input_mix(self): 61 | data = _bytes('') 62 | handler = HTTPRequestCollector(data) 63 | 64 | with webserver(handler) as host_url: 65 | tree = self.etree.parse(host_url) 66 | root = tree.getroot() 67 | self.assertEqual('a', root[0].tag) 68 | 69 | root = self.etree.fromstring(data) 70 | self.assertEqual('a', root[0].tag) 71 | 72 | tree = self.etree.parse(host_url) 73 | root = tree.getroot() 74 | self.assertEqual('a', root[0].tag) 75 | 76 | root = self.etree.fromstring(data) 77 | self.assertEqual('a', root[0].tag) 78 | 79 | root = self.etree.fromstring(data) 80 | self.assertEqual('a', root[0].tag) 81 | 82 | def test_network_dtd(self): 83 | data = [_bytes(textwrap.dedent(s)) for s in [ 84 | # XML file 85 | '''\ 86 | 87 | 88 | &myentity; 89 | ''', 90 | # DTD 91 | '', 92 | ]] 93 | 94 | responses = [] 95 | def handler(environ, start_response): 96 | start_response('200 OK', []) 97 | return [responses.pop()] 98 | 99 | with webserver(handler) as host_url: 100 | # DTD network loading enabled 101 | responses = data[::-1] 102 | tree = self.etree.parse( 103 | host_url + 'dir/test.xml', 104 | parser=self.etree.XMLParser( 105 | load_dtd=True, no_network=False)) 106 | self.assertFalse(responses) # all read 107 | root = tree.getroot() 108 | self.assertEqual('DEFINED', root.text) 109 | 110 | # DTD network loading disabled 111 | responses = data[::-1] 112 | try: 113 | self.etree.parse( 114 | host_url + 'dir/test.xml', 115 | parser=self.etree.XMLParser( 116 | load_dtd=True, no_network=True)) 117 | except self.etree.XMLSyntaxError: 118 | self.assertTrue("myentity" in str(sys.exc_info()[1])) 119 | else: 120 | self.assertTrue(False) 121 | self.assertEqual(1, len(responses)) # DTD not read 122 | 123 | 124 | def test_suite(): 125 | suite = unittest.TestSuite() 126 | suite.addTests([unittest.makeSuite(HttpIOTestCase)]) 127 | return suite 128 | 129 | 130 | if __name__ == '__main__': 131 | print('to test use test.py %s' % __file__) 132 | -------------------------------------------------------------------------------- /src/lxml/tests/test_import.xsd: -------------------------------------------------------------------------------- 1 | 5 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/lxml/tests/test_inc.xsd: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/lxml/tests/test_schematron.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Test cases related to Schematron parsing and validation 5 | """ 6 | 7 | import unittest, sys, os.path 8 | 9 | this_dir = os.path.dirname(__file__) 10 | if this_dir not in sys.path: 11 | sys.path.insert(0, this_dir) # needed for Py3 12 | 13 | from common_imports import etree, HelperTestCase, fileInTestDir 14 | from common_imports import doctest, make_doctest 15 | 16 | class ETreeSchematronTestCase(HelperTestCase): 17 | def test_schematron(self): 18 | tree_valid = self.parse('') 19 | tree_invalid = self.parse('') 20 | schema = self.parse('''\ 21 | 22 | 23 | 24 | BBB element is not present 25 | CCC element is not present 26 | 27 | 28 | 29 | 30 | BBB element is not present 31 | CCC element is not present 32 | There is an extra element 33 | 34 | 35 | 36 | ''') 37 | schema = etree.Schematron(schema) 38 | self.assertTrue(schema.validate(tree_valid)) 39 | self.assertTrue(not schema.validate(tree_invalid)) 40 | 41 | def test_schematron_elementtree_error(self): 42 | self.assertRaises(ValueError, etree.Schematron, etree.ElementTree()) 43 | 44 | def test_schematron_invalid_schema(self): 45 | schema = self.parse('''\ 46 | 47 | 48 | 49 | 50 | ''') 51 | self.assertRaises(etree.SchematronParseError, 52 | etree.Schematron, schema) 53 | 54 | def test_schematron_invalid_schema_empty(self): 55 | schema = self.parse('''\ 56 | 57 | ''') 58 | self.assertRaises(etree.SchematronParseError, 59 | etree.Schematron, schema) 60 | 61 | def test_schematron_invalid_schema_namespace(self): 62 | # segfault 63 | schema = self.parse('''\ 64 | 65 | ''') 66 | self.assertRaises(etree.SchematronParseError, 67 | etree.Schematron, schema) 68 | 69 | 70 | def test_suite(): 71 | suite = unittest.TestSuite() 72 | suite.addTests([unittest.makeSuite(ETreeSchematronTestCase)]) 73 | suite.addTests( 74 | [make_doctest('../../../doc/validation.txt')]) 75 | return suite 76 | 77 | if __name__ == '__main__': 78 | print('to test use test.py %s' % __file__) 79 | -------------------------------------------------------------------------------- /src/lxml/tests/test_unicode.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest, doctest, sys, os.path 3 | 4 | this_dir = os.path.dirname(__file__) 5 | if this_dir not in sys.path: 6 | sys.path.insert(0, this_dir) # needed for Py3 7 | 8 | from common_imports import StringIO, etree, SillyFileLike, HelperTestCase 9 | from common_imports import _str, _bytes 10 | 11 | try: 12 | unicode 13 | except NameError: 14 | unicode = str 15 | 16 | ascii_uni = _str('a') 17 | 18 | klingon = _bytes("\\uF8D2").decode("unicode_escape") # not valid for XML names 19 | 20 | invalid_tag = _str("test") + klingon 21 | 22 | uni = _bytes('\\xc3\\u0680\\u3120').decode("unicode_escape") # some non-ASCII characters 23 | 24 | uxml = _bytes("test \\xc3\\xa1\\u3120

page \\xc3\\xa1\\u3120 title

" 25 | ).decode("unicode_escape") 26 | 27 | class UnicodeTestCase(HelperTestCase): 28 | def test_unicode_xml(self): 29 | tree = etree.XML(_str('

%s

') % uni) 30 | self.assertEqual(uni, tree.text) 31 | 32 | def test_unicode_xml_broken(self): 33 | uxml = _str('') + \ 34 | _str('

%s

') % uni 35 | self.assertRaises(ValueError, etree.XML, uxml) 36 | 37 | def test_unicode_tag(self): 38 | el = etree.Element(uni) 39 | self.assertEqual(uni, el.tag) 40 | 41 | def test_unicode_tag_invalid(self): 42 | # sadly, Klingon is not well-formed 43 | self.assertRaises(ValueError, etree.Element, invalid_tag) 44 | 45 | def test_unicode_nstag(self): 46 | tag = _str("{http://abc/}%s") % uni 47 | el = etree.Element(tag) 48 | self.assertEqual(tag, el.tag) 49 | 50 | def test_unicode_ns_invalid(self): 51 | # namespace URIs must conform to RFC 3986 52 | tag = _str("{http://%s/}abc") % uni 53 | self.assertRaises(ValueError, etree.Element, tag) 54 | 55 | def test_unicode_nstag_invalid(self): 56 | # sadly, Klingon is not well-formed 57 | tag = _str("{http://abc/}%s") % invalid_tag 58 | self.assertRaises(ValueError, etree.Element, tag) 59 | 60 | def test_unicode_qname(self): 61 | qname = etree.QName(uni, uni) 62 | tag = _str("{%s}%s") % (uni, uni) 63 | self.assertEqual(qname.text, tag) 64 | self.assertEqual(unicode(qname), tag) 65 | 66 | def test_unicode_qname_invalid(self): 67 | self.assertRaises(ValueError, etree.QName, invalid_tag) 68 | 69 | def test_unicode_attr(self): 70 | el = etree.Element('foo', {'bar': uni}) 71 | self.assertEqual(uni, el.attrib['bar']) 72 | 73 | def test_unicode_comment(self): 74 | el = etree.Comment(uni) 75 | self.assertEqual(uni, el.text) 76 | 77 | def test_unicode_parse_stringio(self): 78 | el = etree.parse(StringIO(_str('

%s

') % uni)).getroot() 79 | self.assertEqual(uni, el.text) 80 | 81 | ## def test_parse_fileobject_unicode(self): 82 | ## # parse unicode from unamed file object (not support by ElementTree) 83 | ## f = SillyFileLike(uxml) 84 | ## root = etree.parse(f).getroot() 85 | ## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'), 86 | ## uxml) 87 | 88 | def test_suite(): 89 | suite = unittest.TestSuite() 90 | suite.addTests([unittest.makeSuite(UnicodeTestCase)]) 91 | return suite 92 | -------------------------------------------------------------------------------- /src/lxml/usedoctest.py: -------------------------------------------------------------------------------- 1 | """Doctest module for XML comparison. 2 | 3 | Usage:: 4 | 5 | >>> import lxml.usedoctest 6 | >>> # now do your XML doctests ... 7 | 8 | See `lxml.doctestcompare` 9 | """ 10 | 11 | from lxml import doctestcompare 12 | 13 | doctestcompare.temp_install(del_module=__name__) 14 | -------------------------------------------------------------------------------- /src/lxml/xinclude.pxi: -------------------------------------------------------------------------------- 1 | # XInclude processing 2 | 3 | from lxml.includes cimport xinclude 4 | 5 | class XIncludeError(LxmlError): 6 | u"""Error during XInclude processing. 7 | """ 8 | pass 9 | 10 | cdef class XInclude: 11 | u"""XInclude(self) 12 | XInclude processor. 13 | 14 | Create an instance and call it on an Element to run XInclude 15 | processing. 16 | """ 17 | cdef _ErrorLog _error_log 18 | def __init__(self): 19 | self._error_log = _ErrorLog() 20 | 21 | property error_log: 22 | def __get__(self): 23 | assert self._error_log is not None, "XInclude instance not initialised" 24 | return self._error_log.copy() 25 | 26 | def __call__(self, _Element node not None): 27 | u"__call__(self, node)" 28 | # We cannot pass the XML_PARSE_NOXINCNODE option as this would free 29 | # the XInclude nodes - there may still be Python references to them! 30 | # Therefore, we allow XInclude nodes to be converted to 31 | # XML_XINCLUDE_START nodes. XML_XINCLUDE_END nodes are added as 32 | # siblings. Tree traversal will simply ignore them as they are not 33 | # typed as elements. The included fragment is added between the two, 34 | # i.e. as a sibling, which does not conflict with traversal. 35 | cdef int result 36 | _assertValidNode(node) 37 | assert self._error_log is not None, "XPath evaluator not initialised" 38 | self._error_log.connect() 39 | __GLOBAL_PARSER_CONTEXT.pushImpliedContextFromParser( 40 | node._doc._parser) 41 | with nogil: 42 | if node._doc._parser is not None: 43 | result = xinclude.xmlXIncludeProcessTreeFlags( 44 | node._c_node, node._doc._parser._parse_options) 45 | else: 46 | result = xinclude.xmlXIncludeProcessTree(node._c_node) 47 | __GLOBAL_PARSER_CONTEXT.popImpliedContext() 48 | self._error_log.disconnect() 49 | 50 | if result == -1: 51 | raise XIncludeError( 52 | self._error_log._buildExceptionMessage( 53 | u"XInclude processing failed"), 54 | self._error_log) 55 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py24, py25, py26, py27, py30, py31, py32, py33 8 | 9 | [testenv] 10 | commands = 11 | {envpython} setup.py clean 12 | {envpython} setup.py build_ext --inplace 13 | make test 14 | deps = 15 | Cython>=0.17.2 16 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 3.2.1 2 | --------------------------------------------------------------------------------