├── .gitignore ├── .travis.yml ├── CHANGELOG ├── LICENSE ├── MANIFEST ├── README.md ├── TODO.md ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat └── preview_docs.sh ├── examples ├── engadget.let.json ├── engadget_css.let.json └── engadget_xpath.let.json ├── parslepy ├── __init__.py ├── base.py ├── funcs.py ├── selectors.py └── utils │ ├── README.md │ ├── __init__.py │ └── scrapytools.py ├── requirements-extra.txt ├── requirements.txt ├── run_parslepy.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── data │ ├── creativecommons.org__licenses__by__3.0.html │ ├── itunes.topalbums.rss │ ├── parselet.json │ ├── parselet.yml │ └── validator.w3.org.html ├── requirements.txt ├── test_parslepy_compile.py ├── test_parslepy_extensions.py ├── test_parslepy_extraction.py ├── test_parslepy_init.py ├── test_parslepy_parse.py ├── test_parslepy_parselets.py ├── test_parslepy_selector.py └── tools.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | 37 | # docs 38 | docs/_build 39 | docs/_static 40 | docs/_templates 41 | 42 | 43 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.5" 5 | - "3.6" 6 | install: 7 | - pip install -U tox tox-travis twine wheel 8 | script: tox 9 | 10 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Version 0.3.0 - March 3., 2015 5 | ---------------------------------- 6 | 7 | * Improvements: 8 | * CSS selectors extensions: 9 | * pseudo-elements ``::text`` (borrowed from Scrapy) and ``::comment`` 10 | * functional pseudo-element ``::attr(name)`` 11 | * Cleaned up documentation (thanks @eliasdorneles!) 12 | * New ``keys()`` method for ``Parselet`` nodes 13 | 14 | Version 0.2.0 - August 5., 2013 15 | --------------------------------- 16 | 17 | * Improvements: 18 | * Support XPath namespace prefixes (``namespace:element``) and CSS namespace 19 | prefixes (``namespace|element``) in DefaultSelectorHandler 20 | * new built-in extension function ``parslepy:strip()`` mapped 21 | to Python's strip() for strings 22 | * new built-in extension function ``parslepy:attrname()`` 23 | that takes an attribute selector and returns the attribute's name 24 | * support for user-defined extension functions, which take an additional 25 | context parameter when called (context is passed either at selector 26 | handler instantiation or when calling Parselet.parse()) 27 | * use ``smart_strings=False`` for XPath compiled expressions, 28 | except for user-defined extensions and some built-in extensions 29 | (see http://lxml.de/xpathxslt.html#xpath-return-values) 30 | 31 | * Bug fixes: 32 | * #2: XPath namespace prefixes raise cssselect.xpath.ExpressionError 33 | with DefaultSelectorHandler 34 | * #3: Docs suggest using \*.js files when they are JSON documents 35 | * #4: The example usage should not have both url_css and url_xpath 36 | * #5: In example usage, skip lines between "configuration" and "execution" 37 | * #6: add underscore to _version__ 38 | * #7: Empty result set on boolean or numerical selectors 39 | 40 | Version 0.1.2 - July 9, 2013 41 | ----------------------------- 42 | 43 | * Bug fixes: 44 | * #1: headingxpath rule does not seem to work as expected 45 | 46 | Version 0.1.1 - July 3, 2013 47 | ----------------------------- 48 | 49 | * Docstrings added to main classes and methods. 50 | * Added parse_fromstring() method to Parselet 51 | * Added tests for Parselet.parse() and Parselet.parse_fromstring() 52 | 53 | Version 0.1 - June 30, 2013 54 | --------------------------- 55 | 56 | Initial release 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 Paul Tremberth, Newlynn Labs 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | README 3 | setup.cfg 4 | setup.py 5 | parslepy/__init__.py 6 | parslepy/base.py 7 | parslepy/funcs.py 8 | parslepy/selectors.py 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | parslepy 2 | ======== 3 | 4 | [![Build Status](https://travis-ci.org/redapple/parslepy.png?branch=master)](https://travis-ci.org/redapple/parslepy) 5 | 6 | *parslepy* (pronounced *"parsley-pie"*, */ˈpɑːslipaɪ/*) is a Python implementation 7 | (built on top of [lxml](http://lxml.de) and [cssselect](https://github.com/SimonSapin/cssselect)) of the 8 | [Parsley DSL](https://github.com/fizx/parsley) 9 | for extracting structured data from web pages, as defined by Kyle Maxwell and Andrew Cantino 10 | (see [Parsley's wiki](https://github.com/fizx/parsley/wiki) for more details and original C implementation). 11 | 12 | Kudos to Kyle Maxwell (@fizx) for coming up with this smart and easy syntax to define extracting rules. 13 | 14 | > Please note that this *Parsley DSL* is **NOT** the same as the Parsley parsing library at https://pypi.python.org/pypi/Parsley 15 | 16 | Check out the [official docs](http://pythonhosted.org/parslepy) for more information on how to install 17 | and use *parslepy*. There is also some useful information at the [parslepy Wiki](https://github.com/redapple/parslepy/wiki) 18 | 19 | Here is an example of a parselet script that extracts the questions from StackOverflow first page: 20 | 21 | { 22 | "first_page_questions(//div[contains(@class,'question-summary')])": [{ 23 | "title": ".//h3/a", 24 | "tags": "div.tags", 25 | "votes": "div.votes div.mini-counts", 26 | "views": "div.views div.mini-counts", 27 | "answers": "div.status div.mini-counts" 28 | }] 29 | } 30 | 31 | ### Install 32 | 33 | Install via pip with: 34 | 35 | sudo pip install parslepy 36 | 37 | Alternatively, you can install from the latest source code: 38 | 39 | git clone https://github.com/redapple/parslepy.git 40 | sudo python setup.py install 41 | 42 | 43 | ### Online Resources ### 44 | 45 | * [Official Documentation](http://pythonhosted.org/parslepy) 46 | * [Wiki with examples and tutorials](https://github.com/redapple/parslepy/wiki) 47 | * [Parsley DSL](https://github.com/fizx/parsley) 48 | * [JSON Structure details -- Parsley wiki](https://github.com/fizx/parsley/wiki/JSON-Structure) 49 | * [Example Scrapy Spider using Parsley](http://snipplr.com/view/67016/parsley-spider/) 50 | * [Parsley DSL on Hacker News](https://news.ycombinator.com/item?id=1585301) 51 | 52 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | 3 | * add more tests 4 | * support XPath functions with CSS selectors 5 | * ~~support optionality operator ("?")~~ 6 | * support complete arrays with the "!" operator (https://github.com/fizx/parsley/wiki/JSON-Structure#requiring-complete-arrays-with-the--operator) 7 | * support bucketed arrays (https://github.com/fizx/parsley/wiki/JSON-Structure#bucketed-arrays); 8 | see https://github.com/redapple/parslepy/wiki/Implementing-bucketed-arrays-(work-in-progess) 9 | * investigate PyParsley API 10 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/parslepy.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/parslepy.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/parslepy" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/parslepy" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # parslepy documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jul 1 15:20:50 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | sys.path.insert(0, os.path.abspath('../parslepy')) 20 | sys.path.insert(0, os.path.abspath('../')) 21 | 22 | # -- General configuration ----------------------------------------------------- 23 | 24 | # If your documentation needs a minimal Sphinx version, state it here. 25 | #needs_sphinx = '1.0' 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be extensions 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 29 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.viewcode'] 30 | 31 | # Add any paths that contain templates here, relative to this directory. 32 | templates_path = ['_templates'] 33 | 34 | # The suffix of source filenames. 35 | source_suffix = '.rst' 36 | 37 | # The encoding of source files. 38 | #source_encoding = 'utf-8-sig' 39 | 40 | # The master toctree document. 41 | master_doc = 'index' 42 | 43 | # General information about the project. 44 | project = u'parslepy' 45 | copyright = u'2013, Paul Tremberth' 46 | 47 | # The version info for the project you're documenting, acts as replacement for 48 | # |version| and |release|, also used in various other places throughout the 49 | # built documents. 50 | # 51 | # The short X.Y version. 52 | version = '0.3' 53 | # The full version, including alpha/beta/rc tags. 54 | release = '0.3.0' 55 | 56 | # The language for content autogenerated by Sphinx. Refer to documentation 57 | # for a list of supported languages. 58 | #language = None 59 | 60 | # There are two options for replacing |today|: either, you set today to some 61 | # non-false value, then it is used: 62 | #today = '' 63 | # Else, today_fmt is used as the format for a strftime call. 64 | #today_fmt = '%B %d, %Y' 65 | 66 | # List of patterns, relative to source directory, that match files and 67 | # directories to ignore when looking for source files. 68 | exclude_patterns = ['_build'] 69 | 70 | # The reST default role (used for this markup: `text`) to use for all documents. 71 | #default_role = None 72 | 73 | # If true, '()' will be appended to :func: etc. cross-reference text. 74 | #add_function_parentheses = True 75 | 76 | # If true, the current module name will be prepended to all description 77 | # unit titles (such as .. function::). 78 | #add_module_names = True 79 | 80 | # If true, sectionauthor and moduleauthor directives will be shown in the 81 | # output. They are ignored by default. 82 | #show_authors = False 83 | 84 | # The name of the Pygments (syntax highlighting) style to use. 85 | pygments_style = 'sphinx' 86 | 87 | # A list of ignored prefixes for module index sorting. 88 | #modindex_common_prefix = [] 89 | 90 | autoclass_content = 'both' 91 | 92 | # -- Options for HTML output --------------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | html_theme = 'default' 97 | 98 | # Theme options are theme-specific and customize the look and feel of a theme 99 | # further. For a list of options available for each theme, see the 100 | # documentation. 101 | #html_theme_options = {} 102 | 103 | # Add any paths that contain custom themes here, relative to this directory. 104 | #html_theme_path = [] 105 | 106 | # The name for this set of Sphinx documents. If None, it defaults to 107 | # " v documentation". 108 | #html_title = None 109 | 110 | # A shorter title for the navigation bar. Default is the same as html_title. 111 | #html_short_title = None 112 | 113 | # The name of an image file (relative to this directory) to place at the top 114 | # of the sidebar. 115 | #html_logo = None 116 | 117 | # The name of an image file (within the static path) to use as favicon of the 118 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 119 | # pixels large. 120 | #html_favicon = None 121 | 122 | # Add any paths that contain custom static files (such as style sheets) here, 123 | # relative to this directory. They are copied after the builtin static files, 124 | # so a file named "default.css" will overwrite the builtin "default.css". 125 | html_static_path = ['_static'] 126 | 127 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 128 | # using the given strftime format. 129 | #html_last_updated_fmt = '%b %d, %Y' 130 | 131 | # If true, SmartyPants will be used to convert quotes and dashes to 132 | # typographically correct entities. 133 | #html_use_smartypants = True 134 | 135 | # Custom sidebar templates, maps document names to template names. 136 | #html_sidebars = {} 137 | 138 | # Additional templates that should be rendered to pages, maps page names to 139 | # template names. 140 | #html_additional_pages = {} 141 | 142 | # If false, no module index is generated. 143 | #html_domain_indices = True 144 | 145 | # If false, no index is generated. 146 | #html_use_index = True 147 | 148 | # If true, the index is split into individual pages for each letter. 149 | #html_split_index = False 150 | 151 | # If true, links to the reST sources are added to the pages. 152 | #html_show_sourcelink = True 153 | 154 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 155 | #html_show_sphinx = True 156 | 157 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 158 | #html_show_copyright = True 159 | 160 | # If true, an OpenSearch description file will be output, and all pages will 161 | # contain a tag referring to it. The value of this option must be the 162 | # base URL from which the finished HTML is served. 163 | #html_use_opensearch = '' 164 | 165 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 166 | #html_file_suffix = None 167 | 168 | # Output file base name for HTML help builder. 169 | htmlhelp_basename = 'parslepydoc' 170 | 171 | 172 | # -- Options for LaTeX output -------------------------------------------------- 173 | 174 | latex_elements = { 175 | # The paper size ('letterpaper' or 'a4paper'). 176 | #'papersize': 'letterpaper', 177 | 178 | # The font size ('10pt', '11pt' or '12pt'). 179 | #'pointsize': '10pt', 180 | 181 | # Additional stuff for the LaTeX preamble. 182 | #'preamble': '', 183 | } 184 | 185 | # Grouping the document tree into LaTeX files. List of tuples 186 | # (source start file, target name, title, author, documentclass [howto/manual]). 187 | latex_documents = [ 188 | ('index', 'parslepy.tex', u'parslepy Documentation', 189 | u'Paul Tremberth', 'manual'), 190 | ] 191 | 192 | # The name of an image file (relative to this directory) to place at the top of 193 | # the title page. 194 | #latex_logo = None 195 | 196 | # For "manual" documents, if this is true, then toplevel headings are parts, 197 | # not chapters. 198 | #latex_use_parts = False 199 | 200 | # If true, show page references after internal links. 201 | #latex_show_pagerefs = False 202 | 203 | # If true, show URL addresses after external links. 204 | #latex_show_urls = False 205 | 206 | # Documents to append as an appendix to all manuals. 207 | #latex_appendices = [] 208 | 209 | # If false, no module index is generated. 210 | #latex_domain_indices = True 211 | 212 | 213 | # -- Options for manual page output -------------------------------------------- 214 | 215 | # One entry per manual page. List of tuples 216 | # (source start file, name, description, authors, manual section). 217 | man_pages = [ 218 | ('index', 'parslepy', u'parslepy Documentation', 219 | [u'Paul Tremberth'], 1) 220 | ] 221 | 222 | # If true, show URL addresses after external links. 223 | #man_show_urls = False 224 | 225 | 226 | # -- Options for Texinfo output ------------------------------------------------ 227 | 228 | # Grouping the document tree into Texinfo files. List of tuples 229 | # (source start file, target name, title, author, 230 | # dir menu entry, description, category) 231 | texinfo_documents = [ 232 | ('index', 'parslepy', u'parslepy Documentation', 233 | u'Paul Tremberth', 'parslepy', 'One line description of project.', 234 | 'Miscellaneous'), 235 | ] 236 | 237 | # Documents to append as an appendix to all manuals. 238 | #texinfo_appendices = [] 239 | 240 | # If false, no module index is generated. 241 | #texinfo_domain_indices = True 242 | 243 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 244 | #texinfo_show_urls = 'footnote' 245 | 246 | 247 | # -- Options for Epub output --------------------------------------------------- 248 | 249 | # Bibliographic Dublin Core info. 250 | epub_title = u'parslepy' 251 | epub_author = u'Paul Tremberth' 252 | epub_publisher = u'Paul Tremberth' 253 | epub_copyright = u'2013, Paul Tremberth' 254 | 255 | # The language of the text. It defaults to the language option 256 | # or en if the language is not set. 257 | #epub_language = '' 258 | 259 | # The scheme of the identifier. Typical schemes are ISBN or URL. 260 | #epub_scheme = '' 261 | 262 | # The unique identifier of the text. This can be a ISBN number 263 | # or the project homepage. 264 | #epub_identifier = '' 265 | 266 | # A unique identification for the text. 267 | #epub_uid = '' 268 | 269 | # A tuple containing the cover image and cover page html template filenames. 270 | #epub_cover = () 271 | 272 | # HTML files that should be inserted before the pages created by sphinx. 273 | # The format is a list of tuples containing the path and title. 274 | #epub_pre_files = [] 275 | 276 | # HTML files shat should be inserted after the pages created by sphinx. 277 | # The format is a list of tuples containing the path and title. 278 | #epub_post_files = [] 279 | 280 | # A list of files that should not be packed into the epub file. 281 | #epub_exclude_files = [] 282 | 283 | # The depth of the table of contents in toc.ncx. 284 | #epub_tocdepth = 3 285 | 286 | # Allow duplicate toc entries. 287 | #epub_tocdup = True 288 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. parslepy documentation master file, created by 2 | sphinx-quickstart on Mon Jul 1 15:20:50 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | parslepy -- Documentation 7 | ========================= 8 | 9 | Introduction 10 | ------------ 11 | 12 | *parslepy* lets you extract content from HTML and XML documents 13 | **using rules defined in a JSON object** (or a Python :class:`dict`). 14 | The object keys mean the names you want to assign for the data in each 15 | document section and the values are CSS selectors or XPath expressions 16 | that will match the document parts (elements or attributes). 17 | 18 | Here is an example for extracting questions in StackOverflow first page:: 19 | 20 | { 21 | "first_page_questions(//div[contains(@class,'question-summary')])": [{ 22 | "title": ".//h3/a", 23 | "tags": "div.tags", 24 | "votes": "div.votes div.mini-counts", 25 | "views": "div.views div.mini-counts", 26 | "answers": "div.status div.mini-counts" 27 | }] 28 | } 29 | 30 | Some details 31 | ^^^^^^^^^^^^ 32 | 33 | *parslepy* is a Python implementation (built on top of `lxml`_ and `cssselect`_) 34 | of the `Parsley DSL`_ for extraction content from structured documents, 35 | defined by Kyle Maxwell and Andrew Cantino 36 | (see the `parsley wiki`_ for more details and original C implementation). 37 | 38 | The default behavior for the selectors is: 39 | 40 | * selectors for elements will output their matching textual content (children elements' content is also included) 41 | * selectors matching element attributes will output the attribute's value 42 | 43 | So, if you use ``//h1/a`` in a selector, *parslepy* will extract the text inside of the ``a`` element 44 | and its children, and if you use ``//h1/a/@href`` it will extract the value for ``href``, i.e., 45 | the address the link is pointing to. 46 | 47 | 48 | You can also nest objects, generate lists of objects, and mix CSS and XPath 49 | -- although not in the same selector. 50 | 51 | *parslepy* understands what `lxml`_ and `cssselect`_ understand, 52 | which is roughly `CSS3 Selectors`_ and `XPath 1.0`_. 53 | 54 | 55 | .. _CSS3 Selectors: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/ 56 | .. _XPath 1.0: http://www.w3.org/TR/xpath/ 57 | .. _lxml: http://lxml.de/ 58 | .. _cssselect: https://github.com/SimonSapin/cssselect 59 | .. _Parsley DSL: https://github.com/fizx/parsley 60 | .. _parsley wiki: https://github.com/fizx/parsley/wiki 61 | 62 | 63 | Syntax summary 64 | ^^^^^^^^^^^^^^ 65 | 66 | Here is a quick description of the rules format:: 67 | 68 | output key (mandatory) 69 | | 70 | | optionality operator (optional) 71 | | | 72 | | | scope, always within brackets (optional) 73 | | | | 74 | v v v 75 | "somekey?(someselector)": "someCSSSelector" 76 | 77 | or // : "someXPathExpression" 78 | 79 | or // : ["someXPathOrCSSExpression"] 80 | 81 | or // : { ...some other rules... } 82 | 83 | or // : [{ ...some other rules... }] 84 | 85 | 86 | A collection of extraction rules (also called a *parselet*, 87 | or *Parsley script*) looks like this in JSON format:: 88 | 89 | { 90 | "somekey": "#someID .someclass", # using a CSS selector 91 | "anotherkey": "//sometag[@someattribute='somevalue']", # using an XPath expression 92 | "nestedkey(.somelistclass)": [{ # CSS selector for multiple elements (scope selector) 93 | "somenestedkey": "somenestedtag/@someattribute" # XPath expression for an attribbute 94 | }] 95 | } 96 | 97 | ... or like this in YAML format: 98 | 99 | --- 100 | somekey: "#someID .someclass" # using a CSS selector 101 | anotherkey: "//sometag[@someattribute='somevalue']" # using an XPath expression 102 | nestedkey(.somelistclass): # CSS selector for multiple elements (scope selector) 103 | - somenestedkey: somenestedtag/@someattribute # XPath expression for an attribbute 104 | 105 | And the output would be something like:: 106 | 107 | { 108 | "somekey": "some value inside the first element matching the CSS selector", 109 | "anotherkey": "some value inside the first element matching the XPath expression", 110 | "nestedkey: [ 111 | {"somenestedkey": "attribute of 1st nested element"}, 112 | {"somenestedkey": "attribute of 2nd nested element"}, 113 | ... 114 | {"somenestedkey": "attribute of last nested element"} 115 | ] 116 | } 117 | 118 | 119 | 120 | Quickstart 121 | ---------- 122 | 123 | Install 124 | ^^^^^^^ 125 | 126 | From PyPI 127 | ######### 128 | 129 | You can install *parslepy* from `PyPI `_: 130 | 131 | .. code-block:: bash 132 | 133 | sudo pip install parslepy 134 | 135 | 136 | From latest source 137 | ################## 138 | 139 | You can also install from source code (make sure you have the 140 | ``lxml`` and ``cssselect`` libraries already installed): 141 | 142 | .. code-block:: bash 143 | 144 | git clone https://github.com/redapple/parslepy.git 145 | sudo python setup.py install 146 | 147 | You probably want also to make sure the tests passes: 148 | 149 | .. code-block:: bash 150 | 151 | sudo pip install nosetests # only needed if you don't have nosetests installed 152 | nosetests -v tests 153 | 154 | Usage 155 | ^^^^^ 156 | 157 | Here are some examples on how to use parslepy. 158 | You can also check out the examples and tutorials at `parsley's wiki at GitHub `_. 159 | 160 | Extract the questions from StackOverflow first page: 161 | 162 | >>> import parslepy, urllib2 163 | >>> rules = {"questions(//div[contains(@class,'question-summary')])": [{"title": ".//h3/a", "votes": "div.votes div.mini-counts"}]} 164 | >>> parslepy.Parselet(rules).parse(urllib2.urlopen('http://stackoverflow.com')) 165 | {'questions': [{'title': u'node.js RSS memory grows over time despite fairly consistent heap sizes', 166 | 'votes': u'0'}, 167 | {'title': u'SQL query for count of predicate applied on rows of subquery', 168 | 'votes': u'3'}, 169 | ... 170 | } 171 | 172 | Extract a page heading and a list of item links from a string containing HTML: 173 | 174 | >>> import lxml.etree 175 | >>> import parslepy 176 | >>> import pprint 177 | >>> html = """ 178 | ... 179 | ... 180 | ... 181 | ... Sample document to test parslepy 182 | ... 183 | ... 184 | ... 185 | ...

What’s new

186 | ... 191 | ... 192 | ... """ 193 | >>> rules = { 194 | ... "heading": "h1#main", 195 | ... "news(li.newsitem)": [{ 196 | ... "title": ".", 197 | ... "url": "a/@href", 198 | ... "fresh": ".fresh" 199 | ... }], 200 | ... } 201 | >>> p = parslepy.Parselet(rules) 202 | >>> extracted = p.parse_fromstring(html) 203 | >>> pprint.pprint(extracted) 204 | {'heading': u'What\u2019s new', 205 | 'news': [{'title': u'This is the first article', 'url': '/article-001.html'}, 206 | {'title': u'A second report on something', 207 | 'url': '/article-002.html'}, 208 | {'fresh': u'New!', 209 | 'title': u'Python is great! New!', 210 | 'url': '/article-003.html'}]} 211 | >>> 212 | 213 | 214 | Extract using the rules in a JSON file (from *parslepy*'s ``examples/`` directory): 215 | 216 | .. code-block:: bash 217 | 218 | # Parselet file containing CSS selectors 219 | $ cat examples/engadget_css.let.json 220 | { 221 | "sections(nav#nav-main > ul li)": [{ 222 | "title": ".", 223 | "url": "a.item @href", 224 | }] 225 | } 226 | $ python run_parslepy.py --script examples/engadget_css.let.json --url http://www.engadget.com 227 | {u'sections': [{u'title': u'News', u'url': '/'}, 228 | {u'title': u'Reviews', u'url': '/reviews/'}, 229 | {u'title': u'Features', u'url': '/features/'}, 230 | {u'title': u'Galleries', u'url': '/galleries/'}, 231 | {u'title': u'Videos', u'url': '/videos/'}, 232 | {u'title': u'Events', u'url': '/events/'}, 233 | {u'title': u'Podcasts', 234 | u'url': '/podcasts/the-engadget-podcast/'}, 235 | {u'title': u'Engadget Show', u'url': '/videos/show/'}, 236 | {u'title': u'Topics', u'url': '#nav-topics'}]} 237 | 238 | 239 | You may want to check out the other examples given in the ``examples/`` directory. 240 | You can run them using the ``run_parslepy.py`` script like shown above. 241 | 242 | 243 | Selector syntax 244 | ^^^^^^^^^^^^^^^ 245 | 246 | *parslepy* understands `CSS3 Selectors`_ and `XPath 1.0`_ expressions. 247 | 248 | Select elements attributes by name 249 | ################################## 250 | 251 | It also accepts `Parsley DSL`_'s ``@attributename`` at the end of CSS 252 | selectors, to get the attribute(s) of the preceding selected element(s). 253 | *parslepy* supports `Scrapy`_'s ``::attr(attributename)`` functional pseudo 254 | element extension to CCS3, which gets attributes by ``attributename``. 255 | 256 | See the two syntax variants in use: 257 | 258 | .. code-block:: bash 259 | 260 | >>> import parslepy 261 | >>> import pprint 262 | >>> 263 | >>> html = """ 264 | ... 265 | ... 266 | ... 267 | ... Sample document to test parslepy 268 | ... 269 | ... 270 | ... 271 | ...
272 | ... First link 273 | ... Second link 274 | ...
275 | ... 276 | ... """ 277 | >>> rules = { 278 | ... "links": { 279 | ... "first_class": ["a.first::attr(href)"], 280 | ... "second_class": ["a.second @href"], 281 | ... } 282 | ... } 283 | >>> p = parslepy.Parselet(rules) 284 | >>> extracted = p.parse_fromstring(html) 285 | >>> pprint.pprint(extracted) 286 | {'links': {'first_class': ['http://www.example.com/first'], 287 | 'second_class': ['http://www.example.com/second']}} 288 | >>> 289 | 290 | 291 | Select text and comments nodes 292 | ############################## 293 | 294 | Borrowing from `Scrapy`_'s extension to CCS3 selectors, 295 | *parslepy* supports ``::text`` and ``::comment`` pseudo 296 | elements (resp. get text nodes of an element, and extract 297 | comments in XML/HTML elements) 298 | 299 | .. code-block:: bash 300 | 301 | >>> import parslepy 302 | >>> import pprint 303 | >>> 304 | >>> html = """ 305 | ... 306 | ... 307 | ... 308 | ... Sample document to test parslepy 309 | ... 310 | ... 311 | ... 312 | ...

News

313 | ... 314 | ...
315 | ...

Something to say

316 | ... 317 | ...
318 | ... 319 | ... """ 320 | >>> rules = { 321 | ... "comments": { 322 | ... "all": ["::comment"], 323 | ... "inside_div": "div::comment" 324 | ... } 325 | ... } 326 | >>> p = parslepy.Parselet(rules) 327 | >>> extracted = p.parse_fromstring(html) 328 | >>> pprint.pprint(extracted) 329 | {'comments': {'all': [u'this is a comment', u'this is another comment'], 330 | 'inside_div': u'this is another comment'}} 331 | >>> 332 | 333 | 334 | .. _CSS3 Selectors: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/ 335 | .. _XPath 1.0: http://www.w3.org/TR/xpath/ 336 | .. _Parsley DSL: https://github.com/fizx/parsley 337 | .. _Scrapy: http://scrapy.org/ 338 | 339 | 340 | Dependencies 341 | ------------ 342 | 343 | The current dependencies of the master branch are: 344 | 345 | * lxml>=2.3 (http://lxml.de/, https://pypi.python.org/pypi/lxml) 346 | * cssselect (https://github.com/SimonSapin/cssselect/, https://pypi.python.org/pypi/cssselect) (for lxml>=3) 347 | 348 | 349 | API 350 | --- 351 | 352 | :class:`.Parselet` is the main class for extracting content 353 | from documents with *parslepy*. 354 | 355 | Instantiate it with a Parsley script, containing 356 | a mapping of name keys, and selectors (CSS or XPath) to apply on documents, or document parts. 357 | 358 | Then, run the extraction rules by passing an HTML or XML document to 359 | :meth:`~.Parselet.extract` or :meth:`~.Parselet.parse` 360 | 361 | The output will be a :class:`dict` containing the same keys as in your Parsley 362 | script, and, depending on your selectors, values will be: 363 | 364 | * text serialization of matching elements 365 | * element attributes 366 | * nested lists of extraction content 367 | 368 | .. autoclass:: parslepy.base.Parselet 369 | :members: parse, from_jsonfile, from_jsonstring, from_yamlfile, from_yamlstring, extract, parse_fromstring, keys 370 | 371 | Customizing 372 | ----------- 373 | 374 | You can use a :class:`.Parselet` directly with it's default configuration, 375 | which should work fine for HTML documents when the content you want to 376 | extract can be accessed by regular CSS3 selectors or XPath 1.0 expressions. 377 | 378 | But you can also customize how selectors are interpreted by sub-classing 379 | :class:`.SelectorHandler` and passing an instance of your selector handler 380 | to the Parselet constructor. 381 | 382 | .. autoclass:: parslepy.selectors.Selector 383 | 384 | .. autoclass:: parslepy.selectors.SelectorHandler 385 | :members: 386 | 387 | .. autoclass:: parslepy.selectors.XPathSelectorHandler 388 | 389 | .. autoclass:: parslepy.selectors.DefaultSelectorHandler 390 | 391 | Example with iTunes RSS feed: 392 | 393 | >>> import lxml.etree 394 | >>> xml_parser = lxml.etree.XMLParser() 395 | >>> url = 'http://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml' 396 | >>> 397 | >>> # register Atom and iTunes namespaces with prefixes "atom" and "im" 398 | ... # with a custom SelectorHandler 399 | ... xsh = parslepy.XPathSelectorHandler( 400 | ... namespaces={ 401 | ... 'atom': 'http://www.w3.org/2005/Atom', 402 | ... 'im': 'http://itunes.apple.com/rss' 403 | ... }) 404 | >>> 405 | >>> # use prefixes to target elements in the XML document 406 | >>> rules = { 407 | ... "entries(//atom:feed/atom:entry)": [ 408 | ... { 409 | ... "title": "atom:title", 410 | ... "name": "im:name", 411 | ... "id": "atom:id/@im:id", 412 | ... "artist(im:artist)": { 413 | ... "name": ".", 414 | ... "href": "@href" 415 | ... }, 416 | ... "images(im:image)": [{ 417 | ... "height": "@height", 418 | ... "url": "." 419 | ... }], 420 | ... "releasedate": "im:releaseDate" 421 | ... } 422 | ... ] 423 | ... } 424 | >>> parselet = parslepy.Parselet(rules, selector_handler=xsh) 425 | >>> parselet.parse(url, parser=xml_parser) 426 | {'entries': [{'name': u'Born Sinner (Deluxe Version)', ... 427 | 428 | Exceptions 429 | ---------- 430 | 431 | .. autoexception:: parslepy.base.InvalidKeySyntax 432 | 433 | .. autoexception:: parslepy.base.NonMatchingNonOptionalKey 434 | 435 | 436 | Extension functions 437 | ------------------- 438 | 439 | *parslepy* extends XPath 1.0 functions through `lxml`_'s XPath extensions. 440 | See http://lxml.de/extensions.html for details. 441 | 442 | Built-in extensions 443 | ^^^^^^^^^^^^^^^^^^^ 444 | 445 | *parslepy* comes with a few XPath extension functions. These functions 446 | are available by default when you use :class:`.XPathSelectorHandler` 447 | or :class:`.DefaultSelectorHandler`. 448 | 449 | * ``parslepy:text(xpath_expression[, false()])``: 450 | returns the text content for elements matching *xpath_expression*. 451 | The optional boolean second parameter indicates whether *tail* content 452 | should be included or not. 453 | (Internally, this calls `lxml.etree.tostring(..., method="text", encoding=unicode)`.) 454 | Use *true()* and *false()* XPath functions, not only *true* or *false*, 455 | (or 1 or 0). Defaults to *true()*. 456 | 457 | >>> import parslepy 458 | >>> doc = """ 459 | ... 460 | ... 461 | ... Some page title 462 | ... 463 | ... 464 | ... 465 | ...

Some heading

466 | ... 467 | ... Some text 468 | ... 469 | ...

470 | ... Some paragraph 471 | ...

472 | ... 473 | ... 474 | ... """ 475 | >>> rules = {"heading": "h1"} 476 | >>> 477 | >>> # default text extraction includes tail text 478 | ... parslepy.Parselet(rules).parse_fromstring(doc) 479 | {'heading': u'Some heading Some text'} 480 | >>> 481 | >>> # 2nd argument to false means without tail text 482 | ... rules = {"heading": "parslepy:text(//h1, false())"} 483 | >>> parslepy.Parselet(rules).parse_fromstring(doc) 484 | {'heading': 'Some heading'} 485 | >>> 486 | >>> # 2nd argument to true is equivalent to default text extraction 487 | >>> rules = {"heading": "parslepy:text(//h1, true())"} 488 | >>> parslepy.Parselet(rules).parse_fromstring(doc) 489 | {'heading': 'Some heading Some text'} 490 | >>> 491 | 492 | See http://lxml.de/tutorial.html#elements-contain-text for details 493 | on how `lxml`_ handles text and tail element properties 494 | 495 | * ``parslepy:textnl(xpath_expression)``: 496 | similar to ``parslepy:text()`` but appends `\\n` characters to HTML 497 | block elements such as `
`, `
`, `
` 498 | 499 | >>> import parslepy 500 | >>> doc = """ 501 | ... 502 | ... 503 | ... Some page title 504 | ... 505 | ... 506 | ...

Some heading

Some paragraph

with some span inside
ending now.

507 | ... 508 | ... 509 | ... """ 510 | >>> parslepy.Parselet({"heading": "parslepy:text(//body)"}).parse_fromstring(doc) 511 | {'heading': 'Some headingSome paragraphwith some span insideending now.'} 512 | >>> 513 | >>> parslepy.Parselet({"heading": "parslepy:textnl(//body)"}).parse_fromstring(doc) 514 | {'heading': 'Some heading\nSome paragraph\nwith some span inside\nending now.'} 515 | >>> 516 | 517 | 518 | * ``parslepy:html(xpath_expression)`` 519 | returns the HTML content for elements matching *xpath_expression*. 520 | Internally, this calls `lxml.html.tostring(element)`. 521 | 522 | >>> import parslepy 523 | >>> doc = """ 524 | ... 525 | ... 526 | ... Some page title 527 | ... 528 | ... 529 | ...

(Some heading)

530 | ...

[some sub-heading]

531 | ... 532 | ... 533 | ... """ 534 | >>> parslepy.Parselet({"heading": "parslepy:html(//h1)"}).parse_fromstring(doc) 535 | {'heading': '

(Some heading)

'} 536 | >>> parslepy.Parselet({"heading": "parslepy:html(//body)"}).parse_fromstring(doc) 537 | {'heading': '\n

(Some heading)

\n

[some sub-heading]

\n'} 538 | >>> 539 | 540 | 541 | * ``parslepy:xml(xpath_expression)`` 542 | returns the XML content for elements matching *xpath_expression*. 543 | Internally, this calls `lxml.etree.tostring(element)`. 544 | 545 | * ``parslepy:strip(xpath_expression[, chars])`` 546 | behaves like Python's `strip()` method for strings but for the text 547 | content of elements matching *xpath_expression*. 548 | See http://docs.python.org/2/library/string.html#string.strip 549 | 550 | >>> import parslepy 551 | >>> doc = """ 552 | ... 553 | ... 554 | ... Some page title 555 | ... 556 | ... 557 | ...

(Some heading)

558 | ...

[some sub-heading]

559 | ... 560 | ... 561 | ... """ 562 | >>> parslepy.Parselet({"heading": "parslepy:strip(//h2, '[')"}).parse_fromstring(doc) 563 | {'heading': 'some sub-heading]'} 564 | >>> parslepy.Parselet({"heading": "parslepy:strip(//h2, ']')"}).parse_fromstring(doc) 565 | {'heading': '[some sub-heading'} 566 | >>> parslepy.Parselet({"heading": "parslepy:strip(//h2, '[]')"}).parse_fromstring(doc) 567 | {'heading': 'some sub-heading'} 568 | >>> parslepy.Parselet({"heading": "parslepy:strip(//h1, '()')"}).parse_fromstring(doc) 569 | {'heading': 'Some heading'} 570 | >>> 571 | 572 | * ``parslepy:attrname(xpath_expression_matching_attribute)`` 573 | returns name of an attribute. This works with the catch-all-attributes 574 | `@*` expression or a specific attribute expression like `@class`. 575 | It may sound like a useless extension but it can be useful 576 | when combined with the simple `@*` selector like in the example below: 577 | 578 | >>> img_attributes = { 579 | ... "images(img)": [{ 580 | ... "attr_names": ["parslepy:attrname(@*)"], 581 | ... "attr_vals": ["@*"], 582 | ... }] 583 | ... } 584 | >>> extracted = parslepy.Parselet(img_attributes).parse('http://www.python.org') 585 | >>> for r in extracted["images"]: 586 | ...: print dict(zip(r.get("attr_names"), r.get("attr_vals"))) 587 | ...: 588 | {'src': '/images/python-logo.gif', 'alt': 'homepage', 'border': '0', 'id': 'logo'} 589 | {'src': '/images/trans.gif', 'alt': 'skip to navigation', 'border': '0', 'id': 'skiptonav'} 590 | {'src': '/images/trans.gif', 'alt': 'skip to content', 'border': '0', 'id': 'skiptocontent'} 591 | {'width': '116', 'alt': '', 'src': '/images/donate.png', 'title': '', 'height': '42'} 592 | {'width': '94', 'style': 'align:center', 'src': '/images/worldmap.jpg', 'alt': '[Python resources in languages other than English]', 'height': '46'} 593 | {'src': '/images/success/Carmanah.png', 'alt': 'success story photo', 'class': 'success'} 594 | 595 | 596 | User-defined extensions 597 | ^^^^^^^^^^^^^^^^^^^^^^^ 598 | 599 | *parslepy* also lets you define your own XPath extensions, just like 600 | `lxml`_ does, except the function you register must accept a user-supplied 601 | context object passed as first argument, subsequent arguments to your extension 602 | function will be the same as for `lxml`_ extensions, i.e. an XPath context, 603 | followed by matching elements and whatever additional parameters your XPath 604 | call passes. 605 | 606 | The user-supplied context should be passed to :meth:`parslepy.base.Parselet.parse`, 607 | or globally to a XPathSelectorHandler subclass instance passed to instantiate a Parselet. 608 | 609 | Let's illustrate this with a custom extension to make `` @src 610 | attributes "absolute". 611 | 612 | Suppose we already have an extraction rule that outputs the `@src` attributes 613 | from `` tags on the Python.org homepage: 614 | 615 | >>> import parslepy 616 | >>> import pprint 617 | >>> parselet = parslepy.Parselet({"img_abslinks": ["//img/@src"]}) 618 | >>> pprint.pprint(parselet.parse('http://www.python.org')) 619 | {'img_abslinks': ['/images/python-logo.gif', 620 | '/images/trans.gif', 621 | '/images/trans.gif', 622 | '/images/donate.png', 623 | '/images/worldmap.jpg', 624 | '/images/success/afnic.fr.png']} 625 | 626 | We now want to generate full URLs for these images, not relative to 627 | http://www.python.org. 628 | 629 | **First we need to define our extension function as a Python function**: 630 | 631 | *parslepy*'s extension functions must accept a user-context as first argument, 632 | then should expect an XPath context, followed by elements or strings 633 | matching the XPath expression, 634 | and finally whatever other parameters are passed to the function call 635 | in extraction rules. 636 | 637 | In our example, we expect `@src` attribute values as input from XPath, 638 | and combine them with a base URL (via `urlparse.urljoin()`), 639 | the URL from which the HTML document was fetched. 640 | The base URL will be passed as user-context, and we will receive it as 641 | first argument. 642 | So the Python extension function may look like this: 643 | 644 | >>> import urlparse 645 | >>> def absurl(ctx, xpctx, attributes, *args): 646 | ... # user-context "ctx" will be the URL of the page 647 | ... return [urlparse.urljoin(ctx, u) for u in attributes] 648 | ... 649 | 650 | **Then, we need to register this function with parslepy** through 651 | a custom selector handler, with a custom namespace and its prefix: 652 | 653 | >>> # choose a prefix and namespace, e.g. "myext" and "local-extensions" 654 | ... mynamespaces = { 655 | ... "myext": "local-extensions" 656 | ... } 657 | >>> myextensions = { 658 | ... ("local-extensions", "absurl"): absurl, 659 | ... } 660 | >>> 661 | >>> import parslepy 662 | >>> sh = parslepy.DefaultSelectorHandler( 663 | ... namespaces=mynamespaces, 664 | ... extensions=myextensions) 665 | >>> 666 | 667 | 668 | Now we can use this **absurl()** XPath extension within *parslepy* rules, 669 | with the "myext" prefix 670 | (**do not forget to pass your selector handler** to your Parselet instance): 671 | 672 | >>> rules = {"img_abslinks": ["myext:absurl(//img/@src)"]} 673 | >>> parselet = parslepy.Parselet(rules, selector_handler=sh) 674 | 675 | And finally, run the extraction rules on Python.org's homepage again, 676 | with a context argument set to the URL 677 | 678 | >>> import pprint 679 | >>> pprint.pprint(parselet.parse('http://www.python.org', 680 | ... context='http://www.python.org')) 681 | {'img_abslinks': ['http://www.python.org/images/python-logo.gif', 682 | 'http://www.python.org/images/trans.gif', 683 | 'http://www.python.org/images/trans.gif', 684 | 'http://www.python.org/images/donate.png', 685 | 'http://www.python.org/images/worldmap.jpg', 686 | 'http://www.python.org/images/success/afnic.fr.png']} 687 | >>> 688 | 689 | In this case, it may feel odd to have to pass the URL *twice*, 690 | but parse(*URL*) does not store the URL anywhere, it processes only 691 | the HTML stream from the page. 692 | 693 | More examples 694 | ============= 695 | 696 | Check out more examples and tutorials at `parsley's wiki at GitHub `_. 697 | 698 | .. include:: ../CHANGELOG 699 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\parslepy.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\parslepy.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /docs/preview_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | abort() { 4 | echo >&2 "$*"; exit 1; 5 | } 6 | 7 | usage() { 8 | abort """Usage: $(basename $0) OPTIONS 9 | -p|--port Port number for the HTTP server to use (default: 8000) 10 | -b|--browser Browser to open the link (default: xdg-open)""" 11 | } 12 | 13 | require() { 14 | type $1 >/dev/null 2>&1 15 | } 16 | 17 | port=8000 18 | browser=xdg-open 19 | while [ "${1#-}" != "$1" ]; do 20 | case "$1" in 21 | -h) usage;; 22 | -p|--port) [ -z "$2" ] && usage; port="$2"; shift;; 23 | -b|--browser) [ -z "$2" ] && usage; browser="$2"; shift;; 24 | *) usage;; 25 | esac 26 | shift 27 | done 28 | 29 | require when-changed || abort "Please install this first: sudo pip install when-changed" 30 | require "$browser" || abort "$browser is not available -- please specify another browser" 31 | 32 | # compile the first time 33 | make html 34 | 35 | # open web-browser 36 | $browser http://localhost:$port/html & 37 | 38 | # run HTTP server in background inside the _build dir 39 | # NOTE: here, cd also runs in bg, so the script's current dir stays the same 40 | cd _build && python -m SimpleHTTPServer $port & 41 | 42 | # watch for changes 43 | when-changed index.rst -c "make html; echo Use Ctrl-C to quit preview" 44 | 45 | -------------------------------------------------------------------------------- /examples/engadget.let.json: -------------------------------------------------------------------------------- 1 | { 2 | "sections(nav#nav-main > ul li)": [{ 3 | "title": ".", 4 | "url_css": "a.item @href", 5 | "url_xpath": "a[re:test(@class, 'item')]/@href" 6 | }] 7 | } 8 | -------------------------------------------------------------------------------- /examples/engadget_css.let.json: -------------------------------------------------------------------------------- 1 | { 2 | "sections(nav#nav-main > ul li)": [{ 3 | "title": ".", 4 | "url": "a.item @href" 5 | }] 6 | } 7 | -------------------------------------------------------------------------------- /examples/engadget_xpath.let.json: -------------------------------------------------------------------------------- 1 | { 2 | "sections(//nav[@id='nav-main']/ul/li)": [{ 3 | "title": ".", 4 | "url": ".//a[contains(@class, 'item')]/@href" 5 | }] 6 | } 7 | -------------------------------------------------------------------------------- /parslepy/__init__.py: -------------------------------------------------------------------------------- 1 | from parslepy.base import Parselet, Parslet, NonMatchingNonOptionalKey, InvalidKeySyntax 2 | from parslepy.selectors import DefaultSelectorHandler, XPathSelectorHandler 3 | 4 | __version__ = '0.2.0' 5 | __all__ = [ 6 | 'Parselet', 'Parslet', 7 | 'DefaultSelectorHandler', 'XPathSelectorHandler', 8 | 'NonMatchingNonOptionalKey', 'InvalidKeySyntax'] 9 | -------------------------------------------------------------------------------- /parslepy/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | from parslepy.selectors import DefaultSelectorHandler, SelectorHandler, Selector 5 | import lxml.etree 6 | import lxml.html 7 | import re 8 | import json 9 | 10 | # http://stackoverflow.com/questions/11301138/how-to-check-if-variable-is-string-with-python-2-and-3-compatibility 11 | try: 12 | isinstance("", basestring) 13 | def isstr(s): 14 | return isinstance(s, basestring) 15 | except NameError: 16 | def isstr(s): 17 | return isinstance(s, str) 18 | 19 | # ---------------------------------------------------------------------- 20 | 21 | # compiled Parsley scripts look like this 22 | # ParsleyNode( 23 | # ParsleyContext(key, options[, Selector]): ParsleyNode(...), 24 | # ...or 25 | # ParsleyContext(key, options[, Selector]): Selector, 26 | # ...) 27 | # --> a tree of ParsleyNode instances, 28 | # with terminal leaves of type Selector, 29 | # a parent ParsleyNode having 1 or more ParsleyNode children 30 | # references through ParsleyContext keys 31 | # 32 | class ParsleyNode(dict): 33 | pass 34 | 35 | 36 | class ParsleyContext(object): 37 | """ 38 | Stores parameters associated with extraction keys in `ParsleyNode` trees. 39 | Used as keys in `ParsleyNode` objects 40 | """ 41 | 42 | def __init__(self, key, operator=None, required=True, scope=None, iterate=False): 43 | """ 44 | Only `key` is required 45 | 46 | Arguments: 47 | operator (str) -- "?" optional, "!" for complete arrays; defaults to None (i.e. required) 48 | required (boolean) -- whether the key is required in the output (defaults to True) 49 | scope (`Selector`) -- restrict extraction to elements matching this selector 50 | iterate (boolean) -- whether multiple objects will be extracted (defaults to False) 51 | """ 52 | 53 | self.key = key 54 | self.operator = operator 55 | self.required = required 56 | self.scope = scope 57 | self.iterate = iterate 58 | 59 | def __repr__(self): 60 | return "" % ( 61 | self.key, self.operator, self.required, self.scope, self.iterate) 62 | 63 | 64 | class NonMatchingNonOptionalKey(RuntimeError): 65 | """ 66 | Raised by a :class:`.Parselet` instance while extracting content in strict mode, 67 | when a required key does not yield any content. 68 | 69 | >>> import parslepy 70 | >>> html = ''' 71 | ... 72 | ... 73 | ... 74 | ... Sample document to test parslepy 75 | ... 76 | ... 77 | ... 78 | ...

What’s new

79 | ... 84 | ... 85 | ... 86 | ... ''' 87 | >>> rules = { 88 | ... "heading1": "h1#main", 89 | ... "heading2": "h2#main", 90 | ... } 91 | >>> p = parslepy.Parselet(rules, strict=True) 92 | >>> try: 93 | ... p.parse_fromstring(html) 94 | ... except parslepy.base.NonMatchingNonOptionalKey as e: 95 | ... print "Missing mandatory key" 96 | Missing mandatory key 97 | """ 98 | 99 | pass 100 | 101 | 102 | class InvalidKeySyntax(SyntaxError): 103 | """ 104 | Raised when the input Parsley script's syntax is invalid 105 | 106 | >>> import parslepy 107 | >>> try: 108 | ... p = parslepy.Parselet({"heading@": "#main"}) 109 | ... except parslepy.base.InvalidKeySyntax as e: 110 | ... print e 111 | Key heading@ is not valid 112 | """ 113 | 114 | pass 115 | 116 | 117 | class Parselet(object): 118 | 119 | DEBUG = False 120 | SPECIAL_LEVEL_KEY = "--" 121 | KEEP_ONLY_FIRST_ELEMENT_IF_LIST = True 122 | STRICT_MODE = False 123 | 124 | def __init__(self, parselet, selector_handler=None, strict=False, debug=False): 125 | """ 126 | Take a parselet and optional selector_handler 127 | and build an abstract representation of the Parsley extraction 128 | logic. 129 | 130 | Four helper class methods can be used to instantiate a Parselet 131 | from JSON/YAML rules: :meth:`.from_jsonstring`, :meth:`.from_jsonfile`, 132 | :meth:`.from_yamlstring`, :meth:`.from_yamlfile`. 133 | 134 | :param dict parselet: Parsley script as a Python dict object 135 | :param boolean strict: Set to *True* is you want to 136 | enforce that missing required keys raise an Exception; default is False 137 | (i.e. lenient/non-strict mode) 138 | :param selector_handler: an instance of :class:`selectors.SelectorHandler` 139 | optional selector handler instance; 140 | defaults to an instance of :class:`selectors.DefaultSelectorHandler` 141 | :raises: :class:`.InvalidKeySyntax` 142 | 143 | Example: 144 | 145 | >>> import parslepy 146 | >>> rules = { 147 | ... "heading": "h1#main", 148 | ... "news(li.newsitem)": [{ 149 | ... "title": ".", 150 | ... "url": "a/@href" 151 | ... }], 152 | ... } 153 | >>> p = parslepy.Parselet(rules) 154 | >>> type(p) 155 | 156 | 157 | Use :meth:`~base.Parselet.extract` or :meth:`~base.Parselet.parse` 158 | to get extracted content from documents. 159 | """ 160 | 161 | if debug: 162 | self.DEBUG = True 163 | if strict: 164 | self.STRICT_MODE = True 165 | 166 | self.parselet = parselet 167 | 168 | if not selector_handler: 169 | self.selector_handler = DefaultSelectorHandler(debug=self.DEBUG) 170 | 171 | elif not(isinstance(selector_handler, SelectorHandler)): 172 | raise ValueError("You must provide a SelectorHandler instance") 173 | 174 | else: 175 | self.selector_handler = selector_handler 176 | 177 | self.compile() 178 | 179 | # accept comments in parselets 180 | REGEX_COMMENT_LINE = re.compile(r'^\s*#') 181 | @classmethod 182 | def from_jsonfile(cls, fp, selector_handler=None, strict=False, debug=False): 183 | """ 184 | Create a Parselet instance from a file containing 185 | the Parsley script as a JSON object 186 | 187 | >>> import parslepy 188 | >>> with open('parselet.json') as fp: 189 | ... parslepy.Parselet.from_jsonfile(fp) 190 | ... 191 | 192 | 193 | :param file fp: an open file-like pointer containing the Parsley script 194 | :rtype: :class:`.Parselet` 195 | 196 | Other arguments: same as for :class:`.Parselet` contructor 197 | """ 198 | 199 | return cls._from_jsonlines(fp, 200 | selector_handler=selector_handler, strict=strict, debug=debug) 201 | 202 | @classmethod 203 | def from_yamlfile(cls, fp, selector_handler=None, strict=False, debug=False): 204 | """ 205 | Create a Parselet instance from a file containing 206 | the Parsley script as a YAML object 207 | 208 | >>> import parslepy 209 | >>> with open('parselet.yml') as fp: 210 | ... parslepy.Parselet.from_yamlfile(fp) 211 | ... 212 | 213 | 214 | :param file fp: an open file-like pointer containing the Parsley script 215 | :rtype: :class:`.Parselet` 216 | 217 | Other arguments: same as for :class:`.Parselet` contructor 218 | """ 219 | 220 | return cls.from_yamlstring(fp.read(), selector_handler=selector_handler, strict=strict, debug=debug) 221 | 222 | @classmethod 223 | def from_yamlstring(cls, s, selector_handler=None, strict=False, debug=False): 224 | """ 225 | Create a Parselet instance from s (str) containing 226 | the Parsley script as YAML 227 | 228 | >>> import parslepy 229 | >>> parsley_string = '''--- 230 | title: h1 231 | link: a @href 232 | ''' 233 | >>> p = parslepy.Parselet.from_yamlstring(parsley_string) 234 | >>> type(p) 235 | 236 | >>> 237 | 238 | :param string s: a Parsley script as a YAML string 239 | :rtype: :class:`.Parselet` 240 | 241 | Other arguments: same as for :class:`.Parselet` contructor 242 | """ 243 | 244 | import yaml 245 | return cls(yaml.load(s), selector_handler=selector_handler, strict=strict, debug=debug) 246 | 247 | @classmethod 248 | def from_jsonstring(cls, s, selector_handler=None, strict=False, debug=False): 249 | """ 250 | Create a Parselet instance from s (str) containing 251 | the Parsley script as JSON 252 | 253 | >>> import parslepy 254 | >>> parsley_string = '{ "title": "h1", "link": "a @href"}' 255 | >>> p = parslepy.Parselet.from_jsonstring(parsley_string) 256 | >>> type(p) 257 | 258 | >>> 259 | 260 | :param string s: a Parsley script as a JSON string 261 | :rtype: :class:`.Parselet` 262 | 263 | Other arguments: same as for :class:`.Parselet` contructor 264 | """ 265 | 266 | return cls._from_jsonlines(s.split("\n"), 267 | selector_handler=selector_handler, strict=strict, debug=debug) 268 | 269 | @classmethod 270 | def _from_jsonlines(cls, lines, selector_handler=None, strict=False, debug=False): 271 | """ 272 | Interpret input lines as a JSON Parsley script. 273 | Python-style comment lines are skipped. 274 | """ 275 | 276 | return cls(json.loads( 277 | "\n".join([l for l in lines if not cls.REGEX_COMMENT_LINE.match(l)]) 278 | ), selector_handler=selector_handler, strict=strict, debug=debug) 279 | 280 | def parse(self, fp, parser=None, context=None): 281 | """ 282 | Parse an HTML or XML document and 283 | return the extacted object following the Parsley rules give at instantiation. 284 | 285 | :param fp: file-like object containing an HTML or XML document, or URL or filename 286 | :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser() 287 | :param context: user-supplied context that will be passed to custom XPath extensions (as first argument) 288 | :rtype: Python :class:`dict` object with mapped extracted content 289 | :raises: :class:`.NonMatchingNonOptionalKey` 290 | 291 | To parse from a string, use the :meth:`~base.Parselet.parse_fromstring` method instead. 292 | 293 | Note that the fp paramater is passed directly 294 | to `lxml.etree.parse `_, 295 | so you can also give it an URL, and lxml will download it for you. 296 | (Also see ``_.) 297 | """ 298 | 299 | if parser is None: 300 | parser = lxml.etree.HTMLParser() 301 | doc = lxml.etree.parse(fp, parser=parser).getroot() 302 | return self.extract(doc, context=context) 303 | 304 | def parse_fromstring(self, s, parser=None, context=None): 305 | """ 306 | Parse an HTML or XML document and 307 | return the extacted object following the Parsley rules give at instantiation. 308 | 309 | :param string s: an HTML or XML document as a string 310 | :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser() 311 | :param context: user-supplied context that will be passed to custom XPath extensions (as first argument) 312 | :rtype: Python :class:`dict` object with mapped extracted content 313 | :raises: :class:`.NonMatchingNonOptionalKey` 314 | 315 | """ 316 | if parser is None: 317 | parser = lxml.etree.HTMLParser() 318 | doc = lxml.etree.fromstring(s, parser=parser) 319 | return self.extract(doc, context=context) 320 | 321 | def compile(self): 322 | """ 323 | Build the abstract Parsley tree starting from the root node 324 | (recursive) 325 | """ 326 | if not isinstance(self.parselet, dict): 327 | raise ValueError("Parselet must be a dict of some sort. Or use .from_jsonstring(), " \ 328 | ".from_jsonfile(), .from_yamlstring(), or .from_yamlfile()") 329 | self.parselet_tree = self._compile(self.parselet) 330 | 331 | VALID_KEY_CHARS = "\w-" 332 | SUPPORTED_OPERATORS = "?" # "!" not supported for now 333 | REGEX_PARSELET_KEY = re.compile( 334 | "^(?P[%(validkeychars)s]+)(?P[%(suppop)s])?(\((?P.+)\))?$" % { 335 | 'validkeychars': VALID_KEY_CHARS, 336 | 'suppop': SUPPORTED_OPERATORS} 337 | ) 338 | def _compile(self, parselet_node, level=0): 339 | """ 340 | Build part of the abstract Parsley extraction tree 341 | 342 | Arguments: 343 | parselet_node (dict) -- part of the Parsley tree to compile 344 | (can be the root dict/node) 345 | level (int) -- current recursion depth (used for debug) 346 | """ 347 | 348 | if self.DEBUG: 349 | debug_offset = "".join([" " for x in range(level)]) 350 | 351 | if self.DEBUG: 352 | print(debug_offset, "%s::compile(%s)" % ( 353 | self.__class__.__name__, parselet_node)) 354 | 355 | if isinstance(parselet_node, dict): 356 | parselet_tree = ParsleyNode() 357 | for k, v in list(parselet_node.items()): 358 | 359 | # we parse the key raw elements but without much 360 | # interpretation (which is done by the SelectorHandler) 361 | try: 362 | m = self.REGEX_PARSELET_KEY.match(k) 363 | if not m: 364 | if self.DEBUG: 365 | print(debug_offset, "could not parse key", k) 366 | raise InvalidKeySyntax(k) 367 | except: 368 | raise InvalidKeySyntax("Key %s is not valid" % k) 369 | 370 | key = m.group('key') 371 | # by default, fields are required 372 | key_required = True 373 | operator = m.group('operator') 374 | if operator == '?': 375 | key_required = False 376 | # FIXME: "!" operator not supported (complete array) 377 | scope = m.group('scope') 378 | 379 | # example: get list of H3 tags 380 | # { "titles": ["h3"] } 381 | # FIXME: should we support multiple selectors in list? 382 | # e.g. { "titles": ["h1", "h2", "h3", "h4"] } 383 | if isinstance(v, (list, tuple)): 384 | v = v[0] 385 | iterate = True 386 | else: 387 | iterate = False 388 | 389 | # keys in the abstract Parsley trees are of type `ParsleyContext` 390 | try: 391 | parsley_context = ParsleyContext( 392 | key, 393 | operator=operator, 394 | required=key_required, 395 | scope=self.selector_handler.make(scope) if scope else None, 396 | iterate=iterate) 397 | except SyntaxError: 398 | if self.DEBUG: 399 | print("Invalid scope:", k, scope) 400 | raise 401 | 402 | if self.DEBUG: 403 | print(debug_offset, "current context:", parsley_context) 404 | 405 | # go deeper in the Parsley tree... 406 | try: 407 | child_tree = self._compile(v, level=level+1) 408 | except SyntaxError: 409 | if self.DEBUG: 410 | print("Invalid value: ", v) 411 | raise 412 | except: 413 | raise 414 | 415 | if self.DEBUG: 416 | print(debug_offset, "child tree:", child_tree) 417 | 418 | parselet_tree[parsley_context] = child_tree 419 | 420 | return parselet_tree 421 | 422 | # a string leaf should match some kind of selector, 423 | # let the selector handler deal with it 424 | elif isstr(parselet_node): 425 | return self.selector_handler.make(parselet_node) 426 | else: 427 | raise ValueError( 428 | "Unsupported type(%s) for Parselet node <%s>" % ( 429 | type(parselet_node), parselet_node)) 430 | 431 | def extract(self, document, context=None): 432 | """ 433 | Extract values as a dict object following the structure 434 | of the Parsley script (recursive) 435 | 436 | :param document: lxml-parsed document 437 | :param context: user-supplied context that will be passed to custom XPath extensions (as first argument) 438 | :rtype: Python *dict* object with mapped extracted content 439 | :raises: :class:`.NonMatchingNonOptionalKey` 440 | 441 | >>> import lxml.etree 442 | >>> import parslepy 443 | >>> html = ''' 444 | ... 445 | ... 446 | ... 447 | ... Sample document to test parslepy 448 | ... 449 | ... 450 | ... 451 | ...

What’s new

452 | ... 457 | ... 458 | ... 459 | ... ''' 460 | >>> html_parser = lxml.etree.HTMLParser() 461 | >>> doc = lxml.etree.fromstring(html, parser=html_parser) 462 | >>> doc 463 | 464 | >>> rules = { 465 | ... "headingcss": "#main", 466 | ... "headingxpath": "//h1[@id='main']" 467 | ... } 468 | >>> p = parslepy.Parselet(rules) 469 | >>> p.extract(doc) 470 | {'headingcss': u'What\u2019s new', 'headingxpath': u'What\u2019s new'} 471 | 472 | """ 473 | if context: 474 | self.selector_handler.context = context 475 | return self._extract(self.parselet_tree, document) 476 | 477 | def _extract(self, parselet_node, document, level=0): 478 | """ 479 | Extract values at this document node level 480 | using the parselet_node instructions: 481 | - go deeper in tree 482 | - or call selector handler in case of a terminal selector leaf 483 | """ 484 | 485 | if self.DEBUG: 486 | debug_offset = "".join([" " for x in range(level)]) 487 | 488 | # we must go deeper in the Parsley tree 489 | if isinstance(parselet_node, ParsleyNode): 490 | 491 | # default output 492 | output = {} 493 | 494 | # process all children 495 | for ctx, v in list(parselet_node.items()): 496 | if self.DEBUG: 497 | print(debug_offset, "context:", ctx, v) 498 | extracted=None 499 | try: 500 | # scoped-extraction: 501 | # extraction should be done deeper in the document tree 502 | if ctx.scope: 503 | extracted = [] 504 | selected = self.selector_handler.select(document, ctx.scope) 505 | if selected: 506 | for i, elem in enumerate(selected, start=1): 507 | parse_result = self._extract(v, elem, level=level+1) 508 | 509 | if isinstance(parse_result, (list, tuple)): 510 | extracted.extend(parse_result) 511 | else: 512 | extracted.append(parse_result) 513 | 514 | # if we're not in an array, 515 | # we only care about the first iteration 516 | if not ctx.iterate: 517 | break 518 | 519 | if self.DEBUG: 520 | print(debug_offset, 521 | "parsed %d elements in scope (%s)" % (i, ctx.scope)) 522 | 523 | # local extraction 524 | else: 525 | extracted = self._extract(v, document, level=level+1) 526 | 527 | except NonMatchingNonOptionalKey as e: 528 | if self.DEBUG: 529 | print(debug_offset, str(e)) 530 | if not ctx.required or not self.STRICT_MODE: 531 | output[ctx.key] = {} 532 | else: 533 | raise 534 | except Exception as e: 535 | if self.DEBUG: 536 | print(str(e)) 537 | raise 538 | 539 | # replace empty-list result when not looping by empty dict 540 | if ( isinstance(extracted, list) 541 | and not extracted 542 | and not ctx.iterate): 543 | extracted = {} 544 | 545 | # keep only the first element if we're not in an array 546 | if self.KEEP_ONLY_FIRST_ELEMENT_IF_LIST: 547 | try: 548 | if ( isinstance(extracted, list) 549 | and extracted 550 | and not ctx.iterate): 551 | 552 | if self.DEBUG: 553 | print(debug_offset, "keep only 1st element") 554 | extracted = extracted[0] 555 | 556 | except Exception as e: 557 | if self.DEBUG: 558 | print(str(e)) 559 | print(debug_offset, "error getting first element") 560 | 561 | # extraction for a required key gave nothing 562 | if ( self.STRICT_MODE 563 | and ctx.required 564 | and extracted is None): 565 | raise NonMatchingNonOptionalKey( 566 | 'key "%s" is required but yield nothing\nCurrent path: %s/(%s)\n' % ( 567 | ctx.key, 568 | document.getroottree().getpath(document),v 569 | ) 570 | ) 571 | 572 | # special key to extract a selector-defined level deeper 573 | # but still output at same level 574 | # this can be useful for breaking up long selectors 575 | # or when you need to mix XPath and CSS selectors 576 | # e.g. 577 | # { 578 | # "something(#content div.main)": { 579 | # "--(.//div[re:test(@class, 'style\d{3,6}')])": { 580 | # "title": "h1", 581 | # "subtitle": "h2" 582 | # } 583 | # } 584 | # } 585 | # 586 | if ctx.key == self.SPECIAL_LEVEL_KEY: 587 | if isinstance(extracted, dict): 588 | output.update(extracted) 589 | elif isinstance(extracted, list): 590 | if extracted: 591 | raise RuntimeError( 592 | "could not merge non-empty list at higher level") 593 | else: 594 | #empty list, dont bother? 595 | pass 596 | else: 597 | # required keys are handled above 598 | if extracted is not None: 599 | output[ctx.key] = extracted 600 | else: 601 | # do not add this optional key/value pair in the output 602 | pass 603 | 604 | return output 605 | 606 | # a leaf/Selector node 607 | elif isinstance(parselet_node, Selector): 608 | return self.selector_handler.extract(document, parselet_node) 609 | 610 | else: 611 | # FIXME: can this happen? 612 | # if selector handler returned None at compile time, 613 | # probably yes 614 | pass 615 | 616 | def keys(self): 617 | """ 618 | Return a list of 1st level keys of the output data model 619 | 620 | >>> import parslepy 621 | >>> rules = { 622 | ... "headingcss": "#main", 623 | ... "headingxpath": "//h1[@id='main']" 624 | ... } 625 | >>> p = parslepy.Parselet(rules) 626 | >>> sorted(p.keys()) 627 | ['headingcss', 'headingxpath'] 628 | 629 | """ 630 | return self._keys(self.parselet_tree) 631 | 632 | def _keys(self, parselet_node): 633 | keys = [] 634 | if isinstance(parselet_node, ParsleyNode): 635 | for ctx, v in list(parselet_node.items()): 636 | if ctx.key == self.SPECIAL_LEVEL_KEY: 637 | keys.extend(self._keys(v)) 638 | else: 639 | keys.append(ctx.key) 640 | return keys 641 | 642 | # alias 643 | Parslet = Parselet 644 | 645 | 646 | if __name__ == "__main__": 647 | import doctest 648 | doctest.testmod() 649 | -------------------------------------------------------------------------------- /parslepy/funcs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | import re 5 | import lxml.etree 6 | #import traceback 7 | 8 | # ---------------------------------------------------------------------- 9 | 10 | try: 11 | unicode # Python 2.x 12 | def lxml_element2string(element, method="text", with_tail=False): 13 | return lxml.etree.tostring(element, method=method, 14 | encoding=unicode, with_tail=with_tail) 15 | except NameError: # Python 3.x 16 | def lxml_element2string(element, method="text", with_tail=False): 17 | return lxml.etree.tostring(element, method=method, 18 | encoding=str, with_tail=with_tail) 19 | except: 20 | raise 21 | 22 | def extract_text(element, keep_nl=False, with_tail=False): 23 | return remove_multiple_whitespaces( 24 | lxml_element2string(element, method="text", with_tail=with_tail), 25 | keep_nl=keep_nl).strip() 26 | 27 | def extract_html(element, with_tail=False): 28 | return lxml_element2string(element, method="html", with_tail=with_tail) 29 | 30 | def extract_xml(element, with_tail=False): 31 | return lxml_element2string(element, method="xml", with_tail=with_tail) 32 | 33 | REGEX_NEWLINE = re.compile(r'\n') 34 | REGEX_WHITESPACE = re.compile(r'\s+', re.UNICODE) 35 | def remove_multiple_whitespaces(input_string, keep_nl=False): 36 | 37 | if keep_nl: 38 | lines = REGEX_NEWLINE.split(input_string) 39 | return "\n".join([remove_multiple_whitespaces(l) for l in lines]) 40 | else: 41 | return REGEX_WHITESPACE.sub(" ", input_string).strip() 42 | 43 | 44 | def format_alter_htmltags(tree, tags=[], replacement=" "): 45 | regex_repl_start = re.compile(r'^\s*%s' % replacement, re.UNICODE) 46 | context = lxml.etree.iterwalk(tree, events=("end", )) 47 | tag_set = set(tags) 48 | for action, elem in context: 49 | if elem.tag not in tag_set: 50 | continue 51 | if elem.tail is None: 52 | elem.tail = replacement 53 | elif not regex_repl_start.search(elem.tail): 54 | elem.tail = "%s%s" % (replacement, elem.tail) 55 | return tree 56 | 57 | 58 | HTML_BLOCK_ELEMENTS = [ 59 | 'address', 60 | 'article', 61 | 'aside', 62 | 'audio', 63 | 'blockquote', 64 | 'br', 65 | 'canvas', 66 | 'div', 67 | 'dl', 'dd', 'dt', 68 | 'fieldset', 69 | 'figcaption', 70 | 'figure', 71 | 'footer', 72 | 'form', 73 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 74 | 'header', 75 | 'hgroup', 76 | 'hr', 77 | 'noscript', 78 | 'li', 'ol', 'ul', 79 | 'output', 80 | 'p', 81 | 'pre', 82 | 'section', 83 | 'table', 84 | 'tfoot', 85 | 'video', 86 | ] 87 | def format_htmlblock_tags(tree, replacement="\n"): 88 | return format_alter_htmltags(tree, 89 | tags=HTML_BLOCK_ELEMENTS, 90 | replacement=replacement) 91 | 92 | 93 | def elements2text(nodes, with_tail=True): 94 | return [extract_text(e, with_tail=with_tail) for e in nodes] 95 | 96 | 97 | def elements2textnl(nodes, with_tail=True, replacement="\n"): 98 | return [extract_text( 99 | format_htmlblock_tags(e, replacement=replacement), 100 | with_tail=with_tail, 101 | keep_nl=True) 102 | for e in nodes] 103 | 104 | def elements2html(nodes): 105 | return [extract_html(e) for e in nodes] 106 | 107 | def elements2xml(nodes): 108 | return [extract_xml(e) for e in nodes] 109 | 110 | # ---------------------------------------------------------------------- 111 | 112 | def test_listitems_type(itemlist, checktype): 113 | return all([isinstance(i, checktype) for i in itemlist]) 114 | 115 | def check_listitems_types(itemlist): 116 | return list(set([type(i) for i in itemlist])) 117 | 118 | def apply2elements(elements, element_func, notelement_func=None): 119 | ltype = check_listitems_types(elements) 120 | if ltype == [lxml.etree._Element]: 121 | return element_func(elements) 122 | elif notelement_func: 123 | return notelement_func(elements) 124 | else: 125 | return elements 126 | 127 | #def apply2element(element, element_func, notelement_func=None): 128 | #if type(element) == lxml.etree._Element: 129 | #return element_func(element) 130 | #elif notelement_func: 131 | #return notelement_func(element) 132 | #else: 133 | #return element 134 | 135 | try: 136 | unicode # Python 2.x 137 | def xpathtostring(context, nodes, with_tail=True, *args): 138 | return apply2elements( 139 | nodes, 140 | element_func=lambda nodes: elements2text( 141 | nodes, with_tail=with_tail), 142 | notelement_func=lambda nodes: [ 143 | remove_multiple_whitespaces(unicode(s)) 144 | for s in nodes], 145 | ) 146 | 147 | except NameError: # Python 3.x 148 | def xpathtostring(context, nodes, with_tail=True, *args): 149 | return apply2elements( 150 | nodes, 151 | element_func=lambda nodes: elements2text( 152 | nodes, with_tail=with_tail), 153 | notelement_func=lambda nodes: [ 154 | remove_multiple_whitespaces(str(s)) 155 | for s in nodes], 156 | ) 157 | 158 | def xpathtostringnl(context, nodes, with_tail=True, replacement="\n", *args): 159 | return apply2elements(nodes, 160 | element_func=lambda nodes: elements2textnl( 161 | nodes, with_tail=with_tail, replacement=replacement)) 162 | 163 | def xpathtohtml(context, nodes): 164 | return apply2elements(nodes, 165 | element_func=lambda nodes: elements2html(nodes)) 166 | 167 | def xpathtoxml(context, nodes): 168 | return apply2elements(nodes, 169 | element_func=lambda nodes: elements2xml(nodes)) 170 | 171 | try: 172 | unicode # Python 2.x 173 | def xpathstrip(context, nodes, stripchars=None, with_tail=True, *args): 174 | if test_listitems_type(nodes, lxml.etree._Element): 175 | return [s.strip(stripchars) 176 | for s in elements2text( 177 | nodes, with_tail=with_tail)] 178 | else: 179 | return [unicode(s).strip(stripchars) for s in nodes] 180 | 181 | except NameError: # Python 3.x 182 | def xpathstrip(context, nodes, stripchars=None, with_tail=True, *args): 183 | if test_listitems_type(nodes, lxml.etree._Element): 184 | return [s.strip(stripchars) 185 | for s in elements2text( 186 | nodes, with_tail=with_tail)] 187 | else: 188 | return [str(s).strip(stripchars) for s in nodes] 189 | 190 | 191 | def xpathattrname(context, attributes, *args): 192 | return [a.attrname for a in attributes] 193 | -------------------------------------------------------------------------------- /parslepy/selectors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import copy 4 | 5 | import lxml.cssselect 6 | import lxml.etree 7 | 8 | import parslepy.funcs 9 | 10 | 11 | class Selector(object): 12 | """ 13 | Class of objects returned by :class:`.SelectorHandler` instances' 14 | (and subclasses) :meth:`~.SelectorHandler.make` method. 15 | """ 16 | 17 | def __init__(self, selector): 18 | self.selector = selector 19 | 20 | def __repr__(self): 21 | return "" % self.selector 22 | 23 | 24 | class SelectorHandler(object): 25 | """ 26 | Called when building abstract Parsley trees 27 | and when etracting object values during the actual parsing 28 | of documents 29 | 30 | This should be subclassed to implement the selector processing logic 31 | you need for your Parsley handling. 32 | 33 | All 3 methods, :meth:`~.SelectorHandler.make`, :meth:`~.SelectorHandler.select` 34 | and :meth:`~.SelectorHandler.extract` MUST be overridden 35 | """ 36 | 37 | DEBUG = False 38 | 39 | def __init__(self, debug=False): 40 | if debug: 41 | self.DEBUG = True 42 | 43 | def make(self, selection_string): 44 | """ 45 | Interpret a selection_string as a selector 46 | for elements or element attributes in a (semi-)structured document. 47 | In case of XPath selectors, this can also be a function call. 48 | 49 | :param selection_string: a string representing a selector 50 | :rtype: :class:`.Selector` 51 | """ 52 | 53 | raise NotImplementedError 54 | 55 | def select(self, document, selector): 56 | """ 57 | Apply the selector on the document 58 | 59 | :param document: lxml-parsed document 60 | :param selector: input :class:`.Selector` to apply on the document 61 | :rtype: lxml.etree.Element list 62 | """ 63 | 64 | raise NotImplementedError 65 | 66 | def extract(self, document, selector): 67 | """ 68 | Apply the selector on the document 69 | and return a value for the matching elements (text content or 70 | element attributes) 71 | 72 | :param document: lxml-parsed document 73 | :param selector: input :class:`.Selector` to apply on the document 74 | :rtype: depends on the selector (string, boolean value, ...) 75 | 76 | Return value can be single- or multi-valued. 77 | """ 78 | 79 | raise NotImplementedError 80 | 81 | 82 | class XPathSelectorHandler(SelectorHandler): 83 | """ 84 | This selector only accepts XPath selectors. 85 | 86 | It understands what lxml.etree.XPath understands, that is XPath 1.0 87 | expressions 88 | """ 89 | 90 | EXPECTED_NON_ELEMENT_TYPES = [ 91 | bool, 92 | int, 93 | float, 94 | str, 95 | ] 96 | try: 97 | unicode # Python 2.x 98 | EXPECTED_NON_ELEMENT_TYPES.append(unicode) 99 | except NameError: 100 | pass 101 | 102 | LOCAL_NAMESPACE = 'local-parslepy' 103 | LOCAL_XPATH_EXTENSIONS = { 104 | (LOCAL_NAMESPACE, 'text') : parslepy.funcs.xpathtostring, 105 | (LOCAL_NAMESPACE, 'textnl') : parslepy.funcs.xpathtostringnl, 106 | 107 | # aliases 108 | (LOCAL_NAMESPACE, 'str') : parslepy.funcs.xpathtostring, 109 | (LOCAL_NAMESPACE, 'strnl') : parslepy.funcs.xpathtostringnl, 110 | (LOCAL_NAMESPACE, 'nl') : parslepy.funcs.xpathtostringnl, 111 | 112 | (LOCAL_NAMESPACE, 'html') : parslepy.funcs.xpathtohtml, 113 | (LOCAL_NAMESPACE, 'xml') : parslepy.funcs.xpathtoxml, 114 | (LOCAL_NAMESPACE, 'strip') : parslepy.funcs.xpathstrip, 115 | 116 | (LOCAL_NAMESPACE, 'attrname') : parslepy.funcs.xpathattrname, 117 | (LOCAL_NAMESPACE, 'attrnames') : parslepy.funcs.xpathattrname, # alias that's probably a better fit 118 | } 119 | EXSLT_NAMESPACES={ 120 | 'date': 'http://exslt.org/dates-and-times', 121 | 'math': 'http://exslt.org/math', 122 | 're': 'http://exslt.org/regular-expressions', 123 | 'set': 'http://exslt.org/sets', 124 | 'str': 'http://exslt.org/strings', 125 | } 126 | _extension_router = {} 127 | 128 | SMART_STRINGS = False 129 | SMART_STRINGS_FUNCTIONS = [ 130 | (LOCAL_NAMESPACE, 'attrname'), 131 | (LOCAL_NAMESPACE, 'attrnames'), 132 | ] 133 | 134 | _selector_cache = {} 135 | 136 | def __init__(self, namespaces=None, extensions=None, context=None, debug=False): 137 | """ 138 | :param namespaces: namespace mapping as :class:`dict` 139 | :param extensions: extension :class:`dict` 140 | :param context: user-context passed to XPath extension functions 141 | 142 | `namespaces` and `extensions` dicts should have the same format 143 | as for `lxml`_: 144 | see http://lxml.de/xpathxslt.html#namespaces-and-prefixes 145 | and ``_ 146 | 147 | Extension functions have a slightly different signature than 148 | pure-lxml extension functions: they must expect a user-context 149 | as first argument; all other arguments are the same as for 150 | `lxml` extensions. 151 | 152 | `context` will be passed as first argument to extension functions 153 | registered through `extensions`. 154 | Alternative: user-context can also be passed to :meth:`parslepy.base.Parselet.parse` 155 | 156 | """ 157 | 158 | super(XPathSelectorHandler, self).__init__(debug=debug) 159 | 160 | # support EXSLT extensions 161 | self.namespaces = copy.copy(self.EXSLT_NAMESPACES) 162 | 163 | # add local XPath extension functions 164 | self._add_parsley_ns(self.namespaces) 165 | self.extensions = copy.copy(self.LOCAL_XPATH_EXTENSIONS) 166 | 167 | # add user-defined extensions 168 | self._user_extensions = None 169 | self.context = context 170 | if namespaces: 171 | self.namespaces.update(namespaces) 172 | if extensions: 173 | self._user_extensions = extensions 174 | self._process_extensions(extensions) 175 | 176 | # some functions need smart_strings=True 177 | self._set_smart_strings_regexps() 178 | 179 | def _test_smart_strings_needed(self, selector): 180 | return any([r.search(selector) 181 | for r in self.smart_strings_regexps]) 182 | 183 | def _get_smart_strings_regexps(self, ns, fname): 184 | # find out what prefixes match the supplied namespace 185 | prefix_matches = [] 186 | for prefix, namespace in self.namespaces.items(): 187 | if namespace == ns: 188 | prefix_matches.append(prefix) 189 | 190 | return [re.compile("%s:%s\(" % (p, fname)) for p in prefix_matches] 191 | 192 | def _set_smart_strings_regexps(self): 193 | self.smart_strings_regexps = [] 194 | # smart_strings for built-in extensions 195 | for (ns, fname) in self.SMART_STRINGS_FUNCTIONS: 196 | self.smart_strings_regexps.extend( 197 | self._get_smart_strings_regexps(ns, fname)) 198 | 199 | # smart_strings for user_defined extensions 200 | if self._user_extensions: 201 | for (ns, fname) in self._user_extensions: 202 | self.smart_strings_regexps.extend( 203 | self._get_smart_strings_regexps(ns, fname)) 204 | 205 | def _make_xpathextension(self, ns, fname): 206 | def xpath_ext(*args): 207 | return self._extension_router[(ns, fname)](self.context, *args) 208 | 209 | extension_name = str("xpext_%s_%d" % (fname, hash(ns))) 210 | xpath_ext.__doc__ = "docstring for %s" % extension_name 211 | xpath_ext.__name__ = extension_name 212 | setattr(self, xpath_ext.__name__, xpath_ext) 213 | 214 | return xpath_ext 215 | 216 | def _process_extensions(self, extensions): 217 | for (ns, fname), func in extensions.items(): 218 | self._extension_router[(ns, fname)] = func 219 | self.extensions[(ns, fname)] = self._make_xpathextension(ns=ns, fname=fname) 220 | 221 | @classmethod 222 | def _add_parsley_ns(cls, namespace_dict): 223 | """ 224 | Extend XPath evaluation with Parsley extensions' namespace 225 | """ 226 | 227 | namespace_dict.update({ 228 | 'parslepy' : cls.LOCAL_NAMESPACE, 229 | 'parsley' : cls.LOCAL_NAMESPACE, 230 | }) 231 | return namespace_dict 232 | 233 | def make(self, selection): 234 | """ 235 | XPath expression can also use EXSLT functions (as long as they are 236 | understood by libxslt) 237 | """ 238 | 239 | cached = self._selector_cache.get(selection) 240 | if cached: 241 | return cached 242 | 243 | try: 244 | selector = lxml.etree.XPath(selection, 245 | namespaces = self.namespaces, 246 | extensions = self.extensions, 247 | smart_strings=(self.SMART_STRINGS 248 | or self._test_smart_strings_needed(selection)), 249 | ) 250 | 251 | except lxml.etree.XPathSyntaxError as syntax_error: 252 | syntax_error.msg += ": %s" % selection 253 | raise syntax_error 254 | 255 | except Exception as e: 256 | if self.DEBUG: 257 | print(repr(e), selection) 258 | raise 259 | 260 | # wrap it/cache it 261 | self._selector_cache[selection] = Selector(selector) 262 | return self._selector_cache[selection] 263 | 264 | @classmethod 265 | def select(cls, document, selector): 266 | try: 267 | return selector.selector(document) 268 | except Exception as e: 269 | if cls.DEBUG: 270 | print(str(e)) 271 | return 272 | 273 | def extract(self, document, selector, debug_offset=''): 274 | """ 275 | Try and convert matching Elements to unicode strings. 276 | 277 | If this fails, the selector evaluation probably already 278 | returned some string(s) of some sort, or boolean value, 279 | or int/float, so return that instead. 280 | """ 281 | selected = self.select(document, selector) 282 | if selected is not None: 283 | 284 | if isinstance(selected, (list, tuple)): 285 | 286 | # FIXME: return None or return empty list? 287 | if not len(selected): 288 | return 289 | 290 | return [self._extract_single(m) for m in selected] 291 | 292 | else: 293 | return self._extract_single(selected) 294 | 295 | # selector did not match anything 296 | else: 297 | if self.DEBUG: 298 | print(debug_offset, "selector did not match anything; return None") 299 | return None 300 | 301 | def _default_element_extract(self, element): 302 | """ 303 | Overridable method to change how matching Elements 304 | are represented in output 305 | """ 306 | 307 | return parslepy.funcs.extract_text(element) 308 | 309 | def _extract_single(self, retval): 310 | # XPath compiled expressions (and CSSSelect translations) 311 | # can return different types 312 | # See http://lxml.de/xpathxslt.html#xpath-return-values 313 | # - True or False, when the XPath expression 314 | # has a boolean result 315 | # - a float, when the XPath expression has a numeric result 316 | # (integer or float) 317 | # - a 'smart' string (as described below), 318 | # when the XPath expression has a string result. 319 | # - a list of items, when the XPath expression has a list as result. 320 | # The items may include Elements 321 | # (also comments and processing instructions), 322 | # strings and tuples. 323 | # 324 | # Note that in the default implementation, 325 | # smart strings are disabled 326 | if type(retval) == lxml.etree._Element: 327 | return self._default_element_extract(retval) 328 | 329 | elif type(retval) == lxml.etree._Comment: 330 | return self._default_element_extract(retval) 331 | 332 | elif isinstance(retval, tuple(self.EXPECTED_NON_ELEMENT_TYPES)): 333 | return retval 334 | 335 | else: 336 | raise Warning("unusual type %s" % type(retval)) 337 | return retval 338 | 339 | try: 340 | from cssselect import HTMLTranslator 341 | from cssselect.xpath import _unicode_safe_getattr, XPathExpr 342 | 343 | class CssTranslator(HTMLTranslator): 344 | 345 | def xpath_pseudo_element(self, xpath, pseudo_element): 346 | 347 | try: 348 | 349 | from cssselect.parser import FunctionalPseudoElement 350 | from cssselect.xpath import _unicode_safe_getattr, XPathExpr 351 | 352 | if isinstance(pseudo_element, FunctionalPseudoElement): 353 | method = 'xpath_%s_functional_pseudo_element' % ( 354 | pseudo_element.name.replace('-', '_')) 355 | method = _unicode_safe_getattr(self, method, None) 356 | if not method: 357 | raise ExpressionError( 358 | "The functional pseudo-element ::%s() is unknown" 359 | % pseudo_element.name) 360 | xpath = method(xpath, pseudo_element.arguments) 361 | else: 362 | method = 'xpath_%s_simple_pseudo_element' % ( 363 | pseudo_element.replace('-', '_')) 364 | method = _unicode_safe_getattr(self, method, None) 365 | if not method: 366 | raise ExpressionError( 367 | "The pseudo-element ::%s is unknown" 368 | % pseudo_element) 369 | xpath = method(xpath) 370 | 371 | except ImportError: 372 | pass 373 | 374 | return xpath 375 | 376 | # functional pseudo-element: 377 | # element's attribute by name 378 | def xpath_attr_functional_pseudo_element(self, xpath, arguments): 379 | attribute_name = arguments[0].value 380 | other = XPathExpr('@%s' % attribute_name, '', ) 381 | return xpath.join('/', other) 382 | 383 | # pseudo-element: 384 | # element's text() nodes 385 | def xpath_text_simple_pseudo_element(self, xpath): 386 | other = XPathExpr('text()', '', ) 387 | return xpath.join('/', other) 388 | 389 | # pseudo-element: 390 | # element's comment() nodes 391 | def xpath_comment_simple_pseudo_element(self, xpath): 392 | other = XPathExpr('comment()', '', ) 393 | return xpath.join('/', other) 394 | 395 | css_translator = CssTranslator() 396 | def css_to_xpath(css): 397 | return css_translator.css_to_xpath(css) 398 | 399 | except ImportError: 400 | 401 | def css_to_xpath(css): 402 | return lxml.cssselect.css_to_xpath(css) 403 | 404 | 405 | class DefaultSelectorHandler(XPathSelectorHandler): 406 | """ 407 | Default selector logic, loosely based on the original 408 | `Parsley` implementation. 409 | 410 | This handler understands what cssselect and lxml.etree.XPath understands, 411 | that is (roughly) XPath 1.0 and CSS3 for things that dont need browser context 412 | """ 413 | 414 | # newer lxml version (>3) raise SelectorSyntaxError (directly from cssselect) 415 | # for invalid CSS selectors 416 | # but older lxml (2.3.8 for example) have cssselect included 417 | # and for some selectors raise AssertionError and TypeError instead 418 | CSSSELECT_SYNTAXERROR_EXCEPTIONS = set([ 419 | # we could use lxml.cssselect.SelectorError (parent class for both), 420 | # but for lxml<3, they're not related 421 | lxml.cssselect.SelectorSyntaxError, 422 | # for unsupported pseudo-class or XPath namespaces prefix syntax 423 | lxml.cssselect.ExpressionError, 424 | ]) 425 | # this is to add AssertionError and TypeError if lxml < 3.0.0 426 | for s in ('#a.', '//h1'): 427 | try: 428 | lxml.cssselect.CSSSelector(s) 429 | except Exception as e: 430 | CSSSELECT_SYNTAXERROR_EXCEPTIONS.add(type(e)) 431 | 432 | # example: "a img @src" (fetch the 'src' attribute of an IMG tag) 433 | # other example: "im|img @im|src" when using namespace prefixes 434 | REGEX_ENDING_ATTRIBUTE = re.compile(r'^(?P.+)\s+(?P@[\:|\w_\d-]+)$') 435 | def make(self, selection): 436 | """ 437 | Scopes and selectors are tested in this order: 438 | * is this a CSS selector with an appended @something attribute? 439 | * is this a regular CSS selector? 440 | * is this an XPath expression? 441 | 442 | XPath expression can also use EXSLT functions (as long as they are 443 | understood by libxslt) 444 | """ 445 | cached = self._selector_cache.get(selection) 446 | if cached: 447 | return cached 448 | 449 | namespaces = self.EXSLT_NAMESPACES 450 | self._add_parsley_ns(namespaces) 451 | try: 452 | # CSS with attribute? (non-standard but convenient) 453 | # CSS selector cannot select attributes 454 | # this " @" syntax is a Parsley extension 455 | # construct CSS selector and append attribute to XPath expression 456 | m = self.REGEX_ENDING_ATTRIBUTE.match(selection) 457 | if m: 458 | # the selector should be a regular CSS selector 459 | cssxpath = css_to_xpath(m.group("expr")) 460 | 461 | # if "|" is used for namespace prefix reference, 462 | # convert it to XPath prefix syntax 463 | attribute = m.group("attr").replace('|', ':') 464 | 465 | cssxpath = "%s/%s" % (cssxpath, attribute) 466 | else: 467 | cssxpath = css_to_xpath(selection) 468 | 469 | selector = lxml.etree.XPath( 470 | cssxpath, 471 | namespaces = self.namespaces, 472 | extensions = self.extensions, 473 | smart_strings=(self.SMART_STRINGS 474 | or self._test_smart_strings_needed(selection)), 475 | ) 476 | 477 | except tuple(self.CSSSELECT_SYNTAXERROR_EXCEPTIONS) as syntax_error: 478 | if self.DEBUG: 479 | print(repr(syntax_error), selection) 480 | print("Try interpreting as XPath selector") 481 | try: 482 | selector = lxml.etree.XPath(selection, 483 | namespaces = self.namespaces, 484 | extensions = self.extensions, 485 | smart_strings=(self.SMART_STRINGS 486 | or self._test_smart_strings_needed(selection)), 487 | ) 488 | 489 | except lxml.etree.XPathSyntaxError as syntax_error: 490 | syntax_error.msg += ": %s" % selection 491 | raise syntax_error 492 | 493 | except Exception as e: 494 | if self.DEBUG: 495 | print(repr(e), selection) 496 | raise 497 | 498 | # for exception when trying to convert @ syntax 499 | except lxml.etree.XPathSyntaxError as syntax_error: 500 | syntax_error.msg += ": %s" % selection 501 | raise syntax_error 502 | 503 | except Exception as e: 504 | if self.DEBUG: 505 | print(repr(e), selection) 506 | raise 507 | 508 | # wrap it/cache it 509 | self._selector_cache[selection] = Selector(selector) 510 | return self._selector_cache[selection] 511 | -------------------------------------------------------------------------------- /parslepy/utils/README.md: -------------------------------------------------------------------------------- 1 | ### Tools for Scrapy framework ### 2 | 3 | * `parslepy.utils.scrapytools.ParsleyItemClassLoader` 4 | * `parslepy.utils.scrapytools.ParsleyItemLoaderConfig` 5 | * `parslepy.utils.scrapytools.ParsleyImplicitItemClassLoader`: EXPERIMENTAL, TO BE DOCUMENTED 6 | 7 | Provide your Parsley script at the command line: 8 | 9 | ``` 10 | $ scrapy crawl MySpider -a parseletfile=myparselet.let.yml 11 | ``` 12 | 13 | with a Scrapy spider similar to this: 14 | ```python 15 | from mycrawler.items import MyItem 16 | import parslepy 17 | 18 | from scrapy.contrib.loader import ItemLoader 19 | from scrapy.contrib.loader.processor import TakeFirst 20 | from parslepy.utils.scrapytools import ParsleyItemClassLoader, ParsleyItemLoaderConfig 21 | 22 | class MyItemLoader(ItemLoader): 23 | default_output_processor = TakeFirst() 24 | 25 | 26 | class MySpider(BaseSpider): 27 | name = "MySpider" 28 | allowed_domains = ["example.com"] 29 | start_urls = ["http://www.example.com/index.html"] 30 | 31 | def __init__(self, parseletfile=None): 32 | 33 | if parseletfile: 34 | with open(parseletfile) as yamlfp: 35 | self.parselet = parslepy.Parselet.from_yamlfile(yamlfp) 36 | 37 | def parse(self, response): 38 | 39 | loader = ParsleyItemClassLoader( 40 | parselet=self.parselet, 41 | configs=[ 42 | ParsleyItemLoaderConfig( 43 | MyItem, 44 | MyItemLoader) 45 | ], 46 | response=response) 47 | for i in loader.iter_items(response): 48 | yield i 49 | ``` 50 | -------------------------------------------------------------------------------- /parslepy/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redapple/parslepy/a8bc4c0592824459629018c8f4c6ae3dad6cc3cc/parslepy/utils/__init__.py -------------------------------------------------------------------------------- /parslepy/utils/scrapytools.py: -------------------------------------------------------------------------------- 1 | import io as StringIO 2 | from scrapy.contrib.loader import ItemLoader 3 | from scrapy.item import Item, Field 4 | from scrapy.http import Request 5 | import urllib.parse 6 | import pprint 7 | 8 | class ParsleyItemLoaderConfig(object): 9 | 10 | def __init__(self, item_class=None, item_loader_class=None, iter_item_key=None): 11 | self.item_class = item_class 12 | self.item_loader_class = item_loader_class 13 | self.iter_item_key = iter_item_key 14 | 15 | def __repr__(self): 16 | return "" % ( 17 | self.item_class, self.item_loader_class, self.iter_item_key) 18 | 19 | 20 | class ParsleyRequestConfig(object): 21 | 22 | def __init__(self, iter_request_key=None, url_getter=None, callback=None): 23 | if url_getter: 24 | self.url_getter = url_getter 25 | else: 26 | self.url_getter = lambda u: u 27 | self.iter_request_key = iter_request_key 28 | self.callback = callback 29 | 30 | def __repr__(self): 31 | return "" % ( 32 | self.iter_request_key, self.url_getter, self.callback) 33 | 34 | 35 | class ParsleyItemClassLoader(object): 36 | def __init__(self, parselet, configs, response=None, **context): 37 | 38 | self.configs = configs 39 | self.parselet = parselet 40 | self.response = response 41 | self.extracted = None 42 | self.context = context 43 | 44 | def _extract(self, response=None): 45 | self.extracted = self.parselet.parse( 46 | StringIO.StringIO(response.body)) 47 | 48 | 49 | def iter_items(self, response=None): 50 | if self.extracted is None: 51 | self._extract(response or self.response) 52 | 53 | for config in self.configs: 54 | if config.iter_item_key is None: 55 | loader = config.item_loader_class(config.item_class(), 56 | **self.context) 57 | loader.add_value(None, self.extracted) 58 | yield loader.load_item() 59 | else: 60 | for item_value in self.extracted.get(config.iter_item_key) or self.extracted: 61 | loader = config.item_loader_class(config.item_class(), 62 | **self.context) 63 | loader.add_value(None, item_value) 64 | yield loader.load_item() 65 | 66 | 67 | class ParsleyImplicitItemClassLoader(object): 68 | def __init__(self, parselet, configs=None, response=None, **context): 69 | 70 | self.configs = configs 71 | self.parselet = parselet 72 | self.response = response 73 | self.extracted = None 74 | self.context = context 75 | 76 | def _generate_item_classes(self, extracted): 77 | for config in self.configs: 78 | if config.iter_item_key: 79 | keys = [ 80 | k 81 | for e in extracted.get(config.iter_item_key) 82 | for k in list(e.keys()) 83 | ] 84 | class_name = "%sClass" % config.iter_item_key.capitalize() 85 | else: 86 | keys = list(extracted.keys()) 87 | class_name = "CustomClass" 88 | 89 | if keys: 90 | print(("keys:", set(keys))) 91 | config.item_class = type( 92 | class_name, 93 | (Item,), 94 | dict([(k, Field()) for k in set(keys)])) 95 | 96 | def _parse(self, response=None): 97 | return self.parselet.parse( 98 | StringIO.StringIO(response.body)) 99 | 100 | def iter_items(self, response=None): 101 | extracted = self._parse(response or self.response) 102 | 103 | # generate Item classes based on Parsley structure 104 | self._generate_item_classes(extracted) 105 | 106 | for config in self.configs: 107 | if not config.item_class: 108 | continue 109 | if config.iter_item_key is None: 110 | yield config.item_class(**extracted) 111 | else: 112 | #print extracted 113 | for item_value in extracted.get(config.iter_item_key): 114 | yield config.item_class(**item_value) 115 | del extracted 116 | 117 | def iter_requests(self, response=None, iter_request_key=None, get_url_function=None, request_callback=None): 118 | 119 | extracted = self._parse(response or self.response) 120 | 121 | if get_url_function is None: 122 | get_url_function = lambda x: x 123 | 124 | #pprint.pprint(self.extracted) 125 | for request_info in extracted.get(iter_request_key): 126 | yield Request( 127 | url=urllib.parse.urljoin( 128 | response.url, 129 | get_url_function(request_info)), 130 | callback=request_callback) 131 | del extracted 132 | 133 | 134 | class ParsleyLoader(object): 135 | def __init__(self, parselet, response=None, **context): 136 | self.parselet = parselet 137 | self.response = response 138 | self.extracted = None 139 | self.context = context 140 | 141 | def _infer_item_class(self, extracted, config): 142 | if config.iter_item_key: 143 | keys = [ 144 | k 145 | for e in extracted.get(config.iter_item_key) 146 | for k in list(e.keys()) 147 | ] 148 | class_name = "%sClass" % config.iter_item_key.capitalize() 149 | else: 150 | keys = list(extracted.keys()) 151 | class_name = "CustomClass" 152 | 153 | if keys: 154 | return type(class_name, 155 | (Item,), 156 | dict([(k, Field()) for k in set(keys)])) 157 | 158 | def _parse(self, response=None): 159 | return self.parselet.parse( 160 | StringIO.StringIO(response.body)) 161 | 162 | def iter_items(self, config, response=None): 163 | 164 | if not isinstance(config, ParsleyItemLoaderConfig): 165 | raise ValueError("You must provide a ParsleyItemLoaderConfig instance") 166 | 167 | # FIXME: should this be cached? 168 | extracted = self._parse(response or self.response) 169 | 170 | if not config.item_class: 171 | # generate Item classes based on Parsley structure 172 | item_class = self._infer_item_class(extracted, config) 173 | else: 174 | item_class = config.item_class 175 | 176 | if not item_class: 177 | return 178 | 179 | # FIXME: if item_loader_class is not None, 180 | # we should use it 181 | if config.iter_item_key is None: 182 | yield config.item_class(**extracted) 183 | else: 184 | itemdata = extracted.get(config.iter_item_key) 185 | if itemdata: 186 | for item_value in itemdata: 187 | yield config.item_class(**item_value) 188 | del extracted 189 | 190 | def _load_item(self, data, config, **context): 191 | if config.item_loader_class: 192 | loader = config.item_loader_class(config.item_class(), 193 | **context) 194 | loader.add_value(None, item_value) 195 | return loader.load_item() 196 | 197 | 198 | def iter_requests(self, config=None, response=None): 199 | 200 | if not isinstance(config, ParsleyRequestConfig): 201 | raise ValueError("You must provide a ParsleyRequestConfig instance") 202 | 203 | # FIXME: should this be cached? 204 | extracted = self._parse(response or self.response) 205 | reqdata = extracted.get(config.iter_request_key) 206 | if reqdata: 207 | for request_data in reqdata: 208 | nurl = urllib.parse.urljoin( 209 | response.url, 210 | config.url_getter(request_data)) 211 | if nurl: 212 | yield Request( 213 | url=nurl, 214 | callback=config.callback) 215 | del extracted 216 | -------------------------------------------------------------------------------- /requirements-extra.txt: -------------------------------------------------------------------------------- 1 | pyyaml 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cssselect 2 | lxml 3 | -------------------------------------------------------------------------------- /run_parslepy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import optparse 5 | import pprint 6 | import parslepy 7 | import lxml.html 8 | 9 | def main(): 10 | 11 | parser = optparse.OptionParser() 12 | parser.add_option("--debug", dest="debug", action="store_true", help="debug mode", default=False) 13 | parser.add_option("--url", dest="url", help="fetch this URL", default=None) 14 | parser.add_option("--file", dest="inputfile", help="parse this HTML file", default=None) 15 | parser.add_option("--script", dest="parselet", help="Parsley script filename", default=None) 16 | 17 | (options, args) = parser.parse_args() 18 | 19 | if not options.parselet: 20 | print("You must provide a Parsley script") 21 | return 22 | if not options.url and not options.inputfile: 23 | print("You must provide an URL") 24 | return 25 | 26 | with open(options.parselet) as fp: 27 | 28 | extractor = parslepy.Parselet.from_jsonfile(fp, options.debug) 29 | output = extractor.parse(options.url or options.inputfile) 30 | pprint.pprint(output) 31 | 32 | if __name__ == '__main__': 33 | main() 34 | 35 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | setup(name='parslepy', 6 | version='0.3.0', 7 | description='Parsley extraction library using lxml', 8 | long_description=""" 9 | ======== 10 | Parslepy 11 | ======== 12 | 13 | Parslepy lets you extract content from HTML and XML documents 14 | where extraction rules are defined using a JSON object 15 | or equivalent Python dict, 16 | where keys are names you want to assign to extracted content, 17 | and values are CSS selectors or XPath expressions. 18 | 19 | Parslepy is an implementation of the Parsley extraction 20 | language defined `here `_, 21 | using lxml and cssselect. 22 | 23 | You can nest objects, generate list of objects, and (to 24 | a certain extent) mix CSS and XPath. 25 | 26 | Parslepy uderstands what lxml and cssselect understand, 27 | which is roughly CSS3 selectors and XPath 1.0 expressions. 28 | 29 | Documentation & examples 30 | ======================== 31 | 32 | See https://github.com/redapple/parslepy/wiki#usage 33 | """, 34 | author='Paul Tremberth', 35 | author_email='paul.tremberth@gmail.com', 36 | packages=['parslepy'], 37 | requires=['lxml', 'cssselect'], 38 | install_requires=[ 39 | "lxml >= 2.3", 40 | "cssselect", 41 | ], 42 | classifiers = [ 43 | 'Topic :: Software Development :: Libraries :: Python Modules', 44 | 'Topic :: Text Processing :: Markup :: HTML', 45 | 'Topic :: Text Processing :: Markup :: XML', 46 | 'Operating System :: POSIX :: Linux', 47 | 'Programming Language :: Python :: 2.7', 48 | 'Programming Language :: Python :: 3.5', 49 | 'Programming Language :: Python :: 3.6', 50 | 'License :: OSI Approved :: MIT License', 51 | 'Development Status :: 3 - Alpha', 52 | 'Intended Audience :: Developers', 53 | ], 54 | url = 'https://github.com/redapple/parslepy', 55 | ) 56 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redapple/parslepy/a8bc4c0592824459629018c8f4c6ae3dad6cc3cc/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/creativecommons.org__licenses__by__3.0.html: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | 11 | Creative Commons — Attribution 3.0 Unported 12 | — CC BY 3.0 13 | 14 | 15 | 16 | 17 | 18 | 21 | 22 | 24 | 27 | 28 | 33 | 34 | 35 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 58 | 59 | 60 | 61 | 62 | 64 | 65 | 66 | 68 | 77 | 78 | 79 |
83 | 84 |
85 | 89 | 94 |

Creative Commons License Deed

95 | 96 |
97 |

98 | Attribution 3.0 Unported 100 | 101 | (CC BY 3.0) 103 | 104 |

105 |
106 |
107 | 108 |
109 |
110 | 111 | 112 |
113 |
Disclaimer
114 | 115 |
116 |

117 | The Commons Deed is not a license. It is simply a handy reference for understanding the Legal Code (the full license) — it is a human-readable expression of some of its key terms. Think of it as the user-friendly interface to the Legal Code beneath. This Deed itself has no legal value, and its contents do not appear in the actual license. 118 |

119 | 120 |

121 | Creative Commons is not a law firm and does not provide legal services. Distributing of, displaying of, or linking to this Commons Deed does not create an attorney-client relationship. 122 |

123 |
124 | 125 |
126 | 127 |
128 | 129 |
130 | This is a human-readable summary of the Legal Code (the full license). 131 |
132 | 133 | 134 | 139 |
140 | 141 |
142 | 143 |
144 | 145 | 146 |
147 | 148 | This license is acceptable for Free Cultural Works. 150 | 151 |
152 | 153 | 154 |
156 | 157 | 158 | 159 | 160 |

You are free:

162 |
    163 | 168 | 169 | 170 |
  • 173 | to Remix — to adapt the work 174 |
  • 175 | 176 | 177 | 178 |
  • 179 | to make commercial use of the work 180 |
  • 181 | 182 |
  • 184 | 185 |
  • 186 |
187 | 188 |
189 |
190 |

Under the following conditions:

191 | 192 |
    194 | 195 |
  • 197 |

    198 | Attribution — 199 | 200 | 201 | You must attribute the work in the manner specified by the author or licensor (but not in any way that suggests that they endorse you or your use of the work). 202 | 203 | 204 | 205 |

    206 | 207 | 208 | 225 | 226 |
    227 |
    228 | What does "Attribute this work" mean? 229 |
    230 |
    231 | The page you came from contained embedded licensing metadata, including how the creator wishes to be attributed for re-use. You can use the HTML here to cite the work. Doing so will also include metadata on your page so that others can find the original work as well. 232 |
    233 |
    234 | 235 |
  • 236 | 237 |
238 |
239 | 240 |
241 |

242 | With the understanding that: 243 |

244 | 245 |
    246 |
  • 247 | Waiver 248 | — 249 | Any of the above conditions can be waived if you get permission from the copyright holder. 250 |
  • 251 | 252 |
  • 253 | Public Domain 254 | — 255 | Where the work or any of its elements is in the public domain under applicable law, that status is in no way affected by the license. 256 |
  • 257 | 258 |
  • 259 | Other Rights 260 | — 261 | In no way are any of the following rights affected by the license: 262 | 263 |
      264 |
    • 265 | Your fair dealing or fair use rights, or other applicable copyright exceptions and limitations; 266 |
    • 267 | 268 | 269 |
    • 270 | The author's moral rights; 271 |
    • 272 | 273 | 274 | 275 |
    • 276 | Rights other persons may have either in the work itself or in how the work is used, such as publicity or privacy rights. 277 |
    • 278 |
    279 |
  • 280 | 281 |
  • 283 | Notice 284 | — 285 | For any reuse or distribution, you must make clear to others the license terms of this work. The best way to do this is with a link to this web page. 286 |
  • 287 |
288 | 289 | 290 | 291 |
292 |
293 | What does "conditions can be waived" mean? 294 |
295 |
296 |

297 | CC licenses anticipate that a licensor may want to 298 | waive compliance with a specific condition, such as 299 | attribution. 300 |

301 |

302 | Learn more. 303 |

304 |
305 |
306 | 307 |
308 |
309 | What does "Public Domain" mean? 310 |
311 |
312 |

313 | A work is in the public domain when it is free for use by anyone for any purpose without restriction under copyright. 314 |

315 | 316 |

317 | Learn more.

319 |
320 | 321 |
322 | 323 |
324 |
325 | What does "Fair use" mean? 326 |
327 |
328 |

329 | All jurisdictions 330 | allow some limited uses of copyrighted material without 331 | permission. CC licenses do not affect the rights of 332 | users under those copyright limitations and exceptions, 333 | such as fair use and fair dealing where applicable. 334 |

335 | 336 |

337 | Learn more.

338 |
339 |
340 | 341 |
342 |
343 | What are "Moral Rights"? 344 |
345 |
346 |

347 | In addition to the right of licensors to request removal of their name from the work when used in a derivative or collective they don't like, copyright laws in most jurisdictions around the world (with the notable exception of the US except in very limited circumstances) grant creators "moral rights" which may provide some redress if a derivative work represents a "derogatory treatment" of the licensor's work. 348 |

349 | 350 |

351 | Learn more. 352 |

353 | 354 |
355 |
356 | 357 |
358 |
359 | What are "Publicity Rights"? 360 |
361 |
362 |

363 | Publicity 364 | rights allow individuals to control how their voice, 365 | image or likeness is used for commercial purposes in 366 | public. If a CC-licensed work includes the voice or 367 | image of anyone other than the licensor, a user of the 368 | work may need to get permission from those individuals 369 | before using the work for commercial purposes. 370 |

371 | 372 |

373 | Learn more. 374 |

375 |
376 |
377 | 378 |
379 | 380 | 381 |
382 |
383 | 384 |
385 | 576 | 577 | 590 | 595 | 596 | 597 | -------------------------------------------------------------------------------- /tests/data/parselet.json: -------------------------------------------------------------------------------- 1 | { "title": "h1", "link": "a @href"} -------------------------------------------------------------------------------- /tests/data/parselet.yml: -------------------------------------------------------------------------------- 1 | --- 2 | title: h1 3 | link: a @href 4 | -------------------------------------------------------------------------------- /tests/data/validator.w3.org.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | The W3C Markup Validation Service 8 | 9 | 10 | 11 | 14 | 15 | 16 | 17 | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 34 | 35 | 36 | 37 | 38 |
39 | 44 |
45 | 46 |
Validate by URI 47 |
48 |

49 | Validate a document online: 50 |

51 |

52 | 53 | 54 |

55 |
56 | Show More Options 57 |
58 | 59 | 60 | 61 | 62 | 65 | 112 | 113 | 114 | 115 | 118 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 159 | 160 | 161 | 162 | 163 | 164 | 166 | 167 | 168 | 169 | 170 |
63 | 64 | 66 | 110 | 111 |
116 | 117 | 119 | 148 |
158 |
165 |
171 | 172 | 173 |
174 |
175 | 176 | 177 |

178 | 179 |

180 | 181 |
182 |
183 | 184 |
Validate by File Upload 185 |
186 |

Upload a document for validation:

187 |

188 |

189 |
190 | Show More Options 191 |
192 | 193 | 194 | 195 | 198 | 245 | 246 | 247 | 248 | 251 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 292 | 293 | 294 | 295 | 296 | 297 | 299 | 300 | 301 | 302 | 303 |
196 | 197 | 199 | 243 | 244 |
249 | 250 | 252 | 281 |
291 |
298 |
304 | 305 | 306 |
307 |
308 | 309 |

310 | 311 |

312 | 313 |
314 |

Note: file upload may not work with Internet 315 | Explorer on some versions of Windows XP Service Pack 2, see our 316 | information page 317 | on the W3C QA Website.

318 | 319 |
320 |
Validate by direct input 321 |
322 |

:
323 | 324 | 325 | 330 |

331 |
332 | Show More Options 333 |
334 | 335 | 336 | 337 | 339 | 340 | 341 | 342 | 343 | 372 | 373 | 374 | 376 | 377 | 378 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 389 | 390 | 391 | 392 | 393 | 394 | 396 | 397 | 398 | 399 | 400 | 401 |
338 |
375 |
Use Doctype: 379 | 380 |
388 |
395 |
402 |
403 |
404 | 405 | 406 |

407 | 408 |

409 | 410 | 411 |
412 |
413 |
414 |
415 |
416 |

417 | This validator checks the 418 | markup validity 419 | of Web documents in HTML, XHTML, SMIL, MathML, etc. 420 | If you wish to validate specific content such as 421 | RSS/Atom feeds or 422 | CSS stylesheets, 423 | MobileOK content, 424 | or to find broken links, 425 | there are other validators and tools available. 426 | As an alternative you can also try our non-DTD-based validator. 427 |

428 |
429 | 430 |
431 | 432 | 433 | W3C Validator
434 | Suite Logo 435 | 436 | 437 | 438 | NEW - 439 | W3C offers a beta release of a new service providing you an 440 | integrated validation report on your entire web site. 441 |
442 | Try it now 443 | to quickly identify those portions of your web site that may 444 | benefit from attention. 445 |
446 |
447 | 448 |
449 | 450 | 451 | 452 | 459 | 460 | 461 | 462 | 496 | 497 | 498 | 499 | 500 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | nose 2 | -------------------------------------------------------------------------------- /tests/test_parslepy_compile.py: -------------------------------------------------------------------------------- 1 | import parslepy 2 | from parslepy.base import InvalidKeySyntax 3 | from nose.tools import * 4 | from lxml.etree import XPathSyntaxError 5 | from .tools import * 6 | 7 | class TestKeySyntax(object): 8 | 9 | with_valid_keys = ( 10 | ('title_big', ('title_big', None, None, None)), 11 | ('title-short(.span)', ('title-short', None, '(.span)', '.span')), 12 | ('title__2?', ('title__2', "?", None, None)), 13 | ('title_big?(#main)', ('title_big', "?", '(#main)', '#main')), 14 | ) 15 | 16 | def test_key_regex(self): 17 | for key, target_results in self.with_valid_keys: 18 | yield self.compare_regex_results, key, target_results 19 | 20 | def compare_regex_results(self, key, results): 21 | m = parslepy.base.Parselet.REGEX_PARSELET_KEY.match(key) 22 | assert_true(m is not None) 23 | assert_tuple_equal(m.groups(), results) 24 | 25 | 26 | with_invalid_keys = ( 27 | ({'title@(': 'h1'}, InvalidKeySyntax), 28 | 29 | ({'#test': 'h1'}, InvalidKeySyntax), 30 | ({'(#test)': 'h1'}, InvalidKeySyntax), 31 | ({'?(#test)': 'h1'}, InvalidKeySyntax), 32 | ({'.test': 'h1'}, InvalidKeySyntax), 33 | 34 | ({'test!': 'h1'}, InvalidKeySyntax), 35 | ({'test#': 'h1'}, InvalidKeySyntax), 36 | ({'test()': 'h1'}, InvalidKeySyntax), 37 | ({'?()': 'h1'}, InvalidKeySyntax), 38 | ({'test??': 'h1'}, InvalidKeySyntax), 39 | ({'test?()': 'h1'}, InvalidKeySyntax), 40 | ({'test~(test)': 'h1'}, InvalidKeySyntax), 41 | 42 | # this does not raise SyntaxError in lxml<3 43 | #({'test(#)': 'h1'}, XPathSyntaxError), 44 | 45 | ({'test(!)': 'h1'}, XPathSyntaxError), 46 | ({'test(.div ~)': 'h1'}, XPathSyntaxError), 47 | ) 48 | 49 | def test_invalid_syntax(self): 50 | for parselet_dict, target_exception in self.with_invalid_keys: 51 | yield self.init_with_invalid_parselet_dict, parselet_dict, target_exception 52 | 53 | def init_with_invalid_parselet_dict(self, parselet_dict, target_exception): 54 | assert_raises(target_exception, parslepy.Parselet, parselet_dict) 55 | 56 | with_invalid_value_type = ( 57 | ({'title': 1}, ValueError), 58 | ({'title': None}, ValueError), 59 | ({'title': (43,)}, ValueError), 60 | ({'title': {44: 45}}, InvalidKeySyntax), 61 | ) 62 | 63 | def test_invalid_value_type(self): 64 | for parselet_dict, target_exception in self.with_invalid_value_type: 65 | yield self.init_with_invalid_parselet_dict, parselet_dict, target_exception 66 | -------------------------------------------------------------------------------- /tests/test_parslepy_extensions.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | import parslepy 3 | import parslepy.base 4 | import lxml.cssselect 5 | from nose.tools import * 6 | import io as StringIO 7 | import pprint 8 | import os 9 | from .tools import * 10 | 11 | def compare_extracted_output(root, input_parselet, expected_output, debug=False): 12 | parselet = parslepy.Parselet(input_parselet, strict=True, debug=debug) 13 | extracted = parselet.extract(root) 14 | #pprint.pprint(extracted) 15 | #pprint.pprint(expected_output) 16 | assert_dict_equal(extracted, expected_output) 17 | 18 | def test_attrnames(): 19 | parselets = ( 20 | ( 21 | {"images(img)": [{ 22 | "attrnames": ["parslepy:attrname(@*)"], 23 | "attrvals": ["@*"], 24 | }]}, 25 | { 26 | 'images': [ 27 | { 28 | 'attrvals': ['W3C', '110', '61', 'logo', './images/w3c.png'], 29 | 'attrnames': ['alt', 'width', 'height', 'id', 'src'] 30 | }, 31 | { 32 | 'attrvals': ['toggleiconURI', 'toggleicon', './images/arrow-closed.png', 'Show'], 33 | 'attrnames': ['id', 'class', 'src', 'alt'] 34 | }, 35 | { 36 | 'attrvals': ['toggleicon', './images/arrow-closed.png', 'Show'], 37 | 'attrnames': ['class', 'src', 'alt'] 38 | }, 39 | { 40 | 'attrvals': ['toggleicon', './images/arrow-closed.png', 'Show'], 41 | 'attrnames': ['class', 'src', 'alt'] 42 | }, 43 | { 44 | 'attrvals': ['http://www.w3.org/Icons/VSlogo', 'W3C Validator\nSuite Logo'], 45 | 'attrnames': ['src', 'alt'] 46 | }, 47 | { 48 | 'attrvals': ['http://www.w3.org/Icons/WWW/w3c_home_nb', 'W3C', '72', '47'], 49 | 'attrnames': ['src', 'alt', 'width', 'height'] 50 | }, 51 | { 52 | 'attrvals': ['./images/opensource-55x48.png', 'Open-Source', 'We are building certified Open Source/Free Software. - see www.opensource.org', '55', '48'], 53 | 'attrnames': ['src', 'alt', 'title', 'width', 'height'] 54 | }, 55 | { 56 | 'attrvals': ['http://www.w3.org/QA/Tools/I_heart_validator', 'I heart Validator logo', ' Validators Donation Program', '80', '15'], 57 | 'attrnames': ['src', 'alt', 'title', 'width', 'height'] 58 | } 59 | ] 60 | } 61 | ), 62 | ) 63 | hp = lxml.etree.HTMLParser() 64 | dirname = os.path.dirname(os.path.abspath(__file__)) 65 | root = lxml.etree.parse( 66 | open(os.path.join( 67 | dirname, 68 | 'data/validator.w3.org.html')), 69 | parser=hp).getroot() 70 | for input_parselet, expected_output in parselets: 71 | yield compare_extracted_output, root, input_parselet, expected_output 72 | 73 | 74 | def test_strip(): 75 | parselets = ( 76 | ( 77 | # strip bracket from text content 78 | {"selected_option": "parslepy:strip(//select[@id='uri-charset']/option[@selected], '()')"}, 79 | {'selected_option': 'detect automatically'} 80 | ), 81 | ( 82 | # strip brackets from attribute value 83 | {"selected_option": "parslepy:strip(//select[@id='upload-charset']/option[@selected]/@value, '()')"}, 84 | {'selected_option': 'detect automatically'} 85 | ), 86 | ( 87 | # strip '#' from attribute values 88 | {"legend_links(legend.toggletext > a)": ["parslepy:strip(@href, '#')"]}, 89 | {'legend_links': ['validate_by_uri+with_options', 90 | 'validate_by_upload+with_options', 91 | 'validate_by_input+with_options']} 92 | ), 93 | ) 94 | hp = lxml.etree.HTMLParser() 95 | dirname = os.path.dirname(os.path.abspath(__file__)) 96 | root = lxml.etree.parse( 97 | open(os.path.join( 98 | dirname, 99 | 'data/validator.w3.org.html')), 100 | parser=hp).getroot() 101 | for input_parselet, expected_output in parselets: 102 | yield compare_extracted_output, root, input_parselet, expected_output 103 | 104 | 105 | def test_to_content(): 106 | parselets = ( 107 | ( 108 | {"intro": 'parslepy:html(//div[@class="intro"])'}, 109 | {'intro': """
110 |

111 | This validator checks the 112 | markup validity 113 | of Web documents in HTML, XHTML, SMIL, MathML, etc. 114 | If you wish to validate specific content such as 115 | RSS/Atom feeds or 116 | CSS stylesheets, 117 | MobileOK content, 118 | or to find broken links, 119 | there are other validators and tools available. 120 | As an alternative you can also try our non-DTD-based validator. 121 |

122 |
"""}, 123 | ), 124 | ( 125 | {"intro": 'parslepy:text(//div[@class="intro"])'}, 126 | {'intro': 'This validator checks the markup validity of Web documents in HTML, XHTML, SMIL, MathML, etc. If you wish to validate specific content such as RSS/Atom feeds or CSS stylesheets, MobileOK content, or to find broken links, there are other validators and tools available. As an alternative you can also try our non-DTD-based validator.'} 127 | ), 128 | ( 129 | {"intro": 'parslepy:textnl(//div[@class="intro"])'}, 130 | {'intro': """This validator checks the 131 | markup validity 132 | of Web documents in HTML, XHTML, SMIL, MathML, etc. 133 | If you wish to validate specific content such as 134 | RSS/Atom feeds or 135 | CSS stylesheets, 136 | MobileOK content, 137 | or to find broken links, 138 | there are other validators and tools available. 139 | As an alternative you can also try our non-DTD-based validator.""" 140 | } 141 | ), 142 | ) 143 | hp = lxml.etree.HTMLParser() 144 | dirname = os.path.dirname(os.path.abspath(__file__)) 145 | root = lxml.etree.parse( 146 | open(os.path.join( 147 | dirname, 148 | 'data/validator.w3.org.html')), 149 | parser=hp).getroot() 150 | for input_parselet, expected_output in parselets: 151 | yield compare_extracted_output, root, input_parselet, expected_output 152 | 153 | def test_to_xml(): 154 | parselets = ( 155 | ( 156 | {"first": "parslepy:xml(//atom:feed/atom:entry[1]/im:contentType)"}, 157 | {'first': ''} 158 | ), 159 | ) 160 | dirname = os.path.dirname(os.path.abspath(__file__)) 161 | root = lxml.etree.parse( 162 | open(os.path.join( 163 | dirname, 164 | 'data/itunes.topalbums.rss')), 165 | parser=lxml.etree.XMLParser()).getroot() 166 | xsh = parslepy.selectors.XPathSelectorHandler( 167 | namespaces={ 168 | 'atom': 'http://www.w3.org/2005/Atom', 169 | 'im': 'http://itunes.apple.com/rss' 170 | }) 171 | for input_parselet, expected_output in parselets: 172 | parselet = parslepy.Parselet( 173 | input_parselet, selector_handler=xsh, strict=True) 174 | extracted = parselet.extract(root) 175 | assert_dict_equal(extracted, expected_output) 176 | 177 | 178 | def test_userdefined_extensions(): 179 | 180 | def myattrnames(ctx, xpctx, attributes, *args): 181 | #print "myattrnames:", ctx, xpctx, attributes, args 182 | return [a.attrname for a in attributes] 183 | 184 | # extension to built full URLs from @href or @src attributes 185 | try: 186 | import urlparse # Python 2.x 187 | except ImportError: 188 | import urllib.parse as urlparse 189 | 190 | def absurl(ctx, xpctx, attributes, *args): 191 | #print "absurl:", ctx, xpctx, attributes, args 192 | return [urlparse.urljoin(ctx, u) for u in attributes] 193 | 194 | parselets = ( 195 | ( 196 | { 197 | "head_meta(head/meta)": [{ 198 | "attrnames": ["myext:attrnames(@*)"], 199 | "attrvals": ["@*"], 200 | }], 201 | "img_links": ["//img/@src"], 202 | "img_abslinks": ["myext:absurl(//img/@src)"], 203 | }, 204 | { 205 | 'head_meta': [ 206 | {'attrnames': ['http-equiv', 'content'], 207 | 'attrvals': ['Content-Type', 'text/html;charset=utf-8'] 208 | }, 209 | {'attrnames': ['name', 'content'], 210 | 'attrvals': ['keywords', 211 | 'HTML, HyperText Markup Language, Validation,\n W3C Markup Validation Service']}, 212 | {'attrnames': ['name', 'content'], 213 | 'attrvals': ['description', 214 | "W3C's easy-to-use\n markup validation service, based on SGML and XML parsers."]}], 215 | 'img_abslinks': ['http://validator.w3.org/images/w3c.png', 216 | 'http://validator.w3.org/images/arrow-closed.png', 217 | 'http://validator.w3.org/images/arrow-closed.png', 218 | 'http://validator.w3.org/images/arrow-closed.png', 219 | 'http://www.w3.org/Icons/VSlogo', 220 | 'http://www.w3.org/Icons/WWW/w3c_home_nb', 221 | 'http://validator.w3.org/images/opensource-55x48.png', 222 | 'http://www.w3.org/QA/Tools/I_heart_validator'], 223 | 'img_links': ['./images/w3c.png', 224 | './images/arrow-closed.png', 225 | './images/arrow-closed.png', 226 | './images/arrow-closed.png', 227 | 'http://www.w3.org/Icons/VSlogo', 228 | 'http://www.w3.org/Icons/WWW/w3c_home_nb', 229 | './images/opensource-55x48.png', 230 | 'http://www.w3.org/QA/Tools/I_heart_validator'] 231 | } 232 | ), 233 | ) 234 | mynamespaces = { 235 | "myext": "myextension" 236 | } 237 | myextensions = { 238 | ("myextension", "absurl"): absurl, 239 | ("myextension", "attrnames"): myattrnames, 240 | } 241 | 242 | sh = parslepy.DefaultSelectorHandler( 243 | namespaces=mynamespaces, 244 | extensions=myextensions) 245 | 246 | dirname = os.path.dirname(os.path.abspath(__file__)) 247 | for input_parselet, expected_output in parselets: 248 | parselet = parslepy.Parselet( 249 | input_parselet, 250 | selector_handler=sh, strict=True) 251 | extracted = parselet.parse( 252 | os.path.join(dirname, 'data/validator.w3.org.html'), 253 | context='http://validator.w3.org/') 254 | 255 | #pprint.pprint(extracted) 256 | #pprint.pprint(expected_output) 257 | assert_dict_equal(extracted, expected_output) 258 | -------------------------------------------------------------------------------- /tests/test_parslepy_init.py: -------------------------------------------------------------------------------- 1 | import parslepy 2 | import parslepy.base 3 | import lxml.cssselect 4 | from nose.tools import * 5 | from .tools import * 6 | 7 | def test_parslepy_init_default(): 8 | parselet_script = { 9 | "title": "h1", 10 | "subtitle": "//h2" 11 | } 12 | parselet = parslepy.Parselet(parselet_script) 13 | 14 | assert_dict_equal(parselet.parselet, parselet_script) 15 | 16 | assert_is_instance(parselet.parselet_tree, parslepy.base.ParsleyNode) 17 | assert_equal(len(parselet.parselet_tree), len(parselet_script), "not the same number of keys") 18 | 19 | for k,v in list(parselet.parselet_tree.items()): 20 | assert_is_instance(k, parslepy.base.ParsleyContext) 21 | assert_is_instance(v, parslepy.selectors.Selector) 22 | 23 | # since we did not provide a selector handler 24 | assert_is_instance(parselet.selector_handler, parslepy.base.DefaultSelectorHandler) 25 | 26 | @raises(ValueError) 27 | def test_parslepy_init_invalid_parselet(): 28 | parselet = parslepy.Parselet("{ 'title': 'h1'}") 29 | 30 | @raises(NotImplementedError) 31 | def test_parslepy_init_selector_handler_error(): 32 | parselet_script = { 33 | "title": "h1", 34 | "subtitle": "//h2" 35 | } 36 | class MyHandler(parslepy.selectors.SelectorHandler): 37 | _dummy = True 38 | mh = MyHandler() 39 | parselet = parslepy.Parselet(parselet_script, selector_handler=mh) 40 | 41 | @raises(ValueError) 42 | def test_parslepy_init_wrong_selector_handler(): 43 | parselet_script = { 44 | "title": "h1", 45 | "subtitle": "//h2" 46 | } 47 | parselet = parslepy.Parselet(parselet_script, selector_handler=lambda s: s) 48 | 49 | def test_parslepy_init_selector_handler_error(): 50 | parselet_script = { 51 | "title": "h1", 52 | "subtitle": "//h2" 53 | } 54 | class MyHandler(parslepy.selectors.SelectorHandler): 55 | def make(self, selection): 56 | return parslepy.selectors.Selector(lxml.etree.XPath("body")) 57 | 58 | def select(self, document, selector): 59 | return None 60 | 61 | def extract(self, document, selector): 62 | return None 63 | 64 | mh = MyHandler() 65 | 66 | parselet = parslepy.Parselet(parselet_script, selector_handler=mh) 67 | assert_is_instance(parselet.selector_handler, MyHandler) 68 | 69 | def test_parslepy_keys(): 70 | parselet_scripts = [ 71 | ( 72 | { 73 | "title": "h1", 74 | "subtitle": "//h2" 75 | }, 76 | ["title", "subtitle"], 77 | ), 78 | ( 79 | { 80 | "--": { 81 | "--(#banner)": { 82 | "--(#title)": { 83 | "--(a span)": { 84 | "title": "." 85 | } 86 | } 87 | } 88 | } 89 | }, 90 | ["title"], 91 | ), 92 | ( 93 | { 94 | "--(#header)": { 95 | "--(#banner)": { 96 | "--(#title)": { 97 | "--(a span)": { 98 | "title": "." 99 | } 100 | } 101 | } 102 | } 103 | }, 104 | ["title"], 105 | ), 106 | ( 107 | { 108 | "--": { 109 | "--(#banner)": { 110 | "--(#title)": { 111 | "--(a span)": { 112 | "title": "." 113 | } 114 | } 115 | } 116 | }, 117 | "links": [".//a/@href"] 118 | }, 119 | ["title", "links"], 120 | ), 121 | ( 122 | { 123 | "title": "h1", 124 | "--(.content)": { 125 | "subtitle": ".//h2" 126 | } 127 | }, 128 | ["title", "subtitle"], 129 | ), 130 | ( 131 | { 132 | "title": "h1", 133 | "--(.content)": { 134 | "title": ".//h2" 135 | }, 136 | "footer": "parslepy:html(.//div[@class='footer'])" 137 | }, 138 | ["title", "footer"], 139 | ), 140 | ] 141 | 142 | for input_parselet, expected_output in parselet_scripts: 143 | parselet = parslepy.Parselet(input_parselet) 144 | assert_equal(set(parselet.keys()), 145 | set(expected_output)) 146 | -------------------------------------------------------------------------------- /tests/test_parslepy_parse.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | import parslepy 3 | import parslepy.base 4 | import lxml.cssselect 5 | from nose.tools import * 6 | from .tools import * 7 | import pprint 8 | import os 9 | 10 | 11 | def test_parslepy_xpathparse_xml_file(): 12 | parselet_script = {"id": "//atom:id"} 13 | xsh = parslepy.selectors.XPathSelectorHandler( 14 | namespaces={'atom': 'http://www.w3.org/2005/Atom'} 15 | ) 16 | dirname = os.path.dirname(os.path.abspath(__file__)) 17 | fp = open(os.path.join(dirname, 'data/itunes.topalbums.rss')) 18 | 19 | expected = { 20 | 'id': 'https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml' 21 | } 22 | 23 | parselet = parslepy.Parselet(parselet_script, selector_handler=xsh) 24 | extracted = parselet.parse(fp, parser=lxml.etree.XMLParser()) 25 | assert_dict_equal(extracted, expected) 26 | 27 | 28 | def test_parslepy_defaultparse_xml_file(): 29 | parselet_script = {"id": "//atom:id"} 30 | dsh = parslepy.selectors.DefaultSelectorHandler( 31 | namespaces={'atom': 'http://www.w3.org/2005/Atom'} 32 | ) 33 | dirname = os.path.dirname(os.path.abspath(__file__)) 34 | fp = open(os.path.join(dirname, 'data/itunes.topalbums.rss')) 35 | 36 | expected = { 37 | 'id': 'https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml' 38 | } 39 | 40 | parselet = parslepy.Parselet(parselet_script, selector_handler=dsh) 41 | extracted = parselet.parse(fp, parser=lxml.etree.XMLParser()) 42 | assert_dict_equal(extracted, expected) 43 | 44 | 45 | def test_parslepy_defaultparse_xml_file_cssselectors(): 46 | parselet_script = {"id": "atom|id", "imid": "atom|id @im|id"} 47 | dsh = parslepy.selectors.DefaultSelectorHandler( 48 | namespaces={ 49 | 'atom': 'http://www.w3.org/2005/Atom', 50 | 'im': 'http://itunes.apple.com/rss', 51 | } 52 | ) 53 | dirname = os.path.dirname(os.path.abspath(__file__)) 54 | fp = open(os.path.join(dirname, 'data/itunes.topalbums.rss')) 55 | 56 | expected = { 57 | 'id': 'https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml', 58 | 'imid': '647928068', 59 | } 60 | 61 | parselet = parslepy.Parselet(parselet_script, selector_handler=dsh) 62 | extracted = parselet.parse(fp, parser=lxml.etree.XMLParser()) 63 | assert_dict_equal(extracted, expected) 64 | 65 | 66 | xmldoc = b""" 67 | 68 | https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xmliTunes Store: Top Albums2013-06-25T06:27:25-07:00http://itunes.apple.com/favicon.icoiTunes Storehttp://www.apple.com/itunes/Copyright 2008 Apple Inc. 69 | 70 | 2013-06-25T06:27:25-07:00 71 | https://itunes.apple.com/us/album/the-gifted/id647928068?uo=2 72 | The Gifted - Wale 73 | The Gifted 74 | http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg 75 | http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg 76 | http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg 77 | 78 | 79 | """ 80 | 81 | def test_parslepy_xpathparse_xml_fromstring(): 82 | 83 | parselet_script = { 84 | "--(//atom:feed/atom:entry)": { 85 | "title": "atom:title", 86 | "name": "im:name", 87 | "id": "atom:id/@im:id", 88 | "images(im:image)": [{ 89 | "height": "@height", 90 | "url": ".", 91 | }], 92 | "releasedate": "im:releaseDate", 93 | } 94 | } 95 | xsh = parslepy.selectors.XPathSelectorHandler( 96 | namespaces={ 97 | 'atom': 'http://www.w3.org/2005/Atom', 98 | 'im': 'http://itunes.apple.com/rss', 99 | } 100 | ) 101 | 102 | expected = { 103 | 'id': '647928068', 104 | 'images': [ 105 | { 'height': '55', 106 | 'url': 'http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg' 107 | }, 108 | { 'height': '60', 109 | 'url': 'http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg' 110 | }, 111 | { 'height': '170', 112 | 'url': 'http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg' 113 | } 114 | ], 115 | 'name': 'The Gifted', 116 | 'title': 'The Gifted - Wale', 117 | } 118 | parselet = parslepy.Parselet(parselet_script, selector_handler=xsh) 119 | extracted = parselet.parse_fromstring(xmldoc, parser=lxml.etree.XMLParser()) 120 | assert_dict_equal(extracted, expected) 121 | 122 | 123 | def test_parslepy_defaultparse_xml_fromstring(): 124 | 125 | parselet_script = { 126 | "--(//atom:feed/atom:entry)": { 127 | "title": "atom:title", 128 | "name": "im:name", 129 | "id": "atom:id/@im:id", 130 | "images(im:image)": [{ 131 | "height": "@height", 132 | "url": ".", 133 | }], 134 | "releasedate": "im:releaseDate", 135 | } 136 | } 137 | dsh = parslepy.selectors.DefaultSelectorHandler( 138 | namespaces={ 139 | 'atom': 'http://www.w3.org/2005/Atom', 140 | 'im': 'http://itunes.apple.com/rss', 141 | } 142 | ) 143 | 144 | expected = { 145 | 'id': '647928068', 146 | 'images': [ 147 | { 'height': '55', 148 | 'url': 'http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg' 149 | }, 150 | { 'height': '60', 151 | 'url': 'http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg' 152 | }, 153 | { 'height': '170', 154 | 'url': 'http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg' 155 | } 156 | ], 157 | 'name': 'The Gifted', 158 | 'title': 'The Gifted - Wale', 159 | } 160 | parselet = parslepy.Parselet(parselet_script, selector_handler=dsh) 161 | extracted = parselet.parse_fromstring(xmldoc, parser=lxml.etree.XMLParser()) 162 | assert_dict_equal(extracted, expected) 163 | 164 | 165 | def test_parslepy_defaultparse_xml_fromstring_cssselectors(): 166 | 167 | parselet_script = { 168 | "--(atom|feed atom|entry)": { 169 | "title": "atom|title", 170 | "name": "im|name", 171 | "id": "atom|id @im|id", 172 | "images(im|image)": [{ 173 | "height": "@height", 174 | "url": ".", 175 | }], 176 | "releasedate": "im|releaseDate", 177 | } 178 | } 179 | dsh = parslepy.selectors.DefaultSelectorHandler( 180 | namespaces={ 181 | 'atom': 'http://www.w3.org/2005/Atom', 182 | 'im': 'http://itunes.apple.com/rss', 183 | } 184 | ) 185 | 186 | expected = { 187 | 'id': '647928068', 188 | 'images': [ 189 | { 'height': '55', 190 | 'url': 'http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg' 191 | }, 192 | { 'height': '60', 193 | 'url': 'http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg' 194 | }, 195 | { 'height': '170', 196 | 'url': 'http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg' 197 | } 198 | ], 199 | 'name': 'The Gifted', 200 | 'title': 'The Gifted - Wale', 201 | } 202 | parselet = parslepy.Parselet(parselet_script, selector_handler=dsh) 203 | extracted = parselet.parse_fromstring(xmldoc, parser=lxml.etree.XMLParser()) 204 | assert_dict_equal(extracted, expected) 205 | 206 | 207 | 208 | 209 | def test_parslepy_parse_html_file(): 210 | 211 | parselet = parslepy.Parselet({"title": "h1"}) 212 | expected = {'title': 'Markup Validation Service'} 213 | 214 | dirname = os.path.dirname(os.path.abspath(__file__)) 215 | extracted = parselet.parse( 216 | open(os.path.join(dirname, 'data/validator.w3.org.html')) 217 | ) 218 | assert_dict_equal(extracted, expected) 219 | 220 | 221 | def test_parslepy_parse_html_fromstring(): 222 | 223 | htmldoc = """ 225 | 226 | 227 | 228 | The W3C Markup Validation Service 229 | 230 | 231 | 232 | 235 | 236 | 238 | 240 | 241 | 242 | 243 | 244 | 251 | 252 | 253 | """ 254 | 255 | parselet = parslepy.Parselet( 256 | { 257 | "title": "h1", 258 | "pid": "p[id] @id" 259 | }) 260 | expected = { 261 | 'title': 'Markup Validation Service', 262 | 'pid': 'tagline' 263 | } 264 | 265 | extracted = parselet.parse_fromstring(htmldoc) 266 | assert_dict_equal(extracted, expected) 267 | -------------------------------------------------------------------------------- /tests/test_parslepy_parselets.py: -------------------------------------------------------------------------------- 1 | import os 2 | from parslepy.base import Parselet 3 | from nose.tools import assert_dict_equal 4 | 5 | html = '

hi

click' 6 | expected = {"title":"hi", "link":"/"} 7 | dirname = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | 10 | def test_parslepy_from_jsonstring(): 11 | s = '{ "title": "h1", "link": "a @href"}' 12 | p = Parselet.from_jsonstring(s) 13 | extracted = p.parse_fromstring(html) 14 | assert_dict_equal(extracted, expected) 15 | 16 | 17 | def test_parslepy_from_yamlstring(): 18 | s = '''--- 19 | title: h1 20 | link: a @href 21 | ''' 22 | p = Parselet.from_yamlstring(s) 23 | extracted = p.parse_fromstring(html) 24 | assert_dict_equal(extracted, expected) 25 | 26 | 27 | def test_parslepy_from_jsonstring(): 28 | s = '{ "title": "h1", "link": "a @href"}' 29 | with open(os.path.join(dirname, 'data/parselet.json')) as fp: 30 | p = Parselet.from_jsonfile(fp) 31 | extracted = p.parse_fromstring(html) 32 | assert_dict_equal(extracted, expected) 33 | 34 | 35 | def test_parslepy_from_yamlstring(): 36 | s = '''--- 37 | title: h1 38 | link: a @href 39 | ''' 40 | with open(os.path.join(dirname, 'data/parselet.yml')) as fp: 41 | p = Parselet.from_yamlfile(fp) 42 | extracted = p.parse_fromstring(html) 43 | assert_dict_equal(extracted, expected) 44 | -------------------------------------------------------------------------------- /tests/test_parslepy_selector.py: -------------------------------------------------------------------------------- 1 | import parslepy 2 | import parslepy.base 3 | import parslepy.selectors 4 | import lxml.cssselect 5 | from nose.tools import * 6 | from .tools import * 7 | 8 | class TestInvalidParseletInit(object): 9 | init_parselets = ( 10 | #{ "title": ".test #"}, # this does not raise SyntaxError in lxml<3 11 | { "title": "/h1[@]"}, 12 | { "title": "h1", "paragraphs": [".//p[@class,'news']"]}, 13 | ) 14 | def test_invalid_parselet_init(self): 15 | for parselet in self.init_parselets: 16 | yield self.init_parselet_expect_syntax_error, parselet 17 | 18 | @raises(SyntaxError) 19 | def init_parselet_expect_syntax_error(self, parselet): 20 | parslepy.Parselet(parselet) 21 | 22 | 23 | class TestDefaultValidSelectors(object): 24 | 25 | dsh = parslepy.base.DefaultSelectorHandler() 26 | 27 | selectors = [ 28 | ("div.content", lxml.etree.XPath), 29 | (".content #bogus span.first", lxml.etree.XPath), 30 | ("div#main", lxml.etree.XPath), 31 | ("div[@id='main']", lxml.etree.XPath), 32 | ('div[@id="main"]', lxml.etree.XPath), 33 | ("div", lxml.etree.XPath), 34 | ("//div", lxml.etree.XPath), 35 | ("//a/@href", lxml.etree.XPath), 36 | ("img @src", lxml.etree.XPath), 37 | ("table tr[class='main']", lxml.etree.XPath), 38 | ("tr[2]", lxml.etree.XPath), 39 | ] 40 | 41 | try: 42 | from cssselect.parser import FunctionalPseudoElement 43 | selectors.extend([ 44 | ("img::attr(src)", lxml.etree.XPath), 45 | ]) 46 | except: 47 | pass 48 | 49 | def test_selector_class(self): 50 | for selector_string, target_class in self.selectors: 51 | yield self.compare_selector_class, selector_string, target_class 52 | 53 | def compare_selector_class(self, selector_string, target_class): 54 | s = self.dsh.make(selector_string) 55 | assert_is_instance(s, parslepy.selectors.Selector) 56 | assert_is_instance( 57 | s.selector, target_class, 58 | "\n%s compiled to '%s' of type %s \n and is not an instance of %s" % ( 59 | selector_string, s.selector, type(s.selector), target_class) 60 | ) 61 | 62 | 63 | class TestDefaultInvalidSelectors(object): 64 | 65 | dsh = parslepy.selectors.DefaultSelectorHandler() 66 | 67 | invalid_selectors = ( 68 | # these does not raise SyntaxError in lxml<3 69 | #'# ', 70 | #'.#', 71 | #'#t-#', 72 | 73 | '#t.', 74 | './//e', 75 | './/div class', 76 | './/div[@class="test]', 77 | 'div[]', 78 | '.div[id@]', 79 | 'div[@]', 80 | 'span @', 81 | 'span@', 82 | './/span//', 83 | ) 84 | 85 | def test_invalid_css_selectors(self): 86 | for s in self.invalid_selectors: 87 | yield self.make_selector_expect_syntax_error, s 88 | 89 | @raises(SyntaxError) 90 | def make_selector_expect_syntax_error(self, s): 91 | self.dsh.make(s) 92 | 93 | 94 | class TestXPathValidSelectors(object): 95 | 96 | xsh = parslepy.selectors.XPathSelectorHandler() 97 | 98 | selectors = ( 99 | "div.content", 100 | "span[@id='main']", 101 | 'header[@id="main"]', 102 | "div", 103 | "//div", 104 | "//a/@href", 105 | "img/@src", 106 | "./img/@src", 107 | ".//img/@alt", 108 | "table/tr[@class='main']", 109 | '//div[@id="main"]//tr[@class="item"]', 110 | "tr[2]", 111 | ) 112 | 113 | def test_selector_class(self): 114 | for selector_string in self.selectors: 115 | yield self.compare_selector_class, selector_string 116 | 117 | def compare_selector_class(self, selector_string): 118 | s = self.xsh.make(selector_string) 119 | assert_is_instance(s, parslepy.selectors.Selector) 120 | assert_is_instance( 121 | s.selector, lxml.etree.XPath, 122 | "\n%s compiled to '%s' of type %s \n and is not an instance of %s" % ( 123 | selector_string, s.selector, type(s.selector), lxml.etree.XPath) 124 | ) 125 | 126 | 127 | class TestXPathInvalidSelectors(object): 128 | 129 | xsh = parslepy.selectors.XPathSelectorHandler() 130 | 131 | invalid_selectors = ( 132 | './//e', 133 | './/div class', 134 | './/div[@class="test]', 135 | 'div[]', 136 | '.div[id@]', 137 | 'div[@]', 138 | 'span//', 139 | 'span/@class/', 140 | './/span//', 141 | ) 142 | 143 | def test_invalid_xpath_selectors(self): 144 | for s in self.invalid_selectors: 145 | yield self.make_selector_expect_syntax_error, s 146 | 147 | @raises(SyntaxError) 148 | def make_selector_expect_syntax_error(self, s): 149 | self.xsh.make(s) 150 | -------------------------------------------------------------------------------- /tests/tools.py: -------------------------------------------------------------------------------- 1 | # borrowed from python-github2/tests/test_request.py 2 | try: 3 | from nose.tools import assert_dict_equal 4 | except ImportError: # for Python < 2.7 5 | try: 6 | import unittest2 7 | _binding = unittest2.TestCase('run') 8 | assert_dict_equal = _binding.assertDictEqual 9 | assert_is_instance = _binding.assertIsInstance 10 | assert_tuple_equal = _binding.assertTupleEqual 11 | except: 12 | raise 13 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # content of: tox.ini , put in same dir as setup.py 2 | [tox] 3 | envlist = {py27,py35,py36}-lxml{36,37,38,40,41} 4 | 5 | [travis] 6 | python = 7 | 2.7: py27 8 | 3.5: py35 9 | 3.6: py36 10 | 11 | [testenv] 12 | basepython = 13 | py27: python2.7 14 | py35: python3.5 15 | py36: python3.6 16 | 17 | deps= 18 | lxml36: lxml>=3.6,<3.7 19 | lxml37: lxml>=3.7,<3.8 20 | lxml38: lxml>=3.8,<3.9 21 | lxml40: lxml>=4.0,<4.1 22 | lxml41: lxml>=4.1,<4.2 23 | -rtests/requirements.txt 24 | -rrequirements-extra.txt 25 | 26 | commands= 27 | nosetests {posargs:tests} 28 | --------------------------------------------------------------------------------