├── .gitignore
├── .travis.yml
├── CHANGELOG
├── LICENSE
├── MANIFEST
├── README.md
├── TODO.md
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── make.bat
    └── preview_docs.sh
├── examples
    ├── engadget.let.json
    ├── engadget_css.let.json
    └── engadget_xpath.let.json
├── parslepy
    ├── __init__.py
    ├── base.py
    ├── funcs.py
    ├── selectors.py
    └── utils
    │   ├── README.md
    │   ├── __init__.py
    │   └── scrapytools.py
├── requirements-extra.txt
├── requirements.txt
├── run_parslepy.py
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── data
    │   ├── creativecommons.org__licenses__by__3.0.html
    │   ├── itunes.topalbums.rss
    │   ├── parselet.json
    │   ├── parselet.yml
    │   └── validator.w3.org.html
    ├── requirements.txt
    ├── test_parslepy_compile.py
    ├── test_parslepy_extensions.py
    ├── test_parslepy_extraction.py
    ├── test_parslepy_init.py
    ├── test_parslepy_parse.py
    ├── test_parslepy_parselets.py
    ├── test_parslepy_selector.py
    └── tools.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 
37 | # docs
38 | docs/_build
39 | docs/_static
40 | docs/_templates
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - "2.7"
 4 |     - "3.5"
 5 |     - "3.6"
 6 | install:
 7 |     - pip install -U tox tox-travis twine wheel
 8 | script: tox
 9 | 
10 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | Changelog
 2 | =========
 3 | 
 4 | Version 0.3.0 - March 3., 2015
 5 | ----------------------------------
 6 | 
 7 | * Improvements:
 8 |     * CSS selectors extensions:
 9 |         * pseudo-elements ``::text`` (borrowed from Scrapy) and ``::comment``
10 |         * functional pseudo-element ``::attr(name)``
11 |     * Cleaned up documentation (thanks @eliasdorneles!)
12 |     * New ``keys()`` method for ``Parselet`` nodes
13 | 
14 | Version 0.2.0 - August 5., 2013
15 | ---------------------------------
16 | 
17 | * Improvements:
18 |     * Support XPath namespace prefixes (``namespace:element``) and CSS namespace
19 |       prefixes (``namespace|element``) in DefaultSelectorHandler
20 |     * new built-in extension function ``parslepy:strip()`` mapped
21 |       to Python's strip() for strings
22 |     * new built-in extension function ``parslepy:attrname()``
23 |       that takes an attribute selector and returns the attribute's name
24 |     * support for user-defined extension functions, which take an additional
25 |       context parameter when called (context is passed either at selector
26 |       handler instantiation or when calling Parselet.parse())
27 |     * use ``smart_strings=False`` for XPath compiled expressions,
28 |       except for user-defined extensions and some built-in extensions
29 |       (see http://lxml.de/xpathxslt.html#xpath-return-values)
30 | 
31 | * Bug fixes:
32 |     * #2: XPath namespace prefixes raise cssselect.xpath.ExpressionError
33 |       with DefaultSelectorHandler
34 |     * #3: Docs suggest using \*.js files when they are JSON documents
35 |     * #4: The example usage should not have both url_css and url_xpath
36 |     * #5: In example usage, skip lines between "configuration" and "execution"
37 |     * #6: add underscore to _version__
38 |     * #7: Empty result set on boolean or numerical selectors
39 | 
40 | Version 0.1.2 - July 9, 2013
41 | -----------------------------
42 | 
43 | * Bug fixes:
44 |     * #1: headingxpath rule does not seem to work as expected
45 | 
46 | Version 0.1.1 - July 3, 2013
47 | -----------------------------
48 | 
49 | * Docstrings added to main classes and methods.
50 | * Added parse_fromstring() method to Parselet
51 | * Added tests for Parselet.parse() and Parselet.parse_fromstring()
52 | 
53 | Version 0.1 - June 30, 2013
54 | ---------------------------
55 | 
56 | Initial release
57 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013 Paul Tremberth, Newlynn Labs
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | README
3 | setup.cfg
4 | setup.py
5 | parslepy/__init__.py
6 | parslepy/base.py
7 | parslepy/funcs.py
8 | parslepy/selectors.py
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | parslepy
 2 | ========
 3 | 
 4 | [![Build Status](https://travis-ci.org/redapple/parslepy.png?branch=master)](https://travis-ci.org/redapple/parslepy)
 5 | 
 6 | *parslepy* (pronounced *"parsley-pie"*, */ˈpɑːslipaɪ/*) is a Python implementation
 7 | (built on top of [lxml](http://lxml.de) and [cssselect](https://github.com/SimonSapin/cssselect)) of the
 8 | [Parsley DSL](https://github.com/fizx/parsley)
 9 | for extracting structured data from web pages, as defined by Kyle Maxwell and Andrew Cantino
10 | (see [Parsley's wiki](https://github.com/fizx/parsley/wiki) for more details and original C implementation).
11 | 
12 | Kudos to Kyle Maxwell (@fizx) for coming up with this smart and easy syntax to define extracting rules.
13 | 
14 | > Please note that this *Parsley DSL* is **NOT** the same as the Parsley parsing library at https://pypi.python.org/pypi/Parsley
15 | 
16 | Check out the [official docs](http://pythonhosted.org/parslepy) for more information on how to install
17 | and use *parslepy*. There is also some useful information at the [parslepy Wiki](https://github.com/redapple/parslepy/wiki)
18 | 
19 | Here is an example of a parselet script that extracts the questions from StackOverflow first page:
20 | 
21 |     {
22 |         "first_page_questions(//div[contains(@class,'question-summary')])": [{
23 |             "title": ".//h3/a",
24 |             "tags": "div.tags",
25 |             "votes": "div.votes div.mini-counts",
26 |             "views": "div.views div.mini-counts",
27 |             "answers": "div.status div.mini-counts"
28 |         }]
29 |     }
30 | 
31 | ### Install
32 | 
33 | Install via pip with:
34 | 
35 |     sudo pip install parslepy
36 | 
37 | Alternatively, you can install from the latest source code:
38 | 
39 |     git clone https://github.com/redapple/parslepy.git
40 |     sudo python setup.py install
41 | 
42 | 
43 | ### Online Resources ###
44 | 
45 | * [Official Documentation](http://pythonhosted.org/parslepy)
46 | * [Wiki with examples and tutorials](https://github.com/redapple/parslepy/wiki)
47 | * [Parsley DSL](https://github.com/fizx/parsley)
48 | * [JSON Structure details -- Parsley wiki](https://github.com/fizx/parsley/wiki/JSON-Structure)
49 | * [Example Scrapy Spider using Parsley](http://snipplr.com/view/67016/parsley-spider/)
50 | * [Parsley DSL on Hacker News](https://news.ycombinator.com/item?id=1585301)
51 | 
52 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # TODO
 2 | 
 3 | * add more tests
 4 | * support XPath functions with CSS selectors
 5 | * ~~support optionality operator ("?")~~
 6 | * support complete arrays with the "!" operator (https://github.com/fizx/parsley/wiki/JSON-Structure#requiring-complete-arrays-with-the--operator)
 7 | * support bucketed arrays (https://github.com/fizx/parsley/wiki/JSON-Structure#bucketed-arrays);
 8 | see https://github.com/redapple/parslepy/wiki/Implementing-bucketed-arrays-(work-in-progess)
 9 | * investigate PyParsley API
10 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/parslepy.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/parslepy.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/parslepy"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/parslepy"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # parslepy documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Jul  1 15:20:50 2013.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | sys.path.insert(0, os.path.abspath('../parslepy'))
 20 | sys.path.insert(0, os.path.abspath('../'))
 21 | 
 22 | # -- General configuration -----------------------------------------------------
 23 | 
 24 | # If your documentation needs a minimal Sphinx version, state it here.
 25 | #needs_sphinx = '1.0'
 26 | 
 27 | # Add any Sphinx extension module names here, as strings. They can be extensions
 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 29 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.viewcode']
 30 | 
 31 | # Add any paths that contain templates here, relative to this directory.
 32 | templates_path = ['_templates']
 33 | 
 34 | # The suffix of source filenames.
 35 | source_suffix = '.rst'
 36 | 
 37 | # The encoding of source files.
 38 | #source_encoding = 'utf-8-sig'
 39 | 
 40 | # The master toctree document.
 41 | master_doc = 'index'
 42 | 
 43 | # General information about the project.
 44 | project = u'parslepy'
 45 | copyright = u'2013, Paul Tremberth'
 46 | 
 47 | # The version info for the project you're documenting, acts as replacement for
 48 | # |version| and |release|, also used in various other places throughout the
 49 | # built documents.
 50 | #
 51 | # The short X.Y version.
 52 | version = '0.3'
 53 | # The full version, including alpha/beta/rc tags.
 54 | release = '0.3.0'
 55 | 
 56 | # The language for content autogenerated by Sphinx. Refer to documentation
 57 | # for a list of supported languages.
 58 | #language = None
 59 | 
 60 | # There are two options for replacing |today|: either, you set today to some
 61 | # non-false value, then it is used:
 62 | #today = ''
 63 | # Else, today_fmt is used as the format for a strftime call.
 64 | #today_fmt = '%B %d, %Y'
 65 | 
 66 | # List of patterns, relative to source directory, that match files and
 67 | # directories to ignore when looking for source files.
 68 | exclude_patterns = ['_build']
 69 | 
 70 | # The reST default role (used for this markup: `text`) to use for all documents.
 71 | #default_role = None
 72 | 
 73 | # If true, '()' will be appended to :func: etc. cross-reference text.
 74 | #add_function_parentheses = True
 75 | 
 76 | # If true, the current module name will be prepended to all description
 77 | # unit titles (such as .. function::).
 78 | #add_module_names = True
 79 | 
 80 | # If true, sectionauthor and moduleauthor directives will be shown in the
 81 | # output. They are ignored by default.
 82 | #show_authors = False
 83 | 
 84 | # The name of the Pygments (syntax highlighting) style to use.
 85 | pygments_style = 'sphinx'
 86 | 
 87 | # A list of ignored prefixes for module index sorting.
 88 | #modindex_common_prefix = []
 89 | 
 90 | autoclass_content = 'both'
 91 | 
 92 | # -- Options for HTML output ---------------------------------------------------
 93 | 
 94 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 95 | # a list of builtin themes.
 96 | html_theme = 'default'
 97 | 
 98 | # Theme options are theme-specific and customize the look and feel of a theme
 99 | # further.  For a list of options available for each theme, see the
100 | # documentation.
101 | #html_theme_options = {}
102 | 
103 | # Add any paths that contain custom themes here, relative to this directory.
104 | #html_theme_path = []
105 | 
106 | # The name for this set of Sphinx documents.  If None, it defaults to
107 | # "<project> v<release> documentation".
108 | #html_title = None
109 | 
110 | # A shorter title for the navigation bar.  Default is the same as html_title.
111 | #html_short_title = None
112 | 
113 | # The name of an image file (relative to this directory) to place at the top
114 | # of the sidebar.
115 | #html_logo = None
116 | 
117 | # The name of an image file (within the static path) to use as favicon of the
118 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
119 | # pixels large.
120 | #html_favicon = None
121 | 
122 | # Add any paths that contain custom static files (such as style sheets) here,
123 | # relative to this directory. They are copied after the builtin static files,
124 | # so a file named "default.css" will overwrite the builtin "default.css".
125 | html_static_path = ['_static']
126 | 
127 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
128 | # using the given strftime format.
129 | #html_last_updated_fmt = '%b %d, %Y'
130 | 
131 | # If true, SmartyPants will be used to convert quotes and dashes to
132 | # typographically correct entities.
133 | #html_use_smartypants = True
134 | 
135 | # Custom sidebar templates, maps document names to template names.
136 | #html_sidebars = {}
137 | 
138 | # Additional templates that should be rendered to pages, maps page names to
139 | # template names.
140 | #html_additional_pages = {}
141 | 
142 | # If false, no module index is generated.
143 | #html_domain_indices = True
144 | 
145 | # If false, no index is generated.
146 | #html_use_index = True
147 | 
148 | # If true, the index is split into individual pages for each letter.
149 | #html_split_index = False
150 | 
151 | # If true, links to the reST sources are added to the pages.
152 | #html_show_sourcelink = True
153 | 
154 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
155 | #html_show_sphinx = True
156 | 
157 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
158 | #html_show_copyright = True
159 | 
160 | # If true, an OpenSearch description file will be output, and all pages will
161 | # contain a <link> tag referring to it.  The value of this option must be the
162 | # base URL from which the finished HTML is served.
163 | #html_use_opensearch = ''
164 | 
165 | # This is the file name suffix for HTML files (e.g. ".xhtml").
166 | #html_file_suffix = None
167 | 
168 | # Output file base name for HTML help builder.
169 | htmlhelp_basename = 'parslepydoc'
170 | 
171 | 
172 | # -- Options for LaTeX output --------------------------------------------------
173 | 
174 | latex_elements = {
175 | # The paper size ('letterpaper' or 'a4paper').
176 | #'papersize': 'letterpaper',
177 | 
178 | # The font size ('10pt', '11pt' or '12pt').
179 | #'pointsize': '10pt',
180 | 
181 | # Additional stuff for the LaTeX preamble.
182 | #'preamble': '',
183 | }
184 | 
185 | # Grouping the document tree into LaTeX files. List of tuples
186 | # (source start file, target name, title, author, documentclass [howto/manual]).
187 | latex_documents = [
188 |   ('index', 'parslepy.tex', u'parslepy Documentation',
189 |    u'Paul Tremberth', 'manual'),
190 | ]
191 | 
192 | # The name of an image file (relative to this directory) to place at the top of
193 | # the title page.
194 | #latex_logo = None
195 | 
196 | # For "manual" documents, if this is true, then toplevel headings are parts,
197 | # not chapters.
198 | #latex_use_parts = False
199 | 
200 | # If true, show page references after internal links.
201 | #latex_show_pagerefs = False
202 | 
203 | # If true, show URL addresses after external links.
204 | #latex_show_urls = False
205 | 
206 | # Documents to append as an appendix to all manuals.
207 | #latex_appendices = []
208 | 
209 | # If false, no module index is generated.
210 | #latex_domain_indices = True
211 | 
212 | 
213 | # -- Options for manual page output --------------------------------------------
214 | 
215 | # One entry per manual page. List of tuples
216 | # (source start file, name, description, authors, manual section).
217 | man_pages = [
218 |     ('index', 'parslepy', u'parslepy Documentation',
219 |      [u'Paul Tremberth'], 1)
220 | ]
221 | 
222 | # If true, show URL addresses after external links.
223 | #man_show_urls = False
224 | 
225 | 
226 | # -- Options for Texinfo output ------------------------------------------------
227 | 
228 | # Grouping the document tree into Texinfo files. List of tuples
229 | # (source start file, target name, title, author,
230 | #  dir menu entry, description, category)
231 | texinfo_documents = [
232 |   ('index', 'parslepy', u'parslepy Documentation',
233 |    u'Paul Tremberth', 'parslepy', 'One line description of project.',
234 |    'Miscellaneous'),
235 | ]
236 | 
237 | # Documents to append as an appendix to all manuals.
238 | #texinfo_appendices = []
239 | 
240 | # If false, no module index is generated.
241 | #texinfo_domain_indices = True
242 | 
243 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
244 | #texinfo_show_urls = 'footnote'
245 | 
246 | 
247 | # -- Options for Epub output ---------------------------------------------------
248 | 
249 | # Bibliographic Dublin Core info.
250 | epub_title = u'parslepy'
251 | epub_author = u'Paul Tremberth'
252 | epub_publisher = u'Paul Tremberth'
253 | epub_copyright = u'2013, Paul Tremberth'
254 | 
255 | # The language of the text. It defaults to the language option
256 | # or en if the language is not set.
257 | #epub_language = ''
258 | 
259 | # The scheme of the identifier. Typical schemes are ISBN or URL.
260 | #epub_scheme = ''
261 | 
262 | # The unique identifier of the text. This can be a ISBN number
263 | # or the project homepage.
264 | #epub_identifier = ''
265 | 
266 | # A unique identification for the text.
267 | #epub_uid = ''
268 | 
269 | # A tuple containing the cover image and cover page html template filenames.
270 | #epub_cover = ()
271 | 
272 | # HTML files that should be inserted before the pages created by sphinx.
273 | # The format is a list of tuples containing the path and title.
274 | #epub_pre_files = []
275 | 
276 | # HTML files shat should be inserted after the pages created by sphinx.
277 | # The format is a list of tuples containing the path and title.
278 | #epub_post_files = []
279 | 
280 | # A list of files that should not be packed into the epub file.
281 | #epub_exclude_files = []
282 | 
283 | # The depth of the table of contents in toc.ncx.
284 | #epub_tocdepth = 3
285 | 
286 | # Allow duplicate toc entries.
287 | #epub_tocdup = True
288 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. parslepy documentation master file, created by
  2 |    sphinx-quickstart on Mon Jul  1 15:20:50 2013.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | parslepy -- Documentation
  7 | =========================
  8 | 
  9 | Introduction
 10 | ------------
 11 | 
 12 | *parslepy* lets you extract content from HTML and XML documents
 13 | **using rules defined in a JSON object** (or a Python :class:`dict`).
 14 | The object keys mean the names you want to assign for the data in each
 15 | document section and the values are CSS selectors or XPath expressions
 16 | that will match the document parts (elements or attributes).
 17 | 
 18 | Here is an example for extracting questions in StackOverflow first page::
 19 | 
 20 |     {
 21 |         "first_page_questions(//div[contains(@class,'question-summary')])": [{
 22 |             "title": ".//h3/a",
 23 |             "tags": "div.tags",
 24 |             "votes": "div.votes div.mini-counts",
 25 |             "views": "div.views div.mini-counts",
 26 |             "answers": "div.status div.mini-counts"
 27 |         }]
 28 |     }
 29 | 
 30 | Some details
 31 | ^^^^^^^^^^^^
 32 | 
 33 | *parslepy* is a Python implementation (built on top of `lxml`_ and `cssselect`_)
 34 | of the `Parsley DSL`_ for extraction content from structured documents,
 35 | defined by Kyle Maxwell and Andrew Cantino
 36 | (see the `parsley wiki`_ for more details and original C implementation).
 37 | 
 38 | The default behavior for the selectors is:
 39 | 
 40 | * selectors for elements will output their matching textual content (children elements' content is also included)
 41 | * selectors matching element attributes will output the attribute's value
 42 | 
 43 | So, if you use ``//h1/a`` in a selector, *parslepy* will extract the text inside of the ``a`` element
 44 | and its children, and if you use ``//h1/a/@href`` it will extract the value for ``href``, i.e.,
 45 | the address the link is pointing to.
 46 | 
 47 | 
 48 | You can also nest objects, generate lists of objects, and mix CSS and XPath
 49 | -- although not in the same selector.
 50 | 
 51 | *parslepy* understands what `lxml`_ and `cssselect`_ understand,
 52 | which is roughly `CSS3 Selectors`_ and `XPath 1.0`_.
 53 | 
 54 | 
 55 | .. _CSS3 Selectors: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/
 56 | .. _XPath 1.0: http://www.w3.org/TR/xpath/
 57 | .. _lxml: http://lxml.de/
 58 | .. _cssselect: https://github.com/SimonSapin/cssselect
 59 | .. _Parsley DSL: https://github.com/fizx/parsley
 60 | .. _parsley wiki: https://github.com/fizx/parsley/wiki
 61 | 
 62 | 
 63 | Syntax summary
 64 | ^^^^^^^^^^^^^^
 65 | 
 66 | Here is a quick description of the rules format::
 67 | 
 68 |     output key (mandatory)
 69 |         |
 70 |         |  optionality operator (optional)
 71 |         |   |
 72 |         |   |   scope, always within brackets (optional)
 73 |         |   |      |
 74 |         v   v      v
 75 |     "somekey?(someselector)":   "someCSSSelector"
 76 | 
 77 |     or         //           :   "someXPathExpression"
 78 | 
 79 |     or         //           :   ["someXPathOrCSSExpression"]
 80 | 
 81 |     or         //           :    { ...some other rules... }
 82 | 
 83 |     or         //           :    [{ ...some other rules... }]
 84 | 
 85 | 
 86 | A collection of extraction rules (also called a *parselet*,
 87 | or *Parsley script*) looks like this in JSON format::
 88 | 
 89 |     {
 90 |         "somekey": "#someID .someclass",                        # using a CSS selector
 91 |         "anotherkey": "//sometag[@someattribute='somevalue']",  # using an XPath expression
 92 |         "nestedkey(.somelistclass)": [{                         # CSS selector for multiple elements (scope selector)
 93 |             "somenestedkey": "somenestedtag/@someattribute"     # XPath expression for an attribbute
 94 |        }]
 95 |     }
 96 | 
 97 | ... or like this in YAML format:
 98 | 
 99 |     ---
100 |     somekey: "#someID .someclass"                        # using a CSS selector
101 |     anotherkey: "//sometag[@someattribute='somevalue']"  # using an XPath expression
102 |     nestedkey(.somelistclass):                           # CSS selector for multiple elements (scope selector)
103 |     - somenestedkey: somenestedtag/@someattribute        # XPath expression for an attribbute
104 | 
105 | And the output would be something like::
106 | 
107 |     {
108 |         "somekey": "some value inside the first element matching the CSS selector",
109 |         "anotherkey": "some value inside the first element matching the XPath expression",
110 |         "nestedkey: [
111 |             {"somenestedkey": "attribute of 1st nested element"},
112 |             {"somenestedkey": "attribute of 2nd nested element"},
113 |             ...
114 |             {"somenestedkey": "attribute of last nested element"}
115 |         ]
116 |     }
117 | 
118 | 
119 | 
120 | Quickstart
121 | ----------
122 | 
123 | Install
124 | ^^^^^^^
125 | 
126 | From PyPI
127 | #########
128 | 
129 | You can install *parslepy* from `PyPI <https://pypi.python.org/pypi/parslepy>`_:
130 | 
131 | .. code-block:: bash
132 | 
133 |     sudo pip install parslepy
134 | 
135 | 
136 | From latest source
137 | ##################
138 | 
139 | You can also install from source code (make sure you have the
140 | ``lxml`` and ``cssselect`` libraries already installed):
141 | 
142 | .. code-block:: bash
143 | 
144 |     git clone https://github.com/redapple/parslepy.git
145 |     sudo python setup.py install
146 | 
147 | You probably want also to make sure the tests passes:
148 | 
149 | .. code-block:: bash
150 | 
151 |     sudo pip install nosetests # only needed if you don't have nosetests installed
152 |     nosetests -v tests
153 | 
154 | Usage
155 | ^^^^^
156 | 
157 | Here are some examples on how to use parslepy.
158 | You can also check out the examples and tutorials at `parsley's wiki at GitHub <https://github.com/redapple/parslepy/wiki#usage>`_.
159 | 
160 | Extract the questions from StackOverflow first page:
161 | 
162 |     >>> import parslepy, urllib2
163 |     >>> rules = {"questions(//div[contains(@class,'question-summary')])": [{"title": ".//h3/a", "votes": "div.votes div.mini-counts"}]}
164 |     >>> parslepy.Parselet(rules).parse(urllib2.urlopen('http://stackoverflow.com'))
165 |     {'questions': [{'title': u'node.js RSS memory grows over time despite fairly consistent heap sizes',
166 |         'votes': u'0'},
167 |         {'title': u'SQL query for count of predicate applied on rows of subquery',
168 |         'votes': u'3'},
169 |         ...
170 |     }
171 | 
172 | Extract a page heading and a list of item links from a string containing HTML:
173 | 
174 |     >>> import lxml.etree
175 |     >>> import parslepy
176 |     >>> import pprint
177 |     >>> html = """
178 |     ... <!DOCTYPE html>
179 |     ... <html>
180 |     ... <head>
181 |     ...     <title>Sample document to test parslepy</title>
182 |     ...     <meta http-equiv="content-type" content="text/html;charset=utf-8" />
183 |     ... </head>
184 |     ... <body>
185 |     ... <h1 id="main">What&rsquo;s new</h1>
186 |     ... <ul>
187 |     ...     <li class="newsitem"><a href="/article-001.html">This is the first article</a></li>
188 |     ...     <li class="newsitem"><a href="/article-002.html">A second report on something</a></li>
189 |     ...     <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li>
190 |     ... </ul>
191 |     ... </body>
192 |     ... </html>"""
193 |     >>> rules = {
194 |     ...      "heading": "h1#main",
195 |     ...      "news(li.newsitem)": [{
196 |     ...          "title": ".",
197 |     ...          "url": "a/@href",
198 |     ...          "fresh": ".fresh"
199 |     ...      }],
200 |     ... }
201 |     >>> p = parslepy.Parselet(rules)
202 |     >>> extracted = p.parse_fromstring(html)
203 |     >>> pprint.pprint(extracted)
204 |     {'heading': u'What\u2019s new',
205 |      'news': [{'title': u'This is the first article', 'url': '/article-001.html'},
206 |               {'title': u'A second report on something',
207 |                'url': '/article-002.html'},
208 |               {'fresh': u'New!',
209 |                'title': u'Python is great! New!',
210 |                'url': '/article-003.html'}]}
211 |     >>>
212 | 
213 | 
214 | Extract using the rules in a JSON file (from *parslepy*'s ``examples/`` directory):
215 | 
216 | .. code-block:: bash
217 | 
218 |     # Parselet file containing CSS selectors
219 |     $ cat examples/engadget_css.let.json
220 |     {
221 |         "sections(nav#nav-main > ul li)": [{
222 |             "title": ".",
223 |             "url": "a.item @href",
224 |         }]
225 |     }
226 |     $ python run_parslepy.py --script examples/engadget_css.let.json --url http://www.engadget.com
227 |     {u'sections': [{u'title': u'News', u'url': '/'},
228 |                 {u'title': u'Reviews', u'url': '/reviews/'},
229 |                 {u'title': u'Features', u'url': '/features/'},
230 |                 {u'title': u'Galleries', u'url': '/galleries/'},
231 |                 {u'title': u'Videos', u'url': '/videos/'},
232 |                 {u'title': u'Events', u'url': '/events/'},
233 |                 {u'title': u'Podcasts',
234 |                     u'url': '/podcasts/the-engadget-podcast/'},
235 |                 {u'title': u'Engadget Show', u'url': '/videos/show/'},
236 |                 {u'title': u'Topics', u'url': '#nav-topics'}]}
237 | 
238 | 
239 | You may want to check out the other examples given in the ``examples/`` directory.
240 | You can run them using the ``run_parslepy.py`` script like shown above.
241 | 
242 | 
243 | Selector syntax
244 | ^^^^^^^^^^^^^^^
245 | 
246 | *parslepy* understands `CSS3 Selectors`_ and `XPath 1.0`_ expressions.
247 | 
248 | Select elements attributes by name
249 | ##################################
250 | 
251 | It also accepts `Parsley DSL`_'s ``@attributename`` at the end of CSS
252 | selectors, to get the attribute(s) of the preceding selected element(s).
253 | *parslepy* supports `Scrapy`_'s ``::attr(attributename)`` functional pseudo
254 | element extension to CCS3, which gets attributes by ``attributename``.
255 | 
256 | See the two syntax variants in use:
257 | 
258 | .. code-block:: bash
259 | 
260 |     >>> import parslepy
261 |     >>> import pprint
262 |     >>>
263 |     >>> html = """
264 |     ... <!DOCTYPE html>
265 |     ... <html>
266 |     ... <head>
267 |     ...     <title>Sample document to test parslepy</title>
268 |     ...     <meta http-equiv="content-type" content="text/html;charset=utf-8" />
269 |     ... </head>
270 |     ... <body>
271 |     ... <div>
272 |     ... <a class="first" href="http://www.example.com/first">First link</a>
273 |     ... <a class="second" href="http://www.example.com/second">Second link</a>
274 |     ... </div>
275 |     ... </body>
276 |     ... </html>"""
277 |     >>> rules = {
278 |     ...      "links": {
279 |     ...         "first_class": ["a.first::attr(href)"],
280 |     ...         "second_class": ["a.second @href"],
281 |     ...     }
282 |     ... }
283 |     >>> p = parslepy.Parselet(rules)
284 |     >>> extracted = p.parse_fromstring(html)
285 |     >>> pprint.pprint(extracted)
286 |     {'links': {'first_class': ['http://www.example.com/first'],
287 |                'second_class': ['http://www.example.com/second']}}
288 |     >>>
289 | 
290 | 
291 | Select text and comments nodes
292 | ##############################
293 | 
294 | Borrowing from `Scrapy`_'s extension to CCS3 selectors,
295 | *parslepy* supports ``::text`` and ``::comment`` pseudo
296 | elements (resp. get text nodes of an element, and extract
297 | comments in XML/HTML elements)
298 | 
299 | .. code-block:: bash
300 | 
301 |     >>> import parslepy
302 |     >>> import pprint
303 |     >>>
304 |     >>> html = """
305 |     ... <!DOCTYPE html>
306 |     ... <html>
307 |     ... <head>
308 |     ...     <title>Sample document to test parslepy</title>
309 |     ...     <meta http-equiv="content-type" content="text/html;charset=utf-8" />
310 |     ... </head>
311 |     ... <body>
312 |     ... <h1 id="main">News</h1>
313 |     ... <!-- this is a comment -->
314 |     ... <div>
315 |     ... <p>Something to say</p>
316 |     ... <!-- this is another comment -->
317 |     ... </div>
318 |     ... </body>
319 |     ... </html>"""
320 |     >>> rules = {
321 |     ...      "comments": {
322 |     ...         "all": ["::comment"],
323 |     ...         "inside_div": "div::comment"
324 |     ...     }
325 |     ... }
326 |     >>> p = parslepy.Parselet(rules)
327 |     >>> extracted = p.parse_fromstring(html)
328 |     >>> pprint.pprint(extracted)
329 |     {'comments': {'all': [u'this is a comment', u'this is another comment'],
330 |                   'inside_div': u'this is another comment'}}
331 |     >>>
332 | 
333 | 
334 | .. _CSS3 Selectors: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/
335 | .. _XPath 1.0: http://www.w3.org/TR/xpath/
336 | .. _Parsley DSL: https://github.com/fizx/parsley
337 | .. _Scrapy: http://scrapy.org/
338 | 
339 | 
340 | Dependencies
341 | ------------
342 | 
343 | The current dependencies of the master branch are:
344 | 
345 | * lxml>=2.3 (http://lxml.de/, https://pypi.python.org/pypi/lxml)
346 | * cssselect (https://github.com/SimonSapin/cssselect/, https://pypi.python.org/pypi/cssselect) (for lxml>=3)
347 | 
348 | 
349 | API
350 | ---
351 | 
352 | :class:`.Parselet` is the main class for extracting content
353 | from documents with *parslepy*.
354 | 
355 | Instantiate it with a Parsley script, containing
356 | a mapping of name keys, and selectors (CSS or XPath) to apply on documents, or document parts.
357 | 
358 | Then, run the extraction rules by passing an HTML or XML document to
359 | :meth:`~.Parselet.extract` or :meth:`~.Parselet.parse`
360 | 
361 | The output will be a :class:`dict` containing the same keys as in your Parsley
362 | script, and, depending on your selectors, values will be:
363 | 
364 | * text serialization of matching elements
365 | * element attributes
366 | * nested lists of extraction content
367 | 
368 | .. autoclass:: parslepy.base.Parselet
369 |     :members: parse, from_jsonfile, from_jsonstring, from_yamlfile, from_yamlstring, extract, parse_fromstring, keys
370 | 
371 | Customizing
372 | -----------
373 | 
374 | You can use a :class:`.Parselet` directly with it's default configuration,
375 | which should work fine for HTML documents when the content you want to
376 | extract can be accessed by regular CSS3 selectors or XPath 1.0 expressions.
377 | 
378 | But you can also customize how selectors are interpreted by sub-classing
379 | :class:`.SelectorHandler` and passing an instance of your selector handler
380 | to the Parselet constructor.
381 | 
382 | .. autoclass:: parslepy.selectors.Selector
383 | 
384 | .. autoclass:: parslepy.selectors.SelectorHandler
385 |     :members:
386 | 
387 | .. autoclass:: parslepy.selectors.XPathSelectorHandler
388 | 
389 | .. autoclass:: parslepy.selectors.DefaultSelectorHandler
390 | 
391 |         Example with iTunes RSS feed:
392 | 
393 |         >>> import lxml.etree
394 |         >>> xml_parser = lxml.etree.XMLParser()
395 |         >>> url = 'http://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml'
396 |         >>>
397 |         >>> # register Atom and iTunes namespaces with prefixes "atom" and "im"
398 |         ... # with a custom SelectorHandler
399 |         ... xsh = parslepy.XPathSelectorHandler(
400 |         ...     namespaces={
401 |         ...         'atom': 'http://www.w3.org/2005/Atom',
402 |         ...         'im': 'http://itunes.apple.com/rss'
403 |         ...     })
404 |         >>>
405 |         >>> # use prefixes to target elements in the XML document
406 |         >>> rules = {
407 |         ...     "entries(//atom:feed/atom:entry)": [
408 |         ...         {
409 |         ...             "title": "atom:title",
410 |         ...             "name": "im:name",
411 |         ...             "id": "atom:id/@im:id",
412 |         ...             "artist(im:artist)": {
413 |         ...                 "name": ".",
414 |         ...                 "href": "@href"
415 |         ...             },
416 |         ...             "images(im:image)": [{
417 |         ...                 "height": "@height",
418 |         ...                 "url": "."
419 |         ...             }],
420 |         ...             "releasedate": "im:releaseDate"
421 |         ...         }
422 |         ...     ]
423 |         ... }
424 |         >>> parselet = parslepy.Parselet(rules, selector_handler=xsh)
425 |         >>> parselet.parse(url, parser=xml_parser)
426 |         {'entries': [{'name': u'Born Sinner (Deluxe Version)', ...
427 | 
428 | Exceptions
429 | ----------
430 | 
431 | .. autoexception:: parslepy.base.InvalidKeySyntax
432 | 
433 | .. autoexception:: parslepy.base.NonMatchingNonOptionalKey
434 | 
435 | 
436 | Extension functions
437 | -------------------
438 | 
439 | *parslepy* extends XPath 1.0 functions through `lxml`_'s XPath extensions.
440 | See http://lxml.de/extensions.html for details.
441 | 
442 | Built-in extensions
443 | ^^^^^^^^^^^^^^^^^^^
444 | 
445 | *parslepy* comes with a few XPath extension functions. These functions
446 | are available by default when you use :class:`.XPathSelectorHandler`
447 | or :class:`.DefaultSelectorHandler`.
448 | 
449 | *   ``parslepy:text(xpath_expression[, false()])``:
450 |     returns the text content for elements matching *xpath_expression*.
451 |     The optional boolean second parameter indicates whether *tail* content
452 |     should be included or not.
453 |     (Internally, this calls `lxml.etree.tostring(..., method="text", encoding=unicode)`.)
454 |     Use *true()* and *false()* XPath functions, not only *true* or *false*,
455 |     (or 1 or 0). Defaults to *true()*.
456 | 
457 |     >>> import parslepy
458 |     >>> doc = """<!DOCTYPE html>
459 |     ... <html>
460 |     ... <head>
461 |     ...     <title>Some page title</title>
462 |     ... </head>
463 |     ...
464 |     ... <body>
465 |     ...     <h1>Some heading</h1>
466 |     ...
467 |     ...     Some text
468 |     ...
469 |     ...     <p>
470 |     ...     Some paragraph
471 |     ...     </p>
472 |     ... </body>
473 |     ...
474 |     ... </html>"""
475 |     >>> rules = {"heading": "h1"}
476 |     >>>
477 |     >>> # default text extraction includes tail text
478 |     ... parslepy.Parselet(rules).parse_fromstring(doc)
479 |     {'heading': u'Some heading Some text'}
480 |     >>>
481 |     >>> # 2nd argument to false means without tail text
482 |     ... rules = {"heading": "parslepy:text(//h1, false())"}
483 |     >>> parslepy.Parselet(rules).parse_fromstring(doc)
484 |     {'heading': 'Some heading'}
485 |     >>>
486 |     >>> # 2nd argument to true is equivalent to default text extraction
487 |     >>> rules = {"heading": "parslepy:text(//h1, true())"}
488 |     >>> parslepy.Parselet(rules).parse_fromstring(doc)
489 |     {'heading': 'Some heading Some text'}
490 |     >>>
491 | 
492 |     See http://lxml.de/tutorial.html#elements-contain-text for details
493 |     on how `lxml`_ handles text and tail element properties
494 | 
495 | *   ``parslepy:textnl(xpath_expression)``:
496 |     similar to ``parslepy:text()`` but appends `\\n` characters to HTML
497 |     block elements such as `<br>`, `<hr>`, `<div>`
498 | 
499 |     >>> import parslepy
500 |     >>> doc = """<!DOCTYPE html>
501 |     ... <html>
502 |     ... <head>
503 |     ...     <title>Some page title</title>
504 |     ... </head>
505 |     ... <body>
506 |     ... <h1>Some heading</h1><p>Some paragraph<div>with some span inside</div>ending now.</p>
507 |     ... </body>
508 |     ... </html>
509 |     ... """
510 |     >>> parslepy.Parselet({"heading": "parslepy:text(//body)"}).parse_fromstring(doc)
511 |     {'heading': 'Some headingSome paragraphwith some span insideending now.'}
512 |     >>>
513 |     >>> parslepy.Parselet({"heading": "parslepy:textnl(//body)"}).parse_fromstring(doc)
514 |     {'heading': 'Some heading\nSome paragraph\nwith some span inside\nending now.'}
515 |     >>>
516 | 
517 | 
518 | *   ``parslepy:html(xpath_expression)``
519 |     returns the HTML content for elements matching *xpath_expression*.
520 |     Internally, this calls `lxml.html.tostring(element)`.
521 | 
522 |     >>> import parslepy
523 |     >>> doc = """<!DOCTYPE html>
524 |     ... <html>
525 |     ... <head>
526 |     ...     <title>Some page title</title>
527 |     ... </head>
528 |     ... <body>
529 |     ... <h1>(Some heading)</h1>
530 |     ... <h2>[some sub-heading]</h2>
531 |     ... </body>
532 |     ... </html>
533 |     ... """
534 |     >>> parslepy.Parselet({"heading": "parslepy:html(//h1)"}).parse_fromstring(doc)
535 |     {'heading': '<h1>(Some heading)</h1>'}
536 |     >>> parslepy.Parselet({"heading": "parslepy:html(//body)"}).parse_fromstring(doc)
537 |     {'heading': '<body>\n<h1>(Some heading)</h1>\n<h2>[some sub-heading]</h2>\n</body>'}
538 |     >>>
539 | 
540 | 
541 | *   ``parslepy:xml(xpath_expression)``
542 |     returns the XML content for elements matching *xpath_expression*.
543 |     Internally, this calls `lxml.etree.tostring(element)`.
544 | 
545 | *   ``parslepy:strip(xpath_expression[, chars])``
546 |     behaves like Python's `strip()` method for strings but for the text
547 |     content of elements matching *xpath_expression*.
548 |     See http://docs.python.org/2/library/string.html#string.strip
549 | 
550 |     >>> import parslepy
551 |     >>> doc = """<!DOCTYPE html>
552 |     ... <html>
553 |     ... <head>
554 |     ...     <title>Some page title</title>
555 |     ... </head>
556 |     ... <body>
557 |     ... <h1>(Some heading)</h1>
558 |     ... <h2>[some sub-heading]</h2>
559 |     ... </body>
560 |     ... </html>
561 |     ... """
562 |     >>> parslepy.Parselet({"heading": "parslepy:strip(//h2, '[')"}).parse_fromstring(doc)
563 |     {'heading': 'some sub-heading]'}
564 |     >>> parslepy.Parselet({"heading": "parslepy:strip(//h2, ']')"}).parse_fromstring(doc)
565 |     {'heading': '[some sub-heading'}
566 |     >>> parslepy.Parselet({"heading": "parslepy:strip(//h2, '[]')"}).parse_fromstring(doc)
567 |     {'heading': 'some sub-heading'}
568 |     >>> parslepy.Parselet({"heading": "parslepy:strip(//h1, '()')"}).parse_fromstring(doc)
569 |     {'heading': 'Some heading'}
570 |     >>>
571 | 
572 | *   ``parslepy:attrname(xpath_expression_matching_attribute)``
573 |     returns name of an attribute. This works with the catch-all-attributes
574 |     `@*` expression or a specific attribute expression like `@class`.
575 |     It may sound like a useless extension but it can be useful
576 |     when combined with the simple `@*` selector like in the example below:
577 | 
578 |     >>> img_attributes = {
579 |     ...     "images(img)": [{
580 |     ...         "attr_names": ["parslepy:attrname(@*)"],
581 |     ...         "attr_vals": ["@*"],
582 |     ...     }]
583 |     ... }
584 |     >>> extracted = parslepy.Parselet(img_attributes).parse('http://www.python.org')
585 |     >>> for r in extracted["images"]:
586 |     ...:     print dict(zip(r.get("attr_names"), r.get("attr_vals")))
587 |     ...:
588 |     {'src': '/images/python-logo.gif', 'alt': 'homepage', 'border': '0', 'id': 'logo'}
589 |     {'src': '/images/trans.gif', 'alt': 'skip to navigation', 'border': '0', 'id': 'skiptonav'}
590 |     {'src': '/images/trans.gif', 'alt': 'skip to content', 'border': '0', 'id': 'skiptocontent'}
591 |     {'width': '116', 'alt': '', 'src': '/images/donate.png', 'title': '', 'height': '42'}
592 |     {'width': '94', 'style': 'align:center', 'src': '/images/worldmap.jpg', 'alt': '[Python resources in languages other than English]', 'height': '46'}
593 |     {'src': '/images/success/Carmanah.png', 'alt': 'success story photo', 'class': 'success'}
594 | 
595 | 
596 | User-defined extensions
597 | ^^^^^^^^^^^^^^^^^^^^^^^
598 | 
599 | *parslepy* also lets you define your own XPath extensions, just like
600 | `lxml`_ does, except the function you register must accept a user-supplied
601 | context object passed as first argument, subsequent arguments to your extension
602 | function will be the same as for `lxml`_ extensions, i.e. an XPath context,
603 | followed by matching elements and whatever additional parameters your XPath
604 | call passes.
605 | 
606 | The user-supplied context should be passed to :meth:`parslepy.base.Parselet.parse`,
607 | or globally to a XPathSelectorHandler subclass instance passed to instantiate a Parselet.
608 | 
609 | Let's illustrate this with a custom extension to make `<img>` @src
610 | attributes "absolute".
611 | 
612 | Suppose we already have an extraction rule that outputs the `@src` attributes
613 | from `<img>` tags on the Python.org homepage:
614 | 
615 |     >>> import parslepy
616 |     >>> import pprint
617 |     >>> parselet = parslepy.Parselet({"img_abslinks": ["//img/@src"]})
618 |     >>> pprint.pprint(parselet.parse('http://www.python.org'))
619 |     {'img_abslinks': ['/images/python-logo.gif',
620 |                       '/images/trans.gif',
621 |                       '/images/trans.gif',
622 |                       '/images/donate.png',
623 |                       '/images/worldmap.jpg',
624 |                       '/images/success/afnic.fr.png']}
625 | 
626 | We now want to generate full URLs for these images, not relative to
627 | http://www.python.org.
628 | 
629 | **First we need to define our extension function as a Python function**:
630 | 
631 | *parslepy*'s extension functions must accept a user-context as first argument,
632 | then should expect an XPath context, followed by elements or strings
633 | matching the XPath expression,
634 | and finally whatever other parameters are passed to the function call
635 | in extraction rules.
636 | 
637 | In our example, we expect `@src` attribute values as input from XPath,
638 | and combine them with a base URL (via `urlparse.urljoin()`),
639 | the URL from which the HTML document was fetched.
640 | The base URL will be passed as user-context, and we will receive it as
641 | first argument.
642 | So the Python extension function may look like this:
643 | 
644 |     >>> import urlparse
645 |     >>> def absurl(ctx, xpctx, attributes, *args):
646 |     ...         # user-context "ctx" will be the URL of the page
647 |     ...         return [urlparse.urljoin(ctx, u) for u in attributes]
648 |     ...
649 | 
650 | **Then, we need to register this function with parslepy** through
651 | a custom selector handler, with a custom namespace and its prefix:
652 | 
653 |     >>> # choose a prefix and namespace, e.g. "myext" and "local-extensions"
654 |     ... mynamespaces = {
655 |     ...         "myext": "local-extensions"
656 |     ...     }
657 |     >>> myextensions = {
658 |     ...         ("local-extensions", "absurl"): absurl,
659 |     ...     }
660 |     >>>
661 |     >>> import parslepy
662 |     >>> sh = parslepy.DefaultSelectorHandler(
663 |     ...         namespaces=mynamespaces,
664 |     ...         extensions=myextensions)
665 |     >>>
666 | 
667 | 
668 | Now we can use this **absurl()** XPath extension within *parslepy* rules,
669 | with the "myext" prefix
670 | (**do not forget to pass your selector handler** to your Parselet instance):
671 | 
672 |     >>> rules = {"img_abslinks": ["myext:absurl(//img/@src)"]}
673 |     >>> parselet = parslepy.Parselet(rules, selector_handler=sh)
674 | 
675 | And finally, run the extraction rules on Python.org's homepage again,
676 | with a context argument set to the URL
677 | 
678 |     >>> import pprint
679 |     >>> pprint.pprint(parselet.parse('http://www.python.org',
680 |     ...         context='http://www.python.org'))
681 |     {'img_abslinks': ['http://www.python.org/images/python-logo.gif',
682 |                       'http://www.python.org/images/trans.gif',
683 |                       'http://www.python.org/images/trans.gif',
684 |                       'http://www.python.org/images/donate.png',
685 |                       'http://www.python.org/images/worldmap.jpg',
686 |                       'http://www.python.org/images/success/afnic.fr.png']}
687 |     >>>
688 | 
689 | In this case, it may feel odd to have to pass the URL *twice*,
690 | but parse(*URL*) does not store the URL anywhere, it processes only
691 | the HTML stream from the page.
692 | 
693 | More examples
694 | =============
695 | 
696 | Check out more examples and tutorials at `parsley's wiki at GitHub <https://github.com/redapple/parslepy/wiki#usage>`_.
697 | 
698 | .. include:: ../CHANGELOG
699 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\parslepy.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\parslepy.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/docs/preview_docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | abort() {
 4 |     echo >&2 "$*"; exit 1;
 5 | }
 6 | 
 7 | usage() {
 8 |     abort """Usage: $(basename $0) OPTIONS
 9 |     -p|--port     Port number for the HTTP server to use (default: 8000)
10 |     -b|--browser  Browser to open the link (default: xdg-open)"""
11 | }
12 | 
13 | require() {
14 |     type $1 >/dev/null 2>&1
15 | }
16 | 
17 | port=8000
18 | browser=xdg-open
19 | while [ "${1#-}" != "$1" ]; do
20 |     case "$1" in
21 |         -h) usage;;
22 |         -p|--port) [ -z "$2" ] && usage; port="$2"; shift;;
23 |         -b|--browser) [ -z "$2" ] && usage; browser="$2"; shift;;
24 |         *) usage;;
25 |     esac
26 |     shift
27 | done
28 | 
29 | require when-changed || abort "Please install this first:   sudo pip install when-changed"
30 | require "$browser" || abort "$browser is not available -- please specify another browser"
31 | 
32 | # compile the first time
33 | make html
34 | 
35 | # open web-browser
36 | $browser http://localhost:$port/html &
37 | 
38 | # run HTTP server in background inside the _build dir
39 | # NOTE: here, cd also runs in bg, so the script's current dir stays the same
40 | cd _build && python -m SimpleHTTPServer $port &
41 | 
42 | # watch for changes
43 | when-changed index.rst -c "make html; echo Use Ctrl-C to quit preview"
44 | 
45 | 


--------------------------------------------------------------------------------
/examples/engadget.let.json:
--------------------------------------------------------------------------------
1 | {
2 |     "sections(nav#nav-main > ul li)": [{
3 |         "title": ".",
4 |         "url_css": "a.item @href",
5 |         "url_xpath": "a[re:test(@class, 'item')]/@href"
6 |     }]
7 | }
8 | 


--------------------------------------------------------------------------------
/examples/engadget_css.let.json:
--------------------------------------------------------------------------------
1 | {
2 |     "sections(nav#nav-main > ul li)": [{
3 |         "title": ".",
4 |         "url": "a.item @href"
5 |     }]
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/engadget_xpath.let.json:
--------------------------------------------------------------------------------
1 | {
2 |     "sections(//nav[@id='nav-main']/ul/li)": [{
3 |         "title": ".",
4 |         "url": ".//a[contains(@class, 'item')]/@href"
5 |     }]
6 | }
7 | 


--------------------------------------------------------------------------------
/parslepy/__init__.py:
--------------------------------------------------------------------------------
1 | from parslepy.base import Parselet, Parslet, NonMatchingNonOptionalKey, InvalidKeySyntax
2 | from parslepy.selectors import DefaultSelectorHandler, XPathSelectorHandler
3 | 
4 | __version__ = '0.2.0'
5 | __all__ = [
6 |     'Parselet', 'Parslet',
7 |     'DefaultSelectorHandler', 'XPathSelectorHandler',
8 |     'NonMatchingNonOptionalKey', 'InvalidKeySyntax']
9 | 


--------------------------------------------------------------------------------
/parslepy/base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | from parslepy.selectors import DefaultSelectorHandler, SelectorHandler, Selector
  5 | import lxml.etree
  6 | import lxml.html
  7 | import re
  8 | import json
  9 | 
 10 | # http://stackoverflow.com/questions/11301138/how-to-check-if-variable-is-string-with-python-2-and-3-compatibility
 11 | try:
 12 |     isinstance("", basestring)
 13 |     def isstr(s):
 14 |         return isinstance(s, basestring)
 15 | except NameError:
 16 |     def isstr(s):
 17 |         return isinstance(s, str)
 18 | 
 19 | # ----------------------------------------------------------------------
 20 | 
 21 | # compiled Parsley scripts look like this
 22 | # ParsleyNode(
 23 | #       ParsleyContext(key, options[, Selector]): ParsleyNode(...),
 24 | #           ...or
 25 | #       ParsleyContext(key, options[, Selector]): Selector,
 26 | #       ...)
 27 | # --> a tree of ParsleyNode instances,
 28 | #     with terminal leaves of type Selector,
 29 | #     a parent ParsleyNode having 1 or more ParsleyNode children
 30 | #     references through ParsleyContext keys
 31 | #
 32 | class ParsleyNode(dict):
 33 |     pass
 34 | 
 35 | 
 36 | class ParsleyContext(object):
 37 |     """
 38 |     Stores parameters associated with extraction keys in `ParsleyNode` trees.
 39 |     Used as keys in `ParsleyNode` objects
 40 |     """
 41 | 
 42 |     def __init__(self, key, operator=None, required=True, scope=None, iterate=False):
 43 |         """
 44 |         Only `key` is required
 45 | 
 46 |         Arguments:
 47 |         operator (str)     -- "?" optional,  "!" for complete arrays; defaults to None (i.e. required)
 48 |         required (boolean) -- whether the key is required in the output (defaults to True)
 49 |         scope (`Selector`) -- restrict extraction to elements matching this selector
 50 |         iterate (boolean)  -- whether multiple objects will be extracted (defaults to False)
 51 |         """
 52 | 
 53 |         self.key = key
 54 |         self.operator = operator
 55 |         self.required = required
 56 |         self.scope = scope
 57 |         self.iterate = iterate
 58 | 
 59 |     def __repr__(self):
 60 |         return "<ParsleyContext: k=%s; op=%s; required=%s; scope=%s; iter=%s>" % (
 61 |             self.key, self.operator, self.required, self.scope, self.iterate)
 62 | 
 63 | 
 64 | class NonMatchingNonOptionalKey(RuntimeError):
 65 |     """
 66 |     Raised by a :class:`.Parselet` instance while extracting content in strict mode,
 67 |     when a required key does not yield any content.
 68 | 
 69 |     >>> import parslepy
 70 |     >>> html = '''
 71 |     ... <!DOCTYPE html>
 72 |     ... <html>
 73 |     ... <head>
 74 |     ...     <title>Sample document to test parslepy</title>
 75 |     ...     <meta http-equiv="content-type" content="text/html;charset=utf-8" />
 76 |     ... </head>
 77 |     ... <body>
 78 |     ... <h1 id="main">What&rsquo;s new</h1>
 79 |     ... <ul>
 80 |     ...     <li class="newsitem"><a href="/article-001.html">This is the first article</a></li>
 81 |     ...     <li class="newsitem"><a href="/article-002.html">A second report on something</a></li>
 82 |     ...     <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li>
 83 |     ... </ul>
 84 |     ... </body>
 85 |     ... </html>
 86 |     ... '''
 87 |     >>> rules = {
 88 |     ...     "heading1": "h1#main",
 89 |     ...     "heading2": "h2#main",
 90 |     ... }
 91 |     >>> p = parslepy.Parselet(rules, strict=True)
 92 |     >>> try:
 93 |     ...     p.parse_fromstring(html)
 94 |     ... except parslepy.base.NonMatchingNonOptionalKey as e:
 95 |     ...     print "Missing mandatory key"
 96 |     Missing mandatory key
 97 |     """
 98 | 
 99 |     pass
100 | 
101 | 
102 | class InvalidKeySyntax(SyntaxError):
103 |     """
104 |     Raised when the input Parsley script's syntax is invalid
105 | 
106 |     >>> import parslepy
107 |     >>> try:
108 |     ...     p = parslepy.Parselet({"heading@": "#main"})
109 |     ... except parslepy.base.InvalidKeySyntax as e:
110 |     ...     print e
111 |     Key heading@ is not valid
112 |     """
113 | 
114 |     pass
115 | 
116 | 
117 | class Parselet(object):
118 | 
119 |     DEBUG = False
120 |     SPECIAL_LEVEL_KEY = "--"
121 |     KEEP_ONLY_FIRST_ELEMENT_IF_LIST = True
122 |     STRICT_MODE = False
123 | 
124 |     def __init__(self, parselet, selector_handler=None, strict=False, debug=False):
125 |         """
126 |         Take a parselet and optional selector_handler
127 |         and build an abstract representation of the Parsley extraction
128 |         logic.
129 | 
130 |         Four helper class methods can be used to instantiate a Parselet
131 |         from JSON/YAML rules: :meth:`.from_jsonstring`, :meth:`.from_jsonfile`,
132 |         :meth:`.from_yamlstring`, :meth:`.from_yamlfile`.
133 | 
134 |         :param dict parselet: Parsley script as a Python dict object
135 |         :param boolean strict: Set to *True* is you want to
136 |             enforce that missing required keys raise an Exception; default is False
137 |             (i.e. lenient/non-strict mode)
138 |         :param selector_handler: an instance of :class:`selectors.SelectorHandler`
139 |             optional selector handler instance;
140 |             defaults to an instance of :class:`selectors.DefaultSelectorHandler`
141 |         :raises: :class:`.InvalidKeySyntax`
142 | 
143 |         Example:
144 | 
145 |         >>> import parslepy
146 |         >>> rules = {
147 |         ...     "heading": "h1#main",
148 |         ...     "news(li.newsitem)": [{
149 |         ...         "title": ".",
150 |         ...         "url": "a/@href"
151 |         ...     }],
152 |         ... }
153 |         >>> p = parslepy.Parselet(rules)
154 |         >>> type(p)
155 |         <class 'parslepy.base.Parselet'>
156 | 
157 |         Use :meth:`~base.Parselet.extract` or :meth:`~base.Parselet.parse`
158 |         to get extracted content from documents.
159 |         """
160 | 
161 |         if debug:
162 |             self.DEBUG = True
163 |         if strict:
164 |             self.STRICT_MODE = True
165 | 
166 |         self.parselet =  parselet
167 | 
168 |         if not selector_handler:
169 |             self.selector_handler = DefaultSelectorHandler(debug=self.DEBUG)
170 | 
171 |         elif not(isinstance(selector_handler, SelectorHandler)):
172 |             raise ValueError("You must provide a SelectorHandler instance")
173 | 
174 |         else:
175 |             self.selector_handler = selector_handler
176 | 
177 |         self.compile()
178 | 
179 |     # accept comments in parselets
180 |     REGEX_COMMENT_LINE = re.compile(r'^\s*#')
181 |     @classmethod
182 |     def from_jsonfile(cls, fp, selector_handler=None, strict=False, debug=False):
183 |         """
184 |         Create a Parselet instance from a file containing
185 |         the Parsley script as a JSON object
186 | 
187 |         >>> import parslepy
188 |         >>> with open('parselet.json') as fp:
189 |         ...     parslepy.Parselet.from_jsonfile(fp)
190 |         ...
191 |         <parslepy.base.Parselet object at 0x2014e50>
192 | 
193 |         :param file fp: an open file-like pointer containing the Parsley script
194 |         :rtype: :class:`.Parselet`
195 | 
196 |         Other arguments: same as for :class:`.Parselet` contructor
197 |         """
198 | 
199 |         return cls._from_jsonlines(fp,
200 |             selector_handler=selector_handler, strict=strict, debug=debug)
201 | 
202 |     @classmethod
203 |     def from_yamlfile(cls, fp, selector_handler=None, strict=False, debug=False):
204 |         """
205 |         Create a Parselet instance from a file containing
206 |         the Parsley script as a YAML object
207 | 
208 |         >>> import parslepy
209 |         >>> with open('parselet.yml') as fp:
210 |         ...     parslepy.Parselet.from_yamlfile(fp)
211 |         ...
212 |         <parslepy.base.Parselet object at 0x2014e50>
213 | 
214 |         :param file fp: an open file-like pointer containing the Parsley script
215 |         :rtype: :class:`.Parselet`
216 | 
217 |         Other arguments: same as for :class:`.Parselet` contructor
218 |         """
219 | 
220 |         return cls.from_yamlstring(fp.read(), selector_handler=selector_handler, strict=strict, debug=debug)
221 | 
222 |     @classmethod
223 |     def from_yamlstring(cls, s, selector_handler=None, strict=False, debug=False):
224 |         """
225 |         Create a Parselet instance from s (str) containing
226 |         the Parsley script as YAML
227 | 
228 |         >>> import parslepy
229 |         >>> parsley_string = '''---
230 |             title: h1
231 |             link: a @href
232 |         '''
233 |         >>> p = parslepy.Parselet.from_yamlstring(parsley_string)
234 |         >>> type(p)
235 |         <class 'parslepy.base.Parselet'>
236 |         >>>
237 | 
238 |         :param string s: a Parsley script as a YAML string
239 |         :rtype: :class:`.Parselet`
240 | 
241 |         Other arguments: same as for :class:`.Parselet` contructor
242 |         """
243 | 
244 |         import yaml
245 |         return cls(yaml.load(s), selector_handler=selector_handler, strict=strict, debug=debug)
246 | 
247 |     @classmethod
248 |     def from_jsonstring(cls, s, selector_handler=None, strict=False, debug=False):
249 |         """
250 |         Create a Parselet instance from s (str) containing
251 |         the Parsley script as JSON
252 | 
253 |         >>> import parslepy
254 |         >>> parsley_string = '{ "title": "h1", "link": "a @href"}'
255 |         >>> p = parslepy.Parselet.from_jsonstring(parsley_string)
256 |         >>> type(p)
257 |         <class 'parslepy.base.Parselet'>
258 |         >>>
259 | 
260 |         :param string s: a Parsley script as a JSON string
261 |         :rtype: :class:`.Parselet`
262 | 
263 |         Other arguments: same as for :class:`.Parselet` contructor
264 |         """
265 | 
266 |         return cls._from_jsonlines(s.split("\n"),
267 |             selector_handler=selector_handler, strict=strict, debug=debug)
268 | 
269 |     @classmethod
270 |     def _from_jsonlines(cls, lines, selector_handler=None, strict=False, debug=False):
271 |         """
272 |         Interpret input lines as a JSON Parsley script.
273 |         Python-style comment lines are skipped.
274 |         """
275 | 
276 |         return cls(json.loads(
277 |                 "\n".join([l for l in lines if not cls.REGEX_COMMENT_LINE.match(l)])
278 |             ), selector_handler=selector_handler, strict=strict, debug=debug)
279 | 
280 |     def parse(self, fp, parser=None, context=None):
281 |         """
282 |         Parse an HTML or XML document and
283 |         return the extacted object following the Parsley rules give at instantiation.
284 | 
285 |         :param fp: file-like object containing an HTML or XML document, or URL or filename
286 |         :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser()
287 |         :param context: user-supplied context that will be passed to custom XPath extensions (as first argument)
288 |         :rtype: Python :class:`dict` object with mapped extracted content
289 |         :raises: :class:`.NonMatchingNonOptionalKey`
290 | 
291 |         To parse from a string, use the :meth:`~base.Parselet.parse_fromstring` method instead.
292 | 
293 |         Note that the fp paramater is passed directly
294 |         to `lxml.etree.parse <http://lxml.de/api/lxml.etree-module.html#parse>`_,
295 |         so you can also give it an URL, and lxml will download it for you.
296 |         (Also see `<http://lxml.de/tutorial.html#the-parse-function>`_.)
297 |         """
298 | 
299 |         if parser is None:
300 |             parser = lxml.etree.HTMLParser()
301 |         doc = lxml.etree.parse(fp, parser=parser).getroot()
302 |         return self.extract(doc, context=context)
303 | 
304 |     def parse_fromstring(self, s, parser=None, context=None):
305 |         """
306 |         Parse an HTML or XML document and
307 |         return the extacted object following the Parsley rules give at instantiation.
308 | 
309 |         :param string s: an HTML or XML document as a string
310 |         :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser()
311 |         :param context: user-supplied context that will be passed to custom XPath extensions (as first argument)
312 |         :rtype: Python :class:`dict` object with mapped extracted content
313 |         :raises: :class:`.NonMatchingNonOptionalKey`
314 | 
315 |         """
316 |         if parser is None:
317 |             parser = lxml.etree.HTMLParser()
318 |         doc = lxml.etree.fromstring(s, parser=parser)
319 |         return self.extract(doc, context=context)
320 | 
321 |     def compile(self):
322 |         """
323 |         Build the abstract Parsley tree starting from the root node
324 |         (recursive)
325 |         """
326 |         if not isinstance(self.parselet, dict):
327 |             raise ValueError("Parselet must be a dict of some sort. Or use .from_jsonstring(), " \
328 |                 ".from_jsonfile(), .from_yamlstring(), or .from_yamlfile()")
329 |         self.parselet_tree = self._compile(self.parselet)
330 | 
331 |     VALID_KEY_CHARS = "\w-"
332 |     SUPPORTED_OPERATORS = "?"   # "!" not supported for now
333 |     REGEX_PARSELET_KEY = re.compile(
334 |         "^(?P<key>[%(validkeychars)s]+)(?P<operator>[%(suppop)s])?(\((?P<scope>.+)\))?$" % {
335 |             'validkeychars': VALID_KEY_CHARS,
336 |             'suppop': SUPPORTED_OPERATORS}
337 |         )
338 |     def _compile(self, parselet_node, level=0):
339 |         """
340 |         Build part of the abstract Parsley extraction tree
341 | 
342 |         Arguments:
343 |         parselet_node (dict) -- part of the Parsley tree to compile
344 |                                 (can be the root dict/node)
345 |         level (int)          -- current recursion depth (used for debug)
346 |         """
347 | 
348 |         if self.DEBUG:
349 |             debug_offset = "".join(["    " for x in range(level)])
350 | 
351 |         if self.DEBUG:
352 |             print(debug_offset, "%s::compile(%s)" % (
353 |                 self.__class__.__name__, parselet_node))
354 | 
355 |         if isinstance(parselet_node, dict):
356 |             parselet_tree = ParsleyNode()
357 |             for k, v in list(parselet_node.items()):
358 | 
359 |                 # we parse the key raw elements but without much
360 |                 # interpretation (which is done by the SelectorHandler)
361 |                 try:
362 |                     m = self.REGEX_PARSELET_KEY.match(k)
363 |                     if not m:
364 |                         if self.DEBUG:
365 |                             print(debug_offset, "could not parse key", k)
366 |                         raise InvalidKeySyntax(k)
367 |                 except:
368 |                     raise InvalidKeySyntax("Key %s is not valid" % k)
369 | 
370 |                 key = m.group('key')
371 |                 # by default, fields are required
372 |                 key_required = True
373 |                 operator = m.group('operator')
374 |                 if operator == '?':
375 |                     key_required = False
376 |                 # FIXME: "!" operator not supported (complete array)
377 |                 scope = m.group('scope')
378 | 
379 |                 # example: get list of H3 tags
380 |                 # { "titles": ["h3"] }
381 |                 # FIXME: should we support multiple selectors in list?
382 |                 #        e.g. { "titles": ["h1", "h2", "h3", "h4"] }
383 |                 if isinstance(v, (list, tuple)):
384 |                     v = v[0]
385 |                     iterate = True
386 |                 else:
387 |                     iterate = False
388 | 
389 |                 # keys in the abstract Parsley trees are of type `ParsleyContext`
390 |                 try:
391 |                     parsley_context = ParsleyContext(
392 |                         key,
393 |                         operator=operator,
394 |                         required=key_required,
395 |                         scope=self.selector_handler.make(scope) if scope else None,
396 |                         iterate=iterate)
397 |                 except SyntaxError:
398 |                     if self.DEBUG:
399 |                         print("Invalid scope:", k, scope)
400 |                     raise
401 | 
402 |                 if self.DEBUG:
403 |                     print(debug_offset, "current context:", parsley_context)
404 | 
405 |                 # go deeper in the Parsley tree...
406 |                 try:
407 |                     child_tree = self._compile(v, level=level+1)
408 |                 except SyntaxError:
409 |                     if self.DEBUG:
410 |                         print("Invalid value: ", v)
411 |                     raise
412 |                 except:
413 |                     raise
414 | 
415 |                 if self.DEBUG:
416 |                     print(debug_offset, "child tree:", child_tree)
417 | 
418 |                 parselet_tree[parsley_context] = child_tree
419 | 
420 |             return parselet_tree
421 | 
422 |         # a string leaf should match some kind of selector,
423 |         # let the selector handler deal with it
424 |         elif isstr(parselet_node):
425 |             return self.selector_handler.make(parselet_node)
426 |         else:
427 |             raise ValueError(
428 |                     "Unsupported type(%s) for Parselet node <%s>" % (
429 |                         type(parselet_node), parselet_node))
430 | 
431 |     def extract(self, document, context=None):
432 |         """
433 |         Extract values as a dict object following the structure
434 |         of the Parsley script (recursive)
435 | 
436 |         :param document: lxml-parsed document
437 |         :param context: user-supplied context that will be passed to custom XPath extensions (as first argument)
438 |         :rtype: Python *dict* object with mapped extracted content
439 |         :raises: :class:`.NonMatchingNonOptionalKey`
440 | 
441 |         >>> import lxml.etree
442 |         >>> import parslepy
443 |         >>> html = '''
444 |         ... <!DOCTYPE html>
445 |         ... <html>
446 |         ... <head>
447 |         ...     <title>Sample document to test parslepy</title>
448 |         ...     <meta http-equiv="content-type" content="text/html;charset=utf-8" />
449 |         ... </head>
450 |         ... <body>
451 |         ... <h1 id="main">What&rsquo;s new</h1>
452 |         ... <ul>
453 |         ...     <li class="newsitem"><a href="/article-001.html">This is the first article</a></li>
454 |         ...     <li class="newsitem"><a href="/article-002.html">A second report on something</a></li>
455 |         ...     <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li>
456 |         ... </ul>
457 |         ... </body>
458 |         ... </html>
459 |         ... '''
460 |         >>> html_parser = lxml.etree.HTMLParser()
461 |         >>> doc = lxml.etree.fromstring(html, parser=html_parser)
462 |         >>> doc
463 |         <Element html at 0x7f5fb1fce9b0>
464 |         >>> rules = {
465 |         ...     "headingcss": "#main",
466 |         ...     "headingxpath": "//h1[@id='main']"
467 |         ... }
468 |         >>> p = parslepy.Parselet(rules)
469 |         >>> p.extract(doc)
470 |         {'headingcss': u'What\u2019s new', 'headingxpath': u'What\u2019s new'}
471 | 
472 |         """
473 |         if context:
474 |             self.selector_handler.context = context
475 |         return self._extract(self.parselet_tree, document)
476 | 
477 |     def _extract(self, parselet_node, document, level=0):
478 |         """
479 |         Extract values at this document node level
480 |         using the parselet_node instructions:
481 |         - go deeper in tree
482 |         - or call selector handler in case of a terminal selector leaf
483 |         """
484 | 
485 |         if self.DEBUG:
486 |             debug_offset = "".join(["    " for x in range(level)])
487 | 
488 |         # we must go deeper in the Parsley tree
489 |         if isinstance(parselet_node, ParsleyNode):
490 | 
491 |             # default output
492 |             output = {}
493 | 
494 |             # process all children
495 |             for ctx, v in list(parselet_node.items()):
496 |                 if self.DEBUG:
497 |                     print(debug_offset, "context:", ctx, v)
498 |                 extracted=None
499 |                 try:
500 |                     # scoped-extraction:
501 |                     # extraction should be done deeper in the document tree
502 |                     if ctx.scope:
503 |                         extracted = []
504 |                         selected = self.selector_handler.select(document, ctx.scope)
505 |                         if selected:
506 |                             for i, elem in enumerate(selected, start=1):
507 |                                 parse_result = self._extract(v, elem, level=level+1)
508 | 
509 |                                 if isinstance(parse_result, (list, tuple)):
510 |                                     extracted.extend(parse_result)
511 |                                 else:
512 |                                     extracted.append(parse_result)
513 | 
514 |                                 # if we're not in an array,
515 |                                 # we only care about the first iteration
516 |                                 if not ctx.iterate:
517 |                                     break
518 | 
519 |                             if self.DEBUG:
520 |                                 print(debug_offset,
521 |                                     "parsed %d elements in scope (%s)" % (i, ctx.scope))
522 | 
523 |                     # local extraction
524 |                     else:
525 |                         extracted = self._extract(v, document, level=level+1)
526 | 
527 |                 except NonMatchingNonOptionalKey as e:
528 |                     if self.DEBUG:
529 |                         print(debug_offset, str(e))
530 |                     if not ctx.required or not self.STRICT_MODE:
531 |                         output[ctx.key] = {}
532 |                     else:
533 |                         raise
534 |                 except Exception as e:
535 |                     if self.DEBUG:
536 |                         print(str(e))
537 |                     raise
538 | 
539 |                 # replace empty-list result when not looping by empty dict
540 |                 if (    isinstance(extracted, list)
541 |                     and not extracted
542 |                     and not ctx.iterate):
543 |                         extracted = {}
544 | 
545 |                 # keep only the first element if we're not in an array
546 |                 if self.KEEP_ONLY_FIRST_ELEMENT_IF_LIST:
547 |                     try:
548 |                         if (    isinstance(extracted, list)
549 |                             and extracted
550 |                             and not ctx.iterate):
551 | 
552 |                             if self.DEBUG:
553 |                                 print(debug_offset, "keep only 1st element")
554 |                             extracted =  extracted[0]
555 | 
556 |                     except Exception as e:
557 |                         if self.DEBUG:
558 |                             print(str(e))
559 |                             print(debug_offset, "error getting first element")
560 | 
561 |                 # extraction for a required key gave nothing
562 |                 if (    self.STRICT_MODE
563 |                     and ctx.required
564 |                     and extracted is None):
565 |                     raise NonMatchingNonOptionalKey(
566 |                         'key "%s" is required but yield nothing\nCurrent path: %s/(%s)\n' % (
567 |                             ctx.key,
568 |                             document.getroottree().getpath(document),v
569 |                             )
570 |                         )
571 | 
572 |                 # special key to extract a selector-defined level deeper
573 |                 # but still output at same level
574 |                 # this can be useful for breaking up long selectors
575 |                 # or when you need to mix XPath and CSS selectors
576 |                 # e.g.
577 |                 # {
578 |                 #   "something(#content div.main)": {
579 |                 #       "--(.//div[re:test(@class, 'style\d{3,6}')])": {
580 |                 #           "title": "h1",
581 |                 #           "subtitle": "h2"
582 |                 #       }
583 |                 #   }
584 |                 # }
585 |                 #
586 |                 if ctx.key == self.SPECIAL_LEVEL_KEY:
587 |                     if isinstance(extracted, dict):
588 |                         output.update(extracted)
589 |                     elif isinstance(extracted, list):
590 |                         if extracted:
591 |                             raise RuntimeError(
592 |                                 "could not merge non-empty list at higher level")
593 |                         else:
594 |                             #empty list, dont bother?
595 |                             pass
596 |                 else:
597 |                     # required keys are handled above
598 |                     if extracted is not None:
599 |                         output[ctx.key] = extracted
600 |                     else:
601 |                         # do not add this optional key/value pair in the output
602 |                         pass
603 | 
604 |             return output
605 | 
606 |         # a leaf/Selector node
607 |         elif isinstance(parselet_node, Selector):
608 |             return self.selector_handler.extract(document, parselet_node)
609 | 
610 |         else:
611 |             # FIXME: can this happen?
612 |             #        if selector handler returned None at compile time,
613 |             #        probably yes
614 |             pass
615 | 
616 |     def keys(self):
617 |         """
618 |         Return a list of 1st level keys of the output data model
619 | 
620 |         >>> import parslepy
621 |         >>> rules = {
622 |         ...     "headingcss": "#main",
623 |         ...     "headingxpath": "//h1[@id='main']"
624 |         ... }
625 |         >>> p = parslepy.Parselet(rules)
626 |         >>> sorted(p.keys())
627 |         ['headingcss', 'headingxpath']
628 | 
629 |         """
630 |         return self._keys(self.parselet_tree)
631 | 
632 |     def _keys(self, parselet_node):
633 |         keys = []
634 |         if isinstance(parselet_node, ParsleyNode):
635 |             for ctx, v in list(parselet_node.items()):
636 |                 if ctx.key == self.SPECIAL_LEVEL_KEY:
637 |                     keys.extend(self._keys(v))
638 |                 else:
639 |                     keys.append(ctx.key)
640 |         return keys
641 | 
642 | # alias
643 | Parslet = Parselet
644 | 
645 | 
646 | if __name__ == "__main__":
647 |     import doctest
648 |     doctest.testmod()
649 | 


--------------------------------------------------------------------------------
/parslepy/funcs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | import re
  5 | import lxml.etree
  6 | #import traceback
  7 | 
  8 | # ----------------------------------------------------------------------
  9 | 
 10 | try:
 11 |     unicode         # Python 2.x
 12 |     def lxml_element2string(element, method="text", with_tail=False):
 13 |         return lxml.etree.tostring(element, method=method,
 14 |                 encoding=unicode, with_tail=with_tail)
 15 | except NameError:   # Python 3.x
 16 |     def lxml_element2string(element, method="text", with_tail=False):
 17 |         return lxml.etree.tostring(element, method=method,
 18 |                 encoding=str, with_tail=with_tail)
 19 | except:
 20 |     raise
 21 | 
 22 | def extract_text(element, keep_nl=False, with_tail=False):
 23 |     return remove_multiple_whitespaces(
 24 |         lxml_element2string(element, method="text", with_tail=with_tail),
 25 |         keep_nl=keep_nl).strip()
 26 | 
 27 | def extract_html(element, with_tail=False):
 28 |     return lxml_element2string(element, method="html", with_tail=with_tail)
 29 | 
 30 | def extract_xml(element, with_tail=False):
 31 |     return lxml_element2string(element, method="xml", with_tail=with_tail)
 32 | 
 33 | REGEX_NEWLINE = re.compile(r'\n')
 34 | REGEX_WHITESPACE = re.compile(r'\s+', re.UNICODE)
 35 | def remove_multiple_whitespaces(input_string, keep_nl=False):
 36 | 
 37 |     if keep_nl:
 38 |         lines = REGEX_NEWLINE.split(input_string)
 39 |         return "\n".join([remove_multiple_whitespaces(l) for l in lines])
 40 |     else:
 41 |         return REGEX_WHITESPACE.sub(" ", input_string).strip()
 42 | 
 43 | 
 44 | def format_alter_htmltags(tree, tags=[], replacement=" "):
 45 |     regex_repl_start = re.compile(r'^\s*%s' % replacement, re.UNICODE)
 46 |     context = lxml.etree.iterwalk(tree, events=("end", ))
 47 |     tag_set = set(tags)
 48 |     for action, elem in context:
 49 |         if elem.tag not in tag_set:
 50 |             continue
 51 |         if elem.tail is None:
 52 |             elem.tail = replacement
 53 |         elif not regex_repl_start.search(elem.tail):
 54 |             elem.tail = "%s%s" % (replacement, elem.tail)
 55 |     return tree
 56 | 
 57 | 
 58 | HTML_BLOCK_ELEMENTS = [
 59 |     'address',
 60 |     'article',
 61 |     'aside',
 62 |     'audio',
 63 |     'blockquote',
 64 |     'br',
 65 |     'canvas',
 66 |     'div',
 67 |     'dl', 'dd', 'dt',
 68 |     'fieldset',
 69 |     'figcaption',
 70 |     'figure',
 71 |     'footer',
 72 |     'form',
 73 |     'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
 74 |     'header',
 75 |     'hgroup',
 76 |     'hr',
 77 |     'noscript',
 78 |     'li', 'ol', 'ul',
 79 |     'output',
 80 |     'p',
 81 |     'pre',
 82 |     'section',
 83 |     'table',
 84 |     'tfoot',
 85 |     'video',
 86 | ]
 87 | def format_htmlblock_tags(tree, replacement="\n"):
 88 |     return format_alter_htmltags(tree,
 89 |         tags=HTML_BLOCK_ELEMENTS,
 90 |         replacement=replacement)
 91 | 
 92 | 
 93 | def elements2text(nodes, with_tail=True):
 94 |     return [extract_text(e, with_tail=with_tail) for e in nodes]
 95 | 
 96 | 
 97 | def elements2textnl(nodes, with_tail=True, replacement="\n"):
 98 |     return [extract_text(
 99 |                 format_htmlblock_tags(e, replacement=replacement),
100 |                 with_tail=with_tail,
101 |                 keep_nl=True)
102 |             for e in nodes]
103 | 
104 | def elements2html(nodes):
105 |     return [extract_html(e) for e in nodes]
106 | 
107 | def elements2xml(nodes):
108 |     return [extract_xml(e) for e in nodes]
109 | 
110 | # ----------------------------------------------------------------------
111 | 
112 | def test_listitems_type(itemlist, checktype):
113 |     return all([isinstance(i, checktype) for i in itemlist])
114 | 
115 | def check_listitems_types(itemlist):
116 |     return list(set([type(i) for i in itemlist]))
117 | 
118 | def apply2elements(elements, element_func, notelement_func=None):
119 |     ltype = check_listitems_types(elements)
120 |     if ltype == [lxml.etree._Element]:
121 |         return element_func(elements)
122 |     elif notelement_func:
123 |         return notelement_func(elements)
124 |     else:
125 |         return elements
126 | 
127 | #def apply2element(element, element_func, notelement_func=None):
128 |     #if type(element) == lxml.etree._Element:
129 |         #return element_func(element)
130 |     #elif notelement_func:
131 |         #return notelement_func(element)
132 |     #else:
133 |         #return element
134 | 
135 | try:
136 |     unicode         # Python 2.x
137 |     def xpathtostring(context, nodes, with_tail=True, *args):
138 |         return apply2elements(
139 |             nodes,
140 |             element_func=lambda nodes: elements2text(
141 |                 nodes, with_tail=with_tail),
142 |             notelement_func=lambda nodes: [
143 |                 remove_multiple_whitespaces(unicode(s))
144 |                     for s in nodes],
145 |         )
146 | 
147 | except NameError:   # Python 3.x
148 |     def xpathtostring(context, nodes, with_tail=True, *args):
149 |         return apply2elements(
150 |             nodes,
151 |             element_func=lambda nodes: elements2text(
152 |                 nodes, with_tail=with_tail),
153 |             notelement_func=lambda nodes: [
154 |                 remove_multiple_whitespaces(str(s))
155 |                     for s in nodes],
156 |         )
157 | 
158 | def xpathtostringnl(context, nodes, with_tail=True, replacement="\n", *args):
159 |     return apply2elements(nodes,
160 |         element_func=lambda nodes: elements2textnl(
161 |             nodes, with_tail=with_tail, replacement=replacement))
162 | 
163 | def xpathtohtml(context, nodes):
164 |     return apply2elements(nodes,
165 |         element_func=lambda nodes: elements2html(nodes))
166 | 
167 | def xpathtoxml(context, nodes):
168 |     return apply2elements(nodes,
169 |         element_func=lambda nodes: elements2xml(nodes))
170 | 
171 | try:
172 |     unicode         # Python 2.x
173 |     def xpathstrip(context, nodes, stripchars=None, with_tail=True, *args):
174 |         if test_listitems_type(nodes, lxml.etree._Element):
175 |             return [s.strip(stripchars)
176 |                     for s in elements2text(
177 |                         nodes, with_tail=with_tail)]
178 |         else:
179 |             return [unicode(s).strip(stripchars) for s in nodes]
180 | 
181 | except NameError:   # Python 3.x
182 |     def xpathstrip(context, nodes, stripchars=None, with_tail=True, *args):
183 |         if test_listitems_type(nodes, lxml.etree._Element):
184 |             return [s.strip(stripchars)
185 |                     for s in elements2text(
186 |                         nodes, with_tail=with_tail)]
187 |         else:
188 |             return [str(s).strip(stripchars) for s in nodes]
189 | 
190 | 
191 | def xpathattrname(context, attributes, *args):
192 |     return [a.attrname for a in attributes]
193 | 


--------------------------------------------------------------------------------
/parslepy/selectors.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import copy
  4 | 
  5 | import lxml.cssselect
  6 | import lxml.etree
  7 | 
  8 | import parslepy.funcs
  9 | 
 10 | 
 11 | class Selector(object):
 12 |     """
 13 |     Class of objects returned by :class:`.SelectorHandler` instances'
 14 |     (and subclasses) :meth:`~.SelectorHandler.make` method.
 15 |     """
 16 | 
 17 |     def __init__(self, selector):
 18 |         self.selector = selector
 19 | 
 20 |     def __repr__(self):
 21 |         return "<Selector: inner=%s>" % self.selector
 22 | 
 23 | 
 24 | class SelectorHandler(object):
 25 |     """
 26 |     Called when building abstract Parsley trees
 27 |     and when etracting object values during the actual parsing
 28 |     of documents
 29 | 
 30 |     This should be subclassed to implement the selector processing logic
 31 |     you need for your Parsley handling.
 32 | 
 33 |     All 3 methods, :meth:`~.SelectorHandler.make`, :meth:`~.SelectorHandler.select`
 34 |     and :meth:`~.SelectorHandler.extract` MUST be overridden
 35 |     """
 36 | 
 37 |     DEBUG = False
 38 | 
 39 |     def __init__(self, debug=False):
 40 |         if debug:
 41 |             self.DEBUG = True
 42 | 
 43 |     def make(self, selection_string):
 44 |         """
 45 |         Interpret a selection_string as a selector
 46 |         for elements or element attributes in a (semi-)structured document.
 47 |         In case of XPath selectors, this can also be a function call.
 48 | 
 49 |         :param selection_string: a string representing a selector
 50 |         :rtype: :class:`.Selector`
 51 |         """
 52 | 
 53 |         raise NotImplementedError
 54 | 
 55 |     def select(self, document, selector):
 56 |         """
 57 |         Apply the selector on the document
 58 | 
 59 |         :param document: lxml-parsed document
 60 |         :param selector: input :class:`.Selector` to apply on the document
 61 |         :rtype: lxml.etree.Element list
 62 |         """
 63 | 
 64 |         raise NotImplementedError
 65 | 
 66 |     def extract(self, document, selector):
 67 |         """
 68 |         Apply the selector on the document
 69 |         and return a value for the matching elements (text content or
 70 |         element attributes)
 71 | 
 72 |         :param document: lxml-parsed document
 73 |         :param selector: input :class:`.Selector`  to apply on the document
 74 |         :rtype: depends on the selector (string, boolean value, ...)
 75 | 
 76 |         Return value can be single- or multi-valued.
 77 |         """
 78 | 
 79 |         raise NotImplementedError
 80 | 
 81 | 
 82 | class XPathSelectorHandler(SelectorHandler):
 83 |     """
 84 |     This selector only accepts XPath selectors.
 85 | 
 86 |     It understands what lxml.etree.XPath understands, that is XPath 1.0
 87 |     expressions
 88 |     """
 89 | 
 90 |     EXPECTED_NON_ELEMENT_TYPES = [
 91 |         bool,
 92 |         int,
 93 |         float,
 94 |         str,
 95 |     ]
 96 |     try:
 97 |         unicode         # Python 2.x
 98 |         EXPECTED_NON_ELEMENT_TYPES.append(unicode)
 99 |     except NameError:
100 |         pass
101 | 
102 |     LOCAL_NAMESPACE = 'local-parslepy'
103 |     LOCAL_XPATH_EXTENSIONS = {
104 |         (LOCAL_NAMESPACE, 'text') : parslepy.funcs.xpathtostring,
105 |         (LOCAL_NAMESPACE, 'textnl') : parslepy.funcs.xpathtostringnl,
106 | 
107 |         # aliases
108 |         (LOCAL_NAMESPACE, 'str') : parslepy.funcs.xpathtostring,
109 |         (LOCAL_NAMESPACE, 'strnl') : parslepy.funcs.xpathtostringnl,
110 |         (LOCAL_NAMESPACE, 'nl') : parslepy.funcs.xpathtostringnl,
111 | 
112 |         (LOCAL_NAMESPACE, 'html') : parslepy.funcs.xpathtohtml,
113 |         (LOCAL_NAMESPACE, 'xml') : parslepy.funcs.xpathtoxml,
114 |         (LOCAL_NAMESPACE, 'strip') : parslepy.funcs.xpathstrip,
115 | 
116 |         (LOCAL_NAMESPACE, 'attrname') : parslepy.funcs.xpathattrname,
117 |         (LOCAL_NAMESPACE, 'attrnames') : parslepy.funcs.xpathattrname,   # alias that's probably a better fit
118 |     }
119 |     EXSLT_NAMESPACES={
120 |         'date': 'http://exslt.org/dates-and-times',
121 |         'math': 'http://exslt.org/math',
122 |         're': 'http://exslt.org/regular-expressions',
123 |         'set': 'http://exslt.org/sets',
124 |         'str': 'http://exslt.org/strings',
125 |     }
126 |     _extension_router = {}
127 | 
128 |     SMART_STRINGS = False
129 |     SMART_STRINGS_FUNCTIONS = [
130 |         (LOCAL_NAMESPACE, 'attrname'),
131 |         (LOCAL_NAMESPACE, 'attrnames'),
132 |     ]
133 | 
134 |     _selector_cache = {}
135 | 
136 |     def __init__(self, namespaces=None, extensions=None, context=None, debug=False):
137 |         """
138 |         :param namespaces: namespace mapping as :class:`dict`
139 |         :param extensions: extension :class:`dict`
140 |         :param context: user-context passed to XPath extension functions
141 | 
142 |         `namespaces` and `extensions` dicts should have the same format
143 |         as for `lxml`_:
144 |         see http://lxml.de/xpathxslt.html#namespaces-and-prefixes
145 |         and `<http://lxml.de/extensions.html#xpath-extension-functions>`_
146 | 
147 |         Extension functions have a slightly different signature than
148 |         pure-lxml extension functions: they must expect a user-context
149 |         as first argument; all other arguments are the same as for
150 |         `lxml` extensions.
151 | 
152 |         `context` will be passed as first argument to extension functions
153 |         registered through `extensions`.
154 |         Alternative: user-context can also be passed to :meth:`parslepy.base.Parselet.parse`
155 | 
156 |         """
157 | 
158 |         super(XPathSelectorHandler, self).__init__(debug=debug)
159 | 
160 |         # support EXSLT extensions
161 |         self.namespaces = copy.copy(self.EXSLT_NAMESPACES)
162 | 
163 |         # add local XPath extension functions
164 |         self._add_parsley_ns(self.namespaces)
165 |         self.extensions = copy.copy(self.LOCAL_XPATH_EXTENSIONS)
166 | 
167 |         # add user-defined extensions
168 |         self._user_extensions = None
169 |         self.context = context
170 |         if namespaces:
171 |             self.namespaces.update(namespaces)
172 |         if extensions:
173 |             self._user_extensions = extensions
174 |             self._process_extensions(extensions)
175 | 
176 |         # some functions need smart_strings=True
177 |         self._set_smart_strings_regexps()
178 | 
179 |     def _test_smart_strings_needed(self, selector):
180 |         return any([r.search(selector)
181 |                     for r in self.smart_strings_regexps])
182 | 
183 |     def _get_smart_strings_regexps(self, ns, fname):
184 |         # find out what prefixes match the supplied namespace
185 |         prefix_matches = []
186 |         for prefix, namespace in self.namespaces.items():
187 |             if namespace == ns:
188 |                 prefix_matches.append(prefix)
189 | 
190 |         return [re.compile("%s:%s\(" % (p, fname)) for p in prefix_matches]
191 | 
192 |     def _set_smart_strings_regexps(self):
193 |         self.smart_strings_regexps = []
194 |         # smart_strings for built-in extensions
195 |         for (ns, fname) in self.SMART_STRINGS_FUNCTIONS:
196 |             self.smart_strings_regexps.extend(
197 |                 self._get_smart_strings_regexps(ns, fname))
198 | 
199 |         # smart_strings for user_defined extensions
200 |         if self._user_extensions:
201 |             for (ns, fname) in self._user_extensions:
202 |                 self.smart_strings_regexps.extend(
203 |                     self._get_smart_strings_regexps(ns, fname))
204 | 
205 |     def _make_xpathextension(self, ns, fname):
206 |         def xpath_ext(*args):
207 |             return self._extension_router[(ns, fname)](self.context, *args)
208 | 
209 |         extension_name = str("xpext_%s_%d" % (fname, hash(ns)))
210 |         xpath_ext.__doc__ = "docstring for %s" % extension_name
211 |         xpath_ext.__name__ = extension_name
212 |         setattr(self, xpath_ext.__name__, xpath_ext)
213 | 
214 |         return xpath_ext
215 | 
216 |     def _process_extensions(self, extensions):
217 |         for (ns, fname), func in extensions.items():
218 |             self._extension_router[(ns, fname)] = func
219 |             self.extensions[(ns, fname)] = self._make_xpathextension(ns=ns, fname=fname)
220 | 
221 |     @classmethod
222 |     def _add_parsley_ns(cls, namespace_dict):
223 |         """
224 |         Extend XPath evaluation with Parsley extensions' namespace
225 |         """
226 | 
227 |         namespace_dict.update({
228 |             'parslepy' : cls.LOCAL_NAMESPACE,
229 |             'parsley' : cls.LOCAL_NAMESPACE,
230 |         })
231 |         return namespace_dict
232 | 
233 |     def make(self, selection):
234 |         """
235 |         XPath expression can also use EXSLT functions (as long as they are
236 |         understood by libxslt)
237 |         """
238 | 
239 |         cached = self._selector_cache.get(selection)
240 |         if cached:
241 |             return cached
242 | 
243 |         try:
244 |             selector = lxml.etree.XPath(selection,
245 |                 namespaces = self.namespaces,
246 |                 extensions = self.extensions,
247 |                 smart_strings=(self.SMART_STRINGS
248 |                             or self._test_smart_strings_needed(selection)),
249 |                 )
250 | 
251 |         except lxml.etree.XPathSyntaxError as syntax_error:
252 |             syntax_error.msg += ": %s" % selection
253 |             raise syntax_error
254 | 
255 |         except Exception as e:
256 |             if self.DEBUG:
257 |                 print(repr(e), selection)
258 |             raise
259 | 
260 |         # wrap it/cache it
261 |         self._selector_cache[selection] = Selector(selector)
262 |         return self._selector_cache[selection]
263 | 
264 |     @classmethod
265 |     def select(cls, document, selector):
266 |         try:
267 |             return selector.selector(document)
268 |         except Exception as e:
269 |             if cls.DEBUG:
270 |                 print(str(e))
271 |             return
272 | 
273 |     def extract(self, document, selector, debug_offset=''):
274 |         """
275 |         Try and convert matching Elements to unicode strings.
276 | 
277 |         If this fails, the selector evaluation probably already
278 |         returned some string(s) of some sort, or boolean value,
279 |         or int/float, so return that instead.
280 |         """
281 |         selected = self.select(document, selector)
282 |         if selected is not None:
283 | 
284 |             if isinstance(selected, (list, tuple)):
285 | 
286 |                 # FIXME: return None or return empty list?
287 |                 if not len(selected):
288 |                     return
289 | 
290 |                 return [self._extract_single(m) for m in selected]
291 | 
292 |             else:
293 |                 return self._extract_single(selected)
294 | 
295 |         # selector did not match anything
296 |         else:
297 |             if self.DEBUG:
298 |                 print(debug_offset, "selector did not match anything; return None")
299 |             return None
300 | 
301 |     def _default_element_extract(self, element):
302 |         """
303 |         Overridable method to change how matching Elements
304 |         are represented in output
305 |         """
306 | 
307 |         return parslepy.funcs.extract_text(element)
308 | 
309 |     def _extract_single(self, retval):
310 |         # XPath compiled expressions (and CSSSelect translations)
311 |         # can return different types
312 |         # See http://lxml.de/xpathxslt.html#xpath-return-values
313 |         # - True or False, when the XPath expression
314 |         #       has a boolean result
315 |         # - a float, when the XPath expression has a numeric result
316 |         #       (integer or float)
317 |         # - a 'smart' string (as described below),
318 |         #       when the XPath expression has a string result.
319 |         # - a list of items, when the XPath expression has a list as result.
320 |         #       The items may include Elements
321 |         #       (also comments and processing instructions),
322 |         #       strings and tuples.
323 |         #
324 |         #   Note that in the default implementation,
325 |         #   smart strings are disabled
326 |         if type(retval) == lxml.etree._Element:
327 |             return self._default_element_extract(retval)
328 | 
329 |         elif type(retval) == lxml.etree._Comment:
330 |             return self._default_element_extract(retval)
331 | 
332 |         elif isinstance(retval, tuple(self.EXPECTED_NON_ELEMENT_TYPES)):
333 |             return retval
334 | 
335 |         else:
336 |             raise Warning("unusual type %s" % type(retval))
337 |             return retval
338 | 
339 | try:
340 |     from cssselect import HTMLTranslator
341 |     from cssselect.xpath import _unicode_safe_getattr, XPathExpr
342 | 
343 |     class CssTranslator(HTMLTranslator):
344 | 
345 |         def xpath_pseudo_element(self, xpath, pseudo_element):
346 | 
347 |             try:
348 | 
349 |                 from cssselect.parser import FunctionalPseudoElement
350 |                 from cssselect.xpath import _unicode_safe_getattr, XPathExpr
351 | 
352 |                 if isinstance(pseudo_element, FunctionalPseudoElement):
353 |                     method = 'xpath_%s_functional_pseudo_element' % (
354 |                         pseudo_element.name.replace('-', '_'))
355 |                     method = _unicode_safe_getattr(self, method, None)
356 |                     if not method:
357 |                         raise ExpressionError(
358 |                             "The functional pseudo-element ::%s() is unknown"
359 |                         % pseudo_element.name)
360 |                     xpath = method(xpath, pseudo_element.arguments)
361 |                 else:
362 |                     method = 'xpath_%s_simple_pseudo_element' % (
363 |                         pseudo_element.replace('-', '_'))
364 |                     method = _unicode_safe_getattr(self, method, None)
365 |                     if not method:
366 |                         raise ExpressionError(
367 |                             "The pseudo-element ::%s is unknown"
368 |                             % pseudo_element)
369 |                     xpath = method(xpath)
370 | 
371 |             except ImportError:
372 |                 pass
373 | 
374 |             return xpath
375 | 
376 |         # functional pseudo-element:
377 |         # element's attribute by name
378 |         def xpath_attr_functional_pseudo_element(self, xpath, arguments):
379 |             attribute_name = arguments[0].value
380 |             other = XPathExpr('@%s' % attribute_name, '', )
381 |             return xpath.join('/', other)
382 | 
383 |         # pseudo-element:
384 |         # element's text() nodes
385 |         def xpath_text_simple_pseudo_element(self, xpath):
386 |             other = XPathExpr('text()', '', )
387 |             return xpath.join('/', other)
388 | 
389 |         # pseudo-element:
390 |         # element's comment() nodes
391 |         def xpath_comment_simple_pseudo_element(self, xpath):
392 |             other = XPathExpr('comment()', '', )
393 |             return xpath.join('/', other)
394 | 
395 |     css_translator = CssTranslator()
396 |     def css_to_xpath(css):
397 |         return css_translator.css_to_xpath(css)
398 | 
399 | except ImportError:
400 | 
401 |     def css_to_xpath(css):
402 |         return lxml.cssselect.css_to_xpath(css)
403 | 
404 | 
405 | class DefaultSelectorHandler(XPathSelectorHandler):
406 |     """
407 |     Default selector logic, loosely based on the original
408 |     `Parsley` implementation.
409 | 
410 |     This handler understands what cssselect and lxml.etree.XPath understands,
411 |     that is (roughly) XPath 1.0 and CSS3 for things that dont need browser context
412 |     """
413 | 
414 |     # newer lxml version (>3) raise SelectorSyntaxError (directly from cssselect)
415 |     # for invalid CSS selectors
416 |     # but older lxml (2.3.8 for example) have cssselect included
417 |     # and for some selectors raise AssertionError and TypeError instead
418 |     CSSSELECT_SYNTAXERROR_EXCEPTIONS = set([
419 |         # we could use lxml.cssselect.SelectorError (parent class for both),
420 |         # but for lxml<3, they're not related
421 |         lxml.cssselect.SelectorSyntaxError,
422 |         # for unsupported pseudo-class or XPath namespaces prefix syntax
423 |         lxml.cssselect.ExpressionError,
424 |     ])
425 |     # this is to add AssertionError and TypeError if lxml < 3.0.0
426 |     for s in ('#a.', '//h1'):
427 |         try:
428 |             lxml.cssselect.CSSSelector(s)
429 |         except Exception as e:
430 |             CSSSELECT_SYNTAXERROR_EXCEPTIONS.add(type(e))
431 | 
432 |     # example: "a img @src" (fetch the 'src' attribute of an IMG tag)
433 |     # other example: "im|img @im|src" when using namespace prefixes
434 |     REGEX_ENDING_ATTRIBUTE = re.compile(r'^(?P<expr>.+)\s+(?P<attr>@[\:|\w_\d-]+)$')
435 |     def make(self, selection):
436 |         """
437 |         Scopes and selectors are tested in this order:
438 |         * is this a CSS selector with an appended @something attribute?
439 |         * is this a regular CSS selector?
440 |         * is this an XPath expression?
441 | 
442 |         XPath expression can also use EXSLT functions (as long as they are
443 |         understood by libxslt)
444 |         """
445 |         cached = self._selector_cache.get(selection)
446 |         if cached:
447 |             return cached
448 | 
449 |         namespaces = self.EXSLT_NAMESPACES
450 |         self._add_parsley_ns(namespaces)
451 |         try:
452 |             # CSS with attribute? (non-standard but convenient)
453 |             # CSS selector cannot select attributes
454 |             # this "<css selector> @<attr>" syntax is a Parsley extension
455 |             # construct CSS selector and append attribute to XPath expression
456 |             m = self.REGEX_ENDING_ATTRIBUTE.match(selection)
457 |             if m:
458 |                 # the selector should be a regular CSS selector
459 |                 cssxpath = css_to_xpath(m.group("expr"))
460 | 
461 |                 # if "|" is used for namespace prefix reference,
462 |                 #   convert it to XPath prefix syntax
463 |                 attribute = m.group("attr").replace('|', ':')
464 | 
465 |                 cssxpath = "%s/%s" % (cssxpath, attribute)
466 |             else:
467 |                 cssxpath = css_to_xpath(selection)
468 | 
469 |             selector = lxml.etree.XPath(
470 |                 cssxpath,
471 |                 namespaces = self.namespaces,
472 |                 extensions = self.extensions,
473 |                 smart_strings=(self.SMART_STRINGS
474 |                             or self._test_smart_strings_needed(selection)),
475 |                 )
476 | 
477 |         except tuple(self.CSSSELECT_SYNTAXERROR_EXCEPTIONS) as syntax_error:
478 |             if self.DEBUG:
479 |                 print(repr(syntax_error), selection)
480 |                 print("Try interpreting as XPath selector")
481 |             try:
482 |                 selector = lxml.etree.XPath(selection,
483 |                     namespaces = self.namespaces,
484 |                     extensions = self.extensions,
485 |                     smart_strings=(self.SMART_STRINGS
486 |                                 or self._test_smart_strings_needed(selection)),
487 |                     )
488 | 
489 |             except lxml.etree.XPathSyntaxError as syntax_error:
490 |                 syntax_error.msg += ": %s" % selection
491 |                 raise syntax_error
492 | 
493 |             except Exception as e:
494 |                 if self.DEBUG:
495 |                     print(repr(e), selection)
496 |                 raise
497 | 
498 |         # for exception when trying to convert <cssselector> @<attribute> syntax
499 |         except lxml.etree.XPathSyntaxError as syntax_error:
500 |             syntax_error.msg += ": %s" % selection
501 |             raise syntax_error
502 | 
503 |         except Exception as e:
504 |             if self.DEBUG:
505 |                 print(repr(e), selection)
506 |             raise
507 | 
508 |         # wrap it/cache it
509 |         self._selector_cache[selection] = Selector(selector)
510 |         return self._selector_cache[selection]
511 | 


--------------------------------------------------------------------------------
/parslepy/utils/README.md:
--------------------------------------------------------------------------------
 1 | ### Tools for Scrapy framework ###
 2 | 
 3 | * `parslepy.utils.scrapytools.ParsleyItemClassLoader`
 4 | * `parslepy.utils.scrapytools.ParsleyItemLoaderConfig`
 5 | * `parslepy.utils.scrapytools.ParsleyImplicitItemClassLoader`: EXPERIMENTAL, TO BE DOCUMENTED
 6 | 
 7 | Provide your Parsley script at the command line:
 8 | 
 9 | ```
10 | $ scrapy crawl MySpider -a parseletfile=myparselet.let.yml
11 | ```
12 | 
13 | with a Scrapy spider similar to this:
14 | ```python
15 | from mycrawler.items import MyItem
16 | import parslepy
17 | 
18 | from scrapy.contrib.loader import ItemLoader
19 | from scrapy.contrib.loader.processor import TakeFirst
20 | from parslepy.utils.scrapytools import ParsleyItemClassLoader, ParsleyItemLoaderConfig
21 | 
22 | class MyItemLoader(ItemLoader):
23 |     default_output_processor = TakeFirst()
24 | 
25 | 
26 | class MySpider(BaseSpider):
27 |     name = "MySpider"
28 |     allowed_domains = ["example.com"]
29 |     start_urls = ["http://www.example.com/index.html"]
30 | 
31 |     def __init__(self, parseletfile=None):
32 | 
33 |         if parseletfile:
34 |             with open(parseletfile) as yamlfp:
35 |                 self.parselet = parslepy.Parselet.from_yamlfile(yamlfp)
36 | 
37 |     def parse(self, response):
38 | 
39 |         loader = ParsleyItemClassLoader(
40 |                         parselet=self.parselet,
41 |                         configs=[
42 |                             ParsleyItemLoaderConfig(
43 |                                 MyItem,
44 |                                 MyItemLoader)
45 |                         ],
46 |                         response=response)
47 |         for i in loader.iter_items(response):
48 |             yield i
49 | ```
50 | 


--------------------------------------------------------------------------------
/parslepy/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redapple/parslepy/a8bc4c0592824459629018c8f4c6ae3dad6cc3cc/parslepy/utils/__init__.py


--------------------------------------------------------------------------------
/parslepy/utils/scrapytools.py:
--------------------------------------------------------------------------------
  1 | import io as StringIO
  2 | from scrapy.contrib.loader import ItemLoader
  3 | from scrapy.item import Item, Field
  4 | from scrapy.http import Request
  5 | import urllib.parse
  6 | import pprint
  7 | 
  8 | class ParsleyItemLoaderConfig(object):
  9 | 
 10 |     def __init__(self, item_class=None, item_loader_class=None, iter_item_key=None):
 11 |         self.item_class = item_class
 12 |         self.item_loader_class = item_loader_class
 13 |         self.iter_item_key = iter_item_key
 14 | 
 15 |     def __repr__(self):
 16 |         return "<ParsleyItemLoaderConfig: item=%s, loader=%s, key=%s>" % (
 17 |                 self.item_class, self.item_loader_class, self.iter_item_key)
 18 | 
 19 | 
 20 | class ParsleyRequestConfig(object):
 21 | 
 22 |     def __init__(self, iter_request_key=None, url_getter=None, callback=None):
 23 |         if url_getter:
 24 |             self.url_getter = url_getter
 25 |         else:
 26 |             self.url_getter = lambda u: u
 27 |         self.iter_request_key = iter_request_key
 28 |         self.callback = callback
 29 | 
 30 |     def __repr__(self):
 31 |         return "<ParsleyRequestConfig: key=%s, getter=%s, callback=%s>" % (
 32 |                 self.iter_request_key, self.url_getter, self.callback)
 33 | 
 34 | 
 35 | class ParsleyItemClassLoader(object):
 36 |     def __init__(self, parselet, configs, response=None, **context):
 37 | 
 38 |         self.configs = configs
 39 |         self.parselet = parselet
 40 |         self.response = response
 41 |         self.extracted = None
 42 |         self.context = context
 43 | 
 44 |     def _extract(self, response=None):
 45 |         self.extracted = self.parselet.parse(
 46 |             StringIO.StringIO(response.body))
 47 | 
 48 | 
 49 |     def iter_items(self, response=None):
 50 |         if self.extracted is None:
 51 |             self._extract(response or self.response)
 52 | 
 53 |         for config in self.configs:
 54 |             if config.iter_item_key is None:
 55 |                 loader = config.item_loader_class(config.item_class(),
 56 |                     **self.context)
 57 |                 loader.add_value(None, self.extracted)
 58 |                 yield loader.load_item()
 59 |             else:
 60 |                 for item_value in self.extracted.get(config.iter_item_key) or self.extracted:
 61 |                     loader = config.item_loader_class(config.item_class(),
 62 |                         **self.context)
 63 |                     loader.add_value(None, item_value)
 64 |                     yield loader.load_item()
 65 | 
 66 | 
 67 | class ParsleyImplicitItemClassLoader(object):
 68 |     def __init__(self, parselet, configs=None, response=None, **context):
 69 | 
 70 |         self.configs = configs
 71 |         self.parselet = parselet
 72 |         self.response = response
 73 |         self.extracted = None
 74 |         self.context = context
 75 | 
 76 |     def _generate_item_classes(self, extracted):
 77 |         for config in self.configs:
 78 |             if config.iter_item_key:
 79 |                 keys = [
 80 |                     k
 81 |                     for e in extracted.get(config.iter_item_key)
 82 |                     for k in list(e.keys())
 83 |                 ]
 84 |                 class_name = "%sClass" % config.iter_item_key.capitalize()
 85 |             else:
 86 |                 keys = list(extracted.keys())
 87 |                 class_name = "CustomClass"
 88 | 
 89 |             if keys:
 90 |                 print(("keys:", set(keys)))
 91 |                 config.item_class = type(
 92 |                     class_name,
 93 |                     (Item,),
 94 |                     dict([(k, Field()) for k in set(keys)]))
 95 | 
 96 |     def _parse(self, response=None):
 97 |         return self.parselet.parse(
 98 |             StringIO.StringIO(response.body))
 99 | 
100 |     def iter_items(self, response=None):
101 |         extracted = self._parse(response or self.response)
102 | 
103 |         # generate Item classes based on Parsley structure
104 |         self._generate_item_classes(extracted)
105 | 
106 |         for config in self.configs:
107 |             if not config.item_class:
108 |                 continue
109 |             if config.iter_item_key is None:
110 |                 yield config.item_class(**extracted)
111 |             else:
112 |                 #print extracted
113 |                 for item_value in extracted.get(config.iter_item_key):
114 |                     yield config.item_class(**item_value)
115 |         del extracted
116 | 
117 |     def iter_requests(self, response=None, iter_request_key=None, get_url_function=None, request_callback=None):
118 | 
119 |         extracted = self._parse(response or self.response)
120 | 
121 |         if get_url_function is None:
122 |             get_url_function = lambda x: x
123 | 
124 |         #pprint.pprint(self.extracted)
125 |         for request_info in extracted.get(iter_request_key):
126 |             yield Request(
127 |                 url=urllib.parse.urljoin(
128 |                             response.url,
129 |                             get_url_function(request_info)),
130 |                 callback=request_callback)
131 |         del extracted
132 | 
133 | 
134 | class ParsleyLoader(object):
135 |     def __init__(self, parselet, response=None, **context):
136 |         self.parselet = parselet
137 |         self.response = response
138 |         self.extracted = None
139 |         self.context = context
140 | 
141 |     def _infer_item_class(self, extracted, config):
142 |         if config.iter_item_key:
143 |             keys = [
144 |                 k
145 |                 for e in extracted.get(config.iter_item_key)
146 |                 for k in list(e.keys())
147 |             ]
148 |             class_name = "%sClass" % config.iter_item_key.capitalize()
149 |         else:
150 |             keys = list(extracted.keys())
151 |             class_name = "CustomClass"
152 | 
153 |         if keys:
154 |             return type(class_name,
155 |                         (Item,),
156 |                         dict([(k, Field()) for k in set(keys)]))
157 | 
158 |     def _parse(self, response=None):
159 |         return self.parselet.parse(
160 |             StringIO.StringIO(response.body))
161 | 
162 |     def iter_items(self, config, response=None):
163 | 
164 |         if not isinstance(config, ParsleyItemLoaderConfig):
165 |             raise ValueError("You must provide a ParsleyItemLoaderConfig instance")
166 | 
167 |         # FIXME: should this be cached?
168 |         extracted = self._parse(response or self.response)
169 | 
170 |         if not config.item_class:
171 |             # generate Item classes based on Parsley structure
172 |             item_class = self._infer_item_class(extracted, config)
173 |         else:
174 |             item_class = config.item_class
175 | 
176 |         if not item_class:
177 |             return
178 | 
179 |         # FIXME: if item_loader_class is not None,
180 |         #        we should use it
181 |         if config.iter_item_key is None:
182 |             yield config.item_class(**extracted)
183 |         else:
184 |             itemdata = extracted.get(config.iter_item_key)
185 |             if itemdata:
186 |                 for item_value in itemdata:
187 |                     yield config.item_class(**item_value)
188 |         del extracted
189 | 
190 |     def _load_item(self, data, config, **context):
191 |         if config.item_loader_class:
192 |             loader = config.item_loader_class(config.item_class(),
193 |                 **context)
194 |             loader.add_value(None, item_value)
195 |             return loader.load_item()
196 | 
197 | 
198 |     def iter_requests(self, config=None, response=None):
199 | 
200 |         if not isinstance(config, ParsleyRequestConfig):
201 |             raise ValueError("You must provide a ParsleyRequestConfig instance")
202 | 
203 |         # FIXME: should this be cached?
204 |         extracted = self._parse(response or self.response)
205 |         reqdata = extracted.get(config.iter_request_key)
206 |         if reqdata:
207 |             for request_data in reqdata:
208 |                 nurl = urllib.parse.urljoin(
209 |                                 response.url,
210 |                                 config.url_getter(request_data))
211 |                 if nurl:
212 |                     yield Request(
213 |                         url=nurl,
214 |                         callback=config.callback)
215 |         del extracted
216 | 


--------------------------------------------------------------------------------
/requirements-extra.txt:
--------------------------------------------------------------------------------
1 | pyyaml
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cssselect
2 | lxml
3 | 


--------------------------------------------------------------------------------
/run_parslepy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import optparse
 5 | import pprint
 6 | import parslepy
 7 | import lxml.html
 8 | 
 9 | def main():
10 | 
11 |     parser = optparse.OptionParser()
12 |     parser.add_option("--debug", dest="debug", action="store_true", help="debug mode", default=False)
13 |     parser.add_option("--url", dest="url", help="fetch this URL", default=None)
14 |     parser.add_option("--file", dest="inputfile", help="parse this HTML file", default=None)
15 |     parser.add_option("--script", dest="parselet", help="Parsley script filename", default=None)
16 | 
17 |     (options, args) = parser.parse_args()
18 | 
19 |     if not options.parselet:
20 |         print("You must provide a Parsley script")
21 |         return
22 |     if not options.url and not options.inputfile:
23 |         print("You must provide an URL")
24 |         return
25 | 
26 |     with open(options.parselet) as fp:
27 | 
28 |         extractor = parslepy.Parselet.from_jsonfile(fp, options.debug)
29 |         output = extractor.parse(options.url or options.inputfile)
30 |         pprint.pprint(output)
31 | 
32 | if __name__ == '__main__':
33 | 	main()
34 | 
35 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | setup(name='parslepy',
 6 |       version='0.3.0',
 7 |       description='Parsley extraction library using lxml',
 8 |       long_description="""
 9 | ========
10 | Parslepy
11 | ========
12 | 
13 | Parslepy lets you extract content from HTML and XML documents
14 | where extraction rules are defined using a JSON object
15 | or equivalent Python dict,
16 | where keys are names you want to assign to extracted content,
17 | and values are CSS selectors or XPath expressions.
18 | 
19 | Parslepy is an implementation of the Parsley extraction
20 | language defined `here <https://github.com/fizx/parsley>`_,
21 | using lxml and cssselect.
22 | 
23 | You can nest objects, generate list of objects, and (to
24 | a certain extent) mix CSS and XPath.
25 | 
26 | Parslepy uderstands what lxml and cssselect understand,
27 | which is roughly CSS3 selectors and XPath 1.0 expressions.
28 | 
29 | Documentation & examples
30 | ========================
31 | 
32 | See https://github.com/redapple/parslepy/wiki#usage
33 |       """,
34 |       author='Paul Tremberth',
35 |       author_email='paul.tremberth@gmail.com',
36 |       packages=['parslepy'],
37 |       requires=['lxml', 'cssselect'],
38 |       install_requires=[
39 |         "lxml >= 2.3",
40 |         "cssselect",
41 |       ],
42 |       classifiers = [
43 |         'Topic :: Software Development :: Libraries :: Python Modules',
44 |         'Topic :: Text Processing :: Markup :: HTML',
45 |         'Topic :: Text Processing :: Markup :: XML',
46 |         'Operating System :: POSIX :: Linux',
47 |         'Programming Language :: Python :: 2.7',
48 |         'Programming Language :: Python :: 3.5',
49 |         'Programming Language :: Python :: 3.6',
50 |         'License :: OSI Approved :: MIT License',
51 |         'Development Status :: 3 - Alpha',
52 |         'Intended Audience :: Developers',
53 |      ],
54 |      url = 'https://github.com/redapple/parslepy',
55 | )
56 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redapple/parslepy/a8bc4c0592824459629018c8f4c6ae3dad6cc3cc/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/creativecommons.org__licenses__by__3.0.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
  2 | <html xmlns="http://www.w3.org/1999/xhtml"
  3 |       xmlns:cc="http://creativecommons.org/ns#"
  4 |       xmlns:dc="http://purl.org/dc/elements/1.1/"
  5 |       xmlns:dct="http://purl.org/dc/terms/"
  6 |       xmlns:frbr="http://purl.org/vocab/frbr/core#"
  7 |       xml:lang="en">
  8 |   <head about="http://creativecommons.org/licenses/by/3.0/">
  9 |     <meta http-equiv="Content-Type" 
 10 | 	  content="application/xhtml+xml; charset=utf-8" />
 11 |     <title>Creative Commons &mdash; Attribution 3.0 Unported
 12 |   &mdash; CC BY 3.0 </title>
 13 | 
 14 |     <meta http-equiv="content-type" content="text/html;charset=utf-8" />
 15 | 
 16 |     <link rel="stylesheet" type="text/css" href="/includes/yahooapis/2.6.0/container.css"/>
 17 | 
 18 |     <link rel="stylesheet" type="text/css"
 19 |           href="/includes/deed3.css"
 20 |           media="screen" />
 21 |     
 22 |     <link rel="stylesheet" type="text/css" media="print"
 23 |           href="/includes/deed3-print.css" />
 24 |     <link rel="stylesheet" type="text/css"
 25 |           href="/includes/jurisdictions.css"
 26 |           media="screen" />
 27 | 
 28 |     <!--[if lt IE 7]>
 29 |       <link rel="stylesheet" type="text/css"
 30 |             href="/includes/deed3-ie.css"
 31 |             media="screen" />
 32 |     <![endif]-->
 33 | 
 34 |     <link rel="alternate" type="application/rdf+xml" href="rdf" />
 35 |     <script type="text/javascript">
 36 | function setCookie(name, value, expires, path, domain, secure) {
 37 |     document.cookie= name + "=" + escape(value) +
 38 |         ((expires) ? "; expires=" + expires.toGMTString() : "") +
 39 |         ((path) ? "; path=" + path : "") +
 40 |         ((domain) ? "; domain=" + domain : "") +
 41 |         ((secure) ? "; secure" : "");
 42 | }
 43 | var expiry = new Date();
 44 | expiry.setTime(expiry.getTime()+(5*365*24*60*60*1000));
 45 | setCookie('lang','%s', expiry, '/');
 46 | </script>
 47 | 
 48 |     <script type="text/javascript" src="/includes/yahooapis/2.6.0/yahoo-dom-event.js"></script>
 49 |     <script type="text/javascript" src="/includes/yahooapis/2.6.0/connection-min.js"></script>
 50 |     <script type="text/javascript" src="/includes/yahooapis/2.6.0/json-min.js"></script>
 51 |     <script type="text/javascript" src="/includes/yahooapis/2.6.0/container-min.js"></script>
 52 | 
 53 |     <script type="text/javascript" src="//scraper.creativecommons.org/js/deed.js"></script>
 54 | 
 55 |     <script type="text/javascript"
 56 |             src="/includes/help.js">
 57 |     </script>
 58 | 
 59 |     
 60 |   </head>
 61 | 
 62 |   <body typeof="cc:License" about="http://creativecommons.org/licenses/by/3.0/"
 63 |         class="yui-skin-sam">
 64 | 
 65 |     
 66 |   <!-- RDF code here for backwards compatibility.  Please use the
 67 |        license's RDFa instead. -->
 68 |   <!-- <rdf:RDF xmlns="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
 69 |   <License rdf:about="http://creativecommons.org/licenses/by/3.0/">
 70 |     <permits rdf:resource="http://creativecommons.org/ns#DerivativeWorks"/>
 71 |     <permits rdf:resource="http://creativecommons.org/ns#Distribution"/>
 72 |     <permits rdf:resource="http://creativecommons.org/ns#Reproduction"/>
 73 |     <requires rdf:resource="http://creativecommons.org/ns#Attribution"/>
 74 |     <requires rdf:resource="http://creativecommons.org/ns#Notice"/>
 75 |   </License>
 76 | </rdf:RDF> -->
 77 | 
 78 | 
 79 |     <div id="deed"
 80 |          class="green"
 81 |          dir="ltr"
 82 |          style="text-align: left">
 83 | 
 84 |       <div id="deed-head">
 85 |         <div id="cc-logo">
 86 |           <img src="/images/deed/cc-logo.jpg"
 87 |                alt="cc logo" />
 88 |         </div>
 89 |         <div id="cc-link">
 90 |           <a rel="dc:creator dct:creator" href="http://creativecommons.org/">
 91 |             <span property="dc:title dct:title">Creative Commons</span>
 92 |           </a>
 93 |         </div>
 94 |         <h1><span>Creative Commons License Deed</span></h1>
 95 | 
 96 |         <div id="deed-license">
 97 |           <h2>
 98 |   <span property="dc:title dct:title"
 99 |         style="display: inline;">Attribution 3.0 Unported</span>
100 |   <span style="display: inline-block; font-size: 14px; padding-left: 2px;">
101 |     (<span property="dc:identifier dct:identifier"
102 |            style="display: inline; font-size: 12px;">CC BY 3.0</span>)
103 |   </span>
104 | </h2>
105 |         </div>
106 |       </div>
107 | 
108 |       <div id="deed-main" dir="ltr" style="text-align: left">
109 |         <div id="legalcode-block">
110 |             
111 |               <!--- hidden disclaimer text -->
112 |             <div id="help_disclaimer_popup" class="help_panel">
113 |               <div class="hd">Disclaimer</div>
114 |               
115 |                 <div class="bd">
116 |                   <p>
117 | The Commons Deed is not a license. It is simply a handy reference for understanding the Legal Code (the full license) &mdash; it is a human-readable expression of some of its key terms. Think of it as the user-friendly interface to the Legal Code beneath. This Deed itself has no legal value, and its contents do not appear in the actual license.
118 | </p>
119 | 
120 | <p>
121 | Creative Commons is not a law firm and does not provide legal services. Distributing of, displaying of, or linking to this Commons Deed does not create an attorney-client relationship.
122 | </p>
123 |                 </div>
124 |               
125 |             </div>
126 |   
127 |             <div id="deed-disclaimer">
128 |               
129 |                 <div class="summary">
130 |                   This is a human-readable summary of the <a href="legalcode" class="fulltext">Legal Code (the full license)</a>.
131 |                 </div>
132 |               
133 |   
134 |               <div class="disclaimer">
135 |                 <a href="#" id="disclaimer_popup" class="helpLink">
136 |                   Disclaimer
137 |                 </a>
138 |               </div>
139 |             </div>
140 |           
141 |         </div>
142 | 
143 |         <div id="deed-main-content" class="">
144 |           
145 |   
146 |     <div id="libre">
147 |       <a href="http://freedomdefined.org/">
148 |         <img src="/images/deed/seal.png" style="border: 0"
149 |              alt="This license is acceptable for Free Cultural Works." />
150 |       </a>
151 |     </div>
152 |   
153 | 
154 |   <div id="deed-rights"
155 |        dir="ltr" style="text-align: left">
156 |     
157 | 
158 |     
159 | 
160 |     <h3 resource="http://creativecommons.org/ns#Reproduction"
161 |         rel="cc:permits">You are free:</h3>
162 |     <ul class="license-properties">
163 |       <li class="license share"
164 |           rel="cc:permits"
165 |           resource="http://creativecommons.org/ns#Distribution">
166 |         <strong>to Share</strong> — to copy, distribute and transmit the work
167 |       </li>
168 | 
169 |       
170 |         <li class="license remix"
171 |             rel="cc:permits"
172 |             resource="http://creativecommons.org/ns#DerivativeWorks">
173 |           <strong>to Remix</strong> — to adapt the work
174 |         </li>
175 |       
176 | 
177 |       
178 |         <li class="license commercial">
179 |           to make commercial use of the work
180 |         </li>
181 |       
182 |       <li id="more-container"
183 |           class="license-hidden">
184 |         <span id="devnations-container" />
185 |       </li>
186 |     </ul>
187 | 
188 |   </div>
189 |   <div id="deed-conditions">
190 |     <h3>Under the following conditions:</h3>
191 | 
192 |     <ul dir="ltr" style="text-align: left"
193 |         class="license-properties">
194 |       
195 |       <li class="license by"
196 |           rel="cc:requires" resource="http://creativecommons.org/ns#Attribution">
197 |           <p>
198 |             <strong>Attribution</strong> &mdash;
199 |             
200 |               <span id="attribution-container">
201 |                 You must attribute the work in the manner specified by the author or licensor (but not in any way that suggests that they endorse you or your use of the work).
202 |               </span>
203 |             
204 |             <span id="by-more-container" />
205 |           </p>
206 |   
207 |           
208 |             <p id="work-attribution-container" style="display:none;">
209 |               <strong>
210 |                 Attribute this work:
211 |               </strong>
212 |               <br/>
213 |               <input id="work-attribution" value="" type="text"
214 |                      readonly="readonly" onclick="this.select()"
215 |                      onfocus="document.getElementById('work-attribution').select();"/>
216 |               <input id="license-code" type="hidden"
217 |                      value="CC BY 3.0" />
218 |               <input id="license-url" type="hidden"
219 |                      value="http://creativecommons.org/licenses/by/3.0/" />
220 |               <a href="" id="attribution_help" class="helpLink">
221 |                 <img src="/images/information.png"
222 |                      alt="Information" />
223 |               </a>
224 |             </p>
225 |   
226 |             <div id="help_attribution_help" class="help_panel">
227 |               <div class="hd">
228 |                 What does "Attribute this work" mean?
229 |               </div>
230 |               <div class="bd">
231 |                 The page you came from contained embedded licensing metadata, including how the creator wishes to be attributed for re-use. You can use the HTML here to cite the work. Doing so will also include metadata on your page so that others can find the original work as well.
232 |               </div>
233 |             </div>
234 |           
235 |         </li>
236 |       
237 |     </ul>
238 |   </div>
239 | 
240 |   <div id="deed-understanding">
241 |     <h3>
242 |       With the understanding that:
243 |     </h3>
244 | 
245 |     <ul class="understanding license-properties">
246 |       <li class="license">
247 |         <strong>Waiver</strong>
248 |         &mdash;
249 |         Any of the above conditions can be <a href="#" id="waived" class="helpLink">waived</a> if you get permission from the copyright holder.
250 |       </li>
251 | 
252 |       <li class="license">
253 |         <strong>Public Domain</strong>
254 |         &mdash;
255 |         Where the work or any of its elements is in the <a href="http://wiki.creativecommons.org/Public_domain" id="public_domain" class="helpLink">public domain</a> under applicable law, that status is in no way affected by the license.
256 |       </li>
257 | 
258 |       <li class="license">
259 |         <strong>Other Rights</strong>
260 |         &mdash;
261 |         In no way are any of the following rights affected by the license:
262 | 
263 |         <ul>
264 |           <li>
265 |             Your fair dealing or <a href="http://wiki.creativecommons.org/Frequently_Asked_Questions#Do_Creative_Commons_licenses_affect_fair_use.2C_fair_dealing_or_other_exceptions_to_copyright.3F" id="fair_use" class="helpLink">fair use</a> rights, or other applicable copyright exceptions and limitations;
266 |           </li>
267 | 
268 |           
269 |             <li>
270 |               The author's <a href="http://wiki.creativecommons.org/Frequently_Asked_Questions#I_don.E2.80.99t_like_the_way_a_person_has_used_my_work_in_a_derivative_work_or_included_it_in_a_collective_work.3B_what_can_I_do.3F" id="moral_rights" class="helpLink">moral</a> rights;
271 |             </li>
272 |           
273 |           
274 | 
275 |           <li>
276 |             Rights other persons may have either in the work itself or in how the work is used, such as <a href="http://wiki.creativecommons.org/Frequently_Asked_Questions#When_are_publicity_rights_relevant.3F" class="helpLink" id="publicity_rights">publicity</a> or privacy rights.
277 |           </li>
278 |         </ul>
279 |       </li>
280 | 
281 |       <li rel="cc:requires"
282 |           resource="http://creativecommons.org/ns#Notice">
283 |         <strong>Notice</strong>
284 |         &mdash;
285 |         For any reuse or distribution, you must make clear to others the license terms of this work. The best way to do this is with a link to this web page.
286 |       </li>
287 |     </ul>
288 | 
289 |     
290 | 
291 |     <div id="help_waived" class="help_panel">
292 |       <div class="hd">
293 |         What does "conditions can be waived" mean?
294 |       </div>
295 |       <div class="bd">
296 |         <p>
297 |           CC licenses anticipate that a licensor may want to
298 | 		waive compliance with a specific condition, such as
299 | 		attribution.
300 |         </p>
301 |         <p>
302 |           <a href="http://wiki.creativecommons.org/Frequently_Asked_Questions#Can_I_change_the_terms_of_a_CC_license_or_waive_some_of_its_conditions.3F">Learn more</a>.
303 |         </p>
304 |       </div>
305 |     </div>
306 | 
307 |     <div id="help_public_domain" class="help_panel">
308 |       <div class="hd">
309 |         What does "Public Domain" mean?
310 |       </div>
311 |       <div class="bd">
312 |         <p>
313 |           A work is in the public domain when it is free for use by anyone for any purpose without restriction under copyright.
314 |         </p>
315 | 
316 |         <p>
317 |           <a href="http://wiki.creativecommons.org/Public_domain"
318 |              >Learn more</a>.</p>
319 |       </div>
320 | 
321 |     </div>
322 | 
323 |     <div id="help_fair_use" class="help_panel">
324 |       <div class="hd">
325 |         What does "Fair use" mean?
326 |       </div>
327 |       <div class="bd">
328 |         <p>
329 |           All jurisdictions
330 |               allow some limited uses of copyrighted material without
331 |               permission.  CC licenses do not affect the rights of
332 |               users under those copyright limitations and exceptions,
333 |               such as fair use and fair dealing where applicable.
334 |         </p>
335 | 
336 |         <p>
337 |           <a href="http://wiki.creativecommons.org/Frequently_Asked_Questions#Do_Creative_Commons_licenses_affect_fair_use.2C_fair_dealing_or_other_exceptions_to_copyright.3F">Learn more</a>.</p>
338 |       </div>
339 |     </div>
340 | 
341 |     <div id="help_moral_rights" class="help_panel">
342 |       <div class="hd">
343 |         What are "Moral Rights"?
344 |       </div>
345 |       <div class="bd">
346 |         <p>
347 |           In addition to the right of licensors to request removal of their name from the work when used in a derivative or collective they don't like, copyright laws in most jurisdictions around the world (with the notable exception of the US except in very limited circumstances) grant creators "moral rights" which may provide some redress if a derivative work represents a "derogatory treatment" of the licensor's work.
348 |         </p>
349 | 
350 |         <p>
351 |           <a href="http://wiki.creativecommons.org/Frequently_Asked_Questions#I_don.E2.80.99t_like_the_way_a_person_has_used_my_work_in_a_derivative_work_or_included_it_in_a_collective_work.3B_what_can_I_do.3F">Learn more</a>.
352 |         </p>
353 | 
354 |       </div>
355 |     </div>
356 | 
357 |     <div id="help_publicity_rights" class="help_panel">
358 |       <div class="hd">
359 |         What are "Publicity Rights"?
360 |       </div>
361 |       <div class="bd">
362 |         <p>
363 |           Publicity
364 |               rights allow individuals to control how their voice,
365 |               image or likeness is used for commercial purposes in
366 |               public.  If a CC-licensed work includes the voice or
367 |               image of anyone other than the licensor, a user of the
368 |               work may need to get permission from those individuals
369 |               before using the work for commercial purposes.
370 |         </p>
371 | 
372 |         <p>
373 |           <a href="http://wiki.creativecommons.org/Frequently_Asked_Questions#When_are_publicity_rights_relevant.3F">Learn more</a>.
374 |         </p>
375 |       </div>
376 |     </div>
377 | 
378 |   </div>
379 |   <span id="referrer-metadata-container" />
380 | 
381 |         </div>
382 |       </div>
383 | 
384 |     </div>
385 |     <div id="footer">
386 |       <p>
387 |         
388 |           
389 |             <a id="get_this" href="/choose/results-one?license_code=by&amp;jurisdiction=&amp;version=3.0&amp;lang=en"
390 |                >Use this license for your own work.</a>
391 |           
392 |         
393 |       </p>
394 |       <div id="languages">
395 |   <span dir="ltr" style="text-align: left">
396 |     This page is available in the following languages:
397 |   </span>
398 |   <br/><a title="Castellano"
399 |        rel="alternate nofollow frbr:translation" 
400 |        rev="frbr:translationOf"
401 |        href="./deed.es"
402 |        hreflang="es"
403 |        xml:lang="es">Castellano</a>
404 |   
405 |     <a title="Castellano (España)"
406 |        rel="alternate nofollow frbr:translation" 
407 |        rev="frbr:translationOf"
408 |        href="./deed.es_ES"
409 |        hreflang="es_ES"
410 |        xml:lang="es_ES">Castellano (España)</a>
411 |   
412 |     <a title="Català"
413 |        rel="alternate nofollow frbr:translation" 
414 |        rev="frbr:translationOf"
415 |        href="./deed.ca"
416 |        hreflang="ca"
417 |        xml:lang="ca">Català</a>
418 |   
419 |     <a title="Deutsch"
420 |        rel="alternate nofollow frbr:translation" 
421 |        rev="frbr:translationOf"
422 |        href="./deed.de"
423 |        hreflang="de"
424 |        xml:lang="de">Deutsch</a>
425 |   
426 |     <a title="English"
427 |        rel="alternate nofollow frbr:translation" 
428 |        rev="frbr:translationOf"
429 |        href="./deed.en"
430 |        hreflang="en"
431 |        xml:lang="en">English</a>
432 |   
433 |     <a title="Esperanto"
434 |        rel="alternate nofollow frbr:translation" 
435 |        rev="frbr:translationOf"
436 |        href="./deed.eo"
437 |        hreflang="eo"
438 |        xml:lang="eo">Esperanto</a>
439 |   
440 |     <a title="français"
441 |        rel="alternate nofollow frbr:translation" 
442 |        rev="frbr:translationOf"
443 |        href="./deed.fr"
444 |        hreflang="fr"
445 |        xml:lang="fr">français</a>
446 |   
447 |     <a title="hrvatski"
448 |        rel="alternate nofollow frbr:translation" 
449 |        rev="frbr:translationOf"
450 |        href="./deed.hr"
451 |        hreflang="hr"
452 |        xml:lang="hr">hrvatski</a>
453 |   
454 |     <a title="Indonesia"
455 |        rel="alternate nofollow frbr:translation" 
456 |        rev="frbr:translationOf"
457 |        href="./deed.id"
458 |        hreflang="id"
459 |        xml:lang="id">Indonesia</a>
460 |   
461 |     <a title="Italiano"
462 |        rel="alternate nofollow frbr:translation" 
463 |        rev="frbr:translationOf"
464 |        href="./deed.it"
465 |        hreflang="it"
466 |        xml:lang="it">Italiano</a>
467 |   
468 |     <a title="Magyar"
469 |        rel="alternate nofollow frbr:translation" 
470 |        rev="frbr:translationOf"
471 |        href="./deed.hu"
472 |        hreflang="hu"
473 |        xml:lang="hu">Magyar</a>
474 |   
475 |     <a title="Nederlands"
476 |        rel="alternate nofollow frbr:translation" 
477 |        rev="frbr:translationOf"
478 |        href="./deed.nl"
479 |        hreflang="nl"
480 |        xml:lang="nl">Nederlands</a>
481 |   
482 |     <a title="Norsk"
483 |        rel="alternate nofollow frbr:translation" 
484 |        rev="frbr:translationOf"
485 |        href="./deed.no"
486 |        hreflang="no"
487 |        xml:lang="no">Norsk</a>
488 |   
489 |     <a title="polski"
490 |        rel="alternate nofollow frbr:translation" 
491 |        rev="frbr:translationOf"
492 |        href="./deed.pl"
493 |        hreflang="pl"
494 |        xml:lang="pl">polski</a>
495 |   
496 |     <a title="Português"
497 |        rel="alternate nofollow frbr:translation" 
498 |        rev="frbr:translationOf"
499 |        href="./deed.pt"
500 |        hreflang="pt"
501 |        xml:lang="pt">Português</a>
502 |   
503 |     <a title="Português (BR)"
504 |        rel="alternate nofollow frbr:translation" 
505 |        rev="frbr:translationOf"
506 |        href="./deed.pt_BR"
507 |        hreflang="pt_BR"
508 |        xml:lang="pt_BR">Português (BR)</a>
509 |   
510 |     <a title="Suomeksi"
511 |        rel="alternate nofollow frbr:translation" 
512 |        rev="frbr:translationOf"
513 |        href="./deed.fi"
514 |        hreflang="fi"
515 |        xml:lang="fi">Suomeksi</a>
516 |   
517 |     <a title="svenska"
518 |        rel="alternate nofollow frbr:translation" 
519 |        rev="frbr:translationOf"
520 |        href="./deed.sv"
521 |        hreflang="sv"
522 |        xml:lang="sv">svenska</a>
523 |   
524 |     <a title="íslenska"
525 |        rel="alternate nofollow frbr:translation" 
526 |        rev="frbr:translationOf"
527 |        href="./deed.is"
528 |        hreflang="is"
529 |        xml:lang="is">íslenska</a>
530 |   
531 |     <a title="Ελληνικά"
532 |        rel="alternate nofollow frbr:translation" 
533 |        rev="frbr:translationOf"
534 |        href="./deed.el"
535 |        hreflang="el"
536 |        xml:lang="el">Ελληνικά</a>
537 |   
538 |     <a title="русский"
539 |        rel="alternate nofollow frbr:translation" 
540 |        rev="frbr:translationOf"
541 |        href="./deed.ru"
542 |        hreflang="ru"
543 |        xml:lang="ru">русский</a>
544 |   
545 |     <a title="українська"
546 |        rel="alternate nofollow frbr:translation" 
547 |        rev="frbr:translationOf"
548 |        href="./deed.uk"
549 |        hreflang="uk"
550 |        xml:lang="uk">українська</a>
551 |   
552 |     <a title="中文"
553 |        rel="alternate nofollow frbr:translation" 
554 |        rev="frbr:translationOf"
555 |        href="./deed.zh"
556 |        hreflang="zh"
557 |        xml:lang="zh">中文</a>
558 |   
559 |     <a title="華語 (台灣)"
560 |        rel="alternate nofollow frbr:translation" 
561 |        rev="frbr:translationOf"
562 |        href="./deed.zh_TW"
563 |        hreflang="zh_TW"
564 |        xml:lang="zh_TW">華語 (台灣)</a>
565 |   
566 |     <a title="한국어"
567 |        rel="alternate nofollow frbr:translation" 
568 |        rev="frbr:translationOf"
569 |        href="./deed.ko"
570 |        hreflang="ko"
571 |        xml:lang="ko">한국어</a>
572 |   
573 | </div>
574 | 
575 |     </div>
576 | 
577 |     <script type="text/javascript">
578 | //<![CDATA[
579 |       var _gaq = _gaq || [];
580 |       _gaq.push(['_setAccount', 'UA-2010376-1']);
581 |       _gaq.push(['_trackPageview']);
582 |     
583 |       (function() {
584 |         var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
585 |         ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
586 |         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
587 |       })();
588 | //]]>
589 |     </script>
590 |     <script type="text/javascript">
591 | //<![CDATA[
592 |       document.write("<" + "script type='text/javascript' src='//scraper.creativecommons.org/apps/deed?url=" + encodeURIComponent(document.referrer) + "&amp;license_uri=" + encodeURIComponent(document.URL) + "&amp;callback=YAHOO.cc.success'" + "><" + "/script>");
593 | //]]>
594 |     </script>
595 |   </body>
596 | </html>
597 | 


--------------------------------------------------------------------------------
/tests/data/parselet.json:
--------------------------------------------------------------------------------
1 | { "title": "h1", "link": "a @href"}


--------------------------------------------------------------------------------
/tests/data/parselet.yml:
--------------------------------------------------------------------------------
1 | ---
2 | title: h1
3 | link: a @href
4 | 


--------------------------------------------------------------------------------
/tests/data/validator.w3.org.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
  2 |     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  4 | 
  5 |   <head>
  6 |     <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
  7 |     <title>The W3C Markup Validation Service</title>
  8 |     <link rev="made" href="mailto:www-validator@w3.org" />
  9 |     <link rel="shortcut icon" href="http://www.w3.org/2008/site/images/favicon.ico" type="image/x-icon" />
 10 |     <link rev="start" href="./" title="Home Page" />
 11 |     <style type="text/css" media="all">
 12 |       @import "./style/base";
 13 |     </style>
 14 | 
 15 |     <script type="text/javascript" src="scripts/combined"></script>
 16 | 
 17 |     <meta name="keywords" content="HTML, HyperText Markup Language, Validation,
 18 |       W3C Markup Validation Service" />
 19 |     <meta name="description" content="W3C's easy-to-use
 20 |       markup validation service, based on SGML and XML parsers." />
 21 | 
 22 |     <link rel="alternate" type="application/atom+xml" href="http://www.w3.org/QA/Tools/validator-whatsnew.atom" />
 23 | 
 24 |   </head>
 25 | 
 26 |   <body>
 27 |    <div id="banner">
 28 |     <h1 id="title">
 29 |       <a href="http://www.w3.org/"><img alt="W3C" width="110" height="61" id="logo" src="./images/w3c.png" /></a>
 30 |             <a href="./"><span>Markup Validation Service</span></a>
 31 |       </h1>
 32 |       <p id="tagline">Check the markup (HTML, XHTML, …) of Web documents</p>
 33 |    </div>
 34 | 
 35 | 
 36 | 
 37 | 
 38 | <div id="frontforms">
 39 |     <ul id="tabset_tabs">
 40 |         <li><a href="#validate-by-uri"><span>Validate by</span> URI</a></li>
 41 |         <li><a href="#validate-by-upload"><span>Validate by</span> File Upload</a></li>
 42 |         <li><a href="#validate-by-input"><span>Validate by</span> Direct Input</a></li>
 43 |     </ul>
 44 | <div id="fields">
 45 | 
 46 | <fieldset id="validate-by-uri" class="tabset_content front"><legend class="tabset_label">Validate by URI</legend>
 47 |   <form method="get" action="check">
 48 |     <p class="instructions">
 49 |     Validate a document online:
 50 |    </p>
 51 |    <p>
 52 |     <label title="Address of page to Validate" for="uri">Address:</label>
 53 |         <input type="text" name="uri" id="uri" size="45" />
 54 |    </p>
 55 |   <fieldset id="extra_opt_uri" class="moreoptions">
 56 |         <legend class="toggletext"><a href="#validate_by_uri+with_options"><img id="toggleiconURI" class="toggleicon" src="./images/arrow-closed.png" alt="Show" /> More Options</a></legend>
 57 |         <div class="options">
 58 | 
 59 | 
 60 |                 <table>
 61 |   <tr>
 62 |     <th>
 63 |       <label for="uri-charset">Character Encoding</label>
 64 |     </th>
 65 |     <td>
 66 |                     <select id="uri-charset" name="charset">
 67 |                 <option value="(detect automatically)" selected="selected">(detect automatically)</option>
 68 |                 <option value="utf-8">utf-8 (Unicode, worldwide)</option>
 69 |                 <option value="utf-16">utf-16 (Unicode, worldwide)</option>
 70 |                 <option value="iso-8859-1">iso-8859-1 (Western Europe)</option>
 71 |                 <option value="iso-8859-2">iso-8859-2 (Central Europe)</option>
 72 |                 <option value="iso-8859-3">iso-8859-3 (Southern Europe)</option>
 73 |                 <option value="iso-8859-4">iso-8859-4 (North European)</option>
 74 |                 <option value="iso-8859-5">iso-8859-5 (Cyrillic)</option>
 75 |                 <option value="iso-8859-6-i">iso-8859-6-i (Arabic)</option>
 76 |                 <option value="iso-8859-7">iso-8859-7 (Greek)</option>
 77 |                 <option value="iso-8859-8">iso-8859-8 (Hebrew, visual)</option>
 78 |                 <option value="iso-8859-8-i">iso-8859-8-i (Hebrew, logical)</option>
 79 |                 <option value="iso-8859-9">iso-8859-9 (Turkish)</option>
 80 |                 <option value="iso-8859-10">iso-8859-10 (Latin 6)</option>
 81 |                 <option value="iso-8859-11">iso-8859-11 (Latin/Thai)</option>
 82 |                 <option value="iso-8859-13">iso-8859-13 (Latin 7, Baltic Rim)</option>
 83 |                 <option value="iso-8859-14">iso-8859-14 (Latin 8, Celtic)</option>
 84 |                 <option value="iso-8859-15">iso-8859-15 (Latin 9)</option>
 85 |                 <option value="iso-8859-16">iso-8859-16 (Latin 10)</option>
 86 |                 <option value="us-ascii">us-ascii (basic English)</option>
 87 |                 <option value="euc-jp">euc-jp (Japanese, Unix)</option>
 88 |                 <option value="shift_jis">shift_jis (Japanese, Win/Mac)</option>
 89 |                 <option value="iso-2022-jp">iso-2022-jp (Japanese, email)</option>
 90 |                 <option value="euc-kr">euc-kr (Korean)</option>
 91 |                 <option value="ksc_5601">ksc_5601 (Korean)</option>
 92 |                 <option value="gb2312">gb2312 (Chinese, simplified)</option>
 93 |                 <option value="gb18030">gb18030 (Chinese, simplified)</option>
 94 |                 <option value="big5">big5 (Chinese, traditional)</option>
 95 |                 <option value="big5-HKSCS">Big5-HKSCS (Chinese, Hong Kong)</option>
 96 |                 <option value="tis-620">tis-620 (Thai)</option>
 97 |                 <option value="koi8-r">koi8-r (Russian)</option>
 98 |                 <option value="koi8-u">koi8-u (Ukrainian)</option>
 99 |                 <option value="iso-ir-111">iso-ir-111 (Cyrillic KOI-8)</option>
100 |                 <option value="macintosh">macintosh (MacRoman)</option>
101 |                 <option value="windows-1250">windows-1250 (Central Europe)</option>
102 |                 <option value="windows-1251">windows-1251 (Cyrillic)</option>
103 |                 <option value="windows-1252">windows-1252 (Western Europe)</option>
104 |                 <option value="windows-1253">windows-1253 (Greek)</option>
105 |                 <option value="windows-1254">windows-1254 (Turkish)</option>
106 |                 <option value="windows-1255">windows-1255 (Hebrew)</option>
107 |                 <option value="windows-1256">windows-1256 (Arabic)</option>
108 |                 <option value="windows-1257">windows-1257 (Baltic Rim)</option>
109 |               </select>
110 | 
111 |     </td>
112 |     <td><input id="uri-fbc" name="fbc" type="checkbox" value="1" /><label for="uri-fbc" title="Use selected Character encoding only if missing in the document">Only if missing</label></td>
113 |   </tr>
114 |   <tr>
115 |     <th>
116 |         <label for="uri-doctype">Document Type</label>
117 |     </th>
118 |     <td>
119 |         <select id="uri-doctype" name="doctype">
120 |   <option value="Inline" selected="selected">(detect automatically)</option>
121 |   <option value="HTML5">HTML5 (experimental)</option>
122 |   <option value="XHTML 1.0 Strict">XHTML 1.0 Strict</option>
123 |   <option value="XHTML 1.0 Transitional">XHTML 1.0 Transitional</option>
124 |   <option value="XHTML 1.0 Frameset">XHTML 1.0 Frameset</option>
125 |   <option value="HTML 4.01 Strict">HTML 4.01 Strict</option>
126 |   <option value="HTML 4.01 Transitional">HTML 4.01 Transitional</option>
127 |   <option value="HTML 4.01 Frameset">HTML 4.01 Frameset</option>
128 |   <option value="HTML 4.01 + RDFa 1.1">HTML 4.01 + RDFa 1.1</option>
129 |   <option value="HTML 3.2">HTML 3.2</option>
130 |   <option value="HTML 2.0">HTML 2.0</option>
131 |   <option value="ISO/IEC 15445:2000 (&quot;ISO HTML&quot;)">ISO/IEC 15445:2000 ("ISO HTML")</option>
132 |   <option value="XHTML 1.1">XHTML 1.1</option>
133 |   <option value="XHTML + RDFa">XHTML + RDFa</option>
134 |   <option value="XHTML Basic 1.0">XHTML Basic 1.0</option>
135 |   <option value="XHTML Basic 1.1">XHTML Basic 1.1</option>
136 |   <option value="XHTML Mobile Profile 1.2">XHTML Mobile Profile 1.2</option>
137 |   <option value="XHTML-Print 1.0">XHTML-Print 1.0</option>
138 |   <option value="XHTML 1.1 plus MathML 2.0">XHTML 1.1 plus MathML 2.0</option>
139 |   <option value="XHTML 1.1 plus MathML 2.0 plus SVG 1.1">XHTML 1.1 plus MathML 2.0 plus SVG 1.1</option>
140 |   <option value="MathML 2.0">MathML 2.0</option>
141 |   <option value="SVG 1.0">SVG 1.0</option>
142 |   <option value="SVG 1.1">SVG 1.1</option>
143 |   <option value="SVG 1.1 Tiny">SVG 1.1 Tiny</option>
144 |   <option value="SVG 1.1 Basic">SVG 1.1 Basic</option>
145 |   <option value="SMIL 1.0">SMIL 1.0</option>
146 |   <option value="SMIL 2.0">SMIL 2.0</option>
147 | </select>
148 |     </td>
149 |     <td><label for="uri-fbd"><input id="uri-fbd" name="fbd" type="checkbox" value="1" title="Use selected Document Type only if missing in the document" />Only if missing</label></td>
150 |   </tr>
151 |   <tr>
152 |           <td colspan="3"><input type="radio" name="group" id="urigroup_no" value="0" checked="checked" /><label for="urigroup_no">List Messages Sequentially</label> <input type="radio" name="group" id="urigroup_yes" value="1" /><label for="urigroup_yes">Group Error Messages by Type</label></td>
153 | 
154 | </tr>
155 | <tr>
156 |         <td><input id="uri-ss" name="ss" type="checkbox" value="1" /><label title="Show Page Source" for="uri-ss">Show Source</label></td>
157 |         <td colspan="2"><input id="uri-st" name="st" type="checkbox" value="1" /><label for="uri-st">Clean up Markup with HTML-Tidy</label>
158 |         </td>
159 | 
160 | 
161 | 
162 |           </tr>
163 |           <tr>
164 |                   <td><input id="uri-outline" name="outline" type="checkbox" value="1" /><label title="Show an Outline of the document" for="uri-outline">Show Outline</label>
165 |                   </td>
166 |                   <td><input id="uri-No200" name="No200" type="checkbox" value="1" /><label title="Validate also pages for which the HTTP status code indicates an error" for="uri-No200">Validate error pages</label></td>
167 | 
168 |                   <td><input id="uri-verbose" name="verbose" type="checkbox" value="1" /><label title="Verbose Output" for="uri-verbose">Verbose Output</label></td>
169 |         </tr>
170 | </table>
171 | 
172 | 
173 |     </div>
174 |   </fieldset>
175 | 
176 | 
177 |     <p class="submit_button">
178 |             <input type="submit" title="Submit for validation" value="Check" />
179 |     </p>
180 | 
181 |   </form>
182 | </fieldset>
183 | 
184 | <fieldset id="validate-by-upload"  class="tabset_content front"><legend class="tabset_label">Validate by File Upload</legend>
185 | <form method="post" enctype="multipart/form-data" action="check">
186 |   <p class="instructions">Upload a document for validation:</p>
187 |   <p><label title="Choose a Local File to Upload and Validate" for="uploaded_file">File:</label>
188 |     <input type="file" id="uploaded_file" name="uploaded_file" size="30" /></p>
189 |     <fieldset id="extra_opt_upload" class="moreoptions">
190 |             <legend class="toggletext"><a href="#validate_by_upload+with_options"><img class="toggleicon" src="./images/arrow-closed.png" alt="Show" /> More Options</a></legend>
191 |             <div class="options">
192 | 
193 |          <table>
194 |   <tr>
195 |     <th>
196 |       <label for="upload-charset">Character Encoding</label>
197 |     </th>
198 |     <td>
199 |                     <select id="upload-charset" name="charset">
200 |                 <option value="(detect automatically)" selected="selected">(detect automatically)</option>
201 |                 <option value="utf-8">utf-8 (Unicode, worldwide)</option>
202 |                 <option value="utf-16">utf-16 (Unicode, worldwide)</option>
203 |                 <option value="iso-8859-1">iso-8859-1 (Western Europe)</option>
204 |                 <option value="iso-8859-2">iso-8859-2 (Central Europe)</option>
205 |                 <option value="iso-8859-3">iso-8859-3 (Southern Europe)</option>
206 |                 <option value="iso-8859-4">iso-8859-4 (North European)</option>
207 |                 <option value="iso-8859-5">iso-8859-5 (Cyrillic)</option>
208 |                 <option value="iso-8859-6-i">iso-8859-6-i (Arabic)</option>
209 |                 <option value="iso-8859-7">iso-8859-7 (Greek)</option>
210 |                 <option value="iso-8859-8">iso-8859-8 (Hebrew, visual)</option>
211 |                 <option value="iso-8859-8-i">iso-8859-8-i (Hebrew, logical)</option>
212 |                 <option value="iso-8859-9">iso-8859-9 (Turkish)</option>
213 |                 <option value="iso-8859-10">iso-8859-10 (Latin 6)</option>
214 |                 <option value="iso-8859-11">iso-8859-11 (Latin/Thai)</option>
215 |                 <option value="iso-8859-13">iso-8859-13 (Latin 7, Baltic Rim)</option>
216 |                 <option value="iso-8859-14">iso-8859-14 (Latin 8, Celtic)</option>
217 |                 <option value="iso-8859-15">iso-8859-15 (Latin 9)</option>
218 |                 <option value="iso-8859-16">iso-8859-16 (Latin 10)</option>
219 |                 <option value="us-ascii">us-ascii (basic English)</option>
220 |                 <option value="euc-jp">euc-jp (Japanese, Unix)</option>
221 |                 <option value="shift_jis">shift_jis (Japanese, Win/Mac)</option>
222 |                 <option value="iso-2022-jp">iso-2022-jp (Japanese, email)</option>
223 |                 <option value="euc-kr">euc-kr (Korean)</option>
224 |                 <option value="ksc_5601">ksc_5601 (Korean)</option>
225 |                 <option value="gb2312">gb2312 (Chinese, simplified)</option>
226 |                 <option value="gb18030">gb18030 (Chinese, simplified)</option>
227 |                 <option value="big5">big5 (Chinese, traditional)</option>
228 |                 <option value="big5-HKSCS">Big5-HKSCS (Chinese, Hong Kong)</option>
229 |                 <option value="tis-620">tis-620 (Thai)</option>
230 |                 <option value="koi8-r">koi8-r (Russian)</option>
231 |                 <option value="koi8-u">koi8-u (Ukrainian)</option>
232 |                 <option value="iso-ir-111">iso-ir-111 (Cyrillic KOI-8)</option>
233 |                 <option value="macintosh">macintosh (MacRoman)</option>
234 |                 <option value="windows-1250">windows-1250 (Central Europe)</option>
235 |                 <option value="windows-1251">windows-1251 (Cyrillic)</option>
236 |                 <option value="windows-1252">windows-1252 (Western Europe)</option>
237 |                 <option value="windows-1253">windows-1253 (Greek)</option>
238 |                 <option value="windows-1254">windows-1254 (Turkish)</option>
239 |                 <option value="windows-1255">windows-1255 (Hebrew)</option>
240 |                 <option value="windows-1256">windows-1256 (Arabic)</option>
241 |                 <option value="windows-1257">windows-1257 (Baltic Rim)</option>
242 |               </select>
243 | 
244 |     </td>
245 |     <td><input id="upload-fbc" name="fbc" type="checkbox" value="1" /><label for="upload-fbc" title="Use selected Character encoding only if missing in the document">Only if missing</label></td>
246 |   </tr>
247 |   <tr>
248 |     <th>
249 |         <label for="upload-doctype">Document Type</label>
250 |     </th>
251 |     <td>
252 |         <select id="upload-doctype" name="doctype">
253 |   <option value="Inline" selected="selected">(detect automatically)</option>
254 |   <option value="HTML5">HTML5 (experimental)</option>
255 |   <option value="XHTML 1.0 Strict">XHTML 1.0 Strict</option>
256 |   <option value="XHTML 1.0 Transitional">XHTML 1.0 Transitional</option>
257 |   <option value="XHTML 1.0 Frameset">XHTML 1.0 Frameset</option>
258 |   <option value="HTML 4.01 Strict">HTML 4.01 Strict</option>
259 |   <option value="HTML 4.01 Transitional">HTML 4.01 Transitional</option>
260 |   <option value="HTML 4.01 Frameset">HTML 4.01 Frameset</option>
261 |   <option value="HTML 4.01 + RDFa 1.1">HTML 4.01 + RDFa 1.1</option>
262 |   <option value="HTML 3.2">HTML 3.2</option>
263 |   <option value="HTML 2.0">HTML 2.0</option>
264 |   <option value="ISO/IEC 15445:2000 (&quot;ISO HTML&quot;)">ISO/IEC 15445:2000 ("ISO HTML")</option>
265 |   <option value="XHTML 1.1">XHTML 1.1</option>
266 |   <option value="XHTML + RDFa">XHTML + RDFa</option>
267 |   <option value="XHTML Basic 1.0">XHTML Basic 1.0</option>
268 |   <option value="XHTML Basic 1.1">XHTML Basic 1.1</option>
269 |   <option value="XHTML Mobile Profile 1.2">XHTML Mobile Profile 1.2</option>
270 |   <option value="XHTML-Print 1.0">XHTML-Print 1.0</option>
271 |   <option value="XHTML 1.1 plus MathML 2.0">XHTML 1.1 plus MathML 2.0</option>
272 |   <option value="XHTML 1.1 plus MathML 2.0 plus SVG 1.1">XHTML 1.1 plus MathML 2.0 plus SVG 1.1</option>
273 |   <option value="MathML 2.0">MathML 2.0</option>
274 |   <option value="SVG 1.0">SVG 1.0</option>
275 |   <option value="SVG 1.1">SVG 1.1</option>
276 |   <option value="SVG 1.1 Tiny">SVG 1.1 Tiny</option>
277 |   <option value="SVG 1.1 Basic">SVG 1.1 Basic</option>
278 |   <option value="SMIL 1.0">SMIL 1.0</option>
279 |   <option value="SMIL 2.0">SMIL 2.0</option>
280 | </select>
281 |     </td>
282 |     <td><label for="upload-fbd"><input id="upload-fbd" name="fbd" type="checkbox" value="1" title="Use selected Document Type only if missing in the document" />Only if missing</label></td>
283 |   </tr>
284 |   <tr>
285 |           <td colspan="3"><input type="radio" name="group" id="uploadgroup_no" value="0" checked="checked" /><label for="uploadgroup_no">List Messages Sequentially</label> <input type="radio" name="group" id="uploadgroup_yes" value="1" /><label for="uploadgroup_yes">Group Error Messages by Type</label></td>
286 | 
287 | </tr>
288 | <tr>
289 |         <td><input id="upload-ss" name="ss" type="checkbox" value="1" /><label title="Show Page Source" for="upload-ss">Show Source</label></td>
290 |         <td colspan="2"><input id="upload-st" name="st" type="checkbox" value="1" /><label for="upload-st">Clean up Markup with HTML-Tidy</label>
291 |         </td>
292 | 
293 | 
294 | 
295 |           </tr>
296 |           <tr>
297 |                   <td><input id="upload-outline" name="outline" type="checkbox" value="1" /><label title="Show an Outline of the document" for="upload-outline">Show Outline</label>
298 |                   </td>
299 |                   <td><input id="upload-No200" name="No200" type="checkbox" value="1" /><label title="Validate also pages for which the HTTP status code indicates an error" for="upload-No200">Validate error pages</label></td>
300 | 
301 |                   <td><input id="upload-verbose" name="verbose" type="checkbox" value="1" /><label title="Verbose Output" for="upload-verbose">Verbose Output</label></td>
302 |         </tr>
303 | </table>
304 | 
305 | 
306 |       </div>
307 |     </fieldset><!-- invisible -->
308 | 
309 |         <p class="submit_button">
310 |                 <input title="Submit for validation" type="submit" value="Check" />
311 |         </p>
312 | 
313 | </form>
314 |                 <p><strong>Note</strong>: file upload may not work with Internet
315 |                 Explorer on some versions of Windows XP Service Pack 2, see our
316 |                 <a href="http://www.w3.org/QA/2005/01/Validator-IE_WinXP_SP2">information page</a>
317 |                 on the W3C QA Website.</p>
318 | 
319 | </fieldset>
320 |       <fieldset id="validate-by-input"  class="tabset_content front"><legend class="tabset_label">Validate by direct input</legend>
321 |                   <form method="post" enctype="multipart/form-data" action="check">
322 |                           <p class="instructions"><label title="Paste a complete (HTML) Document here" for="fragment">Enter the Markup to validate</label>:<br />
323 | 
324 |                         <textarea id="fragment" name="fragment" rows="12" cols="80"></textarea>
325 |                        <!--  <br /><label for="parsemodel">Treat as:</label>
326 |                                 <select id="parsemodel" name="parsemodel">
327 |                                         <option value="sgml">HTML</option>
328 |                                         <option value="xml">XML (and XHTML)</option>
329 |                                 </select> -->
330 |                         </p>
331 |                           <fieldset id="extra_opt_direct" class="moreoptions">
332 |                                         <legend class="toggletext"><a href="#validate_by_input+with_options"><img class="toggleicon" src="./images/arrow-closed.png" alt="Show" /> More Options</a></legend>
333 |                                         <div class="options">
334 | 
335 |                                         <table>
336 | 
337 |                                         <tr><th colspan="3"><input id="direct_prefill_no" name="prefill" type="radio" value="0" checked="checked" />
338 |   <label for="direct_prefill_no">Validate Full Document</label></th></tr>
339 | 
340 | <tr class="subchoice" id="choice_full">
341 |         <th class="subchoice_head"><label for="direct-doctype">Use Doctype:</label></th>
342 | 
343 | <td>            <select id="direct-doctype" name="doctype">
344 |   <option value="Inline" selected="selected">(detect automatically)</option>
345 |   <option value="HTML5">HTML5 (experimental)</option>
346 |   <option value="XHTML 1.0 Strict">XHTML 1.0 Strict</option>
347 |   <option value="XHTML 1.0 Transitional">XHTML 1.0 Transitional</option>
348 |   <option value="XHTML 1.0 Frameset">XHTML 1.0 Frameset</option>
349 |   <option value="HTML 4.01 Strict">HTML 4.01 Strict</option>
350 |   <option value="HTML 4.01 Transitional">HTML 4.01 Transitional</option>
351 |   <option value="HTML 4.01 Frameset">HTML 4.01 Frameset</option>
352 |   <option value="HTML 4.01 + RDFa 1.1">HTML 4.01 + RDFa 1.1</option>
353 |   <option value="HTML 3.2">HTML 3.2</option>
354 |   <option value="HTML 2.0">HTML 2.0</option>
355 |   <option value="ISO/IEC 15445:2000 (&quot;ISO HTML&quot;)">ISO/IEC 15445:2000 ("ISO HTML")</option>
356 |   <option value="XHTML 1.1">XHTML 1.1</option>
357 |   <option value="XHTML + RDFa">XHTML + RDFa</option>
358 |   <option value="XHTML Basic 1.0">XHTML Basic 1.0</option>
359 |   <option value="XHTML Basic 1.1">XHTML Basic 1.1</option>
360 |   <option value="XHTML Mobile Profile 1.2">XHTML Mobile Profile 1.2</option>
361 |   <option value="XHTML-Print 1.0">XHTML-Print 1.0</option>
362 |   <option value="XHTML 1.1 plus MathML 2.0">XHTML 1.1 plus MathML 2.0</option>
363 |   <option value="XHTML 1.1 plus MathML 2.0 plus SVG 1.1">XHTML 1.1 plus MathML 2.0 plus SVG 1.1</option>
364 |   <option value="MathML 2.0">MathML 2.0</option>
365 |   <option value="SVG 1.0">SVG 1.0</option>
366 |   <option value="SVG 1.1">SVG 1.1</option>
367 |   <option value="SVG 1.1 Tiny">SVG 1.1 Tiny</option>
368 |   <option value="SVG 1.1 Basic">SVG 1.1 Basic</option>
369 |   <option value="SMIL 1.0">SMIL 1.0</option>
370 |   <option value="SMIL 2.0">SMIL 2.0</option>
371 | </select></td>
372 | <td>            <input id="direct-fbd" name="fbd" type="checkbox" value="1" /><label for="direct-fbd">Only if Doctype is missing</label></td>
373 | </tr>
374 | <tr><th colspan="3"><input id="direct_prefill_yes" name="prefill" type="radio" value="1" />
375 |   <label for="direct_prefill_yes">Validate HTML fragment</label></th></tr>
376 |   <tr class="subchoice" id="choice_frag">
377 |     <th class="subchoice_head">Use Doctype:</th>
378 |   <td colspan="2">
379 |     <input type="radio" name="prefill_doctype" id="directfill_doctype_html401" value="html401" checked="checked" /><label for="directfill_doctype_html401">HTML 4.01</label>
380 |     <input type="radio" name="prefill_doctype" id="directfill_doctype_xhtml10" value="xhtml10" /><label for="directfill_doctype_xhtml10">XHTML 1.0</label></td></tr>
381 |   <tr>
382 |           <td colspan="3"><input type="radio" name="group" id="directgroup_no" value="0" checked="checked" /><label for="directgroup_no">List Messages Sequentially</label> <input type="radio" name="group" id="directgroup_yes" value="1" /><label for="directgroup_yes">Group Error Messages by Type</label></td>
383 | 
384 | </tr>
385 | <tr>
386 |         <td><input id="direct-ss" name="ss" type="checkbox" value="1" /><label title="Show Page Source" for="direct-ss">Show Source</label></td>
387 |         <td colspan="2"><input id="direct-st" name="st" type="checkbox" value="1" /><label for="direct-st">Clean up Markup with HTML-Tidy</label>
388 |         </td>
389 | 
390 | 
391 | 
392 |           </tr>
393 |           <tr>
394 |                   <td><input id="direct-outline" name="outline" type="checkbox" value="1" /><label title="Show an Outline of the document" for="direct-outline">Show Outline</label>
395 |                   </td>
396 |                   <td><input id="direct-No200" name="No200" type="checkbox" value="1" /><label title="Validate also pages for which the HTTP status code indicates an error" for="direct-No200">Validate error pages</label></td>
397 | 
398 |                   <td><input id="direct-verbose" name="verbose" type="checkbox" value="1" /><label title="Verbose Output" for="direct-verbose">Verbose Output</label></td>
399 |         </tr>
400 | 
401 |                                         </table>
402 |                             </div>
403 |                             </fieldset>
404 | 
405 | 
406 |                                     <p class="submit_button">
407 |                                             <input title="Submit for validation" type="submit" value="Check" />
408 |                                     </p>
409 | 
410 | 
411 |                   </form>
412 |                 </fieldset>
413 |   </div><!-- fields -->
414 |   </div> <!-- frontforms -->
415 | <div class="intro">
416 |     <p>
417 |         This validator checks the
418 |         <a href="docs/help.html#validation_basics" title="What is markup validation?">markup validity</a>
419 |         of Web documents in HTML, XHTML, SMIL, MathML, etc.
420 |         If you wish to validate specific content such as
421 |         <a href="http://validator.w3.org/feed/" title="Feed validator, hosted at W3C">RSS/Atom feeds</a> or
422 |     <a href="http://jigsaw.w3.org/css-validator/" title="W3C CSS Validation Service">CSS stylesheets</a>,
423 |     <a href="http://validator.w3.org/mobile/" title="MobileOK Checker">MobileOK content</a>,
424 |         or to <a href="http://validator.w3.org/checklink" title="W3C Link Checker">find broken links</a>,
425 |  there are <a href="http://www.w3.org/QA/Tools/">other validators and tools</a> available.
426 |     As an alternative you can also try our <a href="http://validator.w3.org/nu">non-DTD-based validator</a>.
427 |     </p>
428 | </div>
429 | 
430 | <div id="suite" class="intro" style="overflow: auto;">
431 |     <span style="float: left; padding-right: 1em">
432 |       <a href="http://www.w3.org/2013/ValidatorSuite/">
433 |          <img src="http://www.w3.org/Icons/VSlogo" alt="W3C Validator
434 | Suite Logo"/>
435 |       </a>
436 |     </span>
437 |     <span xml:lang="en" lang="en" dir="ltr" style="display:block; text-align:center">
438 |       <span style="color:red; font-weight:bold">NEW</span> -
439 |       W3C offers a beta release of a new service providing you an
440 |       integrated validation report on your entire web site.
441 |       <br />
442 |       <a href="http://www.w3.org/2013/ValidatorSuite/beta/">Try it now</a>
443 |       to quickly identify those portions of your web site that may
444 |       benefit from attention.
445 |     </span>
446 | </div>
447 | 
448 | <div class="intro" id="don_program"></div>
449 | <script type="text/javascript" src="http://www.w3.org/QA/Tools/don_prog.js"></script>
450 | 
451 | 
452 | <ul class="navbar" id="menu"><li><a href="./" accesskey="1" title="Go to the Home Page for The W3C Markup Validation Service">Home</a><span> | </span></li>
453 | <li><a href="./about.html" title="Information About this Service">About...</a><span> | </span></li>
454 | <li><a href="./whatsnew.html" title="The changes made to this service recently">News</a><span> | </span></li>
455 | <li><a href="./docs/" accesskey="3" title="Documentation for this Service">Docs</a><span> | </span></li>
456 | <li><a href="./docs/help.html" title="Help and answers to frequently asked questions">Help&nbsp;&amp;&nbsp;<acronym title="Frequently Asked Questions">FAQ</acronym></a><span> | </span></li>
457 | <li><a href="./feedback.html" accesskey="4" title="How to provide feedback on this service">Feedback</a></li>
458 | <li><a href="./contribute.html" title="How to contribute to the validator project">Contribute</a></li></ul>
459 | 
460 | 
461 | 
462 | <div id="footer">
463 |         <p id="activity_logos">
464 |       <a href="http://www.w3.org/Status" title="W3C's Open Source, bringing you free Web quality tools and more"><img src="http://www.w3.org/Icons/WWW/w3c_home_nb" alt="W3C" width="72" height="47" /><img src="./images/opensource-55x48.png" alt="Open-Source" title="We are building certified Open Source/Free Software. - see www.opensource.org" width="55" height="48" /></a>
465 |          </p>
466 | 
467 |         <p id="support_logo">
468 |             <a href="http://www.w3.org/QA/Tools/Donate">
469 |             <img src="http://www.w3.org/QA/Tools/I_heart_validator" alt="I heart Validator logo" title=" Validators Donation Program" width="80" height="15" />
470 |             </a>
471 |         </p>
472 |     <p id="version_info">
473 |       This service runs the W3C Markup Validator, <a href="./whatsnew.html#v13"><abbr title="version">v</abbr>1.3</a>.
474 |     </p>
475 | 
476 |         <p class="copyright">
477 |             <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &copy; 1994-2012
478 |             <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a>&reg;
479 | 
480 |             (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>,
481 |             <a href="http://www.ercim.eu/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>,
482 |             <a href="http://www.keio.ac.jp/">Keio</a>),
483 |             All Rights Reserved.
484 |             W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>,
485 |             <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a>,
486 |             <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a>
487 | 
488 |             and <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/copyright-software">software licensing</a>
489 | 
490 |             rules apply. Your interactions with this site are in accordance
491 |             with our <a href="http://www.w3.org/Consortium/Legal/privacy-statement#Public">public</a> and
492 |             <a href="http://www.w3.org/Consortium/Legal/privacy-statement#Members">Member</a> privacy
493 |             statements.
494 |         </p>
495 | </div>
496 | 
497 |   </body>
498 | </html>
499 | 
500 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | nose
2 | 


--------------------------------------------------------------------------------
/tests/test_parslepy_compile.py:
--------------------------------------------------------------------------------
 1 | import parslepy
 2 | from parslepy.base import InvalidKeySyntax
 3 | from nose.tools import *
 4 | from lxml.etree import XPathSyntaxError
 5 | from .tools import *
 6 | 
 7 | class TestKeySyntax(object):
 8 | 
 9 |     with_valid_keys = (
10 |         ('title_big', ('title_big', None, None, None)),
11 |         ('title-short(.span)', ('title-short', None, '(.span)', '.span')),
12 |         ('title__2?', ('title__2', "?", None, None)),
13 |         ('title_big?(#main)', ('title_big', "?", '(#main)', '#main')),
14 |     )
15 | 
16 |     def test_key_regex(self):
17 |         for key, target_results in self.with_valid_keys:
18 |             yield self.compare_regex_results, key, target_results
19 | 
20 |     def compare_regex_results(self, key, results):
21 |         m = parslepy.base.Parselet.REGEX_PARSELET_KEY.match(key)
22 |         assert_true(m is not None)
23 |         assert_tuple_equal(m.groups(), results)
24 | 
25 | 
26 |     with_invalid_keys = (
27 |         ({'title@(': 'h1'}, InvalidKeySyntax),
28 | 
29 |         ({'#test': 'h1'}, InvalidKeySyntax),
30 |         ({'(#test)': 'h1'}, InvalidKeySyntax),
31 |         ({'?(#test)': 'h1'}, InvalidKeySyntax),
32 |         ({'.test': 'h1'}, InvalidKeySyntax),
33 | 
34 |         ({'test!': 'h1'}, InvalidKeySyntax),
35 |         ({'test#': 'h1'}, InvalidKeySyntax),
36 |         ({'test()': 'h1'}, InvalidKeySyntax),
37 |         ({'?()': 'h1'}, InvalidKeySyntax),
38 |         ({'test??': 'h1'}, InvalidKeySyntax),
39 |         ({'test?()': 'h1'}, InvalidKeySyntax),
40 |         ({'test~(test)': 'h1'}, InvalidKeySyntax),
41 | 
42 |         # this does not raise SyntaxError in lxml<3
43 |         #({'test(#)': 'h1'}, XPathSyntaxError),
44 | 
45 |         ({'test(!)': 'h1'}, XPathSyntaxError),
46 |         ({'test(.div ~)': 'h1'}, XPathSyntaxError),
47 |     )
48 | 
49 |     def test_invalid_syntax(self):
50 |         for parselet_dict, target_exception in self.with_invalid_keys:
51 |             yield self.init_with_invalid_parselet_dict, parselet_dict, target_exception
52 | 
53 |     def init_with_invalid_parselet_dict(self, parselet_dict, target_exception):
54 |         assert_raises(target_exception, parslepy.Parselet, parselet_dict)
55 | 
56 |     with_invalid_value_type = (
57 |         ({'title': 1}, ValueError),
58 |         ({'title': None}, ValueError),
59 |         ({'title': (43,)}, ValueError),
60 |         ({'title': {44: 45}}, InvalidKeySyntax),
61 |     )
62 | 
63 |     def test_invalid_value_type(self):
64 |         for parselet_dict, target_exception in self.with_invalid_value_type:
65 |             yield self.init_with_invalid_parselet_dict, parselet_dict, target_exception
66 | 


--------------------------------------------------------------------------------
/tests/test_parslepy_extensions.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | import parslepy
  3 | import parslepy.base
  4 | import lxml.cssselect
  5 | from nose.tools import *
  6 | import io as StringIO
  7 | import pprint
  8 | import os
  9 | from .tools import *
 10 | 
 11 | def compare_extracted_output(root, input_parselet, expected_output, debug=False):
 12 |     parselet = parslepy.Parselet(input_parselet, strict=True, debug=debug)
 13 |     extracted = parselet.extract(root)
 14 |     #pprint.pprint(extracted)
 15 |     #pprint.pprint(expected_output)
 16 |     assert_dict_equal(extracted, expected_output)
 17 | 
 18 | def test_attrnames():
 19 |     parselets = (
 20 |         (
 21 |             {"images(img)": [{
 22 |                     "attrnames": ["parslepy:attrname(@*)"],
 23 |                     "attrvals": ["@*"],
 24 |                 }]},
 25 |             {
 26 |                 'images': [
 27 |                     {
 28 |                         'attrvals': ['W3C', '110', '61', 'logo', './images/w3c.png'],
 29 |                         'attrnames': ['alt', 'width', 'height', 'id', 'src']
 30 |                     },
 31 |                     {
 32 |                         'attrvals': ['toggleiconURI', 'toggleicon', './images/arrow-closed.png', 'Show'],
 33 |                         'attrnames': ['id', 'class', 'src', 'alt']
 34 |                     },
 35 |                     {
 36 |                         'attrvals': ['toggleicon', './images/arrow-closed.png', 'Show'],
 37 |                         'attrnames': ['class', 'src', 'alt']
 38 |                     },
 39 |                     {
 40 |                         'attrvals': ['toggleicon', './images/arrow-closed.png', 'Show'],
 41 |                         'attrnames': ['class', 'src', 'alt']
 42 |                     },
 43 |                     {
 44 |                         'attrvals': ['http://www.w3.org/Icons/VSlogo', 'W3C Validator\nSuite Logo'],
 45 |                         'attrnames': ['src', 'alt']
 46 |                     },
 47 |                     {
 48 |                         'attrvals': ['http://www.w3.org/Icons/WWW/w3c_home_nb', 'W3C', '72', '47'],
 49 |                         'attrnames': ['src', 'alt', 'width', 'height']
 50 |                     },
 51 |                     {
 52 |                         'attrvals': ['./images/opensource-55x48.png', 'Open-Source', 'We are building certified Open Source/Free Software. - see www.opensource.org', '55', '48'],
 53 |                         'attrnames': ['src', 'alt', 'title', 'width', 'height']
 54 |                     },
 55 |                     {
 56 |                         'attrvals': ['http://www.w3.org/QA/Tools/I_heart_validator', 'I heart Validator logo', ' Validators Donation Program', '80', '15'],
 57 |                         'attrnames': ['src', 'alt', 'title', 'width', 'height']
 58 |                     }
 59 |                 ]
 60 |             }
 61 |         ),
 62 |     )
 63 |     hp = lxml.etree.HTMLParser()
 64 |     dirname = os.path.dirname(os.path.abspath(__file__))
 65 |     root = lxml.etree.parse(
 66 |         open(os.path.join(
 67 |                 dirname,
 68 |                 'data/validator.w3.org.html')),
 69 |         parser=hp).getroot()
 70 |     for input_parselet, expected_output in parselets:
 71 |         yield compare_extracted_output, root, input_parselet, expected_output
 72 | 
 73 | 
 74 | def test_strip():
 75 |     parselets = (
 76 |         (
 77 |             # strip bracket from text content
 78 |             {"selected_option": "parslepy:strip(//select[@id='uri-charset']/option[@selected], '()')"},
 79 |             {'selected_option': 'detect automatically'}
 80 |         ),
 81 |         (
 82 |             # strip brackets from attribute value
 83 |             {"selected_option": "parslepy:strip(//select[@id='upload-charset']/option[@selected]/@value, '()')"},
 84 |             {'selected_option': 'detect automatically'}
 85 |         ),
 86 |         (
 87 |             # strip '#' from attribute values
 88 |             {"legend_links(legend.toggletext > a)": ["parslepy:strip(@href, '#')"]},
 89 |             {'legend_links': ['validate_by_uri+with_options',
 90 |                               'validate_by_upload+with_options',
 91 |                               'validate_by_input+with_options']}
 92 |         ),
 93 |     )
 94 |     hp = lxml.etree.HTMLParser()
 95 |     dirname = os.path.dirname(os.path.abspath(__file__))
 96 |     root = lxml.etree.parse(
 97 |         open(os.path.join(
 98 |                 dirname,
 99 |                 'data/validator.w3.org.html')),
100 |         parser=hp).getroot()
101 |     for input_parselet, expected_output in parselets:
102 |         yield compare_extracted_output, root, input_parselet, expected_output
103 | 
104 | 
105 | def test_to_content():
106 |     parselets = (
107 |         (
108 |             {"intro": 'parslepy:html(//div[@class="intro"])'},
109 |             {'intro': """<div class="intro">
110 |     <p>
111 |         This validator checks the
112 |         <a href="docs/help.html#validation_basics" title="What is markup validation?">markup validity</a>
113 |         of Web documents in HTML, XHTML, SMIL, MathML, etc.
114 |         If you wish to validate specific content such as
115 |         <a href="http://validator.w3.org/feed/" title="Feed validator, hosted at W3C">RSS/Atom feeds</a> or
116 |     <a href="http://jigsaw.w3.org/css-validator/" title="W3C CSS Validation Service">CSS stylesheets</a>,
117 |     <a href="http://validator.w3.org/mobile/" title="MobileOK Checker">MobileOK content</a>,
118 |         or to <a href="http://validator.w3.org/checklink" title="W3C Link Checker">find broken links</a>,
119 |  there are <a href="http://www.w3.org/QA/Tools/">other validators and tools</a> available.
120 |     As an alternative you can also try our <a href="http://validator.w3.org/nu">non-DTD-based validator</a>.
121 |     </p>
122 | </div>"""},
123 |         ),
124 |         (
125 |             {"intro": 'parslepy:text(//div[@class="intro"])'},
126 |             {'intro': 'This validator checks the markup validity of Web documents in HTML, XHTML, SMIL, MathML, etc. If you wish to validate specific content such as RSS/Atom feeds or CSS stylesheets, MobileOK content, or to find broken links, there are other validators and tools available. As an alternative you can also try our non-DTD-based validator.'}
127 |         ),
128 |         (
129 |             {"intro": 'parslepy:textnl(//div[@class="intro"])'},
130 |             {'intro': """This validator checks the
131 | markup validity
132 | of Web documents in HTML, XHTML, SMIL, MathML, etc.
133 | If you wish to validate specific content such as
134 | RSS/Atom feeds or
135 | CSS stylesheets,
136 | MobileOK content,
137 | or to find broken links,
138 | there are other validators and tools available.
139 | As an alternative you can also try our non-DTD-based validator."""
140 |             }
141 |         ),
142 |     )
143 |     hp = lxml.etree.HTMLParser()
144 |     dirname = os.path.dirname(os.path.abspath(__file__))
145 |     root = lxml.etree.parse(
146 |         open(os.path.join(
147 |                 dirname,
148 |                 'data/validator.w3.org.html')),
149 |         parser=hp).getroot()
150 |     for input_parselet, expected_output in parselets:
151 |         yield compare_extracted_output, root, input_parselet, expected_output
152 | 
153 | def test_to_xml():
154 |     parselets = (
155 |         (
156 |             {"first": "parslepy:xml(//atom:feed/atom:entry[1]/im:contentType)"},
157 |             {'first': '<im:contentType xmlns:im="http://itunes.apple.com/rss" xmlns="http://www.w3.org/2005/Atom" term="Music" label="Music"><im:contentType term="Album" label="Album"/></im:contentType>'}
158 |         ),
159 |     )
160 |     dirname = os.path.dirname(os.path.abspath(__file__))
161 |     root = lxml.etree.parse(
162 |         open(os.path.join(
163 |                 dirname,
164 |                 'data/itunes.topalbums.rss')),
165 |         parser=lxml.etree.XMLParser()).getroot()
166 |     xsh = parslepy.selectors.XPathSelectorHandler(
167 |         namespaces={
168 |             'atom': 'http://www.w3.org/2005/Atom',
169 |             'im': 'http://itunes.apple.com/rss'
170 |         })
171 |     for input_parselet, expected_output in parselets:
172 |         parselet = parslepy.Parselet(
173 |             input_parselet, selector_handler=xsh, strict=True)
174 |         extracted = parselet.extract(root)
175 |         assert_dict_equal(extracted, expected_output)
176 | 
177 | 
178 | def test_userdefined_extensions():
179 | 
180 |     def myattrnames(ctx, xpctx, attributes, *args):
181 |         #print "myattrnames:", ctx, xpctx, attributes, args
182 |         return [a.attrname for a in attributes]
183 | 
184 |     # extension to built full URLs from @href or @src attributes
185 |     try:
186 |         import urlparse     # Python 2.x
187 |     except ImportError:
188 |         import urllib.parse as urlparse
189 | 
190 |     def absurl(ctx, xpctx, attributes, *args):
191 |         #print "absurl:", ctx, xpctx, attributes, args
192 |         return [urlparse.urljoin(ctx, u) for u in attributes]
193 | 
194 |     parselets = (
195 |         (
196 |             {
197 |                 "head_meta(head/meta)": [{
198 |                     "attrnames": ["myext:attrnames(@*)"],
199 |                     "attrvals": ["@*"],
200 |                 }],
201 |                 "img_links": ["//img/@src"],
202 |                 "img_abslinks": ["myext:absurl(//img/@src)"],
203 |             },
204 |             {
205 |                 'head_meta': [
206 |                     {'attrnames': ['http-equiv', 'content'],
207 |                      'attrvals': ['Content-Type', 'text/html;charset=utf-8']
208 |                     },
209 |                     {'attrnames': ['name', 'content'],
210 |                      'attrvals': ['keywords',
211 |                                   'HTML, HyperText Markup Language, Validation,\n      W3C Markup Validation Service']},
212 |                     {'attrnames': ['name', 'content'],
213 |                      'attrvals': ['description',
214 |                                    "W3C's easy-to-use\n      markup validation service, based on SGML and XML parsers."]}],
215 |                 'img_abslinks': ['http://validator.w3.org/images/w3c.png',
216 |                                'http://validator.w3.org/images/arrow-closed.png',
217 |                                'http://validator.w3.org/images/arrow-closed.png',
218 |                                'http://validator.w3.org/images/arrow-closed.png',
219 |                                'http://www.w3.org/Icons/VSlogo',
220 |                                'http://www.w3.org/Icons/WWW/w3c_home_nb',
221 |                                'http://validator.w3.org/images/opensource-55x48.png',
222 |                                'http://www.w3.org/QA/Tools/I_heart_validator'],
223 |                 'img_links': ['./images/w3c.png',
224 |                             './images/arrow-closed.png',
225 |                             './images/arrow-closed.png',
226 |                             './images/arrow-closed.png',
227 |                             'http://www.w3.org/Icons/VSlogo',
228 |                             'http://www.w3.org/Icons/WWW/w3c_home_nb',
229 |                             './images/opensource-55x48.png',
230 |                             'http://www.w3.org/QA/Tools/I_heart_validator']
231 |             }
232 |         ),
233 |     )
234 |     mynamespaces = {
235 |         "myext": "myextension"
236 |     }
237 |     myextensions = {
238 |         ("myextension", "absurl"): absurl,
239 |         ("myextension", "attrnames"): myattrnames,
240 |     }
241 | 
242 |     sh = parslepy.DefaultSelectorHandler(
243 |         namespaces=mynamespaces,
244 |         extensions=myextensions)
245 | 
246 |     dirname = os.path.dirname(os.path.abspath(__file__))
247 |     for input_parselet, expected_output in parselets:
248 |         parselet = parslepy.Parselet(
249 |             input_parselet,
250 |             selector_handler=sh, strict=True)
251 |         extracted = parselet.parse(
252 |             os.path.join(dirname, 'data/validator.w3.org.html'),
253 |             context='http://validator.w3.org/')
254 | 
255 |         #pprint.pprint(extracted)
256 |         #pprint.pprint(expected_output)
257 |         assert_dict_equal(extracted, expected_output)
258 | 


--------------------------------------------------------------------------------
/tests/test_parslepy_init.py:
--------------------------------------------------------------------------------
  1 | import parslepy
  2 | import parslepy.base
  3 | import lxml.cssselect
  4 | from nose.tools import *
  5 | from .tools import *
  6 | 
  7 | def test_parslepy_init_default():
  8 |     parselet_script = {
  9 |         "title": "h1",
 10 |         "subtitle": "//h2"
 11 |     }
 12 |     parselet = parslepy.Parselet(parselet_script)
 13 | 
 14 |     assert_dict_equal(parselet.parselet, parselet_script)
 15 | 
 16 |     assert_is_instance(parselet.parselet_tree, parslepy.base.ParsleyNode)
 17 |     assert_equal(len(parselet.parselet_tree), len(parselet_script), "not the same number of keys")
 18 | 
 19 |     for k,v in list(parselet.parselet_tree.items()):
 20 |         assert_is_instance(k, parslepy.base.ParsleyContext)
 21 |         assert_is_instance(v, parslepy.selectors.Selector)
 22 | 
 23 |     # since we did not provide a selector handler
 24 |     assert_is_instance(parselet.selector_handler, parslepy.base.DefaultSelectorHandler)
 25 | 
 26 | @raises(ValueError)
 27 | def test_parslepy_init_invalid_parselet():
 28 |     parselet = parslepy.Parselet("{ 'title': 'h1'}")
 29 | 
 30 | @raises(NotImplementedError)
 31 | def test_parslepy_init_selector_handler_error():
 32 |     parselet_script = {
 33 |         "title": "h1",
 34 |         "subtitle": "//h2"
 35 |     }
 36 |     class MyHandler(parslepy.selectors.SelectorHandler):
 37 |         _dummy = True
 38 |     mh = MyHandler()
 39 |     parselet = parslepy.Parselet(parselet_script, selector_handler=mh)
 40 | 
 41 | @raises(ValueError)
 42 | def test_parslepy_init_wrong_selector_handler():
 43 |     parselet_script = {
 44 |         "title": "h1",
 45 |         "subtitle": "//h2"
 46 |     }
 47 |     parselet = parslepy.Parselet(parselet_script, selector_handler=lambda s: s)
 48 | 
 49 | def test_parslepy_init_selector_handler_error():
 50 |     parselet_script = {
 51 |         "title": "h1",
 52 |         "subtitle": "//h2"
 53 |     }
 54 |     class MyHandler(parslepy.selectors.SelectorHandler):
 55 |         def make(self, selection):
 56 |             return parslepy.selectors.Selector(lxml.etree.XPath("body"))
 57 | 
 58 |         def select(self, document, selector):
 59 |             return None
 60 | 
 61 |         def extract(self, document, selector):
 62 |             return None
 63 | 
 64 |     mh = MyHandler()
 65 | 
 66 |     parselet = parslepy.Parselet(parselet_script, selector_handler=mh)
 67 |     assert_is_instance(parselet.selector_handler, MyHandler)
 68 | 
 69 | def test_parslepy_keys():
 70 |     parselet_scripts = [
 71 |     (
 72 |         {
 73 |             "title": "h1",
 74 |             "subtitle": "//h2"
 75 |         },
 76 |         ["title", "subtitle"],
 77 |     ),
 78 |     (
 79 |         {
 80 |             "--": {
 81 |                 "--(#banner)": {
 82 |                     "--(#title)": {
 83 |                         "--(a span)": {
 84 |                             "title": "."
 85 |                         }
 86 |                     }
 87 |                 }
 88 |             }
 89 |         },
 90 |         ["title"],
 91 |     ),
 92 |     (
 93 |         {
 94 |             "--(#header)": {
 95 |                 "--(#banner)": {
 96 |                     "--(#title)": {
 97 |                         "--(a span)": {
 98 |                             "title": "."
 99 |                         }
100 |                     }
101 |                 }
102 |             }
103 |         },
104 |         ["title"],
105 |     ),
106 |     (
107 |         {
108 |             "--": {
109 |                 "--(#banner)": {
110 |                     "--(#title)": {
111 |                         "--(a span)": {
112 |                             "title": "."
113 |                         }
114 |                     }
115 |                 }
116 |             },
117 |             "links": [".//a/@href"]
118 |         },
119 |         ["title", "links"],
120 |     ),
121 |     (
122 |         {
123 |             "title": "h1",
124 |             "--(.content)": {
125 |                 "subtitle": ".//h2"
126 |             }
127 |         },
128 |         ["title", "subtitle"],
129 |     ),
130 |     (
131 |         {
132 |             "title": "h1",
133 |             "--(.content)": {
134 |                 "title": ".//h2"
135 |             },
136 |             "footer": "parslepy:html(.//div[@class='footer'])"
137 |         },
138 |         ["title", "footer"],
139 |     ),
140 |     ]
141 | 
142 |     for input_parselet, expected_output in parselet_scripts:
143 |         parselet = parslepy.Parselet(input_parselet)
144 |         assert_equal(set(parselet.keys()),
145 |                      set(expected_output))
146 | 


--------------------------------------------------------------------------------
/tests/test_parslepy_parse.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | import parslepy
  3 | import parslepy.base
  4 | import lxml.cssselect
  5 | from nose.tools import *
  6 | from .tools import *
  7 | import pprint
  8 | import os
  9 | 
 10 | 
 11 | def test_parslepy_xpathparse_xml_file():
 12 |     parselet_script = {"id": "//atom:id"}
 13 |     xsh = parslepy.selectors.XPathSelectorHandler(
 14 |                 namespaces={'atom': 'http://www.w3.org/2005/Atom'}
 15 |             )
 16 |     dirname = os.path.dirname(os.path.abspath(__file__))
 17 |     fp = open(os.path.join(dirname, 'data/itunes.topalbums.rss'))
 18 | 
 19 |     expected = {
 20 |         'id': 'https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml'
 21 |     }
 22 | 
 23 |     parselet = parslepy.Parselet(parselet_script, selector_handler=xsh)
 24 |     extracted = parselet.parse(fp, parser=lxml.etree.XMLParser())
 25 |     assert_dict_equal(extracted, expected)
 26 | 
 27 | 
 28 | def test_parslepy_defaultparse_xml_file():
 29 |     parselet_script = {"id": "//atom:id"}
 30 |     dsh = parslepy.selectors.DefaultSelectorHandler(
 31 |                 namespaces={'atom': 'http://www.w3.org/2005/Atom'}
 32 |             )
 33 |     dirname = os.path.dirname(os.path.abspath(__file__))
 34 |     fp = open(os.path.join(dirname, 'data/itunes.topalbums.rss'))
 35 | 
 36 |     expected = {
 37 |         'id': 'https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml'
 38 |     }
 39 | 
 40 |     parselet = parslepy.Parselet(parselet_script, selector_handler=dsh)
 41 |     extracted = parselet.parse(fp, parser=lxml.etree.XMLParser())
 42 |     assert_dict_equal(extracted, expected)
 43 | 
 44 | 
 45 | def test_parslepy_defaultparse_xml_file_cssselectors():
 46 |     parselet_script = {"id": "atom|id", "imid": "atom|id @im|id"}
 47 |     dsh = parslepy.selectors.DefaultSelectorHandler(
 48 |                 namespaces={
 49 |                     'atom': 'http://www.w3.org/2005/Atom',
 50 |                     'im': 'http://itunes.apple.com/rss',
 51 |                 }
 52 |             )
 53 |     dirname = os.path.dirname(os.path.abspath(__file__))
 54 |     fp = open(os.path.join(dirname, 'data/itunes.topalbums.rss'))
 55 | 
 56 |     expected = {
 57 |         'id': 'https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml',
 58 |         'imid': '647928068',
 59 |     }
 60 | 
 61 |     parselet = parslepy.Parselet(parselet_script, selector_handler=dsh)
 62 |     extracted = parselet.parse(fp, parser=lxml.etree.XMLParser())
 63 |     assert_dict_equal(extracted, expected)
 64 | 
 65 | 
 66 | xmldoc = b"""<?xml version="1.0" encoding="utf-8"?>
 67 | <feed xmlns:im="http://itunes.apple.com/rss" xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
 68 | <id>https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml</id><title>iTunes Store: Top Albums</title><updated>2013-06-25T06:27:25-07:00</updated><link rel="alternate" type="text/html" href="https://itunes.apple.com/WebObjects/MZStore.woa/wa/viewTop?cc=us&amp;id=38&amp;popId=11"/><link rel="self" href="https://itunes.apple.com/us/rss/topalbums/limit=10/explicit=true/xml"/><icon>http://itunes.apple.com/favicon.ico</icon><author><name>iTunes Store</name><uri>http://www.apple.com/itunes/</uri></author><rights>Copyright 2008 Apple Inc.</rights>
 69 | <entry>
 70 |     <updated>2013-06-25T06:27:25-07:00</updated>
 71 |     <id im:id="647928068">https://itunes.apple.com/us/album/the-gifted/id647928068?uo=2</id>
 72 |     <title>The Gifted - Wale</title>
 73 |     <im:name>The Gifted</im:name>
 74 |     <im:image height="55">http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg</im:image>
 75 |     <im:image height="60">http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg</im:image>
 76 |     <im:image height="170">http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg</im:image>
 77 | </entry>
 78 | </feed>
 79 | """
 80 | 
 81 | def test_parslepy_xpathparse_xml_fromstring():
 82 | 
 83 |     parselet_script = {
 84 |         "--(//atom:feed/atom:entry)": {
 85 |             "title": "atom:title",
 86 |             "name": "im:name",
 87 |             "id": "atom:id/@im:id",
 88 |             "images(im:image)": [{
 89 |                 "height": "@height",
 90 |                 "url": ".",
 91 |             }],
 92 |             "releasedate": "im:releaseDate",
 93 |         }
 94 |     }
 95 |     xsh = parslepy.selectors.XPathSelectorHandler(
 96 |                 namespaces={
 97 |                     'atom': 'http://www.w3.org/2005/Atom',
 98 |                     'im': 'http://itunes.apple.com/rss',
 99 |                 }
100 |             )
101 | 
102 |     expected = {
103 |         'id': '647928068',
104 |         'images': [
105 |             {   'height': '55',
106 |                 'url': 'http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg'
107 |             },
108 |             {   'height': '60',
109 |                 'url': 'http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg'
110 |             },
111 |             {   'height': '170',
112 |                 'url': 'http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg'
113 |             }
114 |         ],
115 |         'name': 'The Gifted',
116 |         'title': 'The Gifted - Wale',
117 |     }
118 |     parselet = parslepy.Parselet(parselet_script, selector_handler=xsh)
119 |     extracted = parselet.parse_fromstring(xmldoc, parser=lxml.etree.XMLParser())
120 |     assert_dict_equal(extracted, expected)
121 | 
122 | 
123 | def test_parslepy_defaultparse_xml_fromstring():
124 | 
125 |     parselet_script = {
126 |         "--(//atom:feed/atom:entry)": {
127 |             "title": "atom:title",
128 |             "name": "im:name",
129 |             "id": "atom:id/@im:id",
130 |             "images(im:image)": [{
131 |                 "height": "@height",
132 |                 "url": ".",
133 |             }],
134 |             "releasedate": "im:releaseDate",
135 |         }
136 |     }
137 |     dsh = parslepy.selectors.DefaultSelectorHandler(
138 |                 namespaces={
139 |                     'atom': 'http://www.w3.org/2005/Atom',
140 |                     'im': 'http://itunes.apple.com/rss',
141 |                 }
142 |             )
143 | 
144 |     expected = {
145 |         'id': '647928068',
146 |         'images': [
147 |             {   'height': '55',
148 |                 'url': 'http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg'
149 |             },
150 |             {   'height': '60',
151 |                 'url': 'http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg'
152 |             },
153 |             {   'height': '170',
154 |                 'url': 'http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg'
155 |             }
156 |         ],
157 |         'name': 'The Gifted',
158 |         'title': 'The Gifted - Wale',
159 |     }
160 |     parselet = parslepy.Parselet(parselet_script, selector_handler=dsh)
161 |     extracted = parselet.parse_fromstring(xmldoc, parser=lxml.etree.XMLParser())
162 |     assert_dict_equal(extracted, expected)
163 | 
164 | 
165 | def test_parslepy_defaultparse_xml_fromstring_cssselectors():
166 | 
167 |     parselet_script = {
168 |         "--(atom|feed atom|entry)": {
169 |             "title": "atom|title",
170 |             "name": "im|name",
171 |             "id": "atom|id @im|id",
172 |             "images(im|image)": [{
173 |                 "height": "@height",
174 |                 "url": ".",
175 |             }],
176 |             "releasedate": "im|releaseDate",
177 |         }
178 |     }
179 |     dsh = parslepy.selectors.DefaultSelectorHandler(
180 |                 namespaces={
181 |                     'atom': 'http://www.w3.org/2005/Atom',
182 |                     'im': 'http://itunes.apple.com/rss',
183 |                 }
184 |             )
185 | 
186 |     expected = {
187 |         'id': '647928068',
188 |         'images': [
189 |             {   'height': '55',
190 |                 'url': 'http://a815.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.55x55-70.jpg'
191 |             },
192 |             {   'height': '60',
193 |                 'url': 'http://a1537.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.60x60-50.jpg'
194 |             },
195 |             {   'height': '170',
196 |                 'url': 'http://a976.phobos.apple.com/us/r30/Features/v4/02/cc/73/02cc7370-693c-f0fe-505b-bb84043ce186/dj.pehmruyt.170x170-75.jpg'
197 |             }
198 |         ],
199 |         'name': 'The Gifted',
200 |         'title': 'The Gifted - Wale',
201 |     }
202 |     parselet = parslepy.Parselet(parselet_script, selector_handler=dsh)
203 |     extracted = parselet.parse_fromstring(xmldoc, parser=lxml.etree.XMLParser())
204 |     assert_dict_equal(extracted, expected)
205 | 
206 | 
207 | 
208 | 
209 | def test_parslepy_parse_html_file():
210 | 
211 |     parselet = parslepy.Parselet({"title": "h1"})
212 |     expected = {'title': 'Markup Validation Service'}
213 | 
214 |     dirname = os.path.dirname(os.path.abspath(__file__))
215 |     extracted = parselet.parse(
216 |                     open(os.path.join(dirname, 'data/validator.w3.org.html'))
217 |                 )
218 |     assert_dict_equal(extracted, expected)
219 | 
220 | 
221 | def test_parslepy_parse_html_fromstring():
222 | 
223 |     htmldoc = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
224 |     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
225 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
226 |   <head>
227 |     <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
228 |     <title>The W3C Markup Validation Service</title>
229 |     <link rev="made" href="mailto:www-validator@w3.org" />
230 |     <link rel="shortcut icon" href="http://www.w3.org/2008/site/images/favicon.ico" type="image/x-icon" />
231 |     <link rev="start" href="./" title="Home Page" />
232 |     <style type="text/css" media="all">
233 |       @import "./style/base";
234 |     </style>
235 |     <script type="text/javascript" src="scripts/combined"></script>
236 |     <meta name="keywords" content="HTML, HyperText Markup Language, Validation,
237 |       W3C Markup Validation Service" />
238 |     <meta name="description" content="W3C's easy-to-use
239 |       markup validation service, based on SGML and XML parsers." />
240 | 
241 |     <link rel="alternate" type="application/atom+xml" href="http://www.w3.org/QA/Tools/validator-whatsnew.atom" />
242 |   </head>
243 |   <body>
244 |    <div id="banner">
245 |     <h1 id="title">
246 |       <a href="http://www.w3.org/"><img alt="W3C" width="110" height="61" id="logo" src="./images/w3c.png" /></a>
247 | 			<a href="./"><span>Markup Validation Service</span></a>
248 |       </h1>
249 |       <p id="tagline">Check the markup (HTML, XHTML, ...) of Web documents</p>
250 |    </div>
251 |   </body>
252 | </html>
253 |     """
254 | 
255 |     parselet = parslepy.Parselet(
256 |         {
257 |             "title": "h1",
258 |             "pid": "p[id] @id"
259 |         })
260 |     expected = {
261 |         'title': 'Markup Validation Service',
262 |         'pid': 'tagline'
263 |     }
264 | 
265 |     extracted = parselet.parse_fromstring(htmldoc)
266 |     assert_dict_equal(extracted, expected)
267 | 


--------------------------------------------------------------------------------
/tests/test_parslepy_parselets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from parslepy.base import Parselet
 3 | from nose.tools import assert_dict_equal
 4 | 
 5 | html = '<html><body><h1>hi</h1><a href="/">click</a></body></html>'
 6 | expected = {"title":"hi", "link":"/"}
 7 | dirname = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | 
10 | def test_parslepy_from_jsonstring():
11 |     s = '{ "title": "h1", "link": "a @href"}'
12 |     p = Parselet.from_jsonstring(s)
13 |     extracted = p.parse_fromstring(html)
14 |     assert_dict_equal(extracted, expected)
15 | 
16 | 
17 | def test_parslepy_from_yamlstring():
18 |     s = '''---
19 |     title: h1
20 |     link: a @href
21 |     '''
22 |     p = Parselet.from_yamlstring(s)
23 |     extracted = p.parse_fromstring(html)
24 |     assert_dict_equal(extracted, expected)
25 | 
26 | 
27 | def test_parslepy_from_jsonstring():
28 |     s = '{ "title": "h1", "link": "a @href"}'
29 |     with open(os.path.join(dirname, 'data/parselet.json')) as fp:
30 |         p = Parselet.from_jsonfile(fp)
31 |     extracted = p.parse_fromstring(html)
32 |     assert_dict_equal(extracted, expected)
33 | 
34 | 
35 | def test_parslepy_from_yamlstring():
36 |     s = '''---
37 |     title: h1
38 |     link: a @href
39 |     '''
40 |     with open(os.path.join(dirname, 'data/parselet.yml')) as fp:
41 |         p = Parselet.from_yamlfile(fp)
42 |     extracted = p.parse_fromstring(html)
43 |     assert_dict_equal(extracted, expected)
44 | 


--------------------------------------------------------------------------------
/tests/test_parslepy_selector.py:
--------------------------------------------------------------------------------
  1 | import parslepy
  2 | import parslepy.base
  3 | import parslepy.selectors
  4 | import lxml.cssselect
  5 | from nose.tools import *
  6 | from .tools import *
  7 | 
  8 | class TestInvalidParseletInit(object):
  9 |     init_parselets = (
 10 |         #{ "title": ".test #"}, # this does not raise SyntaxError in lxml<3
 11 |         { "title": "/h1[@]"},
 12 |         { "title": "h1", "paragraphs": [".//p[@class,'news']"]},
 13 |     )
 14 |     def test_invalid_parselet_init(self):
 15 |         for parselet in self.init_parselets:
 16 |             yield self.init_parselet_expect_syntax_error, parselet
 17 | 
 18 |     @raises(SyntaxError)
 19 |     def init_parselet_expect_syntax_error(self, parselet):
 20 |         parslepy.Parselet(parselet)
 21 | 
 22 | 
 23 | class TestDefaultValidSelectors(object):
 24 | 
 25 |     dsh = parslepy.base.DefaultSelectorHandler()
 26 | 
 27 |     selectors = [
 28 |         ("div.content", lxml.etree.XPath),
 29 |         (".content #bogus span.first", lxml.etree.XPath),
 30 |         ("div#main", lxml.etree.XPath),
 31 |         ("div[@id='main']", lxml.etree.XPath),
 32 |         ('div[@id="main"]', lxml.etree.XPath),
 33 |         ("div", lxml.etree.XPath),
 34 |         ("//div", lxml.etree.XPath),
 35 |         ("//a/@href", lxml.etree.XPath),
 36 |         ("img @src", lxml.etree.XPath),
 37 |         ("table tr[class='main']", lxml.etree.XPath),
 38 |         ("tr[2]", lxml.etree.XPath),
 39 |     ]
 40 | 
 41 |     try:
 42 |         from cssselect.parser import FunctionalPseudoElement
 43 |         selectors.extend([
 44 |             ("img::attr(src)", lxml.etree.XPath),
 45 |         ])
 46 |     except:
 47 |         pass
 48 | 
 49 |     def test_selector_class(self):
 50 |         for selector_string, target_class in self.selectors:
 51 |             yield self.compare_selector_class, selector_string, target_class
 52 | 
 53 |     def compare_selector_class(self, selector_string, target_class):
 54 |         s = self.dsh.make(selector_string)
 55 |         assert_is_instance(s, parslepy.selectors.Selector)
 56 |         assert_is_instance(
 57 |             s.selector, target_class,
 58 |             "\n%s compiled to '%s' of type %s \n and is not an instance of %s" % (
 59 |                 selector_string, s.selector, type(s.selector), target_class)
 60 |         )
 61 | 
 62 | 
 63 | class TestDefaultInvalidSelectors(object):
 64 | 
 65 |     dsh = parslepy.selectors.DefaultSelectorHandler()
 66 | 
 67 |     invalid_selectors = (
 68 |         # these does not raise SyntaxError in lxml<3
 69 |         #'# ',
 70 |         #'.#',
 71 |         #'#t-#',
 72 | 
 73 |         '#t.',
 74 |         './//e',
 75 |         './/div class',
 76 |         './/div[@class="test]',
 77 |         'div[]',
 78 |         '.div[id@]',
 79 |         'div[@]',
 80 |         'span @',
 81 |         'span@',
 82 |         './/span//',
 83 |     )
 84 | 
 85 |     def test_invalid_css_selectors(self):
 86 |         for s in self.invalid_selectors:
 87 |             yield self.make_selector_expect_syntax_error, s
 88 | 
 89 |     @raises(SyntaxError)
 90 |     def make_selector_expect_syntax_error(self, s):
 91 |         self.dsh.make(s)
 92 | 
 93 | 
 94 | class TestXPathValidSelectors(object):
 95 | 
 96 |     xsh = parslepy.selectors.XPathSelectorHandler()
 97 | 
 98 |     selectors = (
 99 |         "div.content",
100 |         "span[@id='main']",
101 |         'header[@id="main"]',
102 |         "div",
103 |         "//div",
104 |         "//a/@href",
105 |         "img/@src",
106 |         "./img/@src",
107 |         ".//img/@alt",
108 |         "table/tr[@class='main']",
109 |         '//div[@id="main"]//tr[@class="item"]',
110 |         "tr[2]",
111 |     )
112 | 
113 |     def test_selector_class(self):
114 |         for selector_string in self.selectors:
115 |             yield self.compare_selector_class, selector_string
116 | 
117 |     def compare_selector_class(self, selector_string):
118 |         s = self.xsh.make(selector_string)
119 |         assert_is_instance(s, parslepy.selectors.Selector)
120 |         assert_is_instance(
121 |             s.selector, lxml.etree.XPath,
122 |             "\n%s compiled to '%s' of type %s \n and is not an instance of %s" % (
123 |                 selector_string, s.selector, type(s.selector), lxml.etree.XPath)
124 |         )
125 | 
126 | 
127 | class TestXPathInvalidSelectors(object):
128 | 
129 |     xsh = parslepy.selectors.XPathSelectorHandler()
130 | 
131 |     invalid_selectors = (
132 |         './//e',
133 |         './/div class',
134 |         './/div[@class="test]',
135 |         'div[]',
136 |         '.div[id@]',
137 |         'div[@]',
138 |         'span//',
139 |         'span/@class/',
140 |         './/span//',
141 |     )
142 | 
143 |     def test_invalid_xpath_selectors(self):
144 |         for s in self.invalid_selectors:
145 |             yield self.make_selector_expect_syntax_error, s
146 | 
147 |     @raises(SyntaxError)
148 |     def make_selector_expect_syntax_error(self, s):
149 |         self.xsh.make(s)
150 | 


--------------------------------------------------------------------------------
/tests/tools.py:
--------------------------------------------------------------------------------
 1 | # borrowed from python-github2/tests/test_request.py
 2 | try:
 3 |     from nose.tools import assert_dict_equal
 4 | except ImportError:  # for Python < 2.7
 5 |     try:
 6 |         import unittest2
 7 |         _binding = unittest2.TestCase('run')
 8 |         assert_dict_equal = _binding.assertDictEqual
 9 |         assert_is_instance = _binding.assertIsInstance
10 |         assert_tuple_equal = _binding.assertTupleEqual
11 |     except:
12 |         raise
13 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # content of: tox.ini , put in same dir as setup.py
 2 | [tox]
 3 | envlist = {py27,py35,py36}-lxml{36,37,38,40,41}
 4 | 
 5 | [travis]
 6 | python =
 7 |   2.7: py27
 8 |   3.5: py35
 9 |   3.6: py36
10 | 
11 | [testenv]
12 | basepython =
13 |     py27: python2.7
14 |     py35: python3.5
15 |     py36: python3.6
16 | 
17 | deps=
18 |    lxml36: lxml>=3.6,<3.7
19 |    lxml37: lxml>=3.7,<3.8
20 |    lxml38: lxml>=3.8,<3.9
21 |    lxml40: lxml>=4.0,<4.1
22 |    lxml41: lxml>=4.1,<4.2
23 |   -rtests/requirements.txt
24 |   -rrequirements-extra.txt
25 | 
26 | commands=
27 |   nosetests {posargs:tests}
28 | 


--------------------------------------------------------------------------------