├── .bumpversion.cfg
├── .gitignore
├── .travis.yml
├── AUTHORS
├── MANIFEST.in
├── NEWS
├── README.rst
├── requirements.txt
├── scrapely
    ├── __init__.py
    ├── _htmlpage.c
    ├── _htmlpage.pyx
    ├── descriptor.py
    ├── extraction
    │   ├── __init__.py
    │   ├── _similarity.c
    │   ├── _similarity.pyx
    │   ├── pageobjects.py
    │   ├── pageparsing.py
    │   ├── regionextract.py
    │   └── similarity.py
    ├── extractors.py
    ├── htmlpage.py
    ├── template.py
    ├── tool.py
    └── version.py
├── setup.py
├── tests
    ├── __init__.py
    ├── samples
    │   ├── samples_htmlpage_0.html
    │   ├── samples_htmlpage_0.json
    │   ├── samples_htmlpage_1.html
    │   ├── samples_htmlpage_1.json
    │   ├── samples_htmlpage_2.html
    │   ├── samples_htmlpage_2.json
    │   ├── samples_pageparsing_0.html
    │   ├── samples_pageparsing_0.json
    │   ├── samples_scraper_loadstore_0.html
    │   ├── samples_scraper_loadstore_0.json
    │   ├── samples_scraper_loadstore_1.html
    │   └── samples_scraper_loadstore_1.json
    ├── test_extraction.py
    ├── test_htmlpage.py
    ├── test_htmlpage_data.py
    ├── test_pageparsing.py
    ├── test_scraper.py
    └── test_template.py
└── tox.ini


/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.13.5
 3 | commit = True
 4 | tag = True
 5 | tag_name = v{new_version}
 6 | 
 7 | [bumpversion:file:setup.py]
 8 | 
 9 | [bumpversion:file:scrapely/version.py]
10 | search = __version__ = '{current_version}'
11 | replace = __version__ = '{new_version}'
12 | 
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.so
 3 | .tox
 4 | build
 5 | dist
 6 | scrapely.egg-info
 7 | 
 8 | # coverage reports
 9 | .coverage
10 | .coverage.*
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | matrix:
 4 |   include:
 5 |   - python: 2.7
 6 |     env: TOXENV=py27
 7 |   - python: 3.4
 8 |     env: TOXENV=py34
 9 |   - python: pypy
10 |     env: TOXENV=pypy
11 | 
12 | install:
13 | - pip install cython
14 | - CYTHONIZE=1 python setup.py build
15 | - pip install -U tox
16 | script: tox
17 | 
18 | after_success:
19 |   - codecov
20 | 
21 | notifications:
22 |   irc:
23 |     use_notice: true
24 |     skip_join: true
25 |     channels:
26 |     - irc.freenode.org#scrapy
27 | deploy:
28 |   provider: pypi
29 |   distributions: sdist
30 |   user: scrapy
31 |   password:
32 |     secure: KIXp6K9gU7TT7d0CTkDq81s1Uh2qLHBf+b8l0fAlzq1xHeBuWY82nq94yp6KPqBDr868Cf5CwyC6Gnz/HFD93NVZabooTiz0qUAq98fqKQ2n2KVWzaWxL5C0PN4x5P9KfAlXTgFAll1uCsKRa7gvRbW+q/wKAGsGfKDshTxTkAQ=
33 |   on:
34 |     tags: true
35 |     all_branches: true
36 |     repo: scrapy/scrapely
37 |     condition: "$TOXENV == py27"
38 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | Scrapely was originally written by Shane Evans and released as part of the
 2 | (larger) Scrapy open source project by Pablo Hoffman. In April of 2011 Scrapely
 3 | was taken out of Scrapy, and released as a standalone library, to improve its
 4 | reusage and adoption.
 5 | 
 6 | Here is the list of the main contributors (along with their github users):
 7 | 
 8 |  * Shane Evans (shane42)
 9 |  * Pablo Hoffman (pablohoffman)
10 |  * Martin Olveyra (kalessin)
11 |  * Daniel Graña (dangra)
12 |  * Terry Peng (tpeng)
13 |  * Mikhail Korobov (kmike)
14 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include scrapely/*.pyx
2 | include scrapely/extraction/*.pyx
3 | include scrapely/*.c
4 | include scrapely/extraction/*.c
5 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | Scrapely release notes
 2 | ======================
 3 | 
 4 | 0.13.5 (2019-06-18)
 5 | -------------------
 6 | 
 7 | - Update C extensions generated from Cython for python 3.7
 8 | - Fix PEP8 violations
 9 | 
10 | 0.13.4 (2017-05-26)
11 | -------------------
12 | 
13 | - Improved price extraction
14 | - Replaced deprecated functions
15 | 
16 | 0.13.3 (2017-01-27)
17 | -------------------
18 | 
19 | - Use 64 bit integers when matching pages
20 | 
21 | 0.13.2 (2016-12-21)
22 | -------------------
23 | 
24 | - Add python3 support for `url_to_page` function
25 | 
26 | 0.13.1 (2016-12-21)
27 | -------------------
28 | 
29 | - Remove numpy as a mandatory import in setup.py
30 | 
31 | 0.13.0 (2016-12-21)
32 | -------------------
33 | 
34 | - Python 3 support;
35 | - fixed incorrect webpage encoding detection;
36 | - usability improvements for scrapely.tool;
37 | - internal cleanups;
38 | - number extractor now supports numbers with a sign.
39 | - add C extension to speed up parsing and extraction
40 | 
41 | 0.12.0 (2015-01-26)
42 | -------------------
43 | 
44 | - TemplatePageExtractor can now use multiple top-level extractors;
45 | - internal cleanups;
46 | 
47 | 0.11.0 (2014-08-01)
48 | -------------------
49 | 
50 | - HtmlPageParsedRegion can be copied/deepcopied.
51 | 
52 | 0.10 (2014-01-14)
53 | -----------------
54 | 
55 | - Several bug fixes and improvements to the IBL extraction logic;
56 | - allow training the Scraper class with an HtmlPage;
57 | - Python 2.5 support is dropped;
58 | - Unicode improvements for scrapely.tool.
59 | 
60 | 0.9 (2011-04-19)
61 | ----------------
62 | 
63 | First release of Scrapely.
64 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ========
  2 | Scrapely
  3 | ========
  4 | 
  5 | .. image:: https://api.travis-ci.org/scrapy/scrapely.svg?branch=master
  6 |     :target: https://travis-ci.org/scrapy/scrapely
  7 | 
  8 | Scrapely is a library for extracting structured data from HTML pages. Given
  9 | some example web pages and the data to be extracted, scrapely constructs a
 10 | parser for all similar pages.
 11 | 
 12 | Overview
 13 | ========
 14 | 
 15 | Scrapinghub wrote a nice `blog post`_ explaining how scrapely works and how it's used in Portia_.
 16 | 
 17 | .. _blog post: https://blog.scrapinghub.com/2016/07/07/scrapely-the-brains-behind-portia-spiders/
 18 | .. _Portia: http://portia.readthedocs.io/
 19 | 
 20 | Installation
 21 | ============
 22 | 
 23 | Scrapely works in Python 2.7 or 3.3+.
 24 | It requires numpy and w3lib Python packages.
 25 | 
 26 | To install scrapely on any platform use::
 27 | 
 28 |     pip install scrapely
 29 | 
 30 | If you're using Ubuntu (9.10 or above), you can install scrapely from the
 31 | Scrapy Ubuntu repos. Just add the Ubuntu repos as described here:
 32 | http://doc.scrapy.org/en/latest/topics/ubuntu.html
 33 | 
 34 | And then install scrapely with::
 35 | 
 36 |     aptitude install python-scrapely
 37 | 
 38 | Usage (API)
 39 | ===========
 40 | 
 41 | Scrapely has a powerful API, including a template format that can be edited
 42 | externally, that you can use to build very capable scrapers.
 43 | 
 44 | What follows is a quick example of the simplest possible usage, that you can
 45 | run in a Python shell.
 46 | 
 47 | Start by importing and instantiating the Scraper class::
 48 | 
 49 |     >>> from scrapely import Scraper
 50 |     >>> s = Scraper()
 51 | 
 52 | Then, proceed to train the scraper by adding some page and the data you expect
 53 | to scrape from there (note that all keys and values in the data you pass must
 54 | be strings)::
 55 | 
 56 |     >>> url1 = 'http://pypi.python.org/pypi/w3lib/1.1'
 57 |     >>> data = {'name': 'w3lib 1.1', 'author': 'Scrapy project', 'description': 'Library of web-related functions'}
 58 |     >>> s.train(url1, data)
 59 | 
 60 | Finally, tell the scraper to scrape any other similar page and it will return
 61 | the results::
 62 | 
 63 |     >>> url2 = 'http://pypi.python.org/pypi/Django/1.3'
 64 |     >>> s.scrape(url2)
 65 |     [{u'author': [u'Django Software Foundation &lt;foundation at djangoproject com&gt;'],
 66 |       u'description': [u'A high-level Python Web framework that encourages rapid development and clean, pragmatic design.'],
 67 |       u'name': [u'Django 1.3']}]
 68 | 
 69 | That's it! No xpaths, regular expressions, or hacky python code.
 70 | 
 71 | Usage (command line tool)
 72 | =========================
 73 | 
 74 | There is also a simple script to create and manage Scrapely scrapers.
 75 | 
 76 | It supports a command-line interface, and an interactive prompt. All commands
 77 | supported on interactive prompt are also supported in the command-line
 78 | interface.
 79 | 
 80 | To enter the interactive prompt type the following without arguments::
 81 | 
 82 |     python -m scrapely.tool myscraper.json
 83 | 
 84 | Example::
 85 | 
 86 |     $ python -m scrapely.tool myscraper.json
 87 |     scrapely> help
 88 | 
 89 |     Documented commands (type help <topic>):
 90 |     ========================================
 91 |     a  al  s  ta  td  tl
 92 | 
 93 |     scrapely>
 94 | 
 95 | To create a scraper and add a template::
 96 | 
 97 |     scrapely> ta http://pypi.python.org/pypi/w3lib/1.1
 98 |     [0] http://pypi.python.org/pypi/w3lib/1.1
 99 | 
100 | This is equivalent as typing the following in one command::
101 | 
102 |     python -m scrapely.tool myscraper.json ta http://pypi.python.org/pypi/w3lib/1.1
103 | 
104 | To list available templates from a scraper::
105 | 
106 |     scrapely> tl
107 |     [0] http://pypi.python.org/pypi/w3lib/1.1
108 | 
109 | To add a new annotation, you usually test the selection criteria first::
110 | 
111 |     scrapely> t 0 w3lib 1.1
112 |     [0] u'<h1>w3lib 1.1</h1>'
113 |     [1] u'<title>Python Package Index : w3lib 1.1</title>'
114 | 
115 | You can also quote the text, if you need to specify an arbitrary number of
116 | spaces, for example::
117 | 
118 |     scrapely> t 0 "w3lib 1.1"
119 | 
120 | You can refine by position. To take the one in position [0]::
121 | 
122 |     scrapely> a 0 w3lib 1.1 -n 0
123 |     [0] u'<h1>w3lib 1.1</h1>'
124 | 
125 | To annotate some fields on the template::
126 | 
127 |     scrapely> a 0 w3lib 1.1 -n 0 -f name
128 |     [new] (name) u'<h1>w3lib 1.1</h1>'
129 |     scrapely> a 0 Scrapy project -n 0 -f author
130 |     [new] u'<span>Scrapy project</span>'
131 | 
132 | To list annotations on a template::
133 | 
134 |     scrapely> al 0
135 |     [0-0] (name) u'<h1>w3lib 1.1</h1>'
136 |     [0-1] (author) u'<span>Scrapy project</span>'
137 | 
138 | To scrape another similar page with the already added templates::
139 | 
140 |     scrapely> s http://pypi.python.org/pypi/Django/1.3
141 |     [{u'author': [u'Django Software Foundation'], u'name': [u'Django 1.3']}]
142 | 
143 | 
144 | Tests
145 | =====
146 | 
147 | `tox`_ is the preferred way to run tests. Just run: ``tox`` from the root
148 | directory.
149 | 
150 | Support
151 | =======
152 | 
153 | * Mailing list: https://groups.google.com/forum/#!forum/scrapely
154 | * IRC: `scrapy@freenode`_
155 | 
156 | Scrapely is created and maintained by the Scrapy group, so you can get help
157 | through the usual support channels described in the `Scrapy community`_ page.
158 | 
159 | Architecture
160 | ============
161 | 
162 | Unlike most scraping libraries, Scrapely doesn't work with DOM trees or xpaths
163 | so it doesn't depend on libraries such as lxml or libxml2. Instead, it uses
164 | an internal pure-python parser, which can accept poorly formed HTML. The HTML is
165 | converted into an array of token ids, which is used for matching the items to
166 | be extracted.
167 | 
168 | Scrapely extraction is based upon the Instance Based Learning algorithm [1]_
169 | and the matched items are combined into complex objects (it supports nested and
170 | repeated objects), using a tree of parsers, inspired by A Hierarchical
171 | Approach to Wrapper Induction [2]_.
172 | 
173 | .. [1] `Yanhong Zhai , Bing Liu, Extracting Web Data Using Instance-Based Learning, World Wide Web, v.10 n.2, p.113-132, June 2007 <http://portal.acm.org/citation.cfm?id=1265174>`_
174 | 
175 | .. [2] `Ion Muslea , Steve Minton , Craig Knoblock, A hierarchical approach to wrapper induction, Proceedings of the third annual conference on Autonomous Agents, p.190-197, April 1999, Seattle, Washington, United States <http://portal.acm.org/citation.cfm?id=301191>`_
176 | 
177 | Known Issues
178 | ============
179 | 
180 | The training implementation is currently very simple and is only provided for
181 | references purposes, to make it easier to test Scrapely and play with it. On
182 | the other hand, the extraction code is reliable and production-ready. So, if
183 | you want to use Scrapely in production, you should use train() with caution and
184 | make sure it annotates the area of the page you intended.
185 | 
186 | Alternatively, you can use the Scrapely command line tool to annotate pages,
187 | which provides more manual control for higher accuracy.
188 | 
189 | How does Scrapely relate to `Scrapy`_?
190 | ======================================
191 | 
192 | Despite the similarity in their names, Scrapely and `Scrapy`_ are quite
193 | different things. The only similarity they share is that they both depend on
194 | `w3lib`_, and they are both maintained by the same group of developers (which
195 | is why both are hosted on the `same Github account`_).
196 | 
197 | Scrapy is an application framework for building web crawlers, while Scrapely is
198 | a library for extracting structured data from HTML pages. If anything, Scrapely
199 | is more similar to `BeautifulSoup`_ or `lxml`_ than Scrapy.
200 | 
201 | Scrapely doesn't depend on Scrapy nor the other way around. In fact, it is
202 | quite common to use Scrapy without Scrapely, and viceversa.
203 | 
204 | If you are looking for a complete crawler-scraper solution, there is (at least)
205 | one project called `Slybot`_ that integrates both, but you can definitely use
206 | Scrapely on other web crawlers since it's just a library.
207 | 
208 | Scrapy has a builtin extraction mechanism called `selectors`_ which (unlike
209 | Scrapely) is based on XPaths.
210 | 
211 | 
212 | License
213 | =======
214 | 
215 | Scrapely library is licensed under the BSD license.
216 | 
217 | .. _Scrapy: http://scrapy.org/
218 | .. _w3lib: https://github.com/scrapy/w3lib
219 | .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
220 | .. _lxml: http://lxml.de/
221 | .. _same Github account: https://github.com/scrapy
222 | .. _slybot: https://github.com/scrapy/slybot
223 | .. _selectors: http://doc.scrapy.org/en/latest/topics/selectors.html
224 | .. _nose: http://readthedocs.org/docs/nose/en/latest/
225 | .. _scrapy@freenode: http://webchat.freenode.net/?channels=scrapy
226 | .. _Scrapy community: http://scrapy.org/community/
227 | .. _tox: https://pypi.python.org/pypi/tox
228 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | w3lib
3 | six


--------------------------------------------------------------------------------
/scrapely/__init__.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from w3lib.util import str_to_unicode
 4 | 
 5 | from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page
 6 | from scrapely.template import TemplateMaker, best_match
 7 | from scrapely.extraction import InstanceBasedLearningExtractor
 8 | from scrapely.version import __version__
 9 | 
10 | 
11 | class Scraper(object):
12 | 
13 |     def __init__(self, templates=None):
14 |         """Initialize an empty scraper."""
15 |         self._templates = templates or []
16 |         self._ex = None
17 | 
18 |     @classmethod
19 |     def fromfile(cls, file):
20 |         """Initialize a scraper from a file previously stored by tofile()
21 |         method.
22 |         """
23 |         templates = [HtmlPage(**x) for x in json.load(file)['templates']]
24 |         return cls(templates)
25 | 
26 |     def tofile(self, file):
27 |         """Store the scraper into the given file-like object"""
28 |         tpls = [page_to_dict(x) for x in self._templates]
29 |         json.dump({'templates': tpls}, file)
30 | 
31 |     def add_template(self, template):
32 |         self._templates.append(template)
33 |         self._ex = None
34 | 
35 |     def train_from_htmlpage(self, htmlpage, data):
36 |         assert data, "Cannot train with empty data"
37 |         tm = TemplateMaker(htmlpage)
38 |         for field, values in data.items():
39 |             if (isinstance(values, (bytes, str)) or
40 |                     not hasattr(values, '__iter__')):
41 |                 values = [values]
42 |             for value in values:
43 |                 value = str_to_unicode(value, htmlpage.encoding)
44 |                 tm.annotate(field, best_match(value))
45 |         self.add_template(tm.get_template())
46 | 
47 |     def train(self, url, data, encoding=None):
48 |         page = url_to_page(url, encoding)
49 |         self.train_from_htmlpage(page, data)
50 | 
51 |     def scrape(self, url, encoding=None):
52 |         page = url_to_page(url, encoding)
53 |         return self.scrape_page(page)
54 | 
55 |     def scrape_page(self, page):
56 |         if self._ex is None:
57 |             self._ex = InstanceBasedLearningExtractor((t, None) for t in
58 |                     self._templates)
59 |         return self._ex.extract(page)[0]
60 | 


--------------------------------------------------------------------------------
/scrapely/_htmlpage.pyx:
--------------------------------------------------------------------------------
  1 | from cpython.version cimport PY_MAJOR_VERSION
  2 | import re
  3 | 
  4 | _ATTR = "((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?"
  5 | _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
  6 | 
  7 | class HtmlTagType(object):
  8 |     OPEN_TAG = 1
  9 |     CLOSE_TAG = 2
 10 |     UNPAIRED_TAG = 3
 11 | 
 12 | 
 13 | class HtmlDataFragment(object):
 14 |     __slots__ = ('start', 'end', 'is_text_content')
 15 | 
 16 |     def __init__(self, start, end, is_text_content=False):
 17 |         self.start = start
 18 |         self.end = end
 19 |         self.is_text_content = is_text_content
 20 | 
 21 |     def __str__(self):
 22 |         return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (self.start, self.end, self.is_text_content)
 23 | 
 24 |     def __repr__(self):
 25 |         return str(self)
 26 | 
 27 | 
 28 | class HtmlTag(HtmlDataFragment):
 29 |     __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text')
 30 | 
 31 |     def __init__(self, tag_type, tag, attr_text, start, end):
 32 |         HtmlDataFragment.__init__(self, start, end)
 33 |         self.tag_type = tag_type
 34 |         self.tag = tag
 35 |         if isinstance(attr_text, dict):
 36 |             self._attributes = attr_text
 37 |             self._attr_text = None
 38 |         else: # defer loading attributes until necessary
 39 |             self._attributes = {}
 40 |             self._attr_text = attr_text
 41 | 
 42 |     @property
 43 |     def attributes(self):
 44 |         if not self._attributes and self._attr_text:
 45 |             for attr_match in _ATTR_REGEXP.findall(self._attr_text):
 46 |                 name = attr_match[0].lower()
 47 |                 values = [v for v in attr_match[1:] if v]
 48 |                 # According to HTML spec if attribute name is repeated only the
 49 |                 # first one is taken into account
 50 |                 if name not in self._attributes:
 51 |                     self._attributes[name] = values[0] if values else None
 52 |         return self._attributes
 53 | 
 54 |     def __str__(self):
 55 |         return "<HtmlTag tag='%s' attributes={%s} type='%d' [%s:%s]>" % (self.tag, ', '.join(sorted\
 56 |                 (["%s: %s" % (k, repr(v)) for k, v in self.attributes.items()])), self.tag_type, self.start, self.end)
 57 | 
 58 |     def __repr__(self):
 59 |         return str(self)
 60 | 
 61 | 
 62 | cdef class CommentParser:
 63 |     cdef int start
 64 |     cdef int end
 65 |     cdef int open_state, open_count
 66 |     cdef int close_state, close_count
 67 |     cdef int inside_comment
 68 | 
 69 |     def __init__(self):
 70 |         self.start = -1
 71 |         self.end = -1
 72 |         self.reset()
 73 | 
 74 |     cdef void reset(self):
 75 |         self.open_state = 1
 76 |         self.close_state = 1
 77 |         self.open_count = 0
 78 |         self.close_count = 0
 79 | 
 80 |     cdef int parse(self, Py_UCS4 c, int i):
 81 |         if ((self.open_state == 1 and c == u'<') or
 82 |             (self.open_state == 2 and c == u'!') or
 83 |             (self.open_state == 3 and c == u'-') or
 84 |             (self.open_state == 4 and c == u'-')):
 85 |             self.open_state += 1
 86 |         else:
 87 |             # Handle <!> comment
 88 |             if self.open_state == 3 and c == u'>':
 89 |                 self.inside_comment = False
 90 |                 self.reset()
 91 |                 self.start, self.end = i - 2, i
 92 |                 return True
 93 |             self.open_state = 1
 94 |         if self.open_state == 5:
 95 |             if self.open_count == 0:
 96 |                 self.start = i - 3
 97 |             self.open_state = 1
 98 |             self.open_count = 1
 99 |             self.inside_comment = True
100 | 
101 |         if self.close_count < self.open_count:
102 |             if self.close_state == 1:
103 |                 if c == u'-':
104 |                     self.close_state += 1
105 |             elif self.close_state == 2:
106 |                 if c == u'-':
107 |                     self.close_state += 1
108 |                 else:
109 |                     self.close_state = 1
110 |             elif self.close_state == 3:
111 |                 if c == u'!':
112 |                     self.close_state = 4
113 |                 elif c == u'>':
114 |                     self.close_state = 5
115 |                 else:
116 |                     self.close_state = 1
117 |             elif self.close_state == 4:
118 |                 if c == u'>':
119 |                     self.close_state = 5
120 |                 else:
121 |                     self.close_state = 1
122 | 
123 |             if self.close_state == 5:
124 |                 self.close_state = 1
125 |                 self.close_count += 1
126 |                 if self.close_count >= self.open_count:
127 |                     self.end = i
128 |                     self.reset()
129 |                     self.inside_comment = False
130 |                     return True
131 |         return False
132 | 
133 | 
134 | cdef class ScriptParser:
135 |     cdef int start
136 |     cdef int end
137 |     cdef int state
138 | 
139 |     def __init__(self):
140 |         self.start = -1
141 |         self.end = -1
142 |         self.state = 1
143 | 
144 |     cdef int parse(self, Py_UCS4 c, int i):
145 |         if self.state == 10:
146 |             self.state = 1
147 |         if ((self.state == 1 and c == u'<') or
148 |             (self.state == 2 and c == u'/') or
149 |             (self.state == 3 and c in u'sS') or
150 |             (self.state == 4 and c in u'cC') or
151 |             (self.state == 5 and c in u'rR') or
152 |             (self.state == 6 and c in u'iI') or
153 |             (self.state == 7 and c in u'pP') or
154 |             (self.state == 8 and c in u'tT') or
155 |             (self.state == 9 and c == u'>')):
156 |             self.state += 1
157 |         else:
158 |             self.state = 1
159 | 
160 |         if self.state == 2:
161 |             self.start = i
162 |         elif self.state == 10:
163 |             self.end = i
164 | 
165 |         return self.state == 10
166 | 
167 | 
168 | # directly copied from cython's docs
169 | cdef unicode _ustring(s):
170 |     if type(s) is unicode:
171 |         # fast path for most common case(s)
172 |         return <unicode>s
173 |     elif PY_MAJOR_VERSION < 3 and isinstance(s, bytes):
174 |         # only accept byte strings in Python 2.x, not in Py3
175 |         return (<bytes>s).decode('ascii')
176 |     elif isinstance(s, unicode):
177 |         # an evil cast to <unicode> might work here in some(!) cases,
178 |         # depending on what the further processing does.  to be safe,
179 |         # we can always create a copy instead
180 |         return unicode(s)
181 |     else:
182 |         raise TypeError('unicode or str expected')
183 | 
184 | 
185 | cpdef parse_html(s):
186 |     cdef int OPEN_TAG = HtmlTagType.OPEN_TAG
187 |     cdef int CLOSE_TAG = HtmlTagType.CLOSE_TAG
188 |     cdef int UNPAIRED_TAG = HtmlTagType.UNPAIRED_TAG
189 | 
190 |     cdef unicode text = _ustring(s)
191 | 
192 |     parsed = []
193 |     comment_parser = CommentParser()
194 |     script_parser = ScriptParser()
195 | 
196 |     cdef int tag_end = -1         # end position of previous tag
197 |     cdef int tag_start = -1       # start of current tag
198 |     cdef int script = False       # True if inside script body
199 |     cdef int open_tag = False     # True if an open tag symbol has been read
200 |     cdef int quote_single = False # True if unpaired single quote
201 |     cdef int quote_double = False # True if unpaired double quote
202 |     cdef int quoted
203 | 
204 |     cdef int reset_tag = True
205 |     cdef int slash
206 |     cdef int has_attributes
207 |     cdef int yield_tag
208 | 
209 |     cdef unicode tag_name
210 |     cdef unicode tag_attributes
211 |     cdef Py_UCS4 curr_char
212 |     cdef Py_UCS4 prev_char = 0 # previous value of curr_char
213 |     cdef int i = 0
214 |     for curr_char in text:
215 |         if reset_tag:
216 |             reset_tag = False
217 |             slash = False
218 |             has_attributes = False
219 |             tag_name = u''
220 |             tag_attributes = u''
221 |             yield_tag = False
222 | 
223 |         if open_tag or script:
224 |             if curr_char == u'"' and not quote_single:
225 |                 quote_double = not quote_double
226 |             if curr_char == u"'" and not quote_double:
227 |                 quote_single = not quote_single
228 |         else:
229 |             quote_single = quote_double = False
230 |         quoted = quote_double or quote_single
231 | 
232 |         if not quoted:
233 |             if comment_parser.parse(curr_char, i):
234 |                 if (tag_end + 1) < comment_parser.start:
235 |                     parsed.append(
236 |                         HtmlDataFragment(tag_end + 1, comment_parser.start, not script))
237 |                 tag_end = comment_parser.end
238 |                 parsed.append(
239 |                     HtmlDataFragment(comment_parser.start, tag_end + 1, False))
240 |                 reset_tag = True
241 |                 if (comment_parser.end - comment_parser.start) == 2:
242 |                     open_tag = False
243 | 
244 |         if comment_parser.inside_comment:
245 |             open_tag = False
246 |         else:
247 |             if script:
248 |                 open_tag = False
249 |                 if script_parser.parse(curr_char, i):
250 |                     script = False
251 |                     if (tag_end + 1) < script_parser.start:
252 |                         parsed.append(
253 |                             HtmlDataFragment(tag_end + 1, script_parser.start, False))
254 |                     tag_end = script_parser.end
255 |                     parsed.append(
256 |                         HtmlTag(CLOSE_TAG,
257 |                                 u'script', u'', script_parser.start, tag_end + 1))
258 |             elif open_tag:
259 |                 if quoted:
260 |                     if has_attributes:
261 |                         tag_attributes += curr_char
262 |                 elif curr_char == u'<':
263 |                     tag_end = i - 1
264 |                     yield_tag = True
265 |                 elif curr_char == u'>':
266 |                     if prev_char == u'/':
267 |                         slash = True
268 |                     tag_end = i
269 |                     yield_tag = True
270 |                     open_tag = False
271 |                 elif curr_char == u'/':
272 |                     if prev_char == u'<':
273 |                         slash = True
274 |                 elif curr_char.isspace():
275 |                     if has_attributes:
276 |                         if prev_char == u'/':
277 |                             # feature, bug? Maintain compatilibity with previous
278 |                             # implementation
279 |                             tag_attributes += u'/'
280 |                         tag_attributes += curr_char
281 |                     elif tag_name:
282 |                         has_attributes = True
283 |                 else:
284 |                     if has_attributes:
285 |                         tag_attributes += curr_char
286 |                     else:
287 |                         tag_name += curr_char.lower()
288 |                 if yield_tag:
289 |                     if not slash:
290 |                         tag_type = OPEN_TAG
291 |                     elif prev_char != u'/':
292 |                         tag_type = CLOSE_TAG
293 |                     else:
294 |                         tag_type = UNPAIRED_TAG
295 |                     if tag_name != u'!doctype':
296 |                         parsed.append(
297 |                             HtmlTag(tag_type, tag_name,
298 |                                     tag_attributes, tag_start, tag_end + 1))
299 |                     if tag_name == u'script':
300 |                         script = True
301 |                     if open_tag:
302 |                         tag_start = i
303 |                     reset_tag = True
304 |             else:
305 |                 open_tag = False
306 |                 if curr_char == u'<' and not quoted:
307 |                     open_tag = True
308 |                     tag_start = i
309 |                     if tag_start > tag_end + 1:
310 |                         parsed.append(
311 |                             HtmlDataFragment(tag_end + 1, tag_start, True))
312 |                     tag_end = tag_start
313 |         prev_char = curr_char
314 |         i += 1
315 | 
316 |     if tag_end + 1 < len(text):
317 |         parsed.append(HtmlDataFragment(tag_end + 1, len(text), True))
318 |     return parsed
319 | 


--------------------------------------------------------------------------------
/scrapely/descriptor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Extended types for IBL extraction
 3 | """
 4 | from itertools import chain
 5 | 
 6 | from scrapely.extractors import text
 7 | 
 8 | 
 9 | class FieldDescriptor(object):
10 |     """description of a scraped attribute"""
11 |     __slots__ = ('name', 'description', 'extractor', 'required')
12 | 
13 |     def __init__(self, name, description, extractor=text, required=False):
14 |         self.name = name
15 |         self.description = description
16 |         self.extractor = extractor
17 |         self.required = required
18 | 
19 |     def __str__(self):
20 |         return "FieldDescriptor(%s)" % self.name
21 | 
22 | 
23 | class ItemDescriptor(object):
24 |     """Simple auto scraping item descriptor.
25 | 
26 |     This used to describe type-specific operations and may be overridden where
27 |     necessary.
28 |     """
29 | 
30 |     def __init__(self, name, description, attribute_descriptors):
31 |         self.name = name
32 |         self.description = description
33 |         self.attribute_map = dict((d.name, d) for d in attribute_descriptors)
34 |         self._required_attributes = [d.name for d in attribute_descriptors \
35 |                 if d.required]
36 | 
37 |     def validated(self, data):
38 |         """Only return the items in the data that are valid"""
39 |         return [d for d in data if self._item_validates(d)]
40 | 
41 |     def _item_validates(self, item):
42 |         """simply checks that all mandatory attributes are present"""
43 |         variant_attrs = set(chain(*
44 |             [v.keys() for v in item.get('variants', [])]))
45 |         return item and all([(name in item or name in variant_attrs) \
46 |                 for name in self._required_attributes])
47 | 
48 |     def get_required_attributes(self):
49 |         return self._required_attributes
50 | 
51 |     def __str__(self):
52 |         return "ItemDescriptor(%s)" % self.name
53 | 
54 |     def copy(self):
55 |         attribute_descriptors = []
56 |         for d in self.attribute_map.values():
57 |             attribute_descriptors.append(FieldDescriptor(d.name, d.description, d.extractor, d.required))
58 |         return ItemDescriptor(self.name, self.description, attribute_descriptors)
59 |         # return self
60 | 


--------------------------------------------------------------------------------
/scrapely/extraction/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | IBL module
  3 | 
  4 | This contains an extraction algorithm based on the paper Extracting Web Data
  5 | Using Instance-Based Learning by Yanhong Zhai and Bing Liu.
  6 | 
  7 | It defines the InstanceBasedLearningExtractor class, which implements this
  8 | extraction algorithm.
  9 | 
 10 | Main departures from the original algorithm:
 11 |     * there is no limit in prefix or suffix size
 12 |     * we have "attribute adaptors" that allow generic post processing and may
 13 |       affect the extraction process. For example, a price field may require a
 14 |       numeric value to be present.
 15 |     * tags can be inserted to extract regions not wrapped by html tags. These
 16 |       regions are then identified using the longest unique character prefix and
 17 |       suffix.
 18 | """
 19 | from operator import itemgetter
 20 | from .pageparsing import parse_template, parse_extraction_page
 21 | from .pageobjects import TokenDict
 22 | from .regionextract import (BasicTypeExtractor, TraceExtractor, RepeatedDataExtractor,
 23 |                             AdjacentVariantExtractor, RecordExtractor, TemplatePageExtractor)
 24 | 
 25 | 
 26 | class InstanceBasedLearningExtractor(object):
 27 |     """Implementation of the instance based learning algorithm to
 28 |     extract data from web pages.
 29 |     """
 30 |     _extractor_classes = [
 31 |         RepeatedDataExtractor,
 32 |         AdjacentVariantExtractor,
 33 |         RepeatedDataExtractor,
 34 |         AdjacentVariantExtractor,
 35 |         RepeatedDataExtractor,
 36 |         RecordExtractor,
 37 |     ]
 38 | 
 39 |     def __init__(self, td_pairs, trace=False, apply_extrarequired=True):
 40 |         """Initialise this extractor
 41 | 
 42 |         td_pairs is a list of (template, item descriptor) pairs.
 43 | 
 44 |         templates should contain a sequence of strings, each containing
 45 |         annotated html that will be used as templates for extraction.
 46 | 
 47 |         Tags surrounding areas to be extracted must contain a
 48 |         'data-scrapy-annotate' attribute and the value must be the name
 49 |         of the attribute. If the tag was inserted and was not present in the
 50 |         original page, the data-scrapy-generated attribute must be present.
 51 | 
 52 |         item descriptors describe how the item will be extracted from target
 53 |         page, using the corresponding template.
 54 | 
 55 |         if trace is true, the returned extracted data will have a 'trace'
 56 |         property that contains a trace of the extraction execution.
 57 |         """
 58 |         self.token_dict = TokenDict()
 59 |         parsed_plus_tdpairs = [(parse_template(self.token_dict, td[0]), td) for td in td_pairs]
 60 |         parsed_plus_epages = (
 61 |             (p, parse_extraction_page(self.token_dict, td[0]), td)
 62 |             for p, td in parsed_plus_tdpairs if _annotation_count(p)
 63 |         )
 64 |         parsed_tdpairs = map(itemgetter(0, 2), parsed_plus_epages)
 65 | 
 66 |         modified_parsed_tdpairs = []
 67 |         # apply extra required attributes
 68 |         for parsed, (t, descriptor) in parsed_tdpairs:
 69 |             if descriptor is not None and apply_extrarequired:
 70 |                 descriptor = descriptor.copy()
 71 |                 for attr in parsed.extra_required_attrs:
 72 |                     descriptor._required_attributes.append(attr)
 73 |                     # not always is present a descriptor for a given attribute
 74 |                     if attr in descriptor.attribute_map:
 75 |                         # not strictly necessary, but avoid possible inconsistencies for user
 76 |                         descriptor.attribute_map[attr].required = True
 77 |             modified_parsed_tdpairs.append((parsed, (t, descriptor)))
 78 |         # templates with more attributes are considered first
 79 |         sorted_tdpairs = sorted(modified_parsed_tdpairs,
 80 |                 key=lambda x: _annotation_count(x[0]), reverse=True)
 81 |         self.extraction_trees = [
 82 |             self.build_extraction_tree(p, td[1], trace)
 83 |             for p, td in sorted_tdpairs
 84 |         ]
 85 |         self.validated = dict(
 86 |             (td[0].page_id, td[1].validated if td[1] else self._filter_not_none)
 87 |             for _, td in sorted_tdpairs
 88 |         )
 89 | 
 90 |     def build_extraction_tree(self, template, type_descriptor, trace=True):
 91 |         """Build a tree of region extractors corresponding to the
 92 |         template
 93 |         """
 94 |         attribute_map = type_descriptor.attribute_map if type_descriptor else None
 95 |         extractors = BasicTypeExtractor.create(template.annotations, attribute_map)
 96 |         if trace:
 97 |             extractors = TraceExtractor.apply(template, extractors)
 98 |         for cls in self._extractor_classes:
 99 |             extractors = cls.apply(template, extractors)
100 |             if trace:
101 |                 extractors = TraceExtractor.apply(template, extractors)
102 | 
103 |         return TemplatePageExtractor(template, extractors)
104 | 
105 |     def extract(self, html, pref_template_id=None):
106 |         """extract data from an html page
107 | 
108 |         If pref_template_url is specified, the template with that url will be
109 |         used first.
110 |         """
111 |         extraction_page = parse_extraction_page(self.token_dict, html)
112 |         if pref_template_id is not None:
113 |             extraction_trees = sorted(self.extraction_trees,
114 |                     key=lambda x: x.template.id != pref_template_id)
115 |         else:
116 |             extraction_trees = self.extraction_trees
117 | 
118 |         for extraction_tree in extraction_trees:
119 |             extracted = extraction_tree.extract(extraction_page)
120 |             correctly_extracted = self.validated[extraction_tree.template.id](extracted)
121 |             if len(correctly_extracted) > 0:
122 |                 return correctly_extracted, extraction_tree.template
123 |         return None, None
124 | 
125 |     def __str__(self):
126 |         return "InstanceBasedLearningExtractor[\n%s\n]" % \
127 |                 (',\n'.join(map(str, self.extraction_trees)))
128 | 
129 |     @staticmethod
130 |     def _filter_not_none(items):
131 |         return [d for d in items if d is not None]
132 | 
133 | 
134 | def _annotation_count(template):
135 |     return len(template.annotations)
136 | 


--------------------------------------------------------------------------------
/scrapely/extraction/_similarity.pyx:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | cimport numpy as np
  3 | cimport cython
  4 | from cpython.version cimport PY_MAJOR_VERSION
  5 | 
  6 | cdef np_kmp_match_length(np.ndarray[np.int64_t, ndim=1] sequence,
  7 |                          np.ndarray[np.int64_t, ndim=1] pattern,
  8 |                          int start=0,
  9 |                          int end=-1):
 10 |     """Adaptated from KMP substring search:
 11 |     http://code.activestate.com/recipes/117214-knuth-morris-pratt-string-matching/
 12 | 
 13 |     The algorithm is modified to return the match length at the given position
 14 |     """
 15 |     ret = []
 16 |     cdef int m = len(pattern)
 17 |     if end == -1:
 18 |         end = m
 19 |     # build table of shift amounts
 20 |     cdef np.ndarray[np.int64_t, ndim=1] shifts = np.ones((m + 1,), dtype=int)
 21 |     cdef int shift = 1
 22 |     cdef int pos
 23 |     for pos in range(m):
 24 |         while shift <= pos and pattern[pos] != pattern[pos-shift]:
 25 |             shift += shifts[pos-shift]
 26 |         shifts[pos+1] = shift
 27 | 
 28 |     # do the actual search
 29 |     cdef int startPos = start
 30 |     cdef int matchLen = 0
 31 |     cdef int c
 32 |     for c in sequence[start:]:
 33 |         if startPos >= end:
 34 |             break
 35 |         while matchLen == m or \
 36 |               matchLen >= 0 and pattern[matchLen] != c:
 37 |             if matchLen > 0:
 38 |                 ret.append((startPos, matchLen))
 39 |             startPos += shifts[matchLen]
 40 |             matchLen -= shifts[matchLen]
 41 |         matchLen += 1
 42 |     if matchLen > 0 and startPos < end:
 43 |         ret.append((startPos, matchLen))
 44 | 
 45 |     return ret
 46 | 
 47 | 
 48 | cdef u_kmp_match_length(unicode sequence, unicode pattern, int start=0, int end=-1):
 49 |     """Adaptated from KMP substring search:
 50 |     http://code.activestate.com/recipes/117214-knuth-morris-pratt-string-matching/
 51 | 
 52 |     The algorithm is modified to return the match length at the given position
 53 |     """
 54 |     ret = []
 55 |     cdef int m = len(pattern)
 56 |     if end == -1:
 57 |         end = m
 58 |     # build table of shift amounts
 59 |     cdef np.ndarray[np.int64_t, ndim=1] shifts = np.ones((m + 1,), dtype=int)
 60 |     cdef int shift = 1
 61 |     cdef int pos
 62 |     for pos in range(m):
 63 |         while shift <= pos and pattern[pos] != pattern[pos-shift]:
 64 |             shift += shifts[pos-shift]
 65 |         shifts[pos+1] = shift
 66 | 
 67 |     # do the actual search
 68 |     cdef int startPos = start
 69 |     cdef int matchLen = 0
 70 |     cdef Py_UCS4 c
 71 |     for c in sequence[start:]:
 72 |         if startPos >= end:
 73 |             break
 74 |         while matchLen == m or \
 75 |               matchLen >= 0 and pattern[matchLen] != c:
 76 |             if matchLen > 0:
 77 |                 ret.append((startPos, matchLen))
 78 |             startPos += shifts[matchLen]
 79 |             matchLen -= shifts[matchLen]
 80 |         matchLen += 1
 81 |     if matchLen > 0 and startPos < end:
 82 |         ret.append((startPos, matchLen))
 83 | 
 84 |     return ret
 85 | 
 86 | 
 87 | cdef np_naive_match_length(np.ndarray[np.int64_t, ndim=1] sequence,
 88 |                            np.ndarray[np.int64_t, ndim=1] pattern,
 89 |                            int start=0,
 90 |                            int end=-1):
 91 |     ret = []
 92 |     cdef int m = len(sequence)
 93 |     cdef int n = min(m, len(pattern))
 94 |     cdef int i
 95 |     cdef int j
 96 |     cdef int k
 97 |     if end == -1:
 98 |         end = m
 99 |     else:
100 |         end = min(end, m)
101 |     for i in range(start, end):
102 |         j = 0
103 |         k = i
104 |         while sequence[k] == pattern[j]:
105 |             j += 1
106 |             k += 1
107 |             if k == m or j == n:
108 |                 break
109 |         if j > 0:
110 |             ret.append((i, j))
111 |     return ret
112 | 
113 | 
114 | cdef u_naive_match_length(unicode sequence,
115 |                           unicode pattern, int start=0, int end=-1):
116 |     ret = []
117 |     cdef int m = len(sequence)
118 |     cdef int n = min(m, len(pattern))
119 |     cdef int i
120 |     cdef int j
121 |     cdef int k
122 |     if end == -1:
123 |         end = m
124 |     else:
125 |         end = min(end, m)
126 |     for i in range(start, end):
127 |         j = 0
128 |         k = i
129 |         while sequence[k] == pattern[j]:
130 |             j += 1
131 |             k += 1
132 |             if k == m or j == n:
133 |                 break
134 |         if j > 0:
135 |             ret.append((i, j))
136 |     return ret
137 | 
138 | 
139 | cdef unicode _ustring(s):
140 |     if type(s) is unicode:
141 |         # fast path for most common case(s)
142 |         return <unicode>s
143 |     elif PY_MAJOR_VERSION < 3 and isinstance(s, bytes):
144 |         # only accept byte strings in Python 2.x, not in Py3
145 |         return (<bytes>s).decode('ascii')
146 |     elif isinstance(s, unicode):
147 |         # an evil cast to <unicode> might work here in some(!) cases,
148 |         # depending on what the further processing does.  to be safe,
149 |         # we can always create a copy instead
150 |         return unicode(s)
151 |     else:
152 |         raise TypeError('Expected str or unicode')
153 | 
154 | 
155 | cpdef naive_match_length(sequence, pattern, int start=0, int end=-1):
156 |     if isinstance(sequence, np.ndarray):
157 |         if isinstance(pattern, np.ndarray):
158 |             return np_naive_match_length(sequence, pattern, start, end)
159 |         else:
160 |             raise TypeError('Different types for sequence and pattern')
161 |     else:
162 |         return u_naive_match_length(
163 |             _ustring(sequence), _ustring(pattern), start, end)
164 | 
165 | cpdef kmp_match_length(sequence, pattern, int start=0, int end=-1):
166 |     if isinstance(sequence, np.ndarray):
167 |         if isinstance(pattern, np.ndarray):
168 |             return np_kmp_match_length(sequence, pattern, start, end)
169 |         else:
170 |             raise TypeError('Different types for sequence and pattern')
171 |     else:
172 |         return u_kmp_match_length(
173 |             _ustring(sequence), _ustring(pattern), start, end)
174 | 


--------------------------------------------------------------------------------
/scrapely/extraction/pageobjects.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Page objects
  3 | 
  4 | This module contains objects representing pages and parts of pages (e.g. tokens
  5 | and annotations) used in the instance based learning algorithm.
  6 | """
  7 | from itertools import chain
  8 | from numpy import array, ndarray
  9 | 
 10 | from scrapely.htmlpage import HtmlTagType, HtmlPageRegion, HtmlPageParsedRegion
 11 | 
 12 | 
 13 | class TokenType(HtmlTagType):
 14 |     """constants for token types"""
 15 |     WORD = 0
 16 | 
 17 | 
 18 | class TokenDict(object):
 19 |     """Mapping from parse tokens to integers
 20 | 
 21 |     >>> d = TokenDict()
 22 |     >>> d.tokenid('i')
 23 |     0
 24 |     >>> d.tokenid('b')
 25 |     1
 26 |     >>> d.tokenid('i')
 27 |     0
 28 | 
 29 |     Tokens can be searched for by id
 30 |     >>> d.find_token(1)
 31 |     'b'
 32 | 
 33 |     The lower 24 bits store the token reference and the higher bits the type.
 34 |     """
 35 | 
 36 |     def __init__(self):
 37 |         self.token_ids = {}
 38 | 
 39 |     def tokenid(self, token, token_type=TokenType.WORD):
 40 |         """create an integer id from the token and token type passed"""
 41 |         tid = self.token_ids.setdefault(token, len(self.token_ids))
 42 |         return tid | (token_type << 24)
 43 | 
 44 |     @staticmethod
 45 |     def token_type(token):
 46 |         """extract the token type from the token id passed"""
 47 |         return token >> 24
 48 | 
 49 |     def find_token(self, tid):
 50 |         """Search for a tag with the given ID
 51 | 
 52 |         This is O(N) and is only intended for debugging
 53 |         """
 54 |         tid &= 0xFFFFFF
 55 |         if tid >= len(self.token_ids) or tid < 0:
 56 |             raise ValueError("tag id %s out of range" % tid)
 57 | 
 58 |         for (token, token_id) in self.token_ids.items():
 59 |             if token_id == tid:
 60 |                 return token
 61 |         assert False, "token dictionary is corrupt"
 62 | 
 63 |     def token_string(self, tid):
 64 |         """create a string representation of a token
 65 | 
 66 |         This is O(N).
 67 |         """
 68 |         templates = ["%s", "<%s>", "</%s>", "<%s/>"]
 69 |         return templates[tid >> 24] % self.find_token(tid)
 70 | 
 71 | 
 72 | class PageRegion(object):
 73 |     """A region in a page, defined by a start and end index"""
 74 | 
 75 |     __slots__ = ('start_index', 'end_index')
 76 | 
 77 |     def __init__(self, start, end):
 78 |         self.start_index = start
 79 |         self.end_index = end
 80 | 
 81 |     def __str__(self):
 82 |         return "%s(%s, %s)" % (self.__class__.__name__, self.start_index,
 83 |                 self.end_index)
 84 | 
 85 |     def __repr__(self):
 86 |         return str(self)
 87 | 
 88 | 
 89 | class FragmentedHtmlPageRegion(HtmlPageParsedRegion, HtmlPageRegion):
 90 |     """An HtmlPageRegion consisting of possibly non-contiguous sub-regions"""
 91 |     def __new__(cls, htmlpage, regions):
 92 |         text = u''.join(regions)
 93 |         return HtmlPageRegion.__new__(cls, htmlpage, text)
 94 | 
 95 |     def __init__(self, htmlpage, regions):
 96 |         self.htmlpage = htmlpage
 97 |         self.regions = regions
 98 | 
 99 |     @property
100 |     def parsed_fragments(self):
101 |         return chain(*(r.parsed_fragments for r in self.regions))
102 | 
103 | 
104 | class Page(object):
105 |     """Basic representation of a page. This consists of a reference to a
106 |     dictionary of tokens and an array of raw token ids
107 |     """
108 | 
109 |     __slots__ = ('token_dict', 'page_tokens', 'htmlpage')
110 | 
111 |     def __init__(self, htmlpage, token_dict, page_tokens):
112 |         self.htmlpage = htmlpage
113 |         self.token_dict = token_dict
114 |         # use a numpy array because we can index/slice easily and efficiently
115 |         if not isinstance(page_tokens, ndarray):
116 |             page_tokens = array(page_tokens)
117 |         self.page_tokens = page_tokens
118 | 
119 | 
120 | class TemplatePage(Page):
121 |     __slots__ = ('annotations', 'id', 'ignored_regions', 'extra_required_attrs')
122 | 
123 |     def __init__(self, htmlpage, token_dict, page_tokens, annotations, \
124 |             template_id=None, ignored_regions=None, extra_required=None):
125 |         Page.__init__(self, htmlpage, token_dict, page_tokens)
126 |         # ensure order is the same as start tag order in the original page
127 |         annotations = sorted(annotations, key=lambda x: x.end_index, reverse=True)
128 |         self.annotations = sorted(annotations, key=lambda x: x.start_index)
129 |         self.id = template_id
130 |         self.ignored_regions = [i if isinstance(i, PageRegion) else PageRegion(*i) \
131 |             for i in (ignored_regions or [])]
132 |         self.extra_required_attrs = set(extra_required or [])
133 | 
134 |     def __str__(self):
135 |         summary = []
136 |         for index, token in enumerate(self.page_tokens):
137 |             text = "%s: %s" % (index, self.token_dict.find_token(token))
138 |             summary.append(text)
139 |         return "TemplatePage\n============\nTokens: (index, token)\n%s\nAnnotations: %s\n" % \
140 |                 ('\n'.join(summary), '\n'.join(map(str, self.annotations)))
141 | 
142 | 
143 | class ExtractionPage(Page):
144 |     """Parsed data belonging to a web page upon which we wish to perform
145 |     extraction.
146 |     """
147 |     __slots__ = ('token_page_indexes', )
148 | 
149 |     def __init__(self, htmlpage, token_dict, page_tokens, token_page_indexes):
150 |         """Construct a new ExtractionPage
151 | 
152 |         Arguments:
153 |             `htmlpage`: The source HtmlPage
154 |             `token_dict`: Token Dictionary used for tokenization
155 |             `page_tokens': array of page tokens for matching
156 |             `token_page_indexes`: indexes of each token in the parsed htmlpage
157 |         """
158 |         Page.__init__(self, htmlpage, token_dict, page_tokens)
159 |         self.token_page_indexes = token_page_indexes
160 | 
161 |     def htmlpage_region(self, start_token_index, end_token_index):
162 |         """The region in the HtmlPage corresponding to the area defined by
163 |         the start_token_index and the end_token_index
164 | 
165 |         This includes the tokens at the specified indexes
166 |         """
167 |         start = self.token_page_indexes[start_token_index]
168 |         end = self.token_page_indexes[end_token_index]
169 |         return self.htmlpage.subregion(start, end)
170 | 
171 |     def htmlpage_region_inside(self, start_token_index, end_token_index):
172 |         """The region in the HtmlPage corresponding to the area between
173 |         the start_token_index and the end_token_index.
174 | 
175 |         This excludes the tokens at the specified indexes
176 |         """
177 |         start = self.token_page_indexes[start_token_index] + 1
178 |         end = self.token_page_indexes[end_token_index] - 1
179 |         return self.htmlpage.subregion(start, end)
180 | 
181 |     def htmlpage_tag(self, token_index):
182 |         """The HtmlPage tag at corresponding to the token at token_index"""
183 |         return self.htmlpage.parsed_body[self.token_page_indexes[token_index]]
184 | 
185 |     def __str__(self):
186 |         summary = []
187 |         for token, tindex in zip(self.page_tokens, self.token_page_indexes):
188 |             text = "%s page[%s]: %s" % (self.token_dict.find_token(token),
189 |                 tindex, self.htmlpage.parsed_body[tindex])
190 |             summary.append(text)
191 |         return "ExtractionPage\n==============\nTokens: %s\n\nRaw text: %s\n\n" \
192 |                 % ('\n'.join(summary), self.htmlpage.body)
193 | 
194 | 
195 | class AnnotationText(object):
196 |     __slots__ = ('start_text', 'follow_text')
197 | 
198 |     def __init__(self, start_text=None, follow_text=None):
199 |         self.start_text = start_text
200 |         self.follow_text = follow_text
201 | 
202 |     def __str__(self):
203 |         return "AnnotationText(%s..%s)" % \
204 |                 (repr(self.start_text), repr(self.follow_text))
205 | 
206 | 
207 | class AnnotationTag(PageRegion):
208 |     """A tag that annotates part of the document
209 | 
210 |     It has the following properties:
211 |         start_index - index of the token for the opening tag
212 |         end_index - index of the token for the closing tag
213 |         surrounds_attribute - the attribute name surrounded by this tag
214 |         tag_attributes - list of (tag attribute, extracted attribute) tuples
215 |                          for each item to be extracted from a tag attribute
216 |         annotation_text - text prefix and suffix for the attribute to be extracted
217 |         metadata - dict with annotation data not used by IBL extractor
218 |     """
219 |     __slots__ = ('surrounds_attribute', 'start_index', 'end_index',
220 |             'tag_attributes', 'annotation_text', 'variant_id',
221 |             'metadata')
222 | 
223 |     def __init__(self, start_index, end_index, surrounds_attribute=None,
224 |             annotation_text=None, tag_attributes=None, variant_id=None):
225 |         PageRegion.__init__(self, start_index, end_index)
226 |         self.surrounds_attribute = surrounds_attribute
227 |         self.annotation_text = annotation_text
228 |         self.tag_attributes = tag_attributes or []
229 |         self.variant_id = variant_id
230 |         self.metadata = {}
231 | 
232 |     def __str__(self):
233 |         return "AnnotationTag(%s)" % ", ".join(
234 |                 ["%s=%s" % (s, getattr(self, s)) \
235 |                 for s in self.__slots__ if getattr(self, s)])
236 | 
237 |     def __repr__(self):
238 |         return str(self)
239 | 
240 | 


--------------------------------------------------------------------------------
/scrapely/extraction/pageparsing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Page parsing
  3 | 
  4 | Parsing of web pages for extraction task.
  5 | """
  6 | import json
  7 | from collections import defaultdict
  8 | from numpy import array
  9 | 
 10 | from scrapely.htmlpage import HtmlTagType, HtmlTag, HtmlPage
 11 | from scrapely.extraction.pageobjects import (AnnotationTag,
 12 |     TemplatePage, ExtractionPage, AnnotationText, TokenDict)
 13 | 
 14 | 
 15 | def parse_strings(template_html, extraction_html):
 16 |     """Create a template and extraction page from raw strings
 17 | 
 18 |     this is useful for testing purposes
 19 |     """
 20 |     t = TokenDict()
 21 |     template_page = HtmlPage(body=template_html)
 22 |     extraction_page = HtmlPage(body=extraction_html)
 23 |     return (parse_template(t, template_page),
 24 |             parse_extraction_page(t, extraction_page))
 25 | 
 26 | 
 27 | def parse_template(token_dict, template_html):
 28 |     """Create an TemplatePage object by parsing the annotated html"""
 29 |     parser = TemplatePageParser(token_dict)
 30 |     parser.feed(template_html)
 31 |     return parser.to_template()
 32 | 
 33 | 
 34 | def parse_extraction_page(token_dict, page_html):
 35 |     """Create an ExtractionPage object by parsing the html"""
 36 |     parser = ExtractionPageParser(token_dict)
 37 |     parser.feed(page_html)
 38 |     return parser.to_extraction_page()
 39 | 
 40 | 
 41 | class InstanceLearningParser(object):
 42 |     """Base parser for instance based learning algorithm
 43 | 
 44 |     This does not require correct HTML and the parsing method should not alter
 45 |     the original tag order. It is important that parsing results do not vary.
 46 |     """
 47 |     def __init__(self, token_dict):
 48 |         self.token_dict = token_dict
 49 |         self.token_list = []
 50 | 
 51 |     def _add_token(self, token, token_type, start, end):
 52 |         tid = self.token_dict.tokenid(token, token_type)
 53 |         self.token_list.append(tid)
 54 | 
 55 |     def feed(self, html_page):
 56 |         self.html_page = html_page
 57 |         self.previous_element_class = None
 58 |         for index, data in enumerate(html_page.parsed_body):
 59 |             if isinstance(data, HtmlTag):
 60 |                 self._add_token(data.tag, data.tag_type, data.start, data.end)
 61 |                 self.handle_tag(data, index)
 62 |             else:
 63 |                 self.handle_data(data, index)
 64 |             self.previous_element_class = data.__class__
 65 | 
 66 |     def handle_data(self, html_data_fragment, index):
 67 |         pass
 68 | 
 69 |     def handle_tag(self, html_tag, index):
 70 |         pass
 71 | 
 72 | 
 73 | _END_UNPAIREDTAG_TAGS = ["form", "div", "p", "table", "tr", "td"]
 74 | _AUTO_CLOSE_TAGS_ON_OPEN = {
 75 |     # the given keys closes the tags in the list
 76 |     "p": ["p"],
 77 |     "option": ["option"],
 78 | }
 79 | _AUTO_CLOSE_TAGS_ON_CLOSE = {
 80 |     "select": ["option"],
 81 | }
 82 | 
 83 | 
 84 | class TemplatePageParser(InstanceLearningParser):
 85 |     """Template parsing for instance based learning algorithm"""
 86 | 
 87 |     def __init__(self, token_dict):
 88 |         InstanceLearningParser.__init__(self, token_dict)
 89 |         self.annotations = []
 90 |         self.ignored_regions = []
 91 |         self.extra_required_attrs = []
 92 |         self.ignored_tag_stacks = defaultdict(list)
 93 |         # tag names that have not been completed
 94 |         self.labelled_tag_stacks = defaultdict(list)
 95 |         self.replacement_stacks = defaultdict(list)
 96 |         self.unpairedtag_stack = []
 97 |         self.variant_stack = []
 98 |         self.prev_data = None
 99 |         self.last_text_region = None
100 |         self.next_tag_index = 0
101 | 
102 |     def handle_tag(self, html_tag, index):
103 |         if self.last_text_region:
104 |             self._process_text('')
105 | 
106 |         if html_tag.tag_type == HtmlTagType.OPEN_TAG:
107 |             self._handle_open_tag(html_tag)
108 |         elif html_tag.tag_type == HtmlTagType.CLOSE_TAG:
109 |             self._handle_close_tag(html_tag)
110 |         else:
111 |             # the tag is not paired, it can contain only attribute annotations
112 |             self._handle_unpaired_tag(html_tag)
113 | 
114 |     @staticmethod
115 |     def _read_template_annotation(html_tag):
116 |         template_attr = html_tag.attributes.get('data-scrapy-annotate')
117 |         if template_attr is None:
118 |             return None
119 |         unescaped = template_attr.replace('&quot;', '"')
120 |         return json.loads(unescaped)
121 | 
122 |     @staticmethod
123 |     def _read_bool_template_attribute(html_tag, attribute):
124 |         return html_tag.attributes.get("data-scrapy-" + attribute) == "true"
125 | 
126 |     def _close_unpaired_tag(self):
127 |         self.unpairedtag_stack[0].end_index = self.next_tag_index
128 |         self.unpairedtag_stack = []
129 | 
130 |     def _handle_unpaired_tag(self, html_tag):
131 |         if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img":
132 |             self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
133 |         elif self._read_bool_template_attribute(html_tag, "ignore-beneath"):
134 |             self.ignored_regions.append((self.next_tag_index, None))
135 |         jannotation = self._read_template_annotation(html_tag)
136 |         if jannotation:
137 |             if self.unpairedtag_stack:
138 |                 self._close_unpaired_tag()
139 | 
140 |             annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
141 |             attribute_annotations = jannotation.pop('annotations', {}).items()
142 |             content_key = jannotation.pop('text-content', 'content')
143 |             for extract_attribute, tag_value in attribute_annotations:
144 |                 if extract_attribute == content_key:
145 |                     annotation.surrounds_attribute = tag_value
146 |                     self.unpairedtag_stack.append(annotation)
147 |                 else:
148 |                     annotation.tag_attributes.append((extract_attribute, tag_value))
149 |             self.annotations.append(annotation)
150 | 
151 |             self.extra_required_attrs.extend(jannotation.pop('required', []))
152 |             variant_id = jannotation.pop('variant', 0)
153 |             if variant_id > 0:
154 |                 annotation.variant_id = variant_id
155 |             assert jannotation.pop("generated", False) == False
156 |             annotation.metadata = jannotation
157 | 
158 |         self.next_tag_index += 1
159 | 
160 |     def _handle_open_tag(self, html_tag):
161 |         if self._read_bool_template_attribute(html_tag, "ignore"):
162 |             if html_tag.tag == "img":
163 |                 self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1))
164 |             else:
165 |                 self.ignored_regions.append((self.next_tag_index, None))
166 |                 self.ignored_tag_stacks[html_tag.tag].append(html_tag)
167 | 
168 |         elif self.ignored_tag_stacks.get(html_tag.tag):
169 |             self.ignored_tag_stacks[html_tag.tag].append(None)
170 |         if self._read_bool_template_attribute(html_tag, "ignore-beneath"):
171 |             self.ignored_regions.append((self.next_tag_index, None))
172 | 
173 |         replacement = html_tag.attributes.pop("data-scrapy-replacement", None)
174 |         if replacement:
175 |             self.token_list.pop()
176 |             self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end)
177 |             self.replacement_stacks[html_tag.tag].append(replacement)
178 |         elif html_tag.tag in self.replacement_stacks:
179 |             self.replacement_stacks[html_tag.tag].append(None)
180 | 
181 |         if self.unpairedtag_stack:
182 |             if html_tag.tag in _END_UNPAIREDTAG_TAGS:
183 |                 self._close_unpaired_tag()
184 |             else:
185 |                 self.unpairedtag_stack.append(html_tag.tag)
186 | 
187 |         tagname = replacement or self._update_replacement_stack(html_tag)
188 |         self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN)
189 | 
190 |         jannotation = self._read_template_annotation(html_tag)
191 |         if not jannotation:
192 |             if tagname in self.labelled_tag_stacks:
193 |                 # add this tag to the stack to match correct end tag
194 |                 self.labelled_tag_stacks[tagname].append(None)
195 |             self.next_tag_index += 1
196 |             return
197 | 
198 |         annotation = AnnotationTag(self.next_tag_index, None)
199 |         if jannotation.pop('generated', False):
200 |             self.token_list.pop()
201 |             annotation.start_index -= 1
202 |             if self.previous_element_class == HtmlTag:
203 |                 annotation.annotation_text = AnnotationText('')
204 |             else:
205 |                 annotation.annotation_text = AnnotationText(self.prev_data)
206 |             if self._read_bool_template_attribute(html_tag, "ignore") \
207 |                     or self._read_bool_template_attribute(html_tag, "ignore-beneath"):
208 |                 ignored = self.ignored_regions.pop()
209 |                 self.ignored_regions.append((ignored[0]-1, ignored[1]))
210 | 
211 |         self.extra_required_attrs.extend(jannotation.pop('required', []))
212 | 
213 |         attribute_annotations = jannotation.pop('annotations', {}).items()
214 |         content_key = jannotation.pop('text-content', 'content')
215 |         for extract_attribute, tag_value in attribute_annotations:
216 |             if extract_attribute == content_key:
217 |                 annotation.surrounds_attribute = tag_value
218 |             else:
219 |                 annotation.tag_attributes.append((extract_attribute, tag_value))
220 | 
221 |         variant_id = jannotation.pop('variant', 0)
222 |         if variant_id > 0:
223 |             if annotation.surrounds_attribute is not None:
224 |                 self.variant_stack.append(variant_id)
225 |             else:
226 |                 annotation.variant_id = variant_id
227 | 
228 |         annotation.metadata = jannotation
229 | 
230 |         if annotation.annotation_text is None:
231 |             self.next_tag_index += 1
232 |         if self.variant_stack and annotation.variant_id is None:
233 |             variant_id = self.variant_stack[-1]
234 |             if variant_id == '0':
235 |                 variant_id = None
236 |             annotation.variant_id = variant_id
237 | 
238 |         # look for a closing tag if the content is important
239 |         if annotation.surrounds_attribute:
240 |             self.labelled_tag_stacks[tagname].append(annotation)
241 |         else:
242 |             annotation.end_index = annotation.start_index + 1
243 |             self.annotations.append(annotation)
244 | 
245 |     def _handle_close_tag(self, html_tag):
246 | 
247 |         if self.unpairedtag_stack:
248 |             if html_tag.tag == self.unpairedtag_stack[-1]:
249 |                 self.unpairedtag_stack.pop()
250 |             else:
251 |                 self._close_unpaired_tag()
252 | 
253 |         ignored_tags = self.ignored_tag_stacks.get(html_tag.tag)
254 |         if ignored_tags is not None:
255 |             tag = ignored_tags.pop()
256 |             if isinstance(tag, HtmlTag):
257 |                 for i in range(-1, -len(self.ignored_regions) - 1, -1):
258 |                     if self.ignored_regions[i][1] is None:
259 |                         self.ignored_regions[i] = (self.ignored_regions[i][0], self.next_tag_index)
260 |                         break
261 |             if len(ignored_tags) == 0:
262 |                 del self.ignored_tag_stacks[html_tag.tag]
263 | 
264 |         tagname = self._update_replacement_stack(html_tag)
265 |         self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_CLOSE)
266 | 
267 |         labelled_tags = self.labelled_tag_stacks.get(tagname)
268 |         if labelled_tags is None:
269 |             self.next_tag_index += 1
270 |             return
271 |         annotation = labelled_tags.pop()
272 |         if annotation is None:
273 |             self.next_tag_index += 1
274 |         else:
275 |             annotation.end_index = self.next_tag_index
276 |             self.annotations.append(annotation)
277 |             if annotation.annotation_text is not None:
278 |                 self.token_list.pop()
279 |                 self.last_text_region = annotation
280 |             else:
281 |                 self.next_tag_index += 1
282 |             if len(labelled_tags) == 0:
283 |                 del self.labelled_tag_stacks[tagname]
284 |             if annotation.variant_id and self.variant_stack:
285 |                 prev = self.variant_stack.pop()
286 |                 if prev != annotation.variant_id:
287 |                     raise ValueError("unbalanced variant annotation tags")
288 | 
289 |     def _update_replacement_stack(self, html_tag):
290 |         replacement = html_tag.tag
291 |         if html_tag.tag in self.replacement_stacks:
292 |             replacement = self.replacement_stacks[html_tag.tag].pop()
293 |             if replacement:
294 |                 self.token_list.pop()
295 |                 self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end)
296 |             if len(self.replacement_stacks[html_tag.tag]) == 0:
297 |                 del self.replacement_stacks[html_tag.tag]
298 |         return replacement
299 | 
300 |     def _handle_unclosed_tags(self, tagname, auto_close_tags):
301 |         """I.e. can't be a p inside another p. Also, an open p element closes
302 |         a previous open p element"""
303 |         if tagname in auto_close_tags:
304 |             for _close_tag in auto_close_tags[tagname]:
305 |                 if _close_tag in self.labelled_tag_stacks:
306 |                     annotation = self.labelled_tag_stacks.pop(_close_tag)[0]
307 |                     annotation.end_index = self.next_tag_index
308 |                     self.annotations.append(annotation)
309 |                     break
310 |         return tagname
311 | 
312 |     def handle_data(self, html_data_fragment, index):
313 |         fragment_text = self.html_page.fragment_data(html_data_fragment)
314 |         self._process_text(fragment_text)
315 | 
316 |     def _process_text(self, text):
317 |         if self.last_text_region is not None:
318 |             self.last_text_region.annotation_text.follow_text = text
319 |             self.last_text_region = None
320 |         self.prev_data = text
321 | 
322 |     def to_template(self):
323 |         """create a TemplatePage from the data fed to this parser"""
324 |         return TemplatePage(self.html_page, self.token_dict, self.token_list, self.annotations,
325 |                 self.html_page.page_id, self.ignored_regions, self.extra_required_attrs)
326 | 
327 | 
328 | class ExtractionPageParser(InstanceLearningParser):
329 |     """Parse an HTML page for extraction using the instance based learning
330 |     algorithm
331 | 
332 |     This needs to extract the tokens in a similar way to LabelledPageParser,
333 |     it needs to also maintain a mapping from token index to the original content
334 |     so that once regions are identified, the original content can be extracted.
335 |     """
336 |     def __init__(self, token_dict):
337 |         InstanceLearningParser.__init__(self, token_dict)
338 |         self._page_token_indexes = []
339 | 
340 |     def handle_tag(self, html_tag, index):
341 |         self._page_token_indexes.append(index)
342 | 
343 |     def to_extraction_page(self):
344 |         return ExtractionPage(self.html_page, self.token_dict, array(self.token_list),
345 |                 self._page_token_indexes)
346 | 


--------------------------------------------------------------------------------
/scrapely/extraction/regionextract.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Region Extract
  3 | 
  4 | Custom extraction for regions in a document
  5 | """
  6 | import re
  7 | import operator
  8 | import copy
  9 | import pprint
 10 | import six
 11 | 
 12 | from itertools import groupby, starmap
 13 | 
 14 | from numpy import array
 15 | 
 16 | from six.moves import zip as izip, xrange, StringIO
 17 | 
 18 | from scrapely.descriptor import FieldDescriptor
 19 | from scrapely.htmlpage import HtmlPageRegion
 20 | from scrapely.extraction.similarity import (
 21 |     similar_region, longest_unique_subsequence, common_prefix)
 22 | from scrapely.extraction.pageobjects import (
 23 |     AnnotationTag, PageRegion, FragmentedHtmlPageRegion)
 24 | 
 25 | _EXTRACT_HTML = lambda x: x
 26 | _DEFAULT_DESCRIPTOR = FieldDescriptor('none', None)
 27 | 
 28 | __all__ = ['BasicTypeExtractor',
 29 |            'TraceExtractor',
 30 |            'RepeatedDataExtractor',
 31 |            'AdjacentVariantExtractor',
 32 |            'RecordExtractor',
 33 |            'TemplatePageExtractor',
 34 |            'TextRegionDataExtractor',
 35 |            'attrs2dict',
 36 |            'labelled_element']
 37 | 
 38 | 
 39 | def _int_cmp(a, op, b):
 40 |     op = getattr(operator, op)
 41 |     a = -float('inf') if a is None else a
 42 |     b = -float('inf') if b is None else b
 43 |     return op(a, b)
 44 | 
 45 | 
 46 | def labelled_element(obj):
 47 |     """
 48 |     Returns labelled element of the object (extractor or labelled region)
 49 |     """
 50 |     return getattr(obj, 'annotation', obj)
 51 | 
 52 | 
 53 | def _compose(f, g):
 54 |     """given unary functions f and g, return a function that computes f(g(x))
 55 |     """
 56 |     def _exec(x):
 57 |         ret = g(x)
 58 |         return f(ret) if ret is not None else None
 59 |     return _exec
 60 | 
 61 | 
 62 | class BasicTypeExtractor(object):
 63 |     """The BasicTypeExtractor extracts single attributes corresponding to
 64 |     annotations.
 65 | 
 66 |     For example:
 67 |     >>> from scrapely.extraction.pageparsing import parse_strings
 68 |     >>> template, page = parse_strings( \
 69 |         u'<h1 data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x</h1>', u'<h1> a name</h1>')
 70 |     >>> ex = BasicTypeExtractor(template.annotations[0])
 71 |     >>> ex.extract(page, 0, 1, None)
 72 |     [(u'name', u' a name')]
 73 | 
 74 |     It supports attribute descriptors
 75 |     >>> descriptor = FieldDescriptor('name', None, lambda x: x.strip())
 76 |     >>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor})
 77 |     >>> ex.extract(page, 0, 1, None)
 78 |     [(u'name', u'a name')]
 79 | 
 80 |     It supports ignoring regions
 81 |     >>> template, page = parse_strings(\
 82 |         u'<div data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x<b> xx</b></div>',\
 83 |         u'<div>a name<b> id-9</b></div>')
 84 |     >>> ex = BasicTypeExtractor(template.annotations[0])
 85 |     >>> ex.extract(page, 0, 3, [PageRegion(1, 2)])
 86 |     [(u'name', u'a name')]
 87 |     """
 88 | 
 89 |     def __init__(self, annotation, attribute_descriptors=None):
 90 |         self.annotation = annotation
 91 |         if attribute_descriptors is None:
 92 |             attribute_descriptors = {}
 93 | 
 94 |         if annotation.surrounds_attribute:
 95 |             descriptor = attribute_descriptors.get(annotation.surrounds_attribute)
 96 |             if descriptor:
 97 |                 self.content_validate = descriptor.extractor
 98 |             else:
 99 |                 self.content_validate = _EXTRACT_HTML
100 |             self.extract = self._extract_content
101 | 
102 |         if annotation.tag_attributes:
103 |             self.tag_data = []
104 |             for (tag_attr, extraction_attr) in annotation.tag_attributes:
105 |                 descriptor = attribute_descriptors.get(extraction_attr)
106 |                 extractf = descriptor.extractor if descriptor else _EXTRACT_HTML
107 |                 self.tag_data.append((extractf, tag_attr, extraction_attr))
108 | 
109 |             self.extract = self._extract_both if \
110 |                     annotation.surrounds_attribute else self._extract_attribute
111 | 
112 |     def _extract_both(self, page, start_index, end_index, ignored_regions=None, **kwargs):
113 |         return self._extract_content(page, start_index, end_index, ignored_regions) + \
114 |             self._extract_attribute(page, start_index, end_index, ignored_regions)
115 | 
116 |     def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
117 |         """extract content between annotation indexes"""
118 |         if ignored_regions and (_int_cmp(start_index, 'le', ignored_regions[0].start_index) and
119 |                                 _int_cmp(end_index, 'ge', ignored_regions[-1].end_index)):
120 |             starts = [start_index] + [i.end_index for i in ignored_regions if i.end_index is not None]
121 |             ends = [i.start_index for i in ignored_regions]
122 |             if starts[-1] is not None:
123 |                 ends.append(end_index)
124 |             included_regions = izip(starts, ends)
125 |             if ends[0] is None:
126 |                 included_regions.next()
127 |             regions = starmap(extraction_page.htmlpage_region_inside, included_regions)
128 |             region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions))
129 |         else:
130 |             region = extraction_page.htmlpage_region_inside(start_index, end_index)
131 |         validated = self.content_validate(region)
132 |         return [(self.annotation.surrounds_attribute, validated)] if validated else []
133 | 
134 |     def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
135 |         data = []
136 |         for (f, ta, ea) in self.tag_data:
137 |             tag_value = extraction_page.htmlpage_tag(start_index).attributes.get(ta)
138 |             if tag_value:
139 |                 region = HtmlPageRegion(extraction_page.htmlpage, tag_value)
140 |                 extracted = f(region)
141 |                 if extracted is not None:
142 |                     data.append((ea, extracted))
143 |         return data
144 | 
145 |     @classmethod
146 |     def create(cls, annotations, attribute_descriptors=None):
147 |         """Create a list of basic extractors from the given annotations
148 |         and attribute descriptors
149 |         """
150 |         if attribute_descriptors is None:
151 |             attribute_descriptors = {}
152 |         return [cls._create_basic_extractor(annotation, attribute_descriptors) \
153 |             for annotation in annotations \
154 |             if annotation.surrounds_attribute or annotation.tag_attributes]
155 | 
156 |     @staticmethod
157 |     def _create_basic_extractor(annotation, attribute_descriptors):
158 |         """Create a basic type extractor for the annotation"""
159 |         text_region = annotation.annotation_text
160 |         if text_region is not None:
161 |             region_extract = TextRegionDataExtractor(text_region.start_text,
162 |                 text_region.follow_text).extract
163 |             # copy attribute_descriptors and add the text extractor
164 |             descriptor_copy = dict(attribute_descriptors)
165 |             attr_descr = descriptor_copy.get(annotation.surrounds_attribute,
166 |                     _DEFAULT_DESCRIPTOR)
167 |             attr_descr = copy.copy(attr_descr)
168 |             attr_descr.extractor = _compose(attr_descr.extractor, region_extract)
169 |             descriptor_copy[annotation.surrounds_attribute] = attr_descr
170 |             attribute_descriptors = descriptor_copy
171 |         return BasicTypeExtractor(annotation, attribute_descriptors)
172 | 
173 |     def extracted_item(self):
174 |         """key used to identify the item extracted"""
175 |         return (self.annotation.surrounds_attribute, self.annotation.tag_attributes)
176 | 
177 |     def __repr__(self):
178 |         return str(self)
179 | 
180 |     def __str__(self):
181 |         messages = ['BasicTypeExtractor(']
182 |         if self.annotation.surrounds_attribute:
183 |             messages.append(self.annotation.surrounds_attribute)
184 |             if self.content_validate != _EXTRACT_HTML:
185 |                 messages += [', extracted with \'',
186 |                         self.content_validate.__name__, '\'']
187 | 
188 |         if self.annotation.tag_attributes:
189 |             if self.annotation.surrounds_attribute:
190 |                 messages.append(';')
191 |             for (f, ta, ea) in self.tag_data:
192 |                 messages += [ea, ': tag attribute "', ta, '"']
193 |                 if f != _EXTRACT_HTML:
194 |                     messages += [', validated by ', str(f)]
195 |         messages.append(", template[%s:%s])" % \
196 |                 (self.annotation.start_index, self.annotation.end_index))
197 |         return ''.join(messages)
198 | 
199 | 
200 | class RepeatedDataExtractor(object):
201 |     """Data extractor for handling repeated data"""
202 | 
203 |     def __init__(self, prefix, suffix, extractors):
204 |         self.prefix = array(prefix)
205 |         self.suffix = array(suffix)
206 |         self.extractor = copy.copy(extractors[0])
207 |         self.annotation = copy.copy(self.extractor.annotation)
208 |         self.annotation.end_index = extractors[-1].annotation.end_index
209 | 
210 |     def extract(self, page, start_index, end_index, ignored_regions, **kwargs):
211 |         """repeatedly find regions bounded by the repeated
212 |         prefix and suffix and extract them
213 |         """
214 |         prefixlen = len(self.prefix)
215 |         suffixlen = len(self.suffix)
216 |         index = max(0, start_index - prefixlen)
217 |         max_index = min(len(page.page_tokens) - suffixlen, end_index + len(self.suffix))
218 |         max_start_index = max_index - prefixlen
219 |         extracted = []
220 |         while index <= max_start_index:
221 |             prefix_end = index + prefixlen
222 |             if (page.page_tokens[index:prefix_end] == self.prefix).all():
223 |                 for peek in xrange(prefix_end, max_index + 1):
224 |                     if (page.page_tokens[peek:peek + suffixlen] \
225 |                             == self.suffix).all():
226 |                         extracted += self.extractor.extract(page,
227 |                                 prefix_end - 1, peek, ignored_regions, suffix_max_length=suffixlen)
228 |                         index = max(peek, index + 1)
229 |                         break
230 |                 else:
231 |                     break
232 |             else:
233 |                 index += 1
234 |         return extracted
235 | 
236 |     @staticmethod
237 |     def apply(template, extractors):
238 |         tokens = template.page_tokens
239 |         output_extractors = []
240 |         group_key = lambda x: (x.extracted_item(), x.annotation.variant_id)
241 |         for extr_key, extraction_group in groupby(extractors, group_key):
242 |             extraction_group = list(extraction_group)
243 |             if extr_key is None or len(extraction_group) == 1:
244 |                 output_extractors += extraction_group
245 |                 continue
246 | 
247 |             separating_tokens = [ \
248 |                 tokens[x.annotation.end_index:y.annotation.start_index+1] \
249 |                 for (x, y) in zip(extraction_group[:-1], extraction_group[1:])]
250 | 
251 |             # calculate the common prefix
252 |             group_start = extraction_group[0].annotation.start_index
253 |             prefix_start = max(0, group_start - len(separating_tokens[0]))
254 |             first_prefix = tokens[prefix_start:group_start+1]
255 |             prefixes = [first_prefix] + separating_tokens
256 |             prefix_pattern = list(reversed(
257 |                 common_prefix(*map(reversed, prefixes))))
258 | 
259 |             # calculate the common suffix
260 |             group_end = extraction_group[-1].annotation.end_index
261 |             last_suffix = tokens[group_end:group_end + \
262 |                     len(separating_tokens[-1])]
263 |             suffixes = separating_tokens + [last_suffix]
264 |             suffix_pattern = common_prefix(*suffixes)
265 | 
266 |             # create a repeated data extractor, if there is a suitable
267 |             # prefix and suffix. (TODO: tune this heuristic)
268 |             matchlen = len(prefix_pattern) + len(suffix_pattern)
269 |             if matchlen >= len(separating_tokens):
270 |                 group_extractor = RepeatedDataExtractor(prefix_pattern,
271 |                     suffix_pattern, extraction_group)
272 |                 output_extractors.append(group_extractor)
273 |             else:
274 |                 output_extractors += extraction_group
275 |         return output_extractors
276 | 
277 |     def extracted_item(self):
278 |         """key used to identify the item extracted"""
279 |         return self.extractor.extracted_item()
280 | 
281 |     def __repr__(self):
282 |         return "Repeat(%r)" % self.extractor
283 | 
284 |     def __str__(self):
285 |         return "Repeat(%s)" % self.extractor
286 | 
287 | 
288 | class TransposedDataExtractor(object):
289 |     """ """
290 |     pass
291 | 
292 | 
293 | _namef = operator.itemgetter(0)
294 | _valuef = operator.itemgetter(1)
295 | def attrs2dict(attributes):
296 |     """convert a list of attributes (name, value) tuples
297 |     into a dict of lists.
298 | 
299 |     For example:
300 |     >>> l = [('name', 'sofa'), ('colour', 'red'), ('colour', 'green')]
301 |     >>> attrs2dict(l) == {'name': ['sofa'], 'colour': ['red', 'green']}
302 |     True
303 |     """
304 |     grouped_data = groupby(sorted(attributes, key=_namef), _namef)
305 |     return dict((name, list(map(_valuef, data))) for (name, data)  in grouped_data)
306 | 
307 | 
308 | class RecordExtractor(object):
309 |     """The RecordExtractor will extract records given annotations.
310 | 
311 |     It looks for a similar region in the target document, using the ibl
312 |     similarity algorithm. The annotations are partitioned by the first similar
313 |     region found and searched recursively.
314 | 
315 |     Records are represented as dicts mapping attribute names to lists
316 |     containing their values.
317 | 
318 |     For example:
319 |     >>> from scrapely.extraction.pageparsing import parse_strings
320 |     >>> template, page = parse_strings( \
321 |             u'<h1 data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x</h1>' + \
322 |             u'<p data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">y</p>', \
323 |             u'<h1>name</h1> <p>description</p>')
324 |     >>> basic_extractors = list(map(BasicTypeExtractor, template.annotations))
325 |     >>> ex = RecordExtractor.apply(template, basic_extractors)[0]
326 |     >>> ex.extract(page) == [{u'description': [u'description'], u'name': [u'name']}]
327 |     True
328 |     """
329 | 
330 |     def __init__(self, extractors, template_tokens):
331 |         """Construct a RecordExtractor for the given annotations and their
332 |         corresponding region extractors
333 |         """
334 |         self.extractors = extractors
335 |         self.template_tokens = template_tokens
336 |         self.template_ignored_regions = []
337 |         start_index = min(e.annotation.start_index for e in extractors)
338 |         end_index = max(e.annotation.end_index for e in extractors)
339 |         self.annotation = AnnotationTag(start_index, end_index)
340 |         self.best_match = longest_unique_subsequence
341 | 
342 |     def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
343 |         """extract data from an extraction page
344 | 
345 |         The region in the page to be extracted from may be specified using
346 |         start_index and end_index
347 |         """
348 |         if ignored_regions is None:
349 |             ignored_regions = []
350 |         extractors = sorted(self.extractors + ignored_regions, key=lambda x: labelled_element(x).start_index)
351 |         _, _, attributes = self._doextract(page, extractors, start_index, end_index, **kwargs)
352 |         # collect variant data, maintaining the order of variants
353 |         variant_ids = []; variants = {}; items = []
354 |         for k, v in attributes:
355 |             if isinstance(k, six.integer_types):
356 |                 if k in variants:
357 |                     variants[k] += v
358 |                 else:
359 |                     variant_ids.append(k)
360 |                     variants[k] = v
361 |             else:
362 |                 items.append((k, v))
363 | 
364 |         variant_records = [('variants', attrs2dict(variants[vid])) \
365 |                 for vid in variant_ids]
366 |         items += variant_records
367 |         return [attrs2dict(items)]
368 | 
369 |     def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs):
370 |         """Carry out extraction of records using the given annotations
371 |         in the page tokens bounded by start_index and end_index
372 |         """
373 |         # reorder extractors leaving nested ones for the end and separating
374 |         # ignore regions
375 |         nested_regions = nested_regions or []
376 |         ignored_regions = ignored_regions or []
377 |         current_extractor, following_extractors = extractors[0], extractors[1:]
378 |         while (following_extractors and
379 |                _int_cmp(labelled_element(following_extractors[0]).start_index, 'lt',
380 |                         labelled_element(current_extractor).end_index)):
381 |             ex = following_extractors.pop(0)
382 |             labelled = labelled_element(ex)
383 |             if (isinstance(labelled, AnnotationTag) or
384 |                 (nested_regions and
385 |                  _int_cmp(labelled_element(nested_regions[-1]).start_index, 'lt', labelled.start_index) and
386 |                  _int_cmp(labelled.start_index, 'lt', labelled_element(nested_regions[-1]).end_index))):
387 |                 nested_regions.append(ex)
388 |             else:
389 |                 ignored_regions.append(ex)
390 |         extracted_data = []
391 |         # end_index is inclusive, but similar_region treats it as exclusive
392 |         end_index_exclusive = None if end_index is None else end_index + 1
393 |         labelled = labelled_element(current_extractor)
394 |         score, pindex, sindex = \
395 |             similar_region(page.page_tokens, self.template_tokens,
396 |                 labelled, start_index, end_index_exclusive, self.best_match, **kwargs)
397 |         if score > 0:
398 |             if isinstance(labelled, AnnotationTag):
399 |                 similar_ignored_regions = []
400 |                 start = pindex
401 |                 for i in ignored_regions:
402 |                     s, p, e = similar_region(page.page_tokens, self.template_tokens,
403 |                               i, start, sindex, self.best_match, **kwargs)
404 |                     if s > 0:
405 |                         similar_ignored_regions.append(PageRegion(p, e))
406 |                         start = e or start
407 |                 extracted_data = current_extractor.extract(page, pindex, sindex, similar_ignored_regions, **kwargs)
408 |                 if extracted_data:
409 |                     if current_extractor.annotation.variant_id:
410 |                         extracted_data = [(current_extractor.annotation.variant_id, extracted_data)]
411 | 
412 |             if nested_regions:
413 |                 _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs)
414 |                 extracted_data += nested_data
415 |             if following_extractors:
416 |                 _, _, following_data = self._doextract(page, following_extractors, sindex or start_index, end_index, **kwargs)
417 |                 extracted_data += following_data
418 | 
419 |         elif following_extractors:
420 |             end_index, _, following_data = self._doextract(page, following_extractors, start_index, end_index, **kwargs)
421 |             if end_index is not None:
422 |                 pindex, sindex, extracted_data = self._doextract(page, [current_extractor], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs)
423 |             extracted_data += following_data
424 |         elif nested_regions:
425 |             _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs)
426 |             extracted_data += nested_data
427 |         return pindex, sindex, extracted_data
428 | 
429 |     @classmethod
430 |     def apply(cls, template, extractors):
431 |         return [cls(extractors, template.page_tokens)]
432 | 
433 |     def extracted_item(self):
434 |         return [self.__class__.__name__] + \
435 |                 sorted((e.extracted_item() for e in self.extractors),
436 |                        key=lambda x: '' if x[0] is None else x[0])
437 | 
438 |     def __repr__(self):
439 |         return str(self)
440 | 
441 |     def __str__(self):
442 |         stream = StringIO()
443 |         pprint.pprint(self.extractors, stream)
444 |         stream.seek(0)
445 |         template_data = stream.read()
446 |         if template_data:
447 |             return "%s[\n%s\n]" % (self.__class__.__name__, template_data)
448 |         return "%s[none]" % (self.__class__.__name__)
449 | 
450 | 
451 | class AdjacentVariantExtractor(RecordExtractor):
452 |     """Extractor for variants
453 | 
454 |     This simply extends the RecordExtractor to output data in a "variants"
455 |     attribute.
456 | 
457 |     The "apply" method will only apply to variants whose items are all adjacent and
458 |     it will appear as one record so that it can be handled by the RepeatedDataExtractor.
459 |     """
460 | 
461 |     def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs):
462 |         records = RecordExtractor.extract(self, page, start_index, end_index, ignored_regions, **kwargs)
463 |         return [('variants', r['variants'][0]) for r in records if r]
464 | 
465 |     @classmethod
466 |     def apply(cls, template, extractors):
467 |         adjacent_variants = set([])
468 |         variantf = lambda x: x.annotation.variant_id
469 |         for vid, egroup in groupby(extractors, variantf):
470 |             if not vid:
471 |                 continue
472 |             if vid in adjacent_variants:
473 |                 adjacent_variants.remove(vid)
474 |             else:
475 |                 adjacent_variants.add(vid)
476 |         new_extractors = []
477 |         for variant, group_seq in groupby(extractors, variantf):
478 |             group_seq = list(group_seq)
479 |             if variant in adjacent_variants:
480 |                 record_extractor = AdjacentVariantExtractor(group_seq, template.page_tokens)
481 |                 new_extractors.append(record_extractor)
482 |             else:
483 |                 new_extractors += group_seq
484 |         return new_extractors
485 | 
486 |     def __repr__(self):
487 |         return str(self)
488 | 
489 | 
490 | class TraceExtractor(object):
491 |     """Extractor that wraps other extractors and prints an execution
492 |     trace of the extraction process to aid debugging
493 |     """
494 | 
495 |     def __init__(self, traced, template):
496 |         self.traced = traced
497 |         self.annotation = traced.annotation
498 |         tstart = traced.annotation.start_index
499 |         tend = traced.annotation.end_index
500 |         self.tprefix = " ".join([template.token_dict.token_string(t)
501 |             for t in template.page_tokens[tstart-4:tstart+1]])
502 |         self.tsuffix = " ".join([template.token_dict.token_string(t)
503 |             for t in template.page_tokens[tend:tend+5]])
504 | 
505 |     def summarize_trace(self, page, start, end, ret):
506 |         text_start = page.htmlpage.parsed_body[page.token_page_indexes[start]].start
507 |         text_end = page.htmlpage.parsed_body[page.token_page_indexes[end or -1]].end
508 |         page_snippet = "(...%s)%s(%s...)" % (
509 |                 page.htmlpage.body[text_start-50:text_start].replace('\n', ' '),
510 |                 page.htmlpage.body[text_start:text_end],
511 |                 page.htmlpage.body[text_end:text_end+50].replace('\n', ' '))
512 |         pre_summary = "\nstart %s page[%s:%s]\n" % (self.traced.__class__.__name__, start, end)
513 |         post_summary = """
514 | %s page[%s:%s]
515 | 
516 | html
517 | %s
518 | 
519 | annotation
520 | ...%s
521 | %s
522 | %s...
523 | 
524 | extracted
525 | %s
526 |         """ % (self.traced.__class__.__name__, start, end, page_snippet,
527 |                 self.tprefix, self.annotation, self.tsuffix, [r for r in ret if 'trace' not in r])
528 |         return pre_summary, post_summary
529 | 
530 |     def extract(self, page, start, end, ignored_regions, **kwargs):
531 |         ret = self.traced.extract(page, start, end, ignored_regions, **kwargs)
532 |         if not ret:
533 |             return []
534 | 
535 |         # handle records by inserting a trace and combining with variant traces
536 |         if len(ret) == 1 and isinstance(ret[0], dict):
537 |             item = ret[0]
538 |             trace = item.pop('trace', [])
539 |             variants = item.get('variants', ())
540 |             for variant in variants:
541 |                 trace += variant.pop('trace', [])
542 |             pre_summary, post_summary = self.summarize_trace(page, start, end, ret)
543 |             item['trace'] = [pre_summary] + trace + [post_summary]
544 |             return ret
545 | 
546 |         pre_summary, post_summary = self.summarize_trace(page, start, end, ret)
547 |         return [('trace', pre_summary)] + ret + [('trace', post_summary)]
548 | 
549 |     @staticmethod
550 |     def apply(template, extractors):
551 |         output = []
552 |         for extractor in extractors:
553 |             if not isinstance(extractor, TraceExtractor):
554 |                 extractor = TraceExtractor(extractor, template)
555 |             output.append(extractor)
556 |         return output
557 | 
558 |     def extracted_item(self):
559 |         return self.traced.extracted_item()
560 | 
561 |     def __repr__(self):
562 |         return "Trace(%s)" % repr(self.traced)
563 | 
564 | 
565 | class TemplatePageExtractor(object):
566 |     """Top level extractor for a template page"""
567 | 
568 |     def __init__(self, template, extractors):
569 |         self.extractors = extractors
570 |         self.template = template
571 | 
572 |     def extract(self, page, start_index=0, end_index=None):
573 |         items = []
574 |         for extractor in self.extractors:
575 |             items.extend(extractor.extract(page, start_index, end_index, self.template.ignored_regions))
576 |         return [self._merge_list_dicts(items)]
577 | 
578 |     def _merge_list_dicts(self, dicts):
579 |         res = {}
580 |         for d in dicts:
581 |             res.update(d)
582 |         return res
583 | 
584 |     def __repr__(self):
585 |         return repr(self.extractors)
586 | 
587 |     def __str__(self):
588 |         return str(self.extractors)
589 | 
590 | 
591 | # Based on nltk's WordPunctTokenizer
592 | _tokenize = re.compile(r'\w+|[^\w\s]+', re.UNICODE | re.MULTILINE | re.DOTALL).findall
593 | 
594 | class TextRegionDataExtractor(object):
595 |     """Data Extractor for extracting text fragments from an annotation page
596 |     fragment or string. It extracts based on the longest unique prefix and
597 |     suffix.
598 | 
599 |     for example:
600 |     >>> extractor = TextRegionDataExtractor('designed by ', '.')
601 |     >>> extractor.extract_text("by Marc Newson.")
602 |     'Marc Newson'
603 | 
604 |     Both prefix and suffix are optional:
605 |     >>> extractor = TextRegionDataExtractor('designed by ')
606 |     >>> extractor.extract_text("by Marc Newson.")
607 |     'Marc Newson.'
608 |     >>> extractor = TextRegionDataExtractor(suffix='.')
609 |     >>> extractor.extract_text("by Marc Newson.")
610 |     'by Marc Newson'
611 | 
612 |     It requires a minimum match of at least one word or punctuation character:
613 |     >>> extractor = TextRegionDataExtractor('designed by')
614 |     >>> extractor.extract_text("y Marc Newson.") is None
615 |     True
616 |     """
617 |     def __init__(self, prefix=None, suffix=None):
618 |         self.prefix = (prefix or '')[::-1]
619 |         self.suffix = suffix or ''
620 |         self.minprefix = self.minmatch(self.prefix)
621 |         self.minsuffix = self.minmatch(self.suffix)
622 | 
623 |     @staticmethod
624 |     def minmatch(matchstring):
625 |         """the minimum number of characters that should match in order
626 |         to consider it a match for that string.
627 | 
628 |         This uses the last word of punctuation character
629 |         """
630 |         tokens = _tokenize(matchstring or '')
631 |         return len(tokens[0]) if tokens else 0
632 | 
633 |     def extract(self, region):
634 |         """Extract a region from the region passed"""
635 |         text = self.extract_text(region)
636 |         return HtmlPageRegion(region.htmlpage, text) if text else None
637 | 
638 |     def extract_text(self, text):
639 |         """Extract a substring from the text"""
640 |         pref_index = 0
641 |         if self.minprefix > 0:
642 |             rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix)
643 |             if plen is None or plen < self.minprefix:
644 |                 return None
645 |             pref_index = -rev_idx
646 |         if self.minsuffix == 0:
647 |             return text[pref_index:]
648 |         sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix)
649 |         if slen is None or slen < self.minsuffix:
650 |             return None
651 |         return text[pref_index:pref_index + sidx]
652 | 


--------------------------------------------------------------------------------
/scrapely/extraction/similarity.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Similarity calculation for Instance based extraction algorithm.
  3 | """
  4 | from itertools import count
  5 | from six.moves import zip as izip, xrange
  6 | from operator import itemgetter
  7 | from heapq import nlargest
  8 | 
  9 | try:
 10 |     # For typical use cases (small sequences and patterns) the naive approach
 11 |     # actually runs faster than KMP algorithm
 12 |     from . _similarity import naive_match_length
 13 | except ImportError:
 14 |     def naive_match_length(to_search, subsequence, range_start, range_end):
 15 |         startval = subsequence[0]
 16 |         return ((i, common_prefix_length(to_search[i:], subsequence))
 17 |                 for i in xrange(range_start, range_end)
 18 |                 if startval == to_search[i])
 19 | 
 20 | 
 21 | def common_prefix_length(a, b):
 22 |     """Calculate the length of the common prefix in both sequences passed.
 23 | 
 24 |     For example, the common prefix in this example is [1, 3]
 25 |     >>> common_prefix_length([1, 3, 4], [1, 3, 5, 1])
 26 |     2
 27 | 
 28 |     If there is no common prefix, 0 is returned
 29 |     >>> common_prefix_length([1], [])
 30 |     0
 31 |     """
 32 |     i = -1
 33 |     for i, x, y in izip(count(), a, b):
 34 |         if x != y:
 35 |             return i
 36 |     return i + 1
 37 | 
 38 | 
 39 | def common_prefix(*sequences):
 40 |     """determine the common prefix of all sequences passed
 41 | 
 42 |     For example:
 43 |     >>> common_prefix('abcdef', 'abc', 'abac')
 44 |     ['a', 'b']
 45 |     """
 46 |     prefix = []
 47 |     for sample in izip(*sequences):
 48 |         first = sample[0]
 49 |         if all(x == first for x in sample[1:]):
 50 |             prefix.append(first)
 51 |         else:
 52 |             break
 53 |     return prefix
 54 | 
 55 | 
 56 | def longest_unique_subsequence(to_search, subsequence, range_start=0,
 57 |                                range_end=None):
 58 |     """Find the longest unique subsequence of items in an array or string.  This
 59 |     searches to_search looking for the longest overlapping
 60 |     match with subsequence. If the largest match is unique (there is no other
 61 |     match of equivalent length), the index and length of match is returned.  If
 62 |     there is no match, (None, None) is returned.
 63 | 
 64 |     Please see section 3.2 of Extracting Web Data Using Instance-Based
 65 |     Learning by Yanhong Zhai and Bing Liu
 66 | 
 67 |     For example, the longest match occurs at index 2 and has length 3
 68 |     >>> import numpy as np
 69 |     >>> to_search = np.array([6, 3, 2, 4, 3, 2, 5])
 70 |     >>> longest_unique_subsequence(to_search, np.array([2, 4, 3]))
 71 |     (2, 3)
 72 | 
 73 |     When there are two equally long subsequences, it does not generate a match
 74 |     >>> longest_unique_subsequence(to_search, np.array([3, 2]))
 75 |     (None, None)
 76 | 
 77 |     range_start and range_end specify a range in which the match must begin
 78 |     >>> longest_unique_subsequence(to_search, np.array([3, 2]), 3)
 79 |     (4, 2)
 80 |     >>> longest_unique_subsequence(to_search, np.array([3, 2]), 0, 2)
 81 |     (1, 2)
 82 |     """
 83 |     if range_end is None:
 84 |         range_end = len(to_search)
 85 |     matches = naive_match_length(to_search, subsequence, range_start, range_end)
 86 |     best2 = nlargest(2, matches, key=itemgetter(1))
 87 |     # if there is a single unique best match, return that
 88 |     if len(best2) == 1 or len(best2) == 2 and best2[0][1] != best2[1][1]:
 89 |         return best2[0][0], best2[0][1]
 90 |     return None, None
 91 | 
 92 | 
 93 | def first_longest_subsequence(to_search, subsequence, range_start=0, range_end=None):
 94 |     """Find the first longest subsequence of the items in a list or array.
 95 | 
 96 |     range_start and range_end specify a range in which the match must begin.
 97 | 
 98 |     For example, the longest match occurs at index 2 and has length 3
 99 |     >>> to_search = [6, 3, 2, 4, 3, 2, 5]
100 |     >>> first_longest_subsequence(to_search, [2, 4, 3])
101 |     (2, 3)
102 | 
103 |     When there are two equally long subsequences, it return the nearest one)
104 |     >>> first_longest_subsequence(to_search, [3, 2])
105 |     (1, 2)
106 | 
107 |     >>> first_longest_subsequence([], [3, 2])
108 |     (None, None)
109 |     """
110 |     startval = subsequence[0]
111 |     if range_end is None:
112 |         range_end = len(to_search)
113 | 
114 |     # the comparison to startval ensures only matches of length >= 1 and
115 |     # reduces the number of calls to the common_length function
116 |     matches = [(i, common_prefix_length(to_search[i:], subsequence))
117 |         for i in xrange(range_start, range_end) if startval == to_search[i]]
118 | 
119 |     if not matches:
120 |         return None, None
121 |     # secondary sort on position and prefer the smaller one (near)
122 |     return max(matches, key=lambda x: (x[1], -x[0]))
123 | 
124 | 
125 | def similar_region(extracted_tokens, template_tokens, labelled_region,
126 |         range_start=0, range_end=None, best_match=longest_unique_subsequence, **kwargs):
127 |     """Given a labelled section in a template, identify a similar region
128 |     in the extracted tokens.
129 | 
130 |     The start and end index of the similar region in the extracted tokens
131 |     is returned.
132 | 
133 |     This will return a tuple containing:
134 |     (match score, start index, end index)
135 |     where match score is the sum of the length of the matching prefix and
136 |     suffix. If there is no unique match, (0, None, None) will be returned.
137 | 
138 |     start_index and end_index specify a range in which the match must begin
139 |     """
140 |     data_length = len(extracted_tokens)
141 |     if range_end is None:
142 |         range_end = data_length
143 |     # calculate the prefix score by finding a longest subsequence in
144 |     # reverse order
145 |     reverse_prefix = template_tokens[labelled_region.start_index::-1]
146 |     reverse_tokens = extracted_tokens[::-1]
147 |     (rpi, pscore) = best_match(reverse_tokens, reverse_prefix,
148 |             data_length - range_end, data_length - range_start)
149 | 
150 |     # None means nothing extracted. Index 0 means there cannot be a suffix.
151 |     if not rpi:
152 |         return 0, None, None
153 | 
154 |     # convert to an index from the start instead of in reverse
155 |     prefix_index = len(extracted_tokens) - rpi - 1
156 | 
157 |     if labelled_region.end_index is None:
158 |         return pscore, prefix_index, None
159 |     elif kwargs.get("suffix_max_length", None) == 0:
160 |         return pscore, prefix_index, range_start + 1
161 | 
162 |     suffix = template_tokens[labelled_region.end_index:]
163 | 
164 |     # if it's not a paired tag, use the best match between prefix & suffix
165 |     if labelled_region.start_index == labelled_region.end_index:
166 |         (match_index, sscore) = best_match(extracted_tokens,
167 |             suffix, prefix_index, range_end)
168 |         if match_index == prefix_index:
169 |             return (pscore + sscore, prefix_index, match_index)
170 |         elif pscore > sscore:
171 |             return pscore, prefix_index, prefix_index
172 |         elif sscore > pscore:
173 |             return sscore, match_index, match_index
174 |         return 0, None, None
175 | 
176 |     # calculate the suffix match on the tokens following the prefix. We could
177 |     # consider the whole page and require a good match.
178 |     (match_index, sscore) = best_match(extracted_tokens,
179 |             suffix, prefix_index + 1, range_end)
180 |     if match_index is None:
181 |         return 0, None, None
182 |     return (pscore + sscore, prefix_index, match_index)
183 | 


--------------------------------------------------------------------------------
/scrapely/extractors.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Extractors collection
  3 | """
  4 | 
  5 | import re
  6 | 
  7 | from six.moves.urllib.parse import urlparse, urlunparse
  8 | from six import unichr
  9 | 
 10 | from w3lib.html import replace_entities, remove_comments
 11 | from w3lib.url import safe_url_string
 12 | 
 13 | from scrapely.htmlpage import HtmlPage, HtmlTag, HtmlTagType
 14 | 
 15 | _NUMERIC_ENTITIES = re.compile("&#([0-9]+)(?:;|\s)", re.U)
 16 | _PRICE_NUMBER_RE = re.compile('(?:^|[^a-zA-Z0-9])(\d+(?:\.\d+)?)(?:$|[^a-zA-Z0-9])')
 17 | _NUMBER_RE = re.compile('(-?\d+(?:\.\d+)?)')
 18 | _DECIMAL_RE = re.compile(r'(-?\d[\d\,\.]*)', re.U | re.M)
 19 | 
 20 | _IMAGES = (
 21 |     'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
 22 |     'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',
 23 | )
 24 | 
 25 | _IMAGES_TYPES = '|'.join(_IMAGES)
 26 | _CSS_IMAGERE = re.compile("background(?:-image)?\s*:\s*url\((.*?)\)", re.I)
 27 | _BASE_PATH_RE = "/?(?:[^/]+/)*(?:.+%s)"
 28 | _IMAGE_PATH_RE = re.compile(_BASE_PATH_RE % '\.(?:%s)' % _IMAGES_TYPES, re.I)
 29 | _GENERIC_PATH_RE = re.compile(_BASE_PATH_RE % '', re.I)
 30 | _WS = re.compile("\s+", re.U)
 31 | 
 32 | # tags to keep (only for attributes with markup)
 33 | _TAGS_TO_KEEP = frozenset(['br', 'p', 'big', 'em', 'small', 'strong', 'sub',
 34 |     'sup', 'ins', 'del', 'code', 'kbd', 'samp', 'tt', 'var', 'pre', 'listing',
 35 |     'plaintext', 'abbr', 'acronym', 'address', 'bdo', 'blockquote', 'q',
 36 |     'cite', 'dfn', 'table', 'tr', 'th', 'td', 'tbody', 'ul', 'ol', 'li', 'dl',
 37 |     'dd', 'dt'])
 38 | 
 39 | # tag names to be replaced by other tag names (overrides tags_to_keep)
 40 | _TAGS_TO_REPLACE = {
 41 |     'h1': 'strong',
 42 |     'h2': 'strong',
 43 |     'h3': 'strong',
 44 |     'h4': 'strong',
 45 |     'h5': 'strong',
 46 |     'h6': 'strong',
 47 |     'b' : 'strong',
 48 |     'i' : 'em',
 49 | }
 50 | # tags whoose content will be completely removed (recursively)
 51 | # (overrides tags_to_keep and tags_to_replace)
 52 | _TAGS_TO_PURGE = ('script', 'style', 'img', 'input')
 53 | # tags that are automatically closed in HTML4 and HTML5
 54 | _VOID_TAGS = frozenset([
 55 |     'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
 56 |     'link', 'meta', 'param', 'source', 'track', 'wbr'
 57 | ])
 58 | 
 59 | 
 60 | def htmlregion(text):
 61 |     """convenience function to make an html region from text.
 62 |     This is useful for testing
 63 |     """
 64 |     return HtmlPage(body=text).subregion()
 65 | 
 66 | 
 67 | def notags(region, tag_replace=u' '):
 68 |     """Removes all html tags"""
 69 |     fragments = getattr(region, 'parsed_fragments', None)
 70 |     if fragments is None:
 71 |         return region
 72 |     page = region.htmlpage
 73 |     data = [page.fragment_data(f) for f in fragments if not isinstance(f, HtmlTag)]
 74 |     return tag_replace.join(data)
 75 | 
 76 | 
 77 | def text(region):
 78 |     """Converts HTML to text. There is no attempt at formatting other than
 79 |     removing excessive whitespace,
 80 | 
 81 |     For example:
 82 |     >>> t = lambda s: text(htmlregion(s))
 83 |     >>> t(u'<h1>test</h1>')
 84 |     u'test'
 85 | 
 86 |     Leading and trailing whitespace are removed
 87 |     >>> t(u'<h1> test</h1> ')
 88 |     u'test'
 89 | 
 90 |     Comments are removed
 91 |     >>> t(u'test <!-- this is a comment --> me')
 92 |     u'test me'
 93 | 
 94 |     Text between script tags is ignored
 95 |     >>> t(u"scripts are<script>n't</script> ignored")
 96 |     u'scripts are ignored'
 97 | 
 98 |     HTML entities are converted to text
 99 |     >>> t(u"only &pound;42")
100 |     u'only \\xa342'
101 | 
102 |     >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
103 |     u'The text is here'
104 |     """
105 |     text = replace_entities(region.text_content, encoding=region.htmlpage.encoding)
106 |     return _WS.sub(u' ', text).strip()
107 | 
108 | 
109 | def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
110 |              tags_to_purge=_TAGS_TO_PURGE):
111 |     """Creates an HTML subset, using a whitelist of HTML tags.
112 | 
113 |     The HTML generated is safe for display on a website,without escaping and
114 |     should not cause formatting problems.
115 | 
116 |     Behaviour can be customized through the following keyword arguments:
117 |         allowed_tags is a set of tags that are allowed
118 |         replace_tags is a mapping of tags to alternative tags to substitute.
119 |         tags_to_purge are tags that, if encountered, all content between the
120 |             opening and closing tag is removed.
121 | 
122 |     For example:
123 |     >>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep)
124 |     >>> t(u'<strong>test <blink>test</blink></strong>')
125 |     u'<strong>test test</strong>'
126 | 
127 |     Some tags, like script, are completely removed
128 |     >>> t(u'<script>test </script>test')
129 |     u'test'
130 | 
131 |     replace_tags define tags that are converted. By default all headers, bold
132 |     and indenting are converted to strong and em.
133 |     >>> t(u'<h2>header</h2> test <b>bold</b> <i>indent</i>')
134 |     u'<strong>header</strong> test <strong>bold</strong> <em>indent</em>'
135 | 
136 |     tags_to_purge defines the tags that have enclosing content removed:
137 |     >>> t(u'<p>test <script>test</script></p>')
138 |     u'<p>test </p>'
139 | 
140 |     Comments are stripped, but entities are not converted
141 |     >>> t(u'<!-- comment --> only &pound;42')
142 |     u'only &pound;42'
143 | 
144 |     Paired tags are closed
145 |     >>> t(u'<p>test')
146 |     u'<p>test</p>'
147 | 
148 |     >>> t(u'<p>test <i><br/><b>test</p>')
149 |     u'<p>test <em><br/><strong>test</strong></em></p>'
150 | 
151 |     Include or exclude tags that you want
152 |     >>> t(u'Keep <meta name="name" content="data"> and <b><hr> tags')
153 |     u'Keep  and <strong> tags</strong>'
154 |     >>> tags = set(list(_TAGS_TO_KEEP)[:] + ['meta', 'hr'])
155 |     >>> t(u'Keep <meta name="name" content="data"> and <b><hr> tags', tags)
156 |     u'Keep <meta> and <strong><hr> tags</strong>'
157 | 
158 |     Handle void tags when purged
159 |     >>> t(u'Keep content around <img src="image.jpg"> <b>img</b> tag')
160 |     u'Keep content around  <strong>img</strong> tag'
161 | 
162 |     """
163 |     tagstack = []
164 | 
165 |     def _process_tag(tag):
166 |         tagstr = replace_tags.get(tag.tag, tag.tag)
167 |         if tagstr not in allowed_tags:
168 |             return
169 |         if tag.tag_type == HtmlTagType.OPEN_TAG:
170 |             if tag.tag not in _VOID_TAGS:
171 |                 tagstack.append(tagstr)
172 |             return u"<%s>" % tagstr
173 |         elif tag.tag_type == HtmlTagType.CLOSE_TAG:
174 |             try:
175 |                 last = tagstack.pop()
176 |                 # common case of matching tag
177 |                 if last == tagstr:
178 |                     return u"</%s>" % last
179 |                 # output all preceeding tags (if present)
180 |                 revtags = tagstack[::-1]
181 |                 tindex = revtags.index(tagstr)
182 |                 del tagstack[-tindex-1:]
183 |                 return u"</%s></%s>" % (last, u"></".join(revtags[:tindex+1]))
184 |             except (ValueError, IndexError):
185 |                 # popped from empty stack or failed to find the tag
186 |                 pass
187 |         else:
188 |             assert tag.tag_type == HtmlTagType.UNPAIRED_TAG, "unrecognised tag type"
189 |             return u"<%s/>" % tag.tag
190 |     chunks = list(_process_markup(region, lambda text: text,
191 |         _process_tag, tags_to_purge)) + ["</%s>" % t for t in reversed(tagstack)]
192 |     return u''.join(chunks).strip()
193 | 
194 | 
195 | def _process_markup(region, textf, tagf, tags_to_purge=_TAGS_TO_PURGE):
196 |     fragments = getattr(region, 'parsed_fragments', None)
197 |     if fragments is None:
198 |         yield textf(region)
199 |         return
200 |     fiter = iter(fragments)
201 |     for fragment in fiter:
202 |         if isinstance(fragment, HtmlTag):
203 |             # skip forward to closing script tags
204 |             tag = fragment.tag
205 |             if tag in tags_to_purge:
206 |                 # if opening, keep going until closed
207 |                 if (fragment.tag_type == HtmlTagType.OPEN_TAG and
208 |                         tag not in _VOID_TAGS):
209 |                     for probe in fiter:
210 |                         if isinstance(probe, HtmlTag) and \
211 |                             probe.tag == tag and \
212 |                             probe.tag_type == HtmlTagType.CLOSE_TAG:
213 |                             break
214 |             else:
215 |                 output = tagf(fragment)
216 |                 if output:
217 |                     yield output
218 |         else:
219 |             text = region.htmlpage.fragment_data(fragment)
220 |             text = remove_comments(text)
221 |             text = textf(text)
222 |             if text:
223 |                 yield text
224 | 
225 | 
226 | def html(pageregion):
227 |     """A page region is already html, so this is the identity function"""
228 |     return pageregion
229 | 
230 | 
231 | def contains_any_numbers(txt):
232 |     """text that must contain at least one number
233 |     >>> contains_any_numbers('foo')
234 |     >>> contains_any_numbers('$67 at 15% discount')
235 |     '$67 at 15% discount'
236 |     """
237 |     if _NUMBER_RE.search(txt) is not None:
238 |         return txt
239 | 
240 | 
241 | def contains_prices(txt):
242 |     """text must contain a number that is not joined to text"""
243 |     if _PRICE_NUMBER_RE.findall(txt) is not None:
244 |         return txt
245 | 
246 | 
247 | def contains_numbers(txt, count=1):
248 |     """Must contain a certain amount of numbers
249 | 
250 |     >>> contains_numbers('foo', 2)
251 |     >>> contains_numbers('this 1 has 2 numbers', 2)
252 |     'this 1 has 2 numbers'
253 |     """
254 |     numbers = _NUMBER_RE.findall(txt)
255 |     if len(numbers) == count:
256 |         return txt
257 | 
258 | 
259 | def extract_number(txt):
260 |     """Extract a numeric value.
261 | 
262 |     This will fail if more than one numeric value is present.
263 | 
264 |     >>> extract_number('  -45.3')
265 |     '-45.3'
266 |     >>> extract_number('  +45.3')
267 |     '45.3'
268 |     >>> extract_number('  45.3')
269 |     '45.3'
270 |     >>> extract_number('  45.3, 7')
271 | 
272 |     It will handle unescaped entities:
273 |     >>> extract_number(u'&#163;129&#46;99')
274 |     u'129.99'
275 |     """
276 |     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
277 |     numbers = _NUMBER_RE.findall(txt)
278 |     if len(numbers) == 1:
279 |         return numbers[0]
280 | 
281 | 
282 | def extract_price(txt):
283 |     """
284 |     Extracts numbers making some price format specific assumptions
285 | 
286 |     >>> extract_price('asdf 234,234.45sdf ')
287 |     '234234.45'
288 |     >>> extract_price('234,23')
289 |     '234.23'
290 |     >>> extract_price('234,230')
291 |     '234230'
292 |     >>> extract_price('asdf 2234 sdf ')
293 |     '2234'
294 |     >>> extract_price('947')
295 |     '947'
296 |     >>> extract_price('-200,069,000,006.565456')
297 |     '-200069000006.565456'
298 |     >>> extract_price('1,000,000')
299 |     '1000000'
300 |     >>> extract_price('1,000,000.00')
301 |     '1000000.00'
302 |     >>> extract_price('1,000')
303 |     '1000'
304 |     >>> extract_price('1000,00')
305 |     '1000.00'
306 |     >>> extract_price('1,000.00')
307 |     '1000.00'
308 |     >>> extract_price('500,000.00')
309 |     '500000.00'
310 |     >>> extract_price('500.000,00')
311 |     '500000.00'
312 |     >>> extract_price('-500,000.00')
313 |     '-500000.00'
314 |     >>> extract_price('500 000,00')
315 |     '500000.00'
316 |     >>> extract_price(u'&#163;129&#46;99')
317 |     u'129.99'
318 |     >>> extract_price('adsfg')
319 |     >>> extract_price('stained, linseed oil finish, clear glas doors')
320 |     >>> extract_price('')
321 |     """
322 |     txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt)
323 |     txt = txt.replace(' ', '')
324 |     m = _DECIMAL_RE.search(txt)
325 |     POINT, COMMA = 0, 1
326 |     decimal_separator = POINT
327 | 
328 |     if m:
329 |         value = m.group(1)
330 |         last_point_idx = value.rfind('.')
331 |         last_comma_idx = value.rfind(',')
332 | 
333 |         # If a number has both separators take the last one
334 |         if last_point_idx > 0 and last_comma_idx > 0:
335 |             if last_comma_idx > last_point_idx:
336 |                 decimal_separator = COMMA
337 |         # If a number has only commas check the last one
338 |         elif last_comma_idx > 0:
339 |             first_comma_idx = value.find(',')
340 |             if (first_comma_idx == last_comma_idx and
341 |                     len(value) - last_comma_idx <= 3):
342 |                 decimal_separator = COMMA
343 | 
344 |         if decimal_separator == POINT:
345 |             value = value.replace(',', '')
346 |         else:
347 |             value = value.replace('.', '')
348 |         return value.replace(',', '.')
349 | 
350 | 
351 | def url(txt):
352 |     """convert text to a url
353 | 
354 |     this is quite conservative, since relative urls are supported
355 |     """
356 |     txt = txt.strip("\t\r\n '\"")
357 |     if txt:
358 |         return txt
359 | 
360 | 
361 | def image_url(txt):
362 |     """convert text to a url
363 | 
364 |     this is quite conservative, since relative urls are supported
365 |     Example:
366 | 
367 |         >>> image_url('')
368 | 
369 |         >>> image_url('   ')
370 | 
371 |         >>> image_url(' \\n\\n  ')
372 | 
373 |         >>> image_url('foo-bar.jpg')
374 |         ['foo-bar.jpg']
375 |         >>> image_url('/images/main_logo12.gif')
376 |         ['/images/main_logo12.gif']
377 |         >>> image_url("http://www.image.com/image.jpg")
378 |         ['http://www.image.com/image.jpg']
379 |         >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
380 |         ['http://www.domain.com/path1/path2/path3/image.jpg']
381 |         >>> image_url("/path1/path2/path3/image.jpg")
382 |         ['/path1/path2/path3/image.jpg']
383 |         >>> image_url("path1/path2/image.jpg")
384 |         ['path1/path2/image.jpg']
385 |         >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
386 |         ['http://www.site.com/path1/path2/image.jpg']
387 |         >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
388 |         ['http://www.site.com/path1/path2/image.jpg']
389 |         >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
390 |         ['http://www.site.com/path1/path2/image.jpg']
391 |         >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
392 |         ['http://www.site.com/path1/path2/image.jpg']
393 |         >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
394 |         ['http://www.site.com/path1/path2/image.jpg']
395 |         >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
396 |         ['http://www.site.com/path1/path2/image.jpg']
397 |         >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
398 |         ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
399 |         >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
400 |         ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
401 |         >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
402 |         ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
403 |         >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
404 |         ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg']
405 |         >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
406 |         ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
407 |         >>> image_url('http://www.site.com/image.php')
408 |         ['http://www.site.com/image.php']
409 |         >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
410 |         ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']
411 | 
412 |     """
413 |     imgurl = extract_image_url(txt)
414 |     return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None
415 | 
416 | 
417 | def extract_image_url(txt):
418 |     txt = url(txt)
419 |     imgurl = None
420 |     if txt:
421 |         # check if the text is style content
422 |         m = _CSS_IMAGERE.search(txt)
423 |         txt = m.groups()[0] if m else txt
424 |         parsed = urlparse(txt)
425 |         path = None
426 |         m = _IMAGE_PATH_RE.search(parsed.path)
427 |         if m:
428 |             path = m.group()
429 |         elif parsed.query:
430 |             m = _GENERIC_PATH_RE.search(parsed.path)
431 |             if m:
432 |                 path = m.group()
433 |         if path is not None:
434 |             parsed = list(parsed)
435 |             parsed[2] = path
436 |             imgurl = urlunparse(parsed)
437 |         if not imgurl:
438 |             imgurl = txt
439 |     return imgurl
440 | 


--------------------------------------------------------------------------------
/scrapely/htmlpage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | htmlpage
  3 | 
  4 | Container objects for representing html pages and their parts in the IBL
  5 | system. This encapsulates page related information and prevents parsing
  6 | multiple times.
  7 | """
  8 | import hashlib
  9 | import six
 10 | 
 11 | from six.moves.urllib.request import urlopen
 12 | from copy import deepcopy
 13 | from w3lib.encoding import html_to_unicode
 14 | try:
 15 |     from . import _htmlpage
 16 |     parse_html = _htmlpage.parse_html
 17 |     HtmlDataFragment = _htmlpage.HtmlDataFragment
 18 |     HtmlTag = _htmlpage.HtmlTag
 19 |     HtmlTagType = _htmlpage.HtmlTagType
 20 | except ImportError:
 21 |     import re
 22 |     from collections import OrderedDict
 23 | 
 24 |     class HtmlTagType(object):
 25 |         OPEN_TAG = 1
 26 |         CLOSE_TAG = 2
 27 |         UNPAIRED_TAG = 3
 28 | 
 29 |     class HtmlDataFragment(object):
 30 |         __slots__ = ('start', 'end', 'is_text_content')
 31 | 
 32 |         def __init__(self, start, end, is_text_content=False):
 33 |             self.start = start
 34 |             self.end = end
 35 |             self.is_text_content = is_text_content
 36 | 
 37 |         def __str__(self):
 38 |             return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (
 39 |                 self.start, self.end, self.is_text_content)
 40 | 
 41 |         def __repr__(self):
 42 |             return str(self)
 43 | 
 44 |     class HtmlTag(HtmlDataFragment):
 45 |         __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text')
 46 | 
 47 |         def __init__(self, tag_type, tag, attr_text, start, end):
 48 |             HtmlDataFragment.__init__(self, start, end)
 49 |             self.tag_type = tag_type
 50 |             self.tag = tag
 51 |             if isinstance(attr_text, dict):
 52 |                 self._attributes = attr_text
 53 |                 self._attr_text = None
 54 |             else:  # defer loading attributes until necessary
 55 |                 self._attributes = OrderedDict()
 56 |                 self._attr_text = attr_text
 57 | 
 58 |         @property
 59 |         def attributes(self):
 60 |             if not self._attributes and self._attr_text:
 61 |                 for attr_match in _ATTR_REGEXP.findall(self._attr_text):
 62 |                     name = attr_match[0].lower()
 63 |                     values = [v for v in attr_match[1:] if v]
 64 |                     # According to HTML spec if attribute name is repeated only
 65 |                     # the first one is taken into account
 66 |                     if name not in self._attributes:
 67 |                         self._attributes[name] = values[0] if values else None
 68 |             return self._attributes
 69 | 
 70 |         def __str__(self):
 71 |             attributes = ', '.join(
 72 |                 sorted(["%s: %s" % (k, repr(v))
 73 |                        for k, v in self.attributes.items()]))
 74 |             return "<HtmlTag tag='%s' attributes={%s} type='%d' [%s:%s]>" % (
 75 |                 self.tag, attributes, self.tag_type, self.start, self.end)
 76 | 
 77 |         def __repr__(self):
 78 |             return str(self)
 79 | 
 80 |     _ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|"
 81 |              "([^>\s]+))?)?")
 82 |     _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
 83 |     _DOCTYPE = r"<!DOCTYPE.*?>"
 84 |     _SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
 85 |     _COMMENT = "(<!--.*?--!?>|<\?.+?>|<!>)"
 86 | 
 87 |     _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
 88 |     _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG),
 89 |                               re.I | re.DOTALL)
 90 |     _DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE)
 91 |     _COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)
 92 | 
 93 |     def parse_html(text):
 94 |         """Higher level html parser. Calls lower level parsers and joins successive
 95 |         HtmlDataFragment elements in a single one.
 96 |         """
 97 |         # If have doctype remove it.
 98 |         start_pos = 0
 99 |         match = _DOCTYPE_REGEXP.match(text)
100 |         if match:
101 |             start_pos = match.end()
102 |         prev_end = start_pos
103 |         for match in _HTML_REGEXP.finditer(text, start_pos):
104 |             start = match.start()
105 |             end = match.end()
106 | 
107 |             if start > prev_end:
108 |                 yield HtmlDataFragment(prev_end, start, True)
109 | 
110 |             if match.groups()[0] is not None:  # comment
111 |                 yield HtmlDataFragment(start, end)
112 |             elif match.groups()[1] is not None:  # <script>...</script>
113 |                 for e in _parse_script(match):
114 |                     yield e
115 |             else:  # tag
116 |                 yield _parse_tag(match)
117 |             prev_end = end
118 |         textlen = len(text)
119 |         if prev_end < textlen:
120 |             yield HtmlDataFragment(prev_end, textlen, True)
121 | 
122 |     def _parse_script(match):
123 |         """parse a <script>...</script> region matched by _HTML_REGEXP"""
124 |         open_text, content, close_text = match.groups()[1:4]
125 | 
126 |         open_tag = _parse_tag(_HTML_REGEXP.match(open_text))
127 |         open_tag.start = match.start()
128 |         open_tag.end = match.start() + len(open_text)
129 | 
130 |         close_tag = _parse_tag(_HTML_REGEXP.match(close_text))
131 |         close_tag.start = match.end() - len(close_text)
132 |         close_tag.end = match.end()
133 | 
134 |         yield open_tag
135 |         if open_tag.end < close_tag.start:
136 |             start_pos = 0
137 |             for m in _COMMENT_REGEXP.finditer(content):
138 |                 if m.start() > start_pos:
139 |                     yield HtmlDataFragment(
140 |                         open_tag.end + start_pos, open_tag.end + m.start())
141 |                 yield HtmlDataFragment(
142 |                     open_tag.end + m.start(), open_tag.end + m.end())
143 |                 start_pos = m.end()
144 |             if open_tag.end + start_pos < close_tag.start:
145 |                 yield HtmlDataFragment(
146 |                     open_tag.end + start_pos, close_tag.start)
147 |         yield close_tag
148 | 
149 |     def _parse_tag(match):
150 |         """
151 |         parse a tag matched by _HTML_REGEXP
152 |         """
153 |         data = match.groups()
154 |         closing, tag, attr_text = data[4:7]
155 |         # if tag is None then the match is a comment
156 |         if tag is not None:
157 |             unpaired = data[-1]
158 |             if closing:
159 |                 tag_type = HtmlTagType.CLOSE_TAG
160 |             elif unpaired:
161 |                 tag_type = HtmlTagType.UNPAIRED_TAG
162 |             else:
163 |                 tag_type = HtmlTagType.OPEN_TAG
164 |             return HtmlTag(tag_type, tag.lower(), attr_text, match.start(),
165 |                            match.end())
166 | 
167 | 
168 | def url_to_page(url, encoding=None, default_encoding='utf-8'):
169 |     """Fetch a URL, using python urllib, and return an HtmlPage object.
170 | 
171 |     The `url` may be a string, or a `urllib.request.Request` object. The
172 |     `encoding` argument can be used to force the interpretation of the page
173 |     encoding.
174 | 
175 |     Redirects are followed, and the `url` property of the returned HtmlPage
176 |     object is the url of the final page redirected to.
177 | 
178 |     If the encoding of the page is known, it can be passed as a keyword
179 |     argument. If unspecified, the encoding is guessed using
180 |     `w3lib.encoding.html_to_unicode`. `default_encoding` is used if the
181 |     encoding cannot be determined.
182 |     """
183 |     fh = urlopen(url)
184 |     info = fh.info()
185 |     body_str = fh.read()
186 |     # guess content encoding if not specified
187 |     if encoding is None:
188 |         try:
189 |             # Python 3.x
190 |             content_type_header = fh.headers.get("content-type")
191 |         except AttributeError:
192 |             # Python 2.x
193 |             content_type_header = info.getheader("content-type")
194 |         encoding, body = html_to_unicode(content_type_header, body_str,
195 |                 default_encoding=default_encoding)
196 |     else:
197 |         body = body_str.decode(encoding)
198 |     return HtmlPage(fh.geturl(), headers=dict(info.items()), body=body, encoding=encoding)
199 | 
200 | 
201 | def dict_to_page(jsonpage, body_key='body'):
202 |     """Create an HtmlPage object from a dict object.
203 | 
204 |     `body_key` is the key where the page body can be found. This is used
205 |     sometimes when we want to store multiple version of the body (annotated and
206 |     original) into the same dict
207 |     """
208 |     url = jsonpage['url']
209 |     headers = jsonpage.get('headers')
210 |     body = jsonpage[body_key]
211 |     page_id = jsonpage.get('page_id')
212 |     encoding = jsonpage.get('encoding', 'utf-8')
213 |     return HtmlPage(url, headers, body, page_id, encoding)
214 | 
215 | 
216 | def page_to_dict(page, body_key='body'):
217 |     """Create a dict from the given HtmlPage
218 | 
219 |     `body_key` indicates what key to store the body into. See `dict_to_page`
220 |     for more info.
221 |     """
222 |     return {
223 |         'url': page.url,
224 |         'headers': page.headers,
225 |         body_key: page.body,
226 |         'page_id': page.page_id,
227 |         'encoding': page.encoding,
228 |     }
229 | 
230 | 
231 | class HtmlPage(object):
232 |     """HtmlPage
233 | 
234 |     This is a parsed HTML page. It contains the page headers, url, raw body and parsed
235 |     body.
236 | 
237 |     The parsed body is a list of HtmlDataFragment objects.
238 | 
239 |     The encoding argument is the original page encoding. This isn't used by the
240 |     core extraction code, but it may be used by some extractors to translate
241 |     entities or encoding urls.
242 |     """
243 |     def __init__(self, url=None, headers=None, body=None, page_id=None, encoding='utf-8'):
244 |         assert isinstance(body, six.text_type), "unicode expected, got: %s" % type(body).__name__
245 |         self.headers = headers or {}
246 |         self.body = body
247 |         self.url = url or u''
248 |         self.encoding = encoding
249 |         if page_id is None and url:
250 |             self.page_id = hashlib.sha1(url.encode(self.encoding)).hexdigest()
251 |         else:
252 |             self.page_id = page_id
253 | 
254 |     def _set_body(self, body):
255 |         self._body = body
256 |         self.parsed_body = list(parse_html(body))
257 | 
258 |     body = property(lambda x: x._body, _set_body, doc="raw html for the page")
259 | 
260 |     def subregion(self, start=0, end=None):
261 |         """HtmlPageRegion constructed from the start and end index (inclusive)
262 |         into the parsed page
263 |         """
264 |         return HtmlPageParsedRegion(self, start, end)
265 | 
266 |     def fragment_data(self, data_fragment):
267 |         """portion of the body corresponding to the HtmlDataFragment"""
268 |         return self.body[data_fragment.start:data_fragment.end]
269 | 
270 | 
271 | class TextPage(HtmlPage):
272 |     """An HtmlPage with one unique HtmlDataFragment, needed to have a
273 |     convenient text with same interface as html page but avoiding unnecesary
274 |     reparsing"""
275 |     def _set_body(self, text):
276 |         self._body = text
277 |         self.parsed_body = [HtmlDataFragment(0, len(self._body), True)]
278 |     body = property(lambda x: x._body, _set_body, doc="raw text for the page")
279 | 
280 | 
281 | class HtmlPageRegion(six.text_type):
282 |     """A Region of an HtmlPage that has been extracted
283 |     """
284 |     def __new__(cls, htmlpage, data):
285 |         return six.text_type.__new__(cls, data)
286 | 
287 |     def __init__(self, htmlpage, data):
288 |         """Construct a new HtmlPageRegion object.
289 | 
290 |         htmlpage is the original page and data is the raw html
291 |         """
292 |         self.htmlpage = htmlpage
293 | 
294 |     @property
295 |     def text_content(self):
296 |         return self
297 | 
298 | 
299 | class HtmlPageParsedRegion(HtmlPageRegion):
300 |     """A region of an HtmlPage that has been extracted
301 | 
302 |     This has a parsed_fragments property that contains the parsed html
303 |     fragments contained within this region
304 |     """
305 |     def __new__(cls, htmlpage, start_index, end_index):
306 |         text = htmlpage.body
307 |         if text:
308 |             text_start = htmlpage.parsed_body[start_index].start
309 |             text_end = htmlpage.parsed_body[end_index or -1].end
310 |             text = htmlpage.body[text_start:text_end]
311 | 
312 |         return HtmlPageRegion.__new__(cls, htmlpage, text)
313 | 
314 |     def __init__(self, htmlpage, start_index, end_index):
315 |         self.htmlpage = htmlpage
316 |         self.start_index = start_index
317 |         self.end_index = end_index
318 | 
319 |     def __copy__(self, page=None):
320 |         page = page or self.htmlpage
321 |         obj = HtmlPageParsedRegion.__new__(HtmlPageParsedRegion, page, self.start_index, self.end_index)
322 |         HtmlPageParsedRegion.__init__(obj, page, self.start_index, self.end_index)
323 |         return obj
324 | 
325 |     def __deepcopy__(self, memo):
326 |         page = deepcopy(self.htmlpage)
327 |         return self.__copy__(page)
328 | 
329 |     @property
330 |     def parsed_fragments(self):
331 |         """HtmlDataFragment or HtmlTag objects for this parsed region"""
332 |         end = self.end_index + 1 if self.end_index is not None else None
333 |         return self.htmlpage.parsed_body[self.start_index:end]
334 | 
335 |     @property
336 |     def text_content(self):
337 |         """Text content of this parsed region"""
338 |         text_all = u" ".join(self.htmlpage.body[_element.start:_element.end] \
339 |                 for _element in self.parsed_fragments if \
340 |                 not isinstance(_element, HtmlTag) and _element.is_text_content)
341 |         return TextPage(self.htmlpage.url, self.htmlpage.headers, \
342 |                 text_all, encoding=self.htmlpage.encoding).subregion()
343 | 


--------------------------------------------------------------------------------
/scrapely/template.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | 
  4 | from scrapely.htmlpage import HtmlTag, HtmlTagType
  5 | 
  6 | 
  7 | class AnnotationError(Exception):
  8 |     pass
  9 | 
 10 | 
 11 | class FragmentNotFound(AnnotationError):
 12 |     pass
 13 | 
 14 | 
 15 | class FragmentAlreadyAnnotated(AnnotationError):
 16 |     pass
 17 | 
 18 | 
 19 | class TemplateMaker(object):
 20 | 
 21 |     def __init__(self, htmlpage):
 22 |         self.htmlpage = copy.copy(htmlpage)
 23 | 
 24 |     def annotate(self, field, score_func, best_match=True):
 25 |         """Annotate a field.
 26 | 
 27 |         ``score_func`` is a callable that receives two arguments: (fragment,
 28 |         htmlpage) and returns a relevancy score (float) indicating how relevant
 29 |         is the fragment. 0 means the fragment is irrelevant. Higher scores
 30 |         means the fragment is more relevant. Otherwise, the closest opening tag
 31 |         (to the left) is annotated with the given attribute.
 32 | 
 33 |         If ``best_match`` is ``True``, only the best fragment is annotated.
 34 |         Otherwise, all fragments (with a positive relevancy) are annotated.
 35 | 
 36 |         """
 37 |         indexes = self.select(score_func)
 38 |         if not indexes:
 39 |             raise FragmentNotFound("Fragment not found annotating %r using: %s" %
 40 |                 (field, score_func))
 41 |         if best_match:
 42 |             del indexes[1:]
 43 |         for i in indexes:
 44 |             self.annotate_fragment(i, field)
 45 | 
 46 |     def select(self, score_func):
 47 |         """Return the indexes of fragment where score_func returns a positive
 48 |         value, reversely sorted by that value"""
 49 |         htmlpage = copy.copy(self.htmlpage)
 50 |         matches = []
 51 |         for i, fragment in enumerate(htmlpage.parsed_body):
 52 |             score = score_func(fragment, htmlpage)
 53 |             if score:
 54 |                 matches.append((score, i))
 55 |         matches.sort(reverse=True)
 56 |         return [x[1] for x in matches]
 57 | 
 58 |     def selected_data(self, index):
 59 |         """Return the data that would be annotated from the given fragment
 60 |         index
 61 |         """
 62 |         start_tag, end_tag = _enclosing_tags(self.htmlpage, index)
 63 |         return self.htmlpage.body[start_tag.start:end_tag.end]
 64 | 
 65 |     def annotations(self):
 66 |         """Return all annotations contained in the template as a list of tuples
 67 |         (annotation, index)
 68 |         """
 69 |         anlist = []
 70 |         for i, f in enumerate(self.htmlpage.parsed_body):
 71 |             if isinstance(f, HtmlTag) and f.tag_type == HtmlTagType.OPEN_TAG:
 72 |                 at = f.attributes.get('data-scrapy-annotate')
 73 |                 if at:
 74 |                     an = json.loads(at.replace('&quot;', '"'))
 75 |                     anlist.append((an, i))
 76 |         return anlist
 77 | 
 78 |     def annotate_fragment(self, index, field):
 79 |         for f in self.htmlpage.parsed_body[index::-1]:
 80 |             if isinstance(f, HtmlTag) and f.tag_type == HtmlTagType.OPEN_TAG:
 81 |                 if 'data-scrapy-annotate' in f.attributes:
 82 |                     fstr = self.htmlpage.fragment_data(f)
 83 |                     raise FragmentAlreadyAnnotated("Fragment already annotated: %s" % fstr)
 84 |                 d = {'annotations': {'content': field}}
 85 |                 a = ' data-scrapy-annotate="%s"' % json.dumps(d).replace('"', '&quot;')
 86 |                 p = self.htmlpage
 87 |                 p.body = p.body[:f.end-1] + a + p.body[f.end-1:]
 88 |                 return True
 89 |         return False
 90 | 
 91 |     def get_template(self):
 92 |         """Return the generated template as a HtmlPage object"""
 93 |         return self.htmlpage
 94 | 
 95 | 
 96 | def best_match(text):
 97 |     """Function to use in TemplateMaker.annotate()"""
 98 |     def func(fragment, page):
 99 |         fdata = page.fragment_data(fragment).strip()
100 |         if text in fdata:
101 |             if not len(fdata):
102 |                 return float("inf")
103 |             return float(len(text)) / len(fdata) - (1e-6 * fragment.start)
104 |         else:
105 |             return 0.0
106 |     return func
107 | 
108 | 
109 | def _enclosing_tags(htmlpage, index):
110 |     f = htmlpage.parsed_body[index]
111 |     if isinstance(f, HtmlTag) and f.tag_type == HtmlTagType.UNPAIRED_TAG:
112 |         return f, f
113 |     start_tag = end_tag = None
114 |     for f in htmlpage.parsed_body[index::-1]:
115 |         if isinstance(f, HtmlTag) and f.tag_type == HtmlTagType.OPEN_TAG:
116 |             start_tag = f
117 |             break
118 |     if not start_tag:
119 |         raise FragmentNotFound("Unable to find start tag from index %d" % index)
120 |     tcount = 1
121 |     start_index = htmlpage.parsed_body.index(start_tag)
122 |     for f in htmlpage.parsed_body[start_index+1:]:
123 |         if isinstance(f, HtmlTag) and f.tag == start_tag.tag:
124 |             if f.tag_type == HtmlTagType.OPEN_TAG:
125 |                 tcount += 1
126 |             if f.tag_type == HtmlTagType.CLOSE_TAG:
127 |                 tcount -= 1
128 |                 if not tcount:
129 |                     end_tag = f
130 |                     break
131 |     if not end_tag or htmlpage.parsed_body.index(end_tag) < index:
132 |         # end tag not found or tag found is not enclosing
133 |         return f, f
134 |     return start_tag, end_tag
135 | 


--------------------------------------------------------------------------------
/scrapely/tool.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys, os, re, cmd, shlex, optparse, json, pprint
  3 | from six.moves import StringIO
  4 | 
  5 | from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page
  6 | from scrapely.template import TemplateMaker, best_match
  7 | from scrapely.extraction import InstanceBasedLearningExtractor
  8 | 
  9 | 
 10 | class IblTool(cmd.Cmd):
 11 | 
 12 |     prompt = 'scrapely> '
 13 | 
 14 |     def __init__(self, filename, **kw):
 15 |         self.filename = filename
 16 |         cmd.Cmd.__init__(self, **kw)
 17 | 
 18 |     def fix_url(self, url):
 19 |         if not url.startswith('http'):
 20 |             url = 'http://' + url
 21 |         return url
 22 | 
 23 |     def do_add_template(self, line):
 24 |         """add_template <url> [--encoding ENCODING] - (alias: ta)"""
 25 |         if not line:
 26 |             print("You must provide an URL")
 27 |             print(IblTool.do_add_template.__doc__)
 28 |             return
 29 |         opts, (url,) = parse_at(line)
 30 |         t = url_to_page(self.fix_url(url), opts.encoding)
 31 |         templates = self._load_templates()
 32 |         templates.append(t)
 33 |         self._save_templates(templates)
 34 |         print("[%d] %s" % (len(templates) - 1, t.url))
 35 |     do_ta = do_add_template
 36 | 
 37 |     def do_ls_templates(self, line):
 38 |         """ls_templates - list templates (aliases: ls, tl)"""
 39 |         templates = self._load_templates()
 40 |         for n, t in enumerate(templates):
 41 |             print("[%d] %s" % (n, t.url))
 42 |     do_ls, do_tl = do_ls_templates, do_ls_templates
 43 | 
 44 |     def do_del_template(self, template_id):
 45 |         """del_template <template_id> - delete template (alias: td)"""
 46 |         templates = self._load_templates()
 47 |         try:
 48 |             del templates[int(template_id)]
 49 |             self._save_templates(templates)
 50 |             print("template deleted: %s" % template_id)
 51 |         except IndexError:
 52 |             print("template not found: %s" % template_id)
 53 |     do_td = do_del_template
 54 | 
 55 |     def do_annotate(self, line):
 56 |         """annotate <template_id> <data> [-n number] [-f field]- add or test annotation (aliases: a, t)
 57 | 
 58 |         Add a new annotation (if -f is passed) or test what would be annotated
 59 |         otherwise
 60 |         """
 61 |         if line.find(' ') < 0:
 62 |             print("You must provide a valid template identifier (check output of ls_templates)")
 63 |             print(IblTool.do_annotate.__doc__)
 64 |             return
 65 |         template_id, criteria = line.split(' ', 1)
 66 |         t = self._load_template(template_id)
 67 |         if not t:
 68 |             return
 69 |         criteria = self._parse_criteria(criteria)
 70 |         tm = TemplateMaker(t)
 71 |         selection = apply_criteria(criteria, tm)
 72 |         if criteria.field:
 73 |             for index in selection:
 74 |                 index = selection[0]
 75 |                 tm.annotate_fragment(index, criteria.field)
 76 |                 self._save_template(template_id, tm.get_template())
 77 |                 print("[new] (%s) %r" % (criteria.field,
 78 |                     remove_annotation(tm.selected_data(index))))
 79 |         else:
 80 |             for n, i in enumerate(selection):
 81 |                 print("[%d] %r" % (n, remove_annotation(tm.selected_data(i))))
 82 |     do_a, do_t = do_annotate, do_annotate
 83 | 
 84 |     def do_ls_annotations(self, template_id):
 85 |         """ls_annotations <template> - list annotations (alias: al)"""
 86 |         if assert_or_print(template_id, "missing template id"):
 87 |             return
 88 |         t = self._load_template(template_id)
 89 |         if not t:
 90 |             return
 91 |         tm = TemplateMaker(t)
 92 |         for n, (a, i) in enumerate(tm.annotations()):
 93 |             print("[%s-%d] (%s) %r" % (template_id, n, a['annotations']['content'],
 94 |                 remove_annotation(tm.selected_data(i))))
 95 |     do_al = do_ls_annotations
 96 | 
 97 |     def do_scrape(self, url):
 98 |         """scrape <url> - scrape url (alias: s)"""
 99 |         templates = self._load_templates()
100 |         if assert_or_print(templates, "no templates available"):
101 |             return
102 |         # fall back to the template encoding if none is specified
103 |         page = url_to_page(url, default_encoding=templates[0].encoding)
104 |         ex = InstanceBasedLearningExtractor((t, None) for t in templates)
105 |         pprint.pprint(ex.extract(page)[0])
106 |     do_s = do_scrape
107 | 
108 |     def default(self, line):
109 |         if line == 'EOF':
110 |             if self.use_rawinput:
111 |                 print("")
112 |             return True
113 |         elif line:
114 |             return cmd.Cmd.default(self, line)
115 | 
116 |     def _load_annotations(self, template_id):
117 |         t = self._load_template(template_id)
118 |         if not t: return
119 |         tm = TemplateMaker(t)
120 |         return [x[0] for x in tm.annotations()]
121 | 
122 |     def _load_template(self, template_id):
123 |         templates = self._load_templates()
124 |         try:
125 |             return templates[int(template_id)]
126 |         except (IndexError, ValueError):
127 |             print('Could not load template: %s' % template_id)
128 | 
129 |     def _load_templates(self):
130 |         if not os.path.exists(self.filename):
131 |             return []
132 |         with open(self.filename) as f:
133 |             templates = json.load(f)['templates']
134 |             templates = [HtmlPage(t['url'], body=t['body'], encoding=t['encoding']) \
135 |                 for t in templates]
136 |             return templates
137 | 
138 |     def _save_template(self, template_id, template):
139 |         templates = self._load_templates()
140 |         templates[int(template_id)] = template
141 |         self._save_templates(templates)
142 | 
143 |     def _save_templates(self, templates):
144 |         with open(self.filename, 'w') as f:
145 |             templates = [page_to_dict(t) for t in templates]
146 |             return json.dump({'templates': templates}, f)
147 | 
148 |     def _parse_criteria(self, criteria_str):
149 |         """Parse the given criteria string and returns a criteria object"""
150 |         p = optparse.OptionParser()
151 |         p.add_option('-f', '--field', help='field to annotate')
152 |         p.add_option('-n', '--number', type="int", help='number of result to select')
153 |         o, a = p.parse_args(shlex.split(criteria_str))
154 |         o.text = ' '.join(a)
155 |         if isinstance(o.text, bytes):
156 |             # Python 2.x
157 |             encoding = getattr(self.stdin, 'encoding', None) or sys.stdin.encoding
158 |             o.text = o.text.decode(encoding or 'ascii')
159 |         return o
160 | 
161 | 
162 | def parse_at(ta_line):
163 |     p = optparse.OptionParser()
164 |     p.add_option('-e', '--encoding', help='page encoding')
165 |     return p.parse_args(shlex.split(ta_line))
166 | 
167 | 
168 | def apply_criteria(criteria, tm):
169 |     """Apply the given criteria object to the given template"""
170 |     func = best_match(criteria.text) if criteria.text else lambda x, y: False
171 |     sel = tm.select(func)
172 |     if criteria.number is not None:
173 |         if criteria.number < len(sel):
174 |             sel = [sel[criteria.number]]
175 |         else:
176 |             sel = []
177 |     return sel
178 | 
179 | 
180 | def remove_annotation(text):
181 |     return re.sub(u' ?data-scrapy-annotate=".*?"', '', text)
182 | 
183 | 
184 | def assert_or_print(condition, text):
185 |     if not condition:
186 |         sys.stderr.write(text + os.linesep)
187 |         return True
188 | 
189 | 
190 | def args_to_file(args):
191 |     s = []
192 |     for a in args:
193 |         if ' ' in a:
194 |             if '"' in a:
195 |                 a = "'%s'" % a
196 |             else:
197 |                 a = '"%s"' % a
198 |         s.append(a)
199 |     return StringIO(' '.join(s))
200 | 
201 | 
202 | def main():
203 |     if len(sys.argv) == 1:
204 |         print("usage: %s <scraper_file> [command arg ...]" % sys.argv[0])
205 |         sys.exit(2)
206 | 
207 |     filename, args = sys.argv[1], sys.argv[2:]
208 |     if args:
209 |         t = IblTool(filename, stdin=args_to_file(args))
210 |         t.prompt = ''
211 |         t.use_rawinput = False
212 |     else:
213 |         t = IblTool(filename)
214 |     t.cmdloop()
215 | 
216 | if __name__ == '__main__':
217 |     main()
218 | 


--------------------------------------------------------------------------------
/scrapely/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.13.5'
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import platform
 4 | from setuptools import setup, find_packages
 5 | from setuptools.extension import Extension
 6 | 
 7 | 
 8 | USE_CYTHON = 'CYTHONIZE' in os.environ
 9 | IS_PYPY = platform.python_implementation() == 'PyPy'
10 | ext = '.pyx' if USE_CYTHON else '.c'
11 | try:
12 |     import numpy as np
13 |     include_dirs = [np.get_include()]
14 | except ImportError:
15 |     include_dirs = []
16 | extensions = [
17 |     Extension("scrapely._htmlpage",
18 |               ["scrapely/_htmlpage%s" % ext],
19 |               include_dirs=include_dirs),
20 |     Extension("scrapely.extraction._similarity",
21 |               ["scrapely/extraction/_similarity%s" % ext],
22 |               include_dirs=include_dirs),
23 | ]
24 | if USE_CYTHON and not IS_PYPY:
25 |     from Cython.Build import cythonize
26 |     extensions = cythonize(extensions)
27 | if IS_PYPY:
28 |     extensions = []
29 | 
30 | 
31 | setup(
32 |     name='scrapely',
33 |     version='0.13.5',
34 |     license='BSD',
35 |     description='A pure-python HTML screen-scraping library',
36 |     author='Scrapy project',
37 |     author_email='info@scrapy.org',
38 |     url='https://github.com/scrapy/scrapely',
39 |     packages=find_packages(exclude=['tests', 'tests.*']),
40 |     include_package_data=True,
41 |     zip_safe=False,
42 |     classifiers=[
43 |         'Development Status :: 5 - Production/Stable',
44 |         'License :: OSI Approved :: BSD License',
45 |         'Operating System :: OS Independent',
46 |         'Programming Language :: Python',
47 |         'Programming Language :: Python :: 2',
48 |         'Programming Language :: Python :: 2.7',
49 |         'Programming Language :: Python :: 3',
50 |         'Programming Language :: Python :: 3.6',
51 |         'Programming Language :: Python :: 3.7',
52 |         'Topic :: Internet :: WWW/HTTP',
53 |         'Topic :: Text Processing :: Markup :: HTML',
54 |     ],
55 |     install_requires=['numpy', 'w3lib', 'six'],
56 |     extras_require={
57 |         'speedup': ['cython']
58 |     },
59 |     ext_modules=extensions,
60 | )
61 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | from os import path
 4 | from itertools import count
 5 | 
 6 | _PATH  = path.abspath(path.dirname(__file__))
 7 | 
 8 | def iter_samples(prefix, html_encoding='utf-8', **json_kwargs):
 9 |     """Iterate through (raw_data, extracted_data) for all samples
10 |     beginning with the specified prefix.
11 | 
12 |     By convention, these are stored in the samples directory in the
13 |     format samples_PREFIX_COUNTER.[html|json]
14 |     """
15 |     SAMPLES_FILE_PREFIX = path.join(_PATH, "samples/samples_" + prefix + "_")
16 |     json_load_kwargs = dict(encoding='utf-8')
17 |     json_load_kwargs.update(json_kwargs)
18 |     for i in count():
19 |         fname = SAMPLES_FILE_PREFIX + str(i)
20 |         html_page = fname + ".html"
21 |         if not path.exists(html_page):
22 |             return
23 |         html_str = open(html_page, 'rb').read()
24 |         sample_data = json.load(open(fname + '.json'), **json_load_kwargs)
25 |         yield html_str.decode(html_encoding), sample_data
26 | 


--------------------------------------------------------------------------------
/tests/samples/samples_htmlpage_0.html:
--------------------------------------------------------------------------------
  1 | <head>
  2 | 
  3 | 
  4 | 
  5 | 
  6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  7 | <meta name="Copyright" content="Site Layout, Design &amp; Content Copyright 2005 - retrosixty.co.uk">
  8 | <meta http-equiv="content-language" content="EN">
  9 | <meta name="Designer" content="Max Williams">
 10 | <meta name="Keywords" content="retrosixty, retro sixty, retro, furniture, retro furniture, lighting ,retro lighting, art, retro art, ceramics, retro ceramics, technology, retro technology, fifties, sixties, seventies, 20th century design, post-war, post-war decorative, retro accessories">
 11 | <meta name="Title" content="retrosixty - retrosixty.co.uk">
 12 | <meta name="revisit-after" content="7">
 13 | <meta name="Robots" content="index,follow">
 14 | <meta name="Description" content="Dealers of retro furniture, post-war decorative and fine arts.">
 15 | <meta http-equiv="Cache-Control" content="no-cache">
 16 | <meta http-equiv="Expires" content="0">
 17 | <meta name="Author" content="Nick Waters">
 18 | 
 19 | <title>retrosixty - Charlotte Perriand Infraphil lamp, c1960s for Philips, Netherlands</title>
 20 | 
 21 | <script language="JavaScript">
 22 | <!--
 23 | function FP_swapImg() {//v1.0
 24 |  var doc=document,args=arguments,elm,n; doc.$imgSwaps=new Array(); for(n=2; n<args.length;
 25 |  n+=2) { c=o.layers; if(elm) { doc.$imgSwaps[doc.$imgSwaps.length]=elm;
 26 |  elm.$src=elm.src; elm.src=args[n+1]; } }
 27 | }
 28 | 
 29 | function FP_preloadImgs() {//v1.0
 30 |  var c=o.childNodes; if(!d.FP_imgs) d.FP_imgs=new Array();
 31 |  for(var d=document,a=arguments; i<a.length; i++) { d.FP_imgs[i]=new Image; d.FP_imgs[i].src=a[i]; }
 32 | }
 33 | 
 34 | function FP_getObjectByID(id,o) {//v1.0
 35 |  var c,el,els,f,m,n; if(!o)o=document; if(o.getElementById) el=o.getElementById(id);
 36 |  else if(o.layers) el=o.all[id]; else if(o.all) el=FP_getObjectByID(id,c[n]); if(el) return el;
 37 |  if(o.id==id || o.name==id) return o; if(o.childNodes) el=FP_getObjectByID(id,els[n]); if(c)
 38 |  for(n=0; n<c.length; n++) { elm=FP_getObjectByID(args[n]); if(el) return el; }
 39 |  els=f[n].elements; if(f) for(n=0; n<f.length; n++) { f=o.forms;
 40 |  for(m=0; m<els.length; m++){ i=0; if(el) return el; } }
 41 |  return null;
 42 | }
 43 | // -->
 44 | </script>
 45 | 
 46 | <style fprolloverstyle="">A:hover {color: #999999}
 47 | span.patitre
 48 | 	{}
 49 | span.auctionblock
 50 | 	{}
 51 | </style>
 52 | 
 53 | <style id="mydeco-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);
 54 | </style></head><body bottommargin="0" leftmargin="0" onload="" rightmargin="0" topmargin="0" alink="#000000" bgcolor="#c0c0c0" vlink="#000000" link="#000000">
 55 | 
 56 | <div class="mydeco-selected" align="center">
 57 | 	<table id="table1" bgcolor="#ffffff" border="0" cellpadding="0" cellspacing="0" width="765" height="100%">
 58 | 		<tbody><tr>
 59 | 			<td colspan="3" style="border-left: 1px solid rgb(0, 0, 0); border-right: 1px solid rgb(0, 0, 0);" align="center" height="120">
 60 | 			<p align="center">
 61 | 			<img alt="retrosixty" src="../images/logo.jpg" border="0" width="745" height="102"></p></td>
 62 | 		</tr>
 63 | 		<tr>
 64 | 			<td style="border-left: 1px solid rgb(0, 0, 0);" width="177" height="20">
 65 | 			<p style="margin-left: 10px;">
 66 | 			<img alt="retrosixty" src="../images/top.gif" border="0" width="160" height="20"></p></td>
 67 | 			<td colspan="2" style="border-right: 1px solid rgb(0, 0, 0);" width="586" height="20">&nbsp;
 68 | 			</td>
 69 | 		</tr>
 70 | 		<tr>
 71 | 			<td style="border-left: 1px solid rgb(0, 0, 0);" background="../images/bg.gif" valign="top" width="180">
 72 | 			<p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 73 | 			<a href="../index.html">
 74 | 			<img alt="Home" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Home" id="img31" onmouseout="" onmouseover="" src="../buttons/button3.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 75 | 			<a href="../about.html">
 76 | 			<img alt="About Us" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="About Us" id="img42" onmouseout="" onmouseover="" src="../buttons/button32.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 77 | 			<a href="../shipping.html">
 78 | 			<img alt="Shipping" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Shipping" id="img43" onmouseout="" onmouseover="" src="../buttons/button34.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 79 | 			<a href="../links.html">
 80 | 			<img alt="Links" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0; fp-orig: 0" fp-title="Links" id="img45" onmouseout="" onmouseover="" src="../buttons/button1.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 81 | 			<a href="../contact.php">
 82 | 			<img alt="Contact" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Contact" id="img44" onmouseout="" onmouseover="" src="../buttons/button36.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">&nbsp;
 83 | 			</p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 84 | 			<a href="../furniture.html">
 85 | 			<img alt="Furniture" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Furniture" id="img33" onmouseout="" onmouseover="" src="../buttons/buttonB.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 86 | 			<a href="../lighting.html">
 87 | 			<img alt="Lighting" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Lighting" id="img34" onmouseout="" onmouseover="" src="../buttons/buttonD.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 88 | 			<a href="../tech.html">
 89 | 			<img alt="Technology" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Technology" id="img35" onmouseout="" onmouseover="" src="../buttons/buttonF.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 90 | 			<a href="../ceramics.html">
 91 | 			<img alt="Ceramics" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Ceramics" id="img36" onmouseout="" onmouseover="" src="../buttons/button11.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 92 | 			<a href="../art.html">
 93 | 			<img alt="Art" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Art" id="img37" onmouseout="" onmouseover="" src="../buttons/button13.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 94 | 			<a href="../misc.html">
 95 | 			<img alt="Misc. Items" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Misc. Items" id="img38" onmouseout="" onmouseover="" src="../buttons/button15.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
 96 | 			<a href="../contemp.html">
 97 | 			<img alt="Contemporary" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0; fp-orig: 0" fp-title="Contemporary" id="img46" onmouseout="" onmouseover="" src="../buttons/button17.jpg" border="0" width="125" height="31"></a></p></td>
 98 | 			<td class="" valign="top" width="433">
 99 | 			<p style="margin-left: 10px; margin-right: 20px;"> 
100 |             <span style="font-weight: 700;"><font id="anonymous_element_1" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}" size="5" face="Tahoma">
101 |             Lighting..</font></span></p>
102 | 			<p style="margin-left: 10px; margin-right: 20px; margin-bottom: 15px;" align="justify">
103 | 			<font class="" size="2" face="Tahoma">Please click the thumbnails for larger 
104 |             images and the back button to return to the Lighting index.</font></p><div class="" align="center">
105 | 				<table id="table2" border="0" cellpadding="0" cellspacing="0" width="400" height="309">
106 | 					<tbody><tr>
107 | 						<td style="border-top: 1px solid rgb(123, 123, 123); border-bottom: 1px solid rgb(123, 123, 123);" width="130" height="309">
108 | 						<p align="center">
109 | 						&nbsp;</p><p class="" align="center">
110 | 						<a href="../photos/0642-01.JPG" target="_blank">
111 | 						<img id="anonymous_element_2" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}" src="../photos/0642-01_small.jpg" border="1"></a> 
112 |                 		</p><p align="center">
113 | 						<a href="../photos/0642-02.JPG" target="_blank">
114 | 						<img src="../photos/0642-02_small.jpg" border="1"></a></p><p align="center">
115 | 						<a href="../photos/0642-03.JPG" target="_blank">
116 | 						<img src="../photos/0642-03_small.jpg" border="1"></a></p><p align="center">
117 | 						<a href="../photos/0642-04.JPG" target="_blank">
118 | 						<img src="../photos/0642-04_small.jpg" border="1"></a></p><p align="center">
119 | 						&nbsp;</p><p align="center">
120 | 						&nbsp;</p><p align="center">
121 | 						&nbsp;</p></td>
122 | 						<td class="" style="border-top: 1px solid rgb(123, 123, 123); border-bottom: 1px solid rgb(123, 123, 123);" align="left" valign="top" height="309">
123 | 						<p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;"> 
124 |                   <b><font size="2" face="Tahoma">Designer</font></b><b><font size="2" face="Tahoma">: 
125 | 					</font>
126 | 					</b>
127 | 					<font size="2" face="Tahoma,sans-serif">Charlotte Perriand</font><span style="font-size: 10pt; font-family: Tahoma,sans-serif;">&nbsp;&nbsp;&nbsp;&nbsp; </span></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;"> 
128 |                   <font size="2" face="Tahoma"><b>Manufacturer: </b></font>
129 | 					<font size="2"><span style="font-family: Tahoma,sans-serif;">
130 | 					Philips, Netherlands</span></font><span style="font-size: 10pt; font-family: Tahoma,sans-serif;">
131 | 					&nbsp; </span></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;"><font size="2" face="Tahoma"><b>
132 |                         Description: 
133 |                          
134 |                         </b></font>
135 | 						<span style="font-size: 10pt; font-family: Tahoma,sans-serif;">
136 | 						A Perriand designed 'infraphil' infrared heat lamp 
137 | 						designed in c1960s. This example is in good vintage 
138 | 						condition with some minor wear as one would expect. 
139 | 						Original Philips sticker intact, although it has some 
140 | 						wear as pictured. </span>
141 | 						</p><p class="" style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;">
142 | 						<span class="" style="font-size: 10pt; font-family: Tahoma,sans-serif;">
143 | 						As with all electrical items we always 
144 | 						recommend having them tested by a professional prior to 
145 | 						use although it is in full working order. The lamp can 
146 | 						be used as a table lamp, or mounted on the wall - full 
147 | 						adjustable</span><font size="2" face="Tahoma">...</font></p><p class="" style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;"> 
148 |                   <font id="anonymous_element_3" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}" size="2" face="Tahoma"><b>Price:</b>&nbsp;£60</font></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;"> 
149 |                   <font size="2" face="Tahoma"><b>Size:</b> </font>
150 | 					<font size="2"><span style="font-family: Tahoma,sans-serif;">
151 | 					N/A</span></font><span style="font-size: 10pt; font-family: Tahoma,sans-serif;">
152 | 					<span class="auctionblock">&nbsp;</span>&nbsp;&nbsp;
153 | 					<span class="auctionblock">&nbsp;</span>&nbsp;&nbsp; </span></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;"> 
154 |                   <font size="2" face="Tahoma"><b>Shipping:</b> </font>
155 | 					<span style="font-size: 10pt; font-family: Tahoma,sans-serif;">
156 | 					£7 to mainland UK</span><font size="2" face="Tahoma">. 
157 | 					Please enquire for other locations.</font></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;"> 
158 |                   <font size="2" face="Tahoma"><b>Ref #:</b> 0642</font></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;"> 
159 |                   &nbsp;</p></td>
160 | 					</tr>
161 | 					</tbody></table>
162 | 					
163 | 					
164 | 				<p style="margin-left: 25px; margin-top: 25px;">
165 | 				<font size="2" face="Tahoma">
166 | 				<a href="about:blank" style="text-decoration: none;"><b>
167 | 				&lt;&lt; </b>BACK</a></font></p></div>
168 | 			</td>
169 | 			<td class="" style="border-right: 1px solid rgb(0, 0, 0);" valign="top" width="153">
170 | 			<p style="margin-right: 20px;" align="left">
171 | 			<img class="" alt="retrosixty" src="../images/icon1.jpg" border="0" width="133" height="133"></p>
172 | 			<p style="margin-right: 20px;">
173 | 			<img class="" alt="retrosixty" src="../images/icon2.jpg" border="0" width="133" height="133"></p>
174 | 			<p style="margin-right: 20px;">
175 | 			<img class="" alt="retrosixty" src="../images/icon3.jpg" border="0" width="133" height="133"></p>
176 | 			</td>
177 | 		</tr>
178 | 		<tr>
179 | 			<td style="border-left: 1px solid rgb(0, 0, 0);" width="177" height="25">
180 | 			<p style="margin-left: 10px; margin-bottom: 10px;">
181 | 			<img alt="retrosixty" src="../images/bottom.gif" border="0" width="160" height="25"></p></td>
182 | 			<td colspan="2" style="border-right: 1px solid rgb(0, 0, 0);" width="586" height="25">
183 | 			<p style="margin-right: 15px;" align="right">
184 | 			<font style="font-size: 8pt;" face="Tahoma">Site Layout, Design &amp; 
185 | 			Content Copyright 2006-09 - retrosixty.co.uk</font></p></td>
186 | 		</tr>
187 | 	</tbody></table>
188 | </div>
189 | 
190 | </body>


--------------------------------------------------------------------------------
/tests/samples/samples_pageparsing_0.json:
--------------------------------------------------------------------------------
 1 | [
 2 |         {
 3 |                 "surrounds_attribute": "name", 
 4 |                 "annotation_text": null, 
 5 |                 "match_common_prefix": false, 
 6 |                 "surrounds_variant": null, 
 7 |                 "variant_id": null, 
 8 |                 "tag_attributes": [], 
 9 |                 "end_index": 133, 
10 |                 "start_index": 132, 
11 |                 "metadata": {}
12 |         }, 
13 |         {
14 |                 "surrounds_attribute": null, 
15 |                 "annotation_text": null, 
16 |                 "match_common_prefix": false, 
17 |                 "surrounds_variant": null, 
18 |                 "variant_id": null, 
19 |                 "tag_attributes": [
20 |                         [
21 |                                 "src", 
22 |                                 "image_urls"
23 |                         ]
24 |                 ], 
25 |                 "end_index": 142, 
26 |                 "start_index": 141, 
27 |                 "metadata": {}
28 |         }, 
29 |         {
30 |                 "surrounds_attribute": null, 
31 |                 "annotation_text": null, 
32 |                 "match_common_prefix": false, 
33 |                 "surrounds_variant": null, 
34 |                 "variant_id": null, 
35 |                 "tag_attributes": [
36 |                         [
37 |                                 "src", 
38 |                                 "image_urls"
39 |                         ]
40 |                 ], 
41 |                 "end_index": 149, 
42 |                 "start_index": 148, 
43 |                 "metadata": {}
44 |         }, 
45 |         {
46 |                 "surrounds_attribute": "description", 
47 |                 "annotation_text": null, 
48 |                 "match_common_prefix": false, 
49 |                 "surrounds_variant": null, 
50 |                 "variant_id": null, 
51 |                 "tag_attributes": [], 
52 |                 "end_index": 207, 
53 |                 "start_index": 161, 
54 |                 "metadata": {}
55 |         }, 
56 |         {
57 |                 "surrounds_attribute": "price", 
58 |                 "annotation_text": null, 
59 |                 "match_common_prefix": false, 
60 |                 "surrounds_variant": null, 
61 |                 "variant_id": null, 
62 |                 "tag_attributes": [], 
63 |                 "end_index": 258, 
64 |                 "start_index": 257, 
65 |                 "metadata": {}
66 |         }, 
67 |         {
68 |                 "surrounds_attribute": "features", 
69 |                 "annotation_text": null, 
70 |                 "match_common_prefix": false, 
71 |                 "surrounds_variant": null, 
72 |                 "variant_id": null, 
73 |                 "tag_attributes": [], 
74 |                 "end_index": 421, 
75 |                 "start_index": 324, 
76 |                 "metadata": {}
77 |         }
78 | ]
79 | 


--------------------------------------------------------------------------------
/tests/samples/samples_scraper_loadstore_0.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapely/31b5881bed01a99be2b65b30b9c81ad65a517eaf/tests/samples/samples_scraper_loadstore_0.html


--------------------------------------------------------------------------------
/tests/samples/samples_scraper_loadstore_0.json:
--------------------------------------------------------------------------------
1 | {
2 |     "price": "340", 
3 |     "designer": "Tom Dixon", 
4 |     "name": "Copper Shade by Tom Dixon"
5 | }
6 | 


--------------------------------------------------------------------------------
/tests/samples/samples_scraper_loadstore_1.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapely/31b5881bed01a99be2b65b30b9c81ad65a517eaf/tests/samples/samples_scraper_loadstore_1.html


--------------------------------------------------------------------------------
/tests/samples/samples_scraper_loadstore_1.json:
--------------------------------------------------------------------------------
1 | {
2 |     "price": "229.00", 
3 |     "designer": "Artemide", 
4 |     "name": "Mesmeri Halo Chrome"
5 | }
6 | 


--------------------------------------------------------------------------------
/tests/test_htmlpage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | htmlpage.py tests
  3 | """
  4 | import os
  5 | import copy
  6 | import json
  7 | from unittest import TestCase
  8 | 
  9 | from scrapely.htmlpage import (
 10 |     parse_html, HtmlTag, HtmlDataFragment, HtmlPage, url_to_page
 11 | )
 12 | from .test_htmlpage_data import *
 13 | from . import iter_samples
 14 | BASE_PATH = os.path.abspath(os.path.dirname(__file__))
 15 | 
 16 | 
 17 | def _encode_element(el):
 18 |     """
 19 |     jsonize parse element
 20 |     """
 21 |     if isinstance(el, HtmlTag):
 22 |         return {"tag": el.tag, "attributes": el.attributes,
 23 |                 "start": el.start, "end": el.end, "tag_type": el.tag_type}
 24 |     if isinstance(el, HtmlDataFragment):
 25 |         return {"start": el.start, "end": el.end, "is_text_content": el.is_text_content}
 26 |     raise TypeError
 27 | 
 28 | 
 29 | def _decode_element(dct):
 30 |     """
 31 |     dejsonize parse element
 32 |     """
 33 |     if "tag" in dct:
 34 |         return HtmlTag(dct["tag_type"], dct["tag"],
 35 |                        dct["attributes"], dct["start"], dct["end"])
 36 |     if "start" in dct:
 37 |         return HtmlDataFragment(dct["start"], dct["end"], dct.get("is_text_content", True))
 38 |     return dct
 39 | 
 40 | 
 41 | class TestParseHtml(TestCase):
 42 |     """Test for parse_html"""
 43 |     def _test_sample(self, source, expected_parsed, samplecount=None):
 44 |         parsed = parse_html(source)
 45 |         count_element = 0
 46 |         count_expected = 0
 47 |         for element in parsed:
 48 |             if type(element) == HtmlTag:
 49 |                 count_element += 1
 50 |             expected = expected_parsed.pop(0)
 51 |             if type(expected) == HtmlTag:
 52 |                 count_expected += 1
 53 |             element_text = source[element.start:element.end]
 54 |             expected_text = source[expected.start:expected.end]
 55 |             if element.start != expected.start or element.end != expected.end:
 56 |                 errstring = "[%s,%s] %s != [%s,%s] %s" % (element.start, \
 57 |                     element.end, element_text, expected.start, \
 58 |                     expected.end, expected_text)
 59 |                 if samplecount is not None:
 60 |                     errstring += " (sample %d)" % samplecount
 61 |                 assert False, errstring
 62 |             if type(element) != type(expected):
 63 |                 errstring = "(%s) %s != (%s) %s for text\n%s" % (count_element, \
 64 |                     repr(type(element)), count_expected, repr(type(expected)), element_text)
 65 |                 if samplecount is not None:
 66 |                     errstring += " (sample %d)" % samplecount
 67 |                 assert False, errstring
 68 |             if type(element) == HtmlTag:
 69 |                 self.assertEqual(element.tag, expected.tag)
 70 |                 self.assertEqual(element.attributes, expected.attributes)
 71 |                 self.assertEqual(element.tag_type, expected.tag_type)
 72 |             if type(element) == HtmlDataFragment:
 73 |                 msg = "Got: %s Expected: %s in sample: %d [%d:%d] (%s)" % \
 74 |                         (element.is_text_content, expected.is_text_content, samplecount, element.start, element.end, repr(element_text)) \
 75 |                         if samplecount is not None else None
 76 |                 self.assertEqual(element.is_text_content, expected.is_text_content, msg)
 77 | 
 78 |         if expected_parsed:
 79 |             errstring = "Expected %s" % repr(expected_parsed)
 80 |             if samplecount is not None:
 81 |                 errstring += " (sample %d)" % samplecount
 82 |             assert False, errstring
 83 | 
 84 |     def test_parse(self):
 85 |         """simple parse_html test"""
 86 |         parsed = [_decode_element(d) for d in PARSED]
 87 |         sample = {"source": PAGE, "parsed": parsed}
 88 |         self._test_sample(PAGE, parsed)
 89 | 
 90 |     def test_site_samples(self):
 91 |         """test parse_html from real cases"""
 92 |         for i, (source, parsed) in enumerate(
 93 |                 iter_samples('htmlpage', object_hook=_decode_element)):
 94 |             self._test_sample(source, parsed, i)
 95 | 
 96 |     def test_bad(self):
 97 |         """test parsing of bad html layout"""
 98 |         parsed = [_decode_element(d) for d in PARSED2]
 99 |         self._test_sample(PAGE2, parsed)
100 | 
101 |     def test_comments(self):
102 |         """test parsing of tags inside comments"""
103 |         parsed = [_decode_element(d) for d in PARSED3]
104 |         self._test_sample(PAGE3, parsed)
105 | 
106 |     def test_script_text(self):
107 |         """test parsing of tags inside scripts"""
108 |         parsed = [_decode_element(d) for d in PARSED4]
109 |         self._test_sample(PAGE4, parsed)
110 | 
111 |     def test_sucessive(self):
112 |         """test parsing of sucesive cleaned elements"""
113 |         parsed = [_decode_element(d) for d in PARSED5]
114 |         self._test_sample(PAGE5, parsed)
115 | 
116 |     def test_sucessive2(self):
117 |         """test parsing of sucesive cleaned elements (variant 2)"""
118 |         parsed = [_decode_element(d) for d in PARSED6]
119 |         self._test_sample(PAGE6, parsed)
120 | 
121 |     def test_special_cases(self):
122 |         """some special cases tests"""
123 |         parsed = list(parse_html("<meta http-equiv='Pragma' content='no-cache' />"))
124 |         self.assertEqual(parsed[0].attributes, {'content': 'no-cache', 'http-equiv': 'Pragma'})
125 |         parsed = list(parse_html("<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>"))
126 |         self.assertEqual(parsed[0].attributes, {'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': 'en', 'lang': 'en'})
127 |         parsed = list(parse_html("<IMG SRC='http://images.play.com/banners/SAM550a.jpg' align='left' / hspace=5>"))
128 |         self.assertEqual(parsed[0].attributes, {'src': 'http://images.play.com/banners/SAM550a.jpg', \
129 |                                                 'align': 'left', 'hspace': '5', '/': None})
130 | 
131 |     def test_no_ending_body(self):
132 |         """Test case when no ending body nor html elements are present"""
133 |         parsed = [_decode_element(d) for d in PARSED7]
134 |         self._test_sample(PAGE7, parsed)
135 | 
136 |     def test_malformed(self):
137 |         """Test parsing of some malformed cases"""
138 |         parsed = [_decode_element(d) for d in PARSED8]
139 |         self._test_sample(PAGE8, parsed)
140 | 
141 |     def test_malformed2(self):
142 |         """Test case when attributes are not separated by space (still recognizable because of quotes)"""
143 |         parsed = [_decode_element(d) for d in PARSED9]
144 |         self._test_sample(PAGE9, parsed)
145 | 
146 |     def test_malformed3(self):
147 |         """Test case where attributes are repeated (should take first attribute, accoring to spec)"""
148 |         parsed = [_decode_element(d) for d in PARSED10]
149 |         self._test_sample(PAGE10, parsed)
150 | 
151 |     def test_empty_subregion(self):
152 |         htmlpage = HtmlPage(body=u"")
153 |         self.assertEqual(htmlpage.subregion(), u"")
154 | 
155 |     def test_ignore_xml_declaration(self):
156 |         """Ignore xml declarations inside html"""
157 |         parsed = list(parse_html(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>"))
158 |         self.assertFalse(parsed[3].is_text_content)
159 | 
160 |     def test_copy(self):
161 |         """Test copy/deepcopy"""
162 |         page = HtmlPage(url='http://www.example.com', body=PAGE)
163 |         region = page.subregion(10, 15)
164 | 
165 |         regioncopy = copy.copy(region)
166 |         self.assertEqual(regioncopy.start_index, 10)
167 |         self.assertEqual(regioncopy.end_index, 15)
168 |         self.assertFalse(region is regioncopy)
169 |         self.assertTrue(region.htmlpage is regioncopy.htmlpage)
170 | 
171 |         regiondeepcopy = copy.deepcopy(region)
172 |         self.assertEqual(regiondeepcopy.start_index, 10)
173 |         self.assertEqual(regiondeepcopy.end_index, 15)
174 |         self.assertFalse(region is regiondeepcopy)
175 |         self.assertFalse(region.htmlpage is regiondeepcopy.htmlpage)
176 | 
177 |     def test_load_page_from_url(self):
178 |         filepath = os.path.join(BASE_PATH, 'samples/samples_htmlpage_0')
179 |         url = 'file://{}.{}'.format(filepath, 'html')
180 |         page = url_to_page(url)
181 |         parsed = json.load(open('{}.{}'.format(filepath, 'json')))
182 |         parsed = [_decode_element(d) for d in parsed]
183 |         self.assertEqual(page.url, url)
184 |         self._test_sample(page.body, parsed, 1)
185 | 


--------------------------------------------------------------------------------
/tests/test_htmlpage_data.py:
--------------------------------------------------------------------------------
  1 | PAGE = u"""
  2 | <style id="scrapy-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);
  3 |                            </style>
  4 | <body>
  5 | <div class="scrapy-selected" id="header">
  6 | <img src="company_logo.jpg" style="margin-left: 68px; padding-top:5px;" alt="Logo" width="530" height="105">
  7 | <div id="vertrule">
  8 | <h1>COMPANY - <ins data-scrapy-annotate="{&quot;variant&quot;: &quot;0&quot;, &quot;generated&quot;: true, &quot;annotations&quot;: {&quot;content&quot;: &quot;title&quot;}}">Item Title</ins></h1>
  9 | <p>introduction</p>
 10 | <div>
 11 | <img src="/upload/img.jpg" classid=""
 12 |     data-scrapy-annotate="{&quot;variant&quot;: &quot;0&quot;, &quot;annotations&quot;: {&quot;image_url&quot;: &quot;src&quot;}}"
 13 | >
 14 | <p classid="" data-scrapy-annotate="{&quot;variant&quot;: &quot;0&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}"
 15 | >
 16 | This is such a nice item<br/> Everybody likes it.
 17 | </p>
 18 | <br></br>
 19 | </div>
 20 | <p data-scrapy-annotate="{&quot;variant&quot;: &quot;0&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}"
 21 | class="" >Power: 50W</p>
 22 | <!-- A comment --!>
 23 | <ul data-scrapy-replacement='select' class='product'>
 24 | <li data-scrapy-replacement='option'>Small</li>
 25 | <li data-scrapy-replacement='option'>Big</li>
 26 | </ul>
 27 | <p>click here for other items</p>
 28 | <h3>Louis Chair</h3>
 29 | <table class="rulet" width="420" cellpadding="0" cellspacing="0"><tbody>
 30 | <tr><td>Height</td>
 31 | <td><ins data-scrapy-annotate="{&quot;variant&quot;: &quot;0&quot;, &quot;generated&quot;: true, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">32.00</ins></td>
 32 | </tr><tbody></table>
 33 | <p onmouseover='xxx' class= style="my style">
 34 | """
 35 | 
 36 | PARSED = [
 37 | {'start': 0, 'end': 1},
 38 | {'attributes': {'type': 'text/css', 'id': 'scrapy-style'}, 'tag': 'style', 'end': 42, 'start': 1, 'tag_type': 1},
 39 | {'start': 42, 'end': 129},
 40 | {'attributes': {}, 'tag': 'style', 'end': 137, 'start': 129, 'tag_type': 2},
 41 | {'start': 137, 'end': 138},
 42 | {'attributes': {}, 'tag': 'body', 'end': 144, 'start': 138, 'tag_type': 1},
 43 | {'start': 144, 'end': 145},
 44 | {'attributes': {'class': 'scrapy-selected', 'id': 'header'}, 'tag': 'div', 'end': 186, 'start': 145, 'tag_type': 1},
 45 | {'start': 186, 'end': 187},
 46 | {'attributes': {'src': 'company_logo.jpg', 'style': 'margin-left: 68px; padding-top:5px;', 'width': '530', 'alt': 'Logo', 'height': '105'}, 'tag': 'img', 'end': 295, 'start': 187, 'tag_type': 1},
 47 | {'start': 295, 'end': 296},
 48 | {'attributes': {'id': 'vertrule'}, 'tag': 'div', 'end': 315, 'start': 296, 'tag_type': 1},
 49 | {'start': 315, 'end': 316},
 50 | {'attributes': {}, 'tag': 'h1', 'end': 320, 'start': 316, 'tag_type': 1},
 51 | {'start': 320, 'end': 330},
 52 | {'attributes': {'data-scrapy-annotate': '{&quot;variant&quot;: &quot;0&quot;, &quot;generated&quot;: true, &quot;annotations&quot;: {&quot;content&quot;: &quot;title&quot;}}'}, 'tag': 'ins', 'end': 491, 'start': 330, 'tag_type': 1},
 53 | {'start': 491, 'end': 501},
 54 | {'attributes': {}, 'tag': 'ins', 'end': 507, 'start': 501, 'tag_type': 2},
 55 | {'attributes': {}, 'tag': 'h1', 'end': 512, 'start': 507, 'tag_type': 2},
 56 | {'start': 512, 'end': 513},
 57 | {'attributes': {}, 'tag': 'p', 'end': 516, 'start': 513, 'tag_type': 1},
 58 | {'start': 516, 'end': 528},
 59 | {'attributes': {}, 'tag': 'p', 'end': 532, 'start': 528, 'tag_type': 2},
 60 | {'start': 532, 'end': 533},
 61 | {'attributes': {}, 'tag': 'div', 'end': 538, 'start': 533, 'tag_type': 1},
 62 | {'start': 538, 'end': 539},
 63 | {'attributes': {'classid': None, 'src': '/upload/img.jpg', 'data-scrapy-annotate': '{&quot;variant&quot;: &quot;0&quot;, &quot;annotations&quot;: {&quot;image_url&quot;: &quot;src&quot;}}'}, 'tag': 'img', 'end': 709, 'start': 539, 'tag_type': 1},
 64 | {'start': 709, 'end': 710},
 65 | {'attributes': {'classid': None, 'data-scrapy-annotate': '{&quot;variant&quot;: &quot;0&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}'}, 'tag': 'p', 'end': 858, 'start': 710, 'tag_type': 1},
 66 | {'start': 858, 'end': 883},
 67 | {'attributes': {}, 'tag': 'br', 'end': 888, 'start': 883, 'tag_type': 3},
 68 | {'start': 888, 'end': 909},
 69 | {'attributes': {}, 'tag': 'p', 'end': 913, 'start': 909, 'tag_type': 2},
 70 | {'start': 913, 'end': 914},
 71 | {'attributes': {}, 'tag': 'br', 'end': 918, 'start': 914, 'tag_type': 1},
 72 | {'attributes': {}, 'tag': 'br', 'end': 923, 'start': 918, 'tag_type': 2},
 73 | {'start': 923, 'end': 924},
 74 | {'attributes': {}, 'tag': 'div', 'end': 930, 'start': 924, 'tag_type': 2},
 75 | {'start': 930, 'end': 931},
 76 | {'attributes': {'data-scrapy-annotate': '{&quot;variant&quot;: &quot;0&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}', 'class': None}, 'tag': 'p', 'end': 1074, 'start': 931, 'tag_type': 1},
 77 | {'start': 1074, 'end': 1084},
 78 | {'attributes': {}, 'tag': 'p', 'end': 1088, 'start': 1084, 'tag_type': 2},
 79 | {'start': 1088, 'end': 1089},
 80 | {'start': 1089, 'end': 1108, 'is_text_content': False},
 81 | {'start': 1108, 'end': 1109},
 82 | {'attributes': {'data-scrapy-replacement': 'select', 'class': 'product'}, 'tag': 'ul', 'end': 1162, 'start': 1109, 'tag_type': 1},
 83 | {'start': 1162, 'end': 1163},
 84 | {'attributes': {'data-scrapy-replacement': 'option'}, 'tag': 'li', 'end': 1200, 'start': 1163, 'tag_type': 1},
 85 | {'start': 1200, 'end': 1205},
 86 | {'attributes': {}, 'tag': 'li', 'end': 1210, 'start': 1205, 'tag_type': 2},
 87 | {'start': 1210, 'end': 1211},
 88 | {'attributes': {'data-scrapy-replacement': 'option'}, 'tag': 'li', 'end': 1248, 'start': 1211, 'tag_type': 1},
 89 | {'start': 1248, 'end': 1251},
 90 | {'attributes': {}, 'tag': 'li', 'end': 1256, 'start': 1251, 'tag_type': 2},
 91 | {'start': 1256, 'end': 1257},
 92 | {'attributes': {}, 'tag': 'ul', 'end': 1262, 'start': 1257, 'tag_type': 2},
 93 | {'start': 1262, 'end': 1263},
 94 | {'attributes': {}, 'tag': 'p', 'end': 1266, 'start': 1263, 'tag_type': 1},
 95 | {'start': 1266, 'end': 1292},
 96 | {'attributes': {}, 'tag': 'p', 'end': 1296, 'start': 1292, 'tag_type': 2},
 97 | {'start': 1296, 'end': 1297},
 98 | {'attributes': {}, 'tag': 'h3', 'end': 1301, 'start': 1297, 'tag_type': 1},
 99 | {'start': 1301, 'end': 1312},
100 | {'attributes': {}, 'tag': 'h3', 'end': 1317, 'start': 1312, 'tag_type': 2},
101 | {'start': 1317, 'end': 1318},
102 | {'attributes': {'cellpadding': '0', 'width': '420', 'cellspacing': '0', 'class': 'rulet'}, 'tag': 'table', 'end': 1383, 'start': 1318, 'tag_type': 1},
103 | {'attributes': {}, 'tag': 'tbody', 'end': 1390, 'start': 1383, 'tag_type': 1},
104 | {'start': 1390, 'end': 1391},
105 | {'attributes': {}, 'tag': 'tr', 'end': 1395, 'start': 1391, 'tag_type': 1},
106 | {'attributes': {}, 'tag': 'td', 'end': 1399, 'start': 1395, 'tag_type': 1},
107 | {'start': 1399, 'end': 1405},
108 | {'attributes': {}, 'tag': 'td', 'end': 1410, 'start': 1405, 'tag_type': 2},
109 | {'start': 1410, 'end': 1411},
110 | {'attributes': {}, 'tag': 'td', 'end': 1415, 'start': 1411, 'tag_type': 1},
111 | {'attributes': {'data-scrapy-annotate': '{&quot;variant&quot;: &quot;0&quot;, &quot;generated&quot;: true, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}'}, 'tag': 'ins', 'end': 1576, 'start': 1415, 'tag_type': 1},
112 | {'start': 1576, 'end': 1581},
113 | {'attributes': {}, 'tag': 'ins', 'end': 1587, 'start': 1581, 'tag_type': 2},
114 | {'attributes': {}, 'tag': 'td', 'end': 1592, 'start': 1587, 'tag_type': 2},
115 | {'start': 1592, 'end': 1593},
116 | {'attributes': {}, 'tag': 'tr', 'end': 1598, 'start': 1593, 'tag_type': 2},
117 | {'attributes': {}, 'tag': 'tbody', 'end': 1605, 'start': 1598, 'tag_type': 1},
118 | {'attributes': {}, 'tag': 'table', 'end': 1613, 'start': 1605, 'tag_type': 2},
119 | {'start': 1613, 'end': 1614},
120 | {'attributes': {'style': 'my style', 'onmouseover': 'xxx', 'class': None}, 'tag': 'p', 'end': 1659, 'start': 1614, 'tag_type': 1},
121 | {'start': 1659, 'end': 1660},
122 | ]
123 | 
124 | # for testing parsing of some invalid html code (but still managed by browsers)
125 | PAGE2 = u"""
126 | <html>
127 | <body>
128 | <p class=&#34;MsoNormal&#34; style=&#34;margin: 0cm 0cm 0pt&#34;><span lang=&#34;EN-GB&#34;>
129 | Hello world!
130 | </span>
131 | </p>
132 | </body>
133 | </html>
134 | """
135 | 
136 | PARSED2 = [
137 |  {'end': 1, 'start': 0},
138 |  {'attributes': {}, 'end': 7, 'start': 1, 'tag': u'html', 'tag_type': 1},
139 |  {'end': 8, 'start': 7},
140 |  {'attributes': {}, 'end': 14, 'start': 8, 'tag': u'body', 'tag_type': 1},
141 |  {'end': 15, 'start': 14},
142 |  {'attributes': {u'style': u'&#34;margin:', u'0pt&#34;': None, u'class': u'&#34;MsoNormal&#34;', u'0cm': None}, 'end': 80, 'start': 15, 'tag': u'p', 'tag_type': 1},
143 |  {'attributes': {u'lang': u'&#34;EN-GB&#34;'}, 'end': 107, 'start': 80, 'tag': u'span', 'tag_type': 1},
144 |  {'end': 121, 'start': 107},
145 |  {'attributes': {}, 'end': 128, 'start': 121, 'tag': u'span', 'tag_type': 2},
146 |  {'end': 129, 'start': 128},
147 |  {'attributes': {}, 'end': 133, 'start': 129, 'tag': u'p', 'tag_type': 2},
148 |  {'end': 134, 'start': 133},
149 |  {'attributes': {}, 'end': 141, 'start': 134, 'tag': u'body', 'tag_type': 2},
150 |  {'end': 142, 'start': 141},
151 |  {'attributes': {}, 'end': 149, 'start': 142, 'tag': u'html', 'tag_type': 2},
152 |  {'end': 150, 'start': 149},
153 | ]
154 | 
155 | # for testing tags in different forms
156 | PAGE3 = u"""<!DOCTYPE html>
157 | <html>
158 |     <head>
159 |     <!-- Standard comment style -->
160 |     <title>Page name</title>
161 |     <meta name="name" content="value"><!> <!-- <- Self Closing Comment --!>
162 |     </head>
163 | 
164 |     <!-- Comment used for ignoring a script
165 |         <script type="text/javascript">
166 |             var a = 1;
167 |         </script>
168 |     -->
169 |     <body>
170 |     </body>
171 | </html>
172 | """
173 | 
174 | PARSED3 = [
175 |     {'end': 16, 'start': 15, 'is_text_content': True},
176 |     {'end': 22, 'start': 16, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'html'},
177 |     {'end': 27, 'start': 22, 'is_text_content': True},
178 |     {'end': 33, 'start': 27, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'head'},
179 |     {'end': 38, 'start': 33, 'is_text_content': True},
180 |     {'end': 69, 'start': 38, 'is_text_content': False},
181 |     {'end': 74, 'start': 69, 'is_text_content': True},
182 |     {'end': 81, 'start': 74, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'title'},
183 |     {'end': 90, 'start': 81, 'is_text_content': True},
184 |     {'end': 98, 'start': 90, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'title'},
185 |     {'end': 103, 'start': 98, 'is_text_content': True},
186 |     {'end': 137, 'start': 103, 'attributes': {'content': 'value', 'name': 'name'}, 'tag_type': 1, 'is_text_content': False, 'tag': 'meta'},
187 |     {'end': 140, 'start': 137, 'is_text_content': False},
188 |     {'end': 141, 'start': 140, 'is_text_content': True},
189 |     {'end': 174, 'start': 141, 'is_text_content': False},
190 |     {'end': 179, 'start': 174, 'is_text_content': True},
191 |     {'end': 186, 'start': 179, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'head'},
192 |     {'end': 192, 'start': 186, 'is_text_content': True},
193 |     {'end': 320, 'start': 192, 'is_text_content': False},
194 |     {'end': 325, 'start': 320, 'is_text_content': True},
195 |     {'end': 331, 'start': 325, 'attributes': {}, 'tag_type': 1, 'is_text_content': False, 'tag': 'body'},
196 |     {'end': 336, 'start': 331, 'is_text_content': True},
197 |     {'end': 343, 'start': 336, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'body'},
198 |     {'end': 344, 'start': 343, 'is_text_content': True},
199 |     {'end': 351, 'start': 344, 'attributes': {}, 'tag_type': 2, 'is_text_content': False, 'tag': 'html'},
200 |     {'end': 352, 'start': 351, 'is_text_content': True}
201 | ]
202 | 
203 | # for testing tags inside scripts
204 | PAGE4 = u"""<html><body><h1>Konnichiwa!!</h1>hello<script type="text/javascript">\
205 | doc.write("<img src=" + base + "product/" + productid + ">");\
206 | </script>hello again</body></html>"""
207 | 
208 | PARSED4 = [
209 |  {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
210 |  {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
211 |  {'attributes': {}, 'end': 16, 'start': 12, 'tag': u'h1', 'tag_type': 1},
212 |  {'end': 28,'start': 16},
213 |  {'attributes': {}, 'end': 33, 'start': 28, 'tag': u'h1', 'tag_type': 2},
214 |  {'end': 38, 'start': 33},
215 |  {'attributes': {u'type': u'text/javascript'}, 'end': 69, 'start': 38, 'tag': u'script', 'tag_type': 1},
216 |  {'end': 130, 'start': 69, 'is_text_content': False},
217 |  {'attributes': {}, 'end': 139, 'start': 130, 'tag': u'script', 'tag_type': 2},
218 |  {'end': 150, 'start': 139},
219 |  {'attributes': {}, 'end': 157, 'start': 150, 'tag': u'body', 'tag_type': 2},
220 |  {'attributes': {}, 'end': 164, 'start': 157, 'tag': u'html', 'tag_type': 2},
221 | ]
222 | 
223 | # Test sucessive cleaning elements
224 | PAGE5 = u"""<html><body><script>hello</script><script>brb</script></body><!--commentA--><!--commentB--></html>"""
225 | 
226 | PARSED5 = [
227 |  {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
228 |  {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
229 |  {'attributes': {}, 'end': 20, 'start': 12, 'tag': u'script', 'tag_type': 1},
230 |  {'end': 25, 'start': 20, 'is_text_content': False},
231 |  {'attributes': {}, 'end': 34, 'start': 25, 'tag': u'script', 'tag_type': 2},
232 |  {'attributes': {}, 'end': 42, 'start': 34, 'tag': u'script', 'tag_type': 1},
233 |  {'end': 45, 'start': 42, 'is_text_content': False},
234 |  {'attributes': {}, 'end': 54, 'start': 45, 'tag': u'script', 'tag_type': 2},
235 |  {'attributes': {}, 'end': 61, 'start': 54, 'tag': u'body', 'tag_type': 2},
236 |  {'end': 76, 'start': 61, 'is_text_content': False},
237 |  {'end': 91, 'start': 76, 'is_text_content': False},
238 |  {'attributes': {}, 'end': 98, 'start': 91, 'tag': u'html', 'tag_type': 2},
239 | ]
240 | 
241 | # Test sucessive cleaning elements variant 2
242 | PAGE6 = u"""<html><body><script>pss<!--comment-->pss</script>all<script>brb</script>\n\n</body></html>"""
243 | 
244 | PARSED6 = [
245 |  {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
246 |  {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
247 |  {'attributes': {}, 'end': 20, 'start': 12, 'tag': u'script', 'tag_type': 1},
248 |  {'end': 23, 'start': 20, 'is_text_content': False},
249 |  {'end': 37, 'start': 23, 'is_text_content': False},
250 |  {'end': 40, 'start': 37, 'is_text_content': False},
251 |  {'attributes': {}, 'end': 49, 'start': 40, 'tag': u'script', 'tag_type': 2},
252 |  {'end': 52, 'start': 49},
253 |  {'attributes': {}, 'end': 60, 'start': 52, 'tag': u'script', 'tag_type': 1},
254 |  {'end': 63, 'start': 60, 'is_text_content': False},
255 |  {'attributes': {}, 'end': 72, 'start': 63, 'tag': u'script', 'tag_type': 2},
256 |  {'end': 74, 'start': 72},
257 |  {'attributes': {}, 'end': 81, 'start': 74, 'tag': u'body', 'tag_type': 2},
258 |  {'attributes': {}, 'end': 88, 'start': 81, 'tag': u'html', 'tag_type': 2},
259 | ]
260 | 
261 | # Test source without ending body nor html
262 | PAGE7 = u"""<html><body><p>veris in temporibus sub aprilis idibus</p><script>script code</script><!--comment-->"""
263 | 
264 | PARSED7 = [
265 |     {'attributes' : {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
266 |     {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
267 |     {'attributes': {}, 'end': 15, 'start': 12, 'tag': u'p', 'tag_type': 1},
268 |     {'end': 53, 'start': 15},
269 |     {'attributes': {}, 'end': 57, 'start': 53, 'tag': u'p', 'tag_type': 2},
270 |     {'attributes' : {}, 'end': 65, 'start': 57, 'tag': u'script', 'tag_type': 1},
271 |     {'end': 76, 'start': 65, 'is_text_content': False},
272 |     {'attributes' : {}, 'end': 85, 'start': 76, 'tag': u'script', 'tag_type': 2},
273 |     {'end': 99, 'start': 85, 'is_text_content': False},
274 | ]
275 | 
276 | PAGE8 = u"""<a href="/overview.asp?id=277"><img border="0" src="/img/5200814311.jpg" title=\'Vinyl Cornice\'</a></td><table width=\'5\'>"""
277 | 
278 | PARSED8 = [
279 |    {'attributes' : {u'href' : u"/overview.asp?id=277"}, 'end': 31, 'start': 0, 'tag': u'a', 'tag_type': 1},
280 |    {'attributes' : {u'src' : u"/img/5200814311.jpg", u'border' : u"0", u'title': u'Vinyl Cornice'}, 'end': 94, 'start': 31, 'tag': u'img', 'tag_type': 1},
281 |    {'attributes' : {}, 'end': 98, 'start': 94, 'tag': u'a', 'tag_type': 2},
282 |    {'attributes' : {}, 'end': 103, 'start': 98, 'tag': u'td', 'tag_type': 2},
283 |    {'attributes' : {u'width': u'5'}, 'end': 120, 'start': 103, 'tag': u'table', 'tag_type': 1}
284 | ]
285 | 
286 | PAGE9 = u"""\
287 | <html>\
288 | <body>\
289 | <img width='230' height='150'src='/images/9589.jpg' >\
290 | <a href="/product/9589">Click here</a>\
291 | </body>\
292 | </html>\
293 | """
294 | 
295 | PARSED9 = [
296 |     {'attributes' : {}, 'end': 6, 'start': 0, 'tag': 'html', 'tag_type': 1},
297 |     {'attributes' : {}, 'end': 12, 'start': 6, 'tag': 'body', 'tag_type': 1},
298 |     {'attributes' : {'width': '230', 'height': '150', 'src': '/images/9589.jpg'}, 'end': 65, 'start': 12, 'tag': 'img', 'tag_type': 1},
299 |     {'attributes' : {'href': '/product/9589'}, 'end': 89, 'start': 65, 'tag': 'a', 'tag_type': 1},
300 |     {'end': 99, 'start': 89},
301 |     {'attributes' : {}, 'end': 103, 'start': 99, 'tag': 'a', 'tag_type': 2},
302 |     {'attributes' : {}, 'end': 110, 'start': 103, 'tag': 'body', 'tag_type': 2},
303 |     {'attributes' : {}, 'end': 117, 'start': 110, 'tag': 'html', 'tag_type': 2},
304 | ]
305 | 
306 | PAGE10 = u"""\
307 | <html>\
308 | <body>\
309 | <img src='/images/9589.jpg' width='230' height='150' src='/IGNORED.jpg'>\
310 | </body>\
311 | </html>\
312 | """
313 | 
314 | PARSED10 = [
315 |     {"attributes": {}, "end": 6, "start": 0, "tag": "html", "tag_type": 1},
316 |     {"attributes": {}, "end": 12, "start": 6, "tag": "body", "tag_type": 1},
317 |     {"attributes": {"height": "150", "src": "/images/9589.jpg", "width": "230"}, "end": 84, "start": 12, "tag": "img", "tag_type": 1},
318 |     {"attributes": {}, "end": 91, "start": 84, "tag": "body", "tag_type": 2},
319 |     {"attributes": {}, "end": 98, "start": 91, "tag": "html", "tag_type": 2}
320 | ]
321 | 


--------------------------------------------------------------------------------
/tests/test_pageparsing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Unit tests for pageparsing
  3 | """
  4 | from unittest import TestCase
  5 | 
  6 | from scrapely.htmlpage import HtmlPage
  7 | from scrapely.extraction.pageparsing import (
  8 |     InstanceLearningParser, TemplatePageParser, ExtractionPageParser)
  9 | from scrapely.extraction.pageobjects import TokenDict, TokenType
 10 | from . import iter_samples
 11 | 
 12 | 
 13 | SIMPLE_PAGE = u"""
 14 | <html> <p some-attr="foo">this is a test</p> </html>
 15 | """
 16 | 
 17 | LABELLED_PAGE1 = u"""
 18 | <html>
 19 | <h1 data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Some Product</h1>
 20 | <p> some stuff</p>
 21 | <p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">
 22 | This is such a nice item<br/>
 23 | Everybody likes it.
 24 | </p>
 25 | <p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}"/>
 26 | \xa310.00
 27 | <br/>
 28 | <p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;short_description&quot;}}">
 29 | Old fashioned product
 30 | <p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;short_description&quot;}}">
 31 | For exigent individuals
 32 | <p>click here for other items</p>
 33 | </html>
 34 | """
 35 | 
 36 | BROKEN_PAGE = u"""
 37 | <html> <p class="ruleb"align="center">html parser cannot parse this</p></html>
 38 | """
 39 | 
 40 | LABELLED_PAGE2 = u"""
 41 | <html><body>
 42 | <h1>A product</h1>
 43 | <div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">
 44 | <p>A very nice product for all intelligent people</p>
 45 | <div data-scrapy-ignore="true">
 46 | <img scr="image.jpg" /><br/><a link="back.html">Click here to go back</a>
 47 | </div>
 48 | </div>
 49 | <div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">
 50 | \xa310.00<p data-scrapy-ignore="true"> 13 <br></p>
 51 | </div>
 52 | <table data-scrapy-ignore="true">
 53 | <tr><td data-scrapy-ignore="true"></td></tr>
 54 | <tr></tr>
 55 | </table>
 56 | <img data-scrapy-ignore="true" src="image2.jpg">
 57 | <img data-scrapy-ignore="true" src="image3.jpg" />
 58 | <img data-scrapy-ignore-beneath="true" src="image2.jpg">
 59 | <img data-scrapy-ignore-beneath="true" src="image3.jpg" />
 60 | </body></html>
 61 | 
 62 | """
 63 | 
 64 | LABELLED_PAGE3 = u"""
 65 | <html><body>
 66 | <h1>A product</h1>
 67 | <div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">
 68 | <p>A very nice product for all intelligent people</p>
 69 | <div data-scrapy-ignore="true">
 70 | <img scr="image.jpg" /><br/><a link="back.html">Click here to go back</a>
 71 | </div>
 72 | </div>
 73 | <div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">
 74 | \xa310.00<p data-scrapy-ignore="true"> 13 <br></p>
 75 | <table><tr>
 76 | <td>Description 1</td>
 77 | <td data-scrapy-ignore-beneath="true">Description 2</td>
 78 | <td>Description 3</td>
 79 | <td>Description 4</td>
 80 | </tr></table>
 81 | </div>
 82 | </body></html>
 83 | """
 84 | 
 85 | LABELLED_PAGE4 = u"""
 86 | <html><body>
 87 | <h1>A product</h1>
 88 | <div>
 89 | <p>A very nice product for all intelligent people</p>
 90 | <div>
 91 | <img scr="image.jpg" /><br/><a link="back.html">Click here to go back</a>
 92 | </div>
 93 | </div>
 94 | <div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">
 95 | \xa310.00<p data-scrapy-ignore="true"> 13 <br></p>
 96 | <table><tr>
 97 | <td>Description 1</td>
 98 | <td data-scrapy-ignore-beneath="true">Description 2</td>
 99 | <td>Description 3</td>
100 | <td data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">
101 | Price \xa310.00</td>
102 | </tr></table>
103 | </div>
104 | </body></html>
105 | """
106 | 
107 | LABELLED_PAGE5 = u"""
108 | <html><body>
109 | <ul data-scrapy-replacement='select'>
110 | <li data-scrapy-replacement='option'>Option A</li>
111 | <li>Option I</li>
112 | <li data-scrapy-replacement='option'>Option B</li>
113 | </ul>
114 | </body></html>
115 | """
116 | 
117 | LABELLED_PAGE5a = u"""
118 | <ul data-scrapy-replacement="select" name="txtvariant" class="smalltextblk">
119 | <li data-scrapy-replacement="option" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}, &quot;generated&quot;: false}" value="BLUE">Blue&nbsp;$9.95 - In Stock</li>
120 | <li data-scrapy-replacement="option" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}, &quot;generated&quot;: false}" value="RED">Red&nbsp;$9.95 - In Stock</li>
121 | </ul>
122 | """
123 | 
124 | LABELLED_PAGE5b = u"""
125 | <ul data-scrapy-replacement="select" name="txtvariant" class="smalltextblk">
126 | <li data-scrapy-replacement="option" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}, &quot;generated&quot;: false}" value="BLUE">Blue&nbsp;$9.95 - In Stock
127 | <li data-scrapy-replacement="option" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}, &quot;generated&quot;: false}" value="RED">Red&nbsp;$9.95 - In Stock
128 | </ul>
129 | """
130 | 
131 | LABELLED_PAGE6 = u"""
132 | <html><body>
133 | Text A
134 | <p><ins data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: true, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">
135 | 65.00</ins>pounds</p>
136 | <p>Description: <ins data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: true, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">
137 | Text B</ins></p>
138 | Text C
139 | </body></html>
140 | """
141 | 
142 | LABELLED_PAGE7 = u"""
143 | <html><body>
144 | <div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">
145 | <ins data-scrapy-ignore="true" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: true, &quot;annotations&quot;: {&quot;content&quot;: &quot;site_id&quot;}}">Item Id</ins>
146 | Description
147 | </div>
148 | </body></html>
149 | """
150 | 
151 | LABELLED_PAGE8 = u"""
152 | <html><body>
153 | <div data-scrapy-annotate="{&quot;required&quot;: [&quot;description&quot;], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">
154 | <ins data-scrapy-ignore="true" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: true, &quot;annotations&quot;: {&quot;content&quot;: &quot;site_id&quot;}}">Item Id</ins>
155 | Description
156 | </div>
157 | </body></html>
158 | """
159 | 
160 | LABELLED_PAGE9 = u"""
161 | <html><body>
162 | <img src="image.jpg" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}">
163 | <p data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 1, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">product 1</p>
164 | <b data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 1, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">$67</b>
165 | <p data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 2, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">product 2</p>
166 | <b data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 2, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">$70</b>
167 | <div data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;category&quot;}}">tables</div>
168 | </body></html>
169 | """
170 | 
171 | LABELLED_PAGE10 = u"""
172 | <html><body>
173 | <img src="image.jpg" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}">
174 | <p data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 1, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">product 1</p>
175 | <b data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 1, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">$67</b>
176 | <img src="swatch1.jpg" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 1, &quot;annotations&quot;: {&quot;src&quot;: &quot;swatches&quot;}}">
177 | 
178 | <p data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 2, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">product 2</p>
179 | <b data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 2, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">$70</b>
180 | <img src="swatch2.jpg" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 2, &quot;annotations&quot;: {&quot;src&quot;: &quot;swatches&quot;}}">
181 | 
182 | <div data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;category&quot;}}">tables</div>
183 | </body></html>
184 | """
185 | 
186 | LABELLED_PAGE11 = u"""
187 | <html><body>
188 | <input type="text" name="3896" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 1, &quot;annotations&quot;: {&quot;name&quot;: &quot;site_id&quot;}, &quot;generated&quot;: false}" />
189 | </body></html>
190 | """
191 | 
192 | LABELLED_PAGE12 = u"""
193 | <head>
194 | <meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content:&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}" />
195 | </head>
196 | """
197 | 
198 | LABELLED_PAGE13 = u"""
199 | <head>
200 | <meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;, &quot;text-content&quot;: &quot;name&quot;}}">This is the name</meta>
201 | </head>
202 | """
203 | 
204 | 
205 | def _parse_page(parser_class, pagetext):
206 |     htmlpage = HtmlPage(None, {}, pagetext)
207 |     parser = parser_class(TokenDict())
208 |     parser.feed(htmlpage)
209 |     return parser
210 | 
211 | 
212 | def _tags(pp, predicate):
213 |     return [pp.token_dict.token_string(s) for s in pp.token_list \
214 |             if predicate(s)]
215 | 
216 | 
217 | class TestPageParsing(TestCase):
218 | 
219 |     def test_instance_parsing(self):
220 |         pp = _parse_page(InstanceLearningParser, SIMPLE_PAGE)
221 |         # all tags
222 |         self.assertEqual(_tags(pp, bool), ['<html>', '<p>', '</p>', '</html>'])
223 | 
224 |         # open/closing tag handling
225 |         openp = lambda x: pp.token_dict.token_type(x) == TokenType.OPEN_TAG
226 |         self.assertEqual(_tags(pp, openp), ['<html>', '<p>'])
227 |         closep = lambda x: pp.token_dict.token_type(x) == TokenType.CLOSE_TAG
228 |         self.assertEqual(_tags(pp, closep), ['</p>', '</html>'])
229 | 
230 |     def _validate_annotation(self, parser, label_region, name, start_tag, end_tag):
231 |         self.assertEqual(label_region.surrounds_attribute, name)
232 |         start_token = parser.token_list[label_region.start_index]
233 |         self.assertEqual(parser.token_dict.token_string(start_token), start_tag)
234 |         end_token = parser.token_list[label_region.end_index]
235 |         self.assertEqual(parser.token_dict.token_string(end_token), end_tag)
236 | 
237 |     def test_template_parsing(self):
238 |         lp = _parse_page(TemplatePageParser, LABELLED_PAGE1)
239 |         self.assertEqual(len(lp.annotations), 5)
240 |         self._validate_annotation(lp, lp.annotations[0],
241 |                 'name', '<h1>', '</h1>')
242 | 
243 |         # all tags were closed
244 |         self.assertEqual(len(lp.labelled_tag_stacks), 0)
245 | 
246 |     def test_extraction_page_parsing(self):
247 |         epp = _parse_page(ExtractionPageParser, SIMPLE_PAGE)
248 |         ep = epp.to_extraction_page()
249 |         self.assertEqual(len(ep.page_tokens), 4)
250 |         self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(0)), '<html>')
251 |         self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(1)), '<p some-attr="foo">')
252 | 
253 |         self.assertEqual(ep.htmlpage_region_inside(1, 2), 'this is a test')
254 |         self.assertEqual(ep.htmlpage_region_inside(1, 3), 'this is a test</p> ')
255 | 
256 |     def test_invalid_html(self):
257 |         p = _parse_page(InstanceLearningParser, BROKEN_PAGE)
258 |         self.assertTrue(p)
259 | 
260 |     def test_ignore_region(self):
261 |         """Test ignored regions"""
262 |         p = _parse_page(TemplatePageParser, LABELLED_PAGE2)
263 |         self.assertEqual(p.ignored_regions, [(7,12),(15,17),(19,26),(21,22),(27,28),(28,29),(29,None),(30,None)])
264 |         self.assertEqual(len(p.ignored_tag_stacks), 0)
265 | 
266 |     def test_ignore_regions2(self):
267 |         """Test ignore-beneath regions"""
268 |         p = _parse_page(TemplatePageParser, LABELLED_PAGE3)
269 |         self.assertEqual(p.ignored_regions, [(7,12),(15,17),(22,None)])
270 |         self.assertEqual(len(p.ignored_tag_stacks), 0)
271 | 
272 |     def test_ignore_regions3(self):
273 |         """Test ignore-beneath with annotation inside region"""
274 |         p = _parse_page(TemplatePageParser, LABELLED_PAGE4)
275 |         self.assertEqual(p.ignored_regions, [(15,17),(22,None)])
276 |         self.assertEqual(len(p.ignored_tag_stacks), 0)
277 | 
278 |     def test_replacement(self):
279 |         """Test parsing of replacement tags"""
280 |         p = _parse_page(TemplatePageParser, LABELLED_PAGE5)
281 |         self.assertEqual(_tags(p, bool), ['<html>', '<body>', '<select>', '<option>',
282 |                     '</option>', '<li>', '</li>', '<option>', '</option>', '</select>', '</body>', '</html>'])
283 | 
284 |     def test_replacement2(self):
285 |         """Replacement, with annotations"""
286 |         p = _parse_page(TemplatePageParser, LABELLED_PAGE5a)
287 |         self.assertEqual(_tags(p, bool), [u'<select>', u'<option>', u'</option>', u'<option>', u'</option>', u'</select>'])
288 |         self.assertEqual(p.annotations[0].surrounds_attribute, 'price')
289 |         self.assertEqual(p.annotations[0].start_index, 1)
290 |         self.assertEqual(p.annotations[0].end_index, 2)
291 |         self.assertEqual(p.annotations[1].surrounds_attribute, 'price')
292 |         self.assertEqual(p.annotations[1].start_index, 3)
293 |         self.assertEqual(p.annotations[1].end_index, 4)
294 | 
295 | 
296 |     def test_replacement3(self):
297 |         """A second case of replacement, with annotations, not closed replaced tags"""
298 |         p = _parse_page(TemplatePageParser, LABELLED_PAGE5b)
299 |         self.assertEqual(_tags(p, bool), [u'<select>', u'<option>', u'<option>', u'</select>'])
300 |         self.assertEqual(p.annotations[0].surrounds_attribute, 'price')
301 |         self.assertEqual(p.annotations[0].start_index, 1)
302 |         self.assertEqual(p.annotations[0].end_index, 2)
303 |         self.assertEqual(p.annotations[1].surrounds_attribute, 'price')
304 |         self.assertEqual(p.annotations[1].start_index, 2)
305 |         self.assertEqual(p.annotations[1].end_index, 3)
306 | 
307 |     def test_partial(self):
308 |         """Test partial annotation parsing"""
309 |         p = _parse_page(TemplatePageParser, LABELLED_PAGE6)
310 |         text = p.annotations[0].annotation_text
311 |         self.assertEqual(text.start_text, '')
312 |         self.assertEqual(text.follow_text, 'pounds')
313 |         text = p.annotations[1].annotation_text
314 |         self.assertEqual(text.start_text, "Description: ")
315 |         self.assertEqual(text.follow_text, '')
316 | 
317 |     def test_ignored_partial(self):
318 |         """Test ignored region declared on partial annotation"""
319 |         p = _parse_page(TemplatePageParser, LABELLED_PAGE7)
320 |         self.assertEqual(p.ignored_regions, [(2, 3)])
321 | 
322 |     def test_extra_required(self):
323 |         """Test parsing of extra required attributes"""
324 |         p = _parse_page(TemplatePageParser, LABELLED_PAGE8)
325 |         self.assertEqual(p.extra_required_attrs, ["description"])
326 | 
327 |     def test_variants(self):
328 |         """Test parsing of variant annotations"""
329 |         annotations = _parse_page(TemplatePageParser, LABELLED_PAGE9).annotations
330 |         self.assertEqual(annotations[0].variant_id, None)
331 |         self.assertEqual(annotations[1].variant_id, 1)
332 |         self.assertEqual(annotations[2].variant_id, 1)
333 |         self.assertEqual(annotations[3].variant_id, 2)
334 |         self.assertEqual(annotations[4].variant_id, 2)
335 |         self.assertEqual(annotations[5].variant_id, None)
336 | 
337 |     def test_variants_in_attributes(self):
338 |         """Test parsing of variant annotations in attributes"""
339 |         annotations = _parse_page(TemplatePageParser, LABELLED_PAGE10).annotations
340 |         self.assertEqual(annotations[0].variant_id, None)
341 |         self.assertEqual(annotations[1].variant_id, 1)
342 |         self.assertEqual(annotations[2].variant_id, 1)
343 |         self.assertEqual(annotations[3].variant_id, 1)
344 |         self.assertEqual(annotations[4].variant_id, 2)
345 |         self.assertEqual(annotations[5].variant_id, 2)
346 |         self.assertEqual(annotations[6].variant_id, 2)
347 |         self.assertEqual(annotations[7].variant_id, None)
348 | 
349 |     def test_variant_attribute(self):
350 |         """
351 |         Test self closed tag attribute annotated for a variant
352 |         """
353 |         annotations = _parse_page(TemplatePageParser, LABELLED_PAGE11).annotations
354 |         self.assertEqual(annotations[0].variant_id, 1)
355 | 
356 |     def test_content_attribute(self):
357 |         """
358 |         Test that attribute with name content is unambiguously interpreted
359 |         """
360 |         annotations = _parse_page(TemplatePageParser, LABELLED_PAGE12).annotations
361 |         self.assertEqual(annotations[0].surrounds_attribute, None)
362 |         self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
363 | 
364 |     def test_content_and_content_attribute(self):
365 |         """
366 |         Test that attribute with name content and the content itself are unambiguously interpreted
367 |         """
368 |         annotations = _parse_page(TemplatePageParser, LABELLED_PAGE13).annotations
369 |         self.assertEqual(annotations[0].surrounds_attribute, 'name')
370 |         self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
371 | 
372 |     def test_site_pages(self):
373 |         """
374 |         Tests from real pages. More reliable and easy to build for more complicated structures
375 |         """
376 |         for source, annotations in iter_samples('pageparsing'):
377 |             template = HtmlPage(body=source)
378 |             parser = TemplatePageParser(TokenDict())
379 |             parser.feed(template)
380 |             for annotation in parser.annotations:
381 |                 test_annotation = annotations.pop(0)
382 |                 for s in annotation.__slots__:
383 |                     if s == "tag_attributes":
384 |                         for pair in getattr(annotation, s):
385 |                             self.assertEqual(list(pair), test_annotation[s].pop(0))
386 |                     else:
387 |                         self.assertEqual(getattr(annotation, s), test_annotation[s])
388 |             self.assertEqual(annotations, [])
389 | 


--------------------------------------------------------------------------------
/tests/test_scraper.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from six import StringIO
 3 | 
 4 | from scrapely import Scraper
 5 | from scrapely.htmlpage import HtmlPage
 6 | from . import iter_samples
 7 | 
 8 | 
 9 | class ScraperTest(TestCase):
10 | 
11 |     def _assert_extracted(self, extracted, expected):
12 |         # FIXME: this is a very weak test - we should assert the
13 |         # extracted data matches, fixing issues that prevent it
14 |         expect_keys = sorted(expected.keys())
15 |         found_keys = sorted(extracted[0].keys())
16 |         self.assertEqual(expect_keys, found_keys)
17 | 
18 |     def test_extraction(self):
19 | 
20 |         samples_encoding = 'latin1'
21 |         [(html1, data1), (html2, data2)] = list(iter_samples(
22 |             'scraper_loadstore', html_encoding=samples_encoding))
23 |         sc = Scraper()
24 |         page1 = HtmlPage(body=html1, encoding=samples_encoding)
25 |         sc.train_from_htmlpage(page1, data1)
26 | 
27 |         page2 = HtmlPage(body=html2, encoding=samples_encoding)
28 |         extracted_data = sc.scrape_page(page2)
29 |         self._assert_extracted(extracted_data, data2)
30 | 
31 |         # check still works after serialize/deserialize
32 |         f = StringIO()
33 |         sc.tofile(f)
34 |         f.seek(0)
35 |         sc = Scraper.fromfile(f)
36 |         extracted_data = sc.scrape_page(page2)
37 |         self._assert_extracted(extracted_data, data2)
38 | 


--------------------------------------------------------------------------------
/tests/test_template.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from scrapely.htmlpage import HtmlPage
 4 | from scrapely.template import TemplateMaker, FragmentNotFound, \
 5 |     FragmentAlreadyAnnotated, best_match
 6 | from scrapely.extraction import InstanceBasedLearningExtractor
 7 | 
 8 | 
 9 | class TemplateMakerTest(TestCase):
10 | 
11 |     PAGE = HtmlPage("http://www.example.com", body=u"""
12 |     <html>
13 |       <body>
14 |         <h1>Some title</h1>
15 |         <p>Some text to annotate here</p>
16 |         <h2>Another title</h2>
17 |         <p>Another text to annotate there</p>
18 |         <p>More text with unpaired tag <img />and that's it</p>
19 |       </body>
20 |     </html>
21 |     """)
22 | 
23 |     def test_annotate_single(self):
24 |         tm = TemplateMaker(self.PAGE)
25 |         tm.annotate('field1', best_match('text to annotate'))
26 |         tpl = tm.get_template()
27 |         ex = InstanceBasedLearningExtractor([(tpl, None)])
28 |         self.assertEqual(ex.extract(self.PAGE)[0],
29 |             [{u'field1': [u'Some text to annotate here']}])
30 | 
31 |     def test_annotate_multiple(self):
32 |         tm = TemplateMaker(self.PAGE)
33 |         tm.annotate('field1', best_match('text to annotate'), best_match=False)
34 |         tpl = tm.get_template()
35 |         ex = InstanceBasedLearningExtractor([(tpl, None)])
36 |         self.assertEqual(ex.extract(self.PAGE)[0],
37 |             [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
38 | 
39 |     def test_annotate_ignore_unpaired(self):
40 |         tm = TemplateMaker(self.PAGE)
41 |         tm.annotate('field1', best_match("and that's"), best_match=False)
42 |         tpl = tm.get_template()
43 |         ex = InstanceBasedLearningExtractor([(tpl, None)])
44 |         self.assertEqual(ex.extract(self.PAGE)[0],
45 |             [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
46 | 
47 |     def test_annotate_fragment_not_found(self):
48 |         tm = TemplateMaker(self.PAGE)
49 |         self.assertRaises(FragmentNotFound, tm.annotate, 'field1', best_match("missing text"))
50 | 
51 |     def test_annotate_fragment_already_annotated(self):
52 |         tm = TemplateMaker(self.PAGE)
53 |         tm.annotate('field1', best_match('text to annotate'))
54 |         self.assertRaises(FragmentAlreadyAnnotated, tm.annotate, 'field1', best_match("text to annotate"))
55 | 
56 |     def test_selected_data(self):
57 |         tm = TemplateMaker(self.PAGE)
58 |         indexes = tm.select(best_match('text to annotate'))
59 |         data = [tm.selected_data(i) for i in indexes]
60 |         self.assertEqual(data, \
61 |             [u'<p>Some text to annotate here</p>', \
62 |             u'<p>Another text to annotate there</p>'])
63 | 
64 |     def test_annotations(self):
65 |         tm = TemplateMaker(self.PAGE)
66 |         tm.annotate('field1', best_match('text to annotate'), best_match=False)
67 |         annotations = [x[0] for x in tm.annotations()]
68 |         self.assertEqual(annotations,
69 |             [{u'annotations': {u'content': u'field1'}},
70 |              {u'annotations': {u'content': u'field1'}}])
71 | 
72 |     def test_best_match(self):
73 |         self.assertEquals(self._matches('text to annotate'),
74 |             ['Some text to annotate here', 'Another text to annotate there'])
75 | 
76 |     def _matches(self, text):
77 |         bm = best_match(text)
78 |         matches = [(bm(f, self.PAGE), f) for f in self.PAGE.parsed_body]
79 |         matches = [x for x in matches if x[0]]
80 |         matches.sort(reverse=True)
81 |         return [self.PAGE.fragment_data(x[1]) for x in matches]
82 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (http://tox.testrun.org/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py27,py36,py37,pypy,pypy3
 8 | usedevelop = True
 9 | 
10 | [testenv]
11 | deps =
12 |     -rrequirements.txt
13 |     nose
14 |     parameterized
15 |     doctest-ignore-unicode
16 |     coverage
17 |     cython
18 | commands =
19 |     pip install -e .
20 |     nosetests \
21 |         --with-coverage \
22 |         --cover-package=scrapely \
23 |         --cover-branches \
24 |         --with-doctest \
25 |         --with-doctest-ignore-unicode \
26 |         --doctest-options='+ELLIPSIS,+IGNORE_UNICODE' \
27 |         {posargs:scrapely tests}
28 | 


--------------------------------------------------------------------------------