├── .bumpversion.cfg ├── .gitignore ├── .travis.yml ├── AUTHORS ├── MANIFEST.in ├── NEWS ├── README.rst ├── requirements.txt ├── scrapely ├── __init__.py ├── _htmlpage.c ├── _htmlpage.pyx ├── descriptor.py ├── extraction │ ├── __init__.py │ ├── _similarity.c │ ├── _similarity.pyx │ ├── pageobjects.py │ ├── pageparsing.py │ ├── regionextract.py │ └── similarity.py ├── extractors.py ├── htmlpage.py ├── template.py ├── tool.py └── version.py ├── setup.py ├── tests ├── __init__.py ├── samples │ ├── samples_htmlpage_0.html │ ├── samples_htmlpage_0.json │ ├── samples_htmlpage_1.html │ ├── samples_htmlpage_1.json │ ├── samples_htmlpage_2.html │ ├── samples_htmlpage_2.json │ ├── samples_pageparsing_0.html │ ├── samples_pageparsing_0.json │ ├── samples_scraper_loadstore_0.html │ ├── samples_scraper_loadstore_0.json │ ├── samples_scraper_loadstore_1.html │ └── samples_scraper_loadstore_1.json ├── test_extraction.py ├── test_htmlpage.py ├── test_htmlpage_data.py ├── test_pageparsing.py ├── test_scraper.py └── test_template.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.13.5 3 | commit = True 4 | tag = True 5 | tag_name = v{new_version} 6 | 7 | [bumpversion:file:setup.py] 8 | 9 | [bumpversion:file:scrapely/version.py] 10 | search = __version__ = '{current_version}' 11 | replace = __version__ = '{new_version}' 12 | 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | .tox 4 | build 5 | dist 6 | scrapely.egg-info 7 | 8 | # coverage reports 9 | .coverage 10 | .coverage.* 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | matrix: 4 | include: 5 | - python: 2.7 6 | env: TOXENV=py27 7 | - python: 3.4 8 | env: TOXENV=py34 9 | - python: pypy 10 | env: TOXENV=pypy 11 | 12 | install: 13 | - pip install cython 14 | - CYTHONIZE=1 python setup.py build 15 | - pip install -U tox 16 | script: tox 17 | 18 | after_success: 19 | - codecov 20 | 21 | notifications: 22 | irc: 23 | use_notice: true 24 | skip_join: true 25 | channels: 26 | - irc.freenode.org#scrapy 27 | deploy: 28 | provider: pypi 29 | distributions: sdist 30 | user: scrapy 31 | password: 32 | secure: KIXp6K9gU7TT7d0CTkDq81s1Uh2qLHBf+b8l0fAlzq1xHeBuWY82nq94yp6KPqBDr868Cf5CwyC6Gnz/HFD93NVZabooTiz0qUAq98fqKQ2n2KVWzaWxL5C0PN4x5P9KfAlXTgFAll1uCsKRa7gvRbW+q/wKAGsGfKDshTxTkAQ= 33 | on: 34 | tags: true 35 | all_branches: true 36 | repo: scrapy/scrapely 37 | condition: "$TOXENV == py27" 38 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Scrapely was originally written by Shane Evans and released as part of the 2 | (larger) Scrapy open source project by Pablo Hoffman. In April of 2011 Scrapely 3 | was taken out of Scrapy, and released as a standalone library, to improve its 4 | reusage and adoption. 5 | 6 | Here is the list of the main contributors (along with their github users): 7 | 8 | * Shane Evans (shane42) 9 | * Pablo Hoffman (pablohoffman) 10 | * Martin Olveyra (kalessin) 11 | * Daniel Graña (dangra) 12 | * Terry Peng (tpeng) 13 | * Mikhail Korobov (kmike) 14 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include scrapely/*.pyx 2 | include scrapely/extraction/*.pyx 3 | include scrapely/*.c 4 | include scrapely/extraction/*.c 5 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Scrapely release notes 2 | ====================== 3 | 4 | 0.13.5 (2019-06-18) 5 | ------------------- 6 | 7 | - Update C extensions generated from Cython for python 3.7 8 | - Fix PEP8 violations 9 | 10 | 0.13.4 (2017-05-26) 11 | ------------------- 12 | 13 | - Improved price extraction 14 | - Replaced deprecated functions 15 | 16 | 0.13.3 (2017-01-27) 17 | ------------------- 18 | 19 | - Use 64 bit integers when matching pages 20 | 21 | 0.13.2 (2016-12-21) 22 | ------------------- 23 | 24 | - Add python3 support for `url_to_page` function 25 | 26 | 0.13.1 (2016-12-21) 27 | ------------------- 28 | 29 | - Remove numpy as a mandatory import in setup.py 30 | 31 | 0.13.0 (2016-12-21) 32 | ------------------- 33 | 34 | - Python 3 support; 35 | - fixed incorrect webpage encoding detection; 36 | - usability improvements for scrapely.tool; 37 | - internal cleanups; 38 | - number extractor now supports numbers with a sign. 39 | - add C extension to speed up parsing and extraction 40 | 41 | 0.12.0 (2015-01-26) 42 | ------------------- 43 | 44 | - TemplatePageExtractor can now use multiple top-level extractors; 45 | - internal cleanups; 46 | 47 | 0.11.0 (2014-08-01) 48 | ------------------- 49 | 50 | - HtmlPageParsedRegion can be copied/deepcopied. 51 | 52 | 0.10 (2014-01-14) 53 | ----------------- 54 | 55 | - Several bug fixes and improvements to the IBL extraction logic; 56 | - allow training the Scraper class with an HtmlPage; 57 | - Python 2.5 support is dropped; 58 | - Unicode improvements for scrapely.tool. 59 | 60 | 0.9 (2011-04-19) 61 | ---------------- 62 | 63 | First release of Scrapely. 64 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Scrapely 3 | ======== 4 | 5 | .. image:: https://api.travis-ci.org/scrapy/scrapely.svg?branch=master 6 | :target: https://travis-ci.org/scrapy/scrapely 7 | 8 | Scrapely is a library for extracting structured data from HTML pages. Given 9 | some example web pages and the data to be extracted, scrapely constructs a 10 | parser for all similar pages. 11 | 12 | Overview 13 | ======== 14 | 15 | Scrapinghub wrote a nice `blog post`_ explaining how scrapely works and how it's used in Portia_. 16 | 17 | .. _blog post: https://blog.scrapinghub.com/2016/07/07/scrapely-the-brains-behind-portia-spiders/ 18 | .. _Portia: http://portia.readthedocs.io/ 19 | 20 | Installation 21 | ============ 22 | 23 | Scrapely works in Python 2.7 or 3.3+. 24 | It requires numpy and w3lib Python packages. 25 | 26 | To install scrapely on any platform use:: 27 | 28 | pip install scrapely 29 | 30 | If you're using Ubuntu (9.10 or above), you can install scrapely from the 31 | Scrapy Ubuntu repos. Just add the Ubuntu repos as described here: 32 | http://doc.scrapy.org/en/latest/topics/ubuntu.html 33 | 34 | And then install scrapely with:: 35 | 36 | aptitude install python-scrapely 37 | 38 | Usage (API) 39 | =========== 40 | 41 | Scrapely has a powerful API, including a template format that can be edited 42 | externally, that you can use to build very capable scrapers. 43 | 44 | What follows is a quick example of the simplest possible usage, that you can 45 | run in a Python shell. 46 | 47 | Start by importing and instantiating the Scraper class:: 48 | 49 | >>> from scrapely import Scraper 50 | >>> s = Scraper() 51 | 52 | Then, proceed to train the scraper by adding some page and the data you expect 53 | to scrape from there (note that all keys and values in the data you pass must 54 | be strings):: 55 | 56 | >>> url1 = 'http://pypi.python.org/pypi/w3lib/1.1' 57 | >>> data = {'name': 'w3lib 1.1', 'author': 'Scrapy project', 'description': 'Library of web-related functions'} 58 | >>> s.train(url1, data) 59 | 60 | Finally, tell the scraper to scrape any other similar page and it will return 61 | the results:: 62 | 63 | >>> url2 = 'http://pypi.python.org/pypi/Django/1.3' 64 | >>> s.scrape(url2) 65 | [{u'author': [u'Django Software Foundation <foundation at djangoproject com>'], 66 | u'description': [u'A high-level Python Web framework that encourages rapid development and clean, pragmatic design.'], 67 | u'name': [u'Django 1.3']}] 68 | 69 | That's it! No xpaths, regular expressions, or hacky python code. 70 | 71 | Usage (command line tool) 72 | ========================= 73 | 74 | There is also a simple script to create and manage Scrapely scrapers. 75 | 76 | It supports a command-line interface, and an interactive prompt. All commands 77 | supported on interactive prompt are also supported in the command-line 78 | interface. 79 | 80 | To enter the interactive prompt type the following without arguments:: 81 | 82 | python -m scrapely.tool myscraper.json 83 | 84 | Example:: 85 | 86 | $ python -m scrapely.tool myscraper.json 87 | scrapely> help 88 | 89 | Documented commands (type help ): 90 | ======================================== 91 | a al s ta td tl 92 | 93 | scrapely> 94 | 95 | To create a scraper and add a template:: 96 | 97 | scrapely> ta http://pypi.python.org/pypi/w3lib/1.1 98 | [0] http://pypi.python.org/pypi/w3lib/1.1 99 | 100 | This is equivalent as typing the following in one command:: 101 | 102 | python -m scrapely.tool myscraper.json ta http://pypi.python.org/pypi/w3lib/1.1 103 | 104 | To list available templates from a scraper:: 105 | 106 | scrapely> tl 107 | [0] http://pypi.python.org/pypi/w3lib/1.1 108 | 109 | To add a new annotation, you usually test the selection criteria first:: 110 | 111 | scrapely> t 0 w3lib 1.1 112 | [0] u'

w3lib 1.1

' 113 | [1] u'Python Package Index : w3lib 1.1' 114 | 115 | You can also quote the text, if you need to specify an arbitrary number of 116 | spaces, for example:: 117 | 118 | scrapely> t 0 "w3lib 1.1" 119 | 120 | You can refine by position. To take the one in position [0]:: 121 | 122 | scrapely> a 0 w3lib 1.1 -n 0 123 | [0] u'

w3lib 1.1

' 124 | 125 | To annotate some fields on the template:: 126 | 127 | scrapely> a 0 w3lib 1.1 -n 0 -f name 128 | [new] (name) u'

w3lib 1.1

' 129 | scrapely> a 0 Scrapy project -n 0 -f author 130 | [new] u'Scrapy project' 131 | 132 | To list annotations on a template:: 133 | 134 | scrapely> al 0 135 | [0-0] (name) u'

w3lib 1.1

' 136 | [0-1] (author) u'Scrapy project' 137 | 138 | To scrape another similar page with the already added templates:: 139 | 140 | scrapely> s http://pypi.python.org/pypi/Django/1.3 141 | [{u'author': [u'Django Software Foundation'], u'name': [u'Django 1.3']}] 142 | 143 | 144 | Tests 145 | ===== 146 | 147 | `tox`_ is the preferred way to run tests. Just run: ``tox`` from the root 148 | directory. 149 | 150 | Support 151 | ======= 152 | 153 | * Mailing list: https://groups.google.com/forum/#!forum/scrapely 154 | * IRC: `scrapy@freenode`_ 155 | 156 | Scrapely is created and maintained by the Scrapy group, so you can get help 157 | through the usual support channels described in the `Scrapy community`_ page. 158 | 159 | Architecture 160 | ============ 161 | 162 | Unlike most scraping libraries, Scrapely doesn't work with DOM trees or xpaths 163 | so it doesn't depend on libraries such as lxml or libxml2. Instead, it uses 164 | an internal pure-python parser, which can accept poorly formed HTML. The HTML is 165 | converted into an array of token ids, which is used for matching the items to 166 | be extracted. 167 | 168 | Scrapely extraction is based upon the Instance Based Learning algorithm [1]_ 169 | and the matched items are combined into complex objects (it supports nested and 170 | repeated objects), using a tree of parsers, inspired by A Hierarchical 171 | Approach to Wrapper Induction [2]_. 172 | 173 | .. [1] `Yanhong Zhai , Bing Liu, Extracting Web Data Using Instance-Based Learning, World Wide Web, v.10 n.2, p.113-132, June 2007 `_ 174 | 175 | .. [2] `Ion Muslea , Steve Minton , Craig Knoblock, A hierarchical approach to wrapper induction, Proceedings of the third annual conference on Autonomous Agents, p.190-197, April 1999, Seattle, Washington, United States `_ 176 | 177 | Known Issues 178 | ============ 179 | 180 | The training implementation is currently very simple and is only provided for 181 | references purposes, to make it easier to test Scrapely and play with it. On 182 | the other hand, the extraction code is reliable and production-ready. So, if 183 | you want to use Scrapely in production, you should use train() with caution and 184 | make sure it annotates the area of the page you intended. 185 | 186 | Alternatively, you can use the Scrapely command line tool to annotate pages, 187 | which provides more manual control for higher accuracy. 188 | 189 | How does Scrapely relate to `Scrapy`_? 190 | ====================================== 191 | 192 | Despite the similarity in their names, Scrapely and `Scrapy`_ are quite 193 | different things. The only similarity they share is that they both depend on 194 | `w3lib`_, and they are both maintained by the same group of developers (which 195 | is why both are hosted on the `same Github account`_). 196 | 197 | Scrapy is an application framework for building web crawlers, while Scrapely is 198 | a library for extracting structured data from HTML pages. If anything, Scrapely 199 | is more similar to `BeautifulSoup`_ or `lxml`_ than Scrapy. 200 | 201 | Scrapely doesn't depend on Scrapy nor the other way around. In fact, it is 202 | quite common to use Scrapy without Scrapely, and viceversa. 203 | 204 | If you are looking for a complete crawler-scraper solution, there is (at least) 205 | one project called `Slybot`_ that integrates both, but you can definitely use 206 | Scrapely on other web crawlers since it's just a library. 207 | 208 | Scrapy has a builtin extraction mechanism called `selectors`_ which (unlike 209 | Scrapely) is based on XPaths. 210 | 211 | 212 | License 213 | ======= 214 | 215 | Scrapely library is licensed under the BSD license. 216 | 217 | .. _Scrapy: http://scrapy.org/ 218 | .. _w3lib: https://github.com/scrapy/w3lib 219 | .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ 220 | .. _lxml: http://lxml.de/ 221 | .. _same Github account: https://github.com/scrapy 222 | .. _slybot: https://github.com/scrapy/slybot 223 | .. _selectors: http://doc.scrapy.org/en/latest/topics/selectors.html 224 | .. _nose: http://readthedocs.org/docs/nose/en/latest/ 225 | .. _scrapy@freenode: http://webchat.freenode.net/?channels=scrapy 226 | .. _Scrapy community: http://scrapy.org/community/ 227 | .. _tox: https://pypi.python.org/pypi/tox 228 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | w3lib 3 | six -------------------------------------------------------------------------------- /scrapely/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from w3lib.util import str_to_unicode 4 | 5 | from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page 6 | from scrapely.template import TemplateMaker, best_match 7 | from scrapely.extraction import InstanceBasedLearningExtractor 8 | from scrapely.version import __version__ 9 | 10 | 11 | class Scraper(object): 12 | 13 | def __init__(self, templates=None): 14 | """Initialize an empty scraper.""" 15 | self._templates = templates or [] 16 | self._ex = None 17 | 18 | @classmethod 19 | def fromfile(cls, file): 20 | """Initialize a scraper from a file previously stored by tofile() 21 | method. 22 | """ 23 | templates = [HtmlPage(**x) for x in json.load(file)['templates']] 24 | return cls(templates) 25 | 26 | def tofile(self, file): 27 | """Store the scraper into the given file-like object""" 28 | tpls = [page_to_dict(x) for x in self._templates] 29 | json.dump({'templates': tpls}, file) 30 | 31 | def add_template(self, template): 32 | self._templates.append(template) 33 | self._ex = None 34 | 35 | def train_from_htmlpage(self, htmlpage, data): 36 | assert data, "Cannot train with empty data" 37 | tm = TemplateMaker(htmlpage) 38 | for field, values in data.items(): 39 | if (isinstance(values, (bytes, str)) or 40 | not hasattr(values, '__iter__')): 41 | values = [values] 42 | for value in values: 43 | value = str_to_unicode(value, htmlpage.encoding) 44 | tm.annotate(field, best_match(value)) 45 | self.add_template(tm.get_template()) 46 | 47 | def train(self, url, data, encoding=None): 48 | page = url_to_page(url, encoding) 49 | self.train_from_htmlpage(page, data) 50 | 51 | def scrape(self, url, encoding=None): 52 | page = url_to_page(url, encoding) 53 | return self.scrape_page(page) 54 | 55 | def scrape_page(self, page): 56 | if self._ex is None: 57 | self._ex = InstanceBasedLearningExtractor((t, None) for t in 58 | self._templates) 59 | return self._ex.extract(page)[0] 60 | -------------------------------------------------------------------------------- /scrapely/_htmlpage.pyx: -------------------------------------------------------------------------------- 1 | from cpython.version cimport PY_MAJOR_VERSION 2 | import re 3 | 4 | _ATTR = "((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?" 5 | _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL) 6 | 7 | class HtmlTagType(object): 8 | OPEN_TAG = 1 9 | CLOSE_TAG = 2 10 | UNPAIRED_TAG = 3 11 | 12 | 13 | class HtmlDataFragment(object): 14 | __slots__ = ('start', 'end', 'is_text_content') 15 | 16 | def __init__(self, start, end, is_text_content=False): 17 | self.start = start 18 | self.end = end 19 | self.is_text_content = is_text_content 20 | 21 | def __str__(self): 22 | return "" % (self.start, self.end, self.is_text_content) 23 | 24 | def __repr__(self): 25 | return str(self) 26 | 27 | 28 | class HtmlTag(HtmlDataFragment): 29 | __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text') 30 | 31 | def __init__(self, tag_type, tag, attr_text, start, end): 32 | HtmlDataFragment.__init__(self, start, end) 33 | self.tag_type = tag_type 34 | self.tag = tag 35 | if isinstance(attr_text, dict): 36 | self._attributes = attr_text 37 | self._attr_text = None 38 | else: # defer loading attributes until necessary 39 | self._attributes = {} 40 | self._attr_text = attr_text 41 | 42 | @property 43 | def attributes(self): 44 | if not self._attributes and self._attr_text: 45 | for attr_match in _ATTR_REGEXP.findall(self._attr_text): 46 | name = attr_match[0].lower() 47 | values = [v for v in attr_match[1:] if v] 48 | # According to HTML spec if attribute name is repeated only the 49 | # first one is taken into account 50 | if name not in self._attributes: 51 | self._attributes[name] = values[0] if values else None 52 | return self._attributes 53 | 54 | def __str__(self): 55 | return "" % (self.tag, ', '.join(sorted\ 56 | (["%s: %s" % (k, repr(v)) for k, v in self.attributes.items()])), self.tag_type, self.start, self.end) 57 | 58 | def __repr__(self): 59 | return str(self) 60 | 61 | 62 | cdef class CommentParser: 63 | cdef int start 64 | cdef int end 65 | cdef int open_state, open_count 66 | cdef int close_state, close_count 67 | cdef int inside_comment 68 | 69 | def __init__(self): 70 | self.start = -1 71 | self.end = -1 72 | self.reset() 73 | 74 | cdef void reset(self): 75 | self.open_state = 1 76 | self.close_state = 1 77 | self.open_count = 0 78 | self.close_count = 0 79 | 80 | cdef int parse(self, Py_UCS4 c, int i): 81 | if ((self.open_state == 1 and c == u'<') or 82 | (self.open_state == 2 and c == u'!') or 83 | (self.open_state == 3 and c == u'-') or 84 | (self.open_state == 4 and c == u'-')): 85 | self.open_state += 1 86 | else: 87 | # Handle comment 88 | if self.open_state == 3 and c == u'>': 89 | self.inside_comment = False 90 | self.reset() 91 | self.start, self.end = i - 2, i 92 | return True 93 | self.open_state = 1 94 | if self.open_state == 5: 95 | if self.open_count == 0: 96 | self.start = i - 3 97 | self.open_state = 1 98 | self.open_count = 1 99 | self.inside_comment = True 100 | 101 | if self.close_count < self.open_count: 102 | if self.close_state == 1: 103 | if c == u'-': 104 | self.close_state += 1 105 | elif self.close_state == 2: 106 | if c == u'-': 107 | self.close_state += 1 108 | else: 109 | self.close_state = 1 110 | elif self.close_state == 3: 111 | if c == u'!': 112 | self.close_state = 4 113 | elif c == u'>': 114 | self.close_state = 5 115 | else: 116 | self.close_state = 1 117 | elif self.close_state == 4: 118 | if c == u'>': 119 | self.close_state = 5 120 | else: 121 | self.close_state = 1 122 | 123 | if self.close_state == 5: 124 | self.close_state = 1 125 | self.close_count += 1 126 | if self.close_count >= self.open_count: 127 | self.end = i 128 | self.reset() 129 | self.inside_comment = False 130 | return True 131 | return False 132 | 133 | 134 | cdef class ScriptParser: 135 | cdef int start 136 | cdef int end 137 | cdef int state 138 | 139 | def __init__(self): 140 | self.start = -1 141 | self.end = -1 142 | self.state = 1 143 | 144 | cdef int parse(self, Py_UCS4 c, int i): 145 | if self.state == 10: 146 | self.state = 1 147 | if ((self.state == 1 and c == u'<') or 148 | (self.state == 2 and c == u'/') or 149 | (self.state == 3 and c in u'sS') or 150 | (self.state == 4 and c in u'cC') or 151 | (self.state == 5 and c in u'rR') or 152 | (self.state == 6 and c in u'iI') or 153 | (self.state == 7 and c in u'pP') or 154 | (self.state == 8 and c in u'tT') or 155 | (self.state == 9 and c == u'>')): 156 | self.state += 1 157 | else: 158 | self.state = 1 159 | 160 | if self.state == 2: 161 | self.start = i 162 | elif self.state == 10: 163 | self.end = i 164 | 165 | return self.state == 10 166 | 167 | 168 | # directly copied from cython's docs 169 | cdef unicode _ustring(s): 170 | if type(s) is unicode: 171 | # fast path for most common case(s) 172 | return s 173 | elif PY_MAJOR_VERSION < 3 and isinstance(s, bytes): 174 | # only accept byte strings in Python 2.x, not in Py3 175 | return (s).decode('ascii') 176 | elif isinstance(s, unicode): 177 | # an evil cast to might work here in some(!) cases, 178 | # depending on what the further processing does. to be safe, 179 | # we can always create a copy instead 180 | return unicode(s) 181 | else: 182 | raise TypeError('unicode or str expected') 183 | 184 | 185 | cpdef parse_html(s): 186 | cdef int OPEN_TAG = HtmlTagType.OPEN_TAG 187 | cdef int CLOSE_TAG = HtmlTagType.CLOSE_TAG 188 | cdef int UNPAIRED_TAG = HtmlTagType.UNPAIRED_TAG 189 | 190 | cdef unicode text = _ustring(s) 191 | 192 | parsed = [] 193 | comment_parser = CommentParser() 194 | script_parser = ScriptParser() 195 | 196 | cdef int tag_end = -1 # end position of previous tag 197 | cdef int tag_start = -1 # start of current tag 198 | cdef int script = False # True if inside script body 199 | cdef int open_tag = False # True if an open tag symbol has been read 200 | cdef int quote_single = False # True if unpaired single quote 201 | cdef int quote_double = False # True if unpaired double quote 202 | cdef int quoted 203 | 204 | cdef int reset_tag = True 205 | cdef int slash 206 | cdef int has_attributes 207 | cdef int yield_tag 208 | 209 | cdef unicode tag_name 210 | cdef unicode tag_attributes 211 | cdef Py_UCS4 curr_char 212 | cdef Py_UCS4 prev_char = 0 # previous value of curr_char 213 | cdef int i = 0 214 | for curr_char in text: 215 | if reset_tag: 216 | reset_tag = False 217 | slash = False 218 | has_attributes = False 219 | tag_name = u'' 220 | tag_attributes = u'' 221 | yield_tag = False 222 | 223 | if open_tag or script: 224 | if curr_char == u'"' and not quote_single: 225 | quote_double = not quote_double 226 | if curr_char == u"'" and not quote_double: 227 | quote_single = not quote_single 228 | else: 229 | quote_single = quote_double = False 230 | quoted = quote_double or quote_single 231 | 232 | if not quoted: 233 | if comment_parser.parse(curr_char, i): 234 | if (tag_end + 1) < comment_parser.start: 235 | parsed.append( 236 | HtmlDataFragment(tag_end + 1, comment_parser.start, not script)) 237 | tag_end = comment_parser.end 238 | parsed.append( 239 | HtmlDataFragment(comment_parser.start, tag_end + 1, False)) 240 | reset_tag = True 241 | if (comment_parser.end - comment_parser.start) == 2: 242 | open_tag = False 243 | 244 | if comment_parser.inside_comment: 245 | open_tag = False 246 | else: 247 | if script: 248 | open_tag = False 249 | if script_parser.parse(curr_char, i): 250 | script = False 251 | if (tag_end + 1) < script_parser.start: 252 | parsed.append( 253 | HtmlDataFragment(tag_end + 1, script_parser.start, False)) 254 | tag_end = script_parser.end 255 | parsed.append( 256 | HtmlTag(CLOSE_TAG, 257 | u'script', u'', script_parser.start, tag_end + 1)) 258 | elif open_tag: 259 | if quoted: 260 | if has_attributes: 261 | tag_attributes += curr_char 262 | elif curr_char == u'<': 263 | tag_end = i - 1 264 | yield_tag = True 265 | elif curr_char == u'>': 266 | if prev_char == u'/': 267 | slash = True 268 | tag_end = i 269 | yield_tag = True 270 | open_tag = False 271 | elif curr_char == u'/': 272 | if prev_char == u'<': 273 | slash = True 274 | elif curr_char.isspace(): 275 | if has_attributes: 276 | if prev_char == u'/': 277 | # feature, bug? Maintain compatilibity with previous 278 | # implementation 279 | tag_attributes += u'/' 280 | tag_attributes += curr_char 281 | elif tag_name: 282 | has_attributes = True 283 | else: 284 | if has_attributes: 285 | tag_attributes += curr_char 286 | else: 287 | tag_name += curr_char.lower() 288 | if yield_tag: 289 | if not slash: 290 | tag_type = OPEN_TAG 291 | elif prev_char != u'/': 292 | tag_type = CLOSE_TAG 293 | else: 294 | tag_type = UNPAIRED_TAG 295 | if tag_name != u'!doctype': 296 | parsed.append( 297 | HtmlTag(tag_type, tag_name, 298 | tag_attributes, tag_start, tag_end + 1)) 299 | if tag_name == u'script': 300 | script = True 301 | if open_tag: 302 | tag_start = i 303 | reset_tag = True 304 | else: 305 | open_tag = False 306 | if curr_char == u'<' and not quoted: 307 | open_tag = True 308 | tag_start = i 309 | if tag_start > tag_end + 1: 310 | parsed.append( 311 | HtmlDataFragment(tag_end + 1, tag_start, True)) 312 | tag_end = tag_start 313 | prev_char = curr_char 314 | i += 1 315 | 316 | if tag_end + 1 < len(text): 317 | parsed.append(HtmlDataFragment(tag_end + 1, len(text), True)) 318 | return parsed 319 | -------------------------------------------------------------------------------- /scrapely/descriptor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extended types for IBL extraction 3 | """ 4 | from itertools import chain 5 | 6 | from scrapely.extractors import text 7 | 8 | 9 | class FieldDescriptor(object): 10 | """description of a scraped attribute""" 11 | __slots__ = ('name', 'description', 'extractor', 'required') 12 | 13 | def __init__(self, name, description, extractor=text, required=False): 14 | self.name = name 15 | self.description = description 16 | self.extractor = extractor 17 | self.required = required 18 | 19 | def __str__(self): 20 | return "FieldDescriptor(%s)" % self.name 21 | 22 | 23 | class ItemDescriptor(object): 24 | """Simple auto scraping item descriptor. 25 | 26 | This used to describe type-specific operations and may be overridden where 27 | necessary. 28 | """ 29 | 30 | def __init__(self, name, description, attribute_descriptors): 31 | self.name = name 32 | self.description = description 33 | self.attribute_map = dict((d.name, d) for d in attribute_descriptors) 34 | self._required_attributes = [d.name for d in attribute_descriptors \ 35 | if d.required] 36 | 37 | def validated(self, data): 38 | """Only return the items in the data that are valid""" 39 | return [d for d in data if self._item_validates(d)] 40 | 41 | def _item_validates(self, item): 42 | """simply checks that all mandatory attributes are present""" 43 | variant_attrs = set(chain(* 44 | [v.keys() for v in item.get('variants', [])])) 45 | return item and all([(name in item or name in variant_attrs) \ 46 | for name in self._required_attributes]) 47 | 48 | def get_required_attributes(self): 49 | return self._required_attributes 50 | 51 | def __str__(self): 52 | return "ItemDescriptor(%s)" % self.name 53 | 54 | def copy(self): 55 | attribute_descriptors = [] 56 | for d in self.attribute_map.values(): 57 | attribute_descriptors.append(FieldDescriptor(d.name, d.description, d.extractor, d.required)) 58 | return ItemDescriptor(self.name, self.description, attribute_descriptors) 59 | # return self 60 | -------------------------------------------------------------------------------- /scrapely/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | IBL module 3 | 4 | This contains an extraction algorithm based on the paper Extracting Web Data 5 | Using Instance-Based Learning by Yanhong Zhai and Bing Liu. 6 | 7 | It defines the InstanceBasedLearningExtractor class, which implements this 8 | extraction algorithm. 9 | 10 | Main departures from the original algorithm: 11 | * there is no limit in prefix or suffix size 12 | * we have "attribute adaptors" that allow generic post processing and may 13 | affect the extraction process. For example, a price field may require a 14 | numeric value to be present. 15 | * tags can be inserted to extract regions not wrapped by html tags. These 16 | regions are then identified using the longest unique character prefix and 17 | suffix. 18 | """ 19 | from operator import itemgetter 20 | from .pageparsing import parse_template, parse_extraction_page 21 | from .pageobjects import TokenDict 22 | from .regionextract import (BasicTypeExtractor, TraceExtractor, RepeatedDataExtractor, 23 | AdjacentVariantExtractor, RecordExtractor, TemplatePageExtractor) 24 | 25 | 26 | class InstanceBasedLearningExtractor(object): 27 | """Implementation of the instance based learning algorithm to 28 | extract data from web pages. 29 | """ 30 | _extractor_classes = [ 31 | RepeatedDataExtractor, 32 | AdjacentVariantExtractor, 33 | RepeatedDataExtractor, 34 | AdjacentVariantExtractor, 35 | RepeatedDataExtractor, 36 | RecordExtractor, 37 | ] 38 | 39 | def __init__(self, td_pairs, trace=False, apply_extrarequired=True): 40 | """Initialise this extractor 41 | 42 | td_pairs is a list of (template, item descriptor) pairs. 43 | 44 | templates should contain a sequence of strings, each containing 45 | annotated html that will be used as templates for extraction. 46 | 47 | Tags surrounding areas to be extracted must contain a 48 | 'data-scrapy-annotate' attribute and the value must be the name 49 | of the attribute. If the tag was inserted and was not present in the 50 | original page, the data-scrapy-generated attribute must be present. 51 | 52 | item descriptors describe how the item will be extracted from target 53 | page, using the corresponding template. 54 | 55 | if trace is true, the returned extracted data will have a 'trace' 56 | property that contains a trace of the extraction execution. 57 | """ 58 | self.token_dict = TokenDict() 59 | parsed_plus_tdpairs = [(parse_template(self.token_dict, td[0]), td) for td in td_pairs] 60 | parsed_plus_epages = ( 61 | (p, parse_extraction_page(self.token_dict, td[0]), td) 62 | for p, td in parsed_plus_tdpairs if _annotation_count(p) 63 | ) 64 | parsed_tdpairs = map(itemgetter(0, 2), parsed_plus_epages) 65 | 66 | modified_parsed_tdpairs = [] 67 | # apply extra required attributes 68 | for parsed, (t, descriptor) in parsed_tdpairs: 69 | if descriptor is not None and apply_extrarequired: 70 | descriptor = descriptor.copy() 71 | for attr in parsed.extra_required_attrs: 72 | descriptor._required_attributes.append(attr) 73 | # not always is present a descriptor for a given attribute 74 | if attr in descriptor.attribute_map: 75 | # not strictly necessary, but avoid possible inconsistencies for user 76 | descriptor.attribute_map[attr].required = True 77 | modified_parsed_tdpairs.append((parsed, (t, descriptor))) 78 | # templates with more attributes are considered first 79 | sorted_tdpairs = sorted(modified_parsed_tdpairs, 80 | key=lambda x: _annotation_count(x[0]), reverse=True) 81 | self.extraction_trees = [ 82 | self.build_extraction_tree(p, td[1], trace) 83 | for p, td in sorted_tdpairs 84 | ] 85 | self.validated = dict( 86 | (td[0].page_id, td[1].validated if td[1] else self._filter_not_none) 87 | for _, td in sorted_tdpairs 88 | ) 89 | 90 | def build_extraction_tree(self, template, type_descriptor, trace=True): 91 | """Build a tree of region extractors corresponding to the 92 | template 93 | """ 94 | attribute_map = type_descriptor.attribute_map if type_descriptor else None 95 | extractors = BasicTypeExtractor.create(template.annotations, attribute_map) 96 | if trace: 97 | extractors = TraceExtractor.apply(template, extractors) 98 | for cls in self._extractor_classes: 99 | extractors = cls.apply(template, extractors) 100 | if trace: 101 | extractors = TraceExtractor.apply(template, extractors) 102 | 103 | return TemplatePageExtractor(template, extractors) 104 | 105 | def extract(self, html, pref_template_id=None): 106 | """extract data from an html page 107 | 108 | If pref_template_url is specified, the template with that url will be 109 | used first. 110 | """ 111 | extraction_page = parse_extraction_page(self.token_dict, html) 112 | if pref_template_id is not None: 113 | extraction_trees = sorted(self.extraction_trees, 114 | key=lambda x: x.template.id != pref_template_id) 115 | else: 116 | extraction_trees = self.extraction_trees 117 | 118 | for extraction_tree in extraction_trees: 119 | extracted = extraction_tree.extract(extraction_page) 120 | correctly_extracted = self.validated[extraction_tree.template.id](extracted) 121 | if len(correctly_extracted) > 0: 122 | return correctly_extracted, extraction_tree.template 123 | return None, None 124 | 125 | def __str__(self): 126 | return "InstanceBasedLearningExtractor[\n%s\n]" % \ 127 | (',\n'.join(map(str, self.extraction_trees))) 128 | 129 | @staticmethod 130 | def _filter_not_none(items): 131 | return [d for d in items if d is not None] 132 | 133 | 134 | def _annotation_count(template): 135 | return len(template.annotations) 136 | -------------------------------------------------------------------------------- /scrapely/extraction/_similarity.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | cimport cython 4 | from cpython.version cimport PY_MAJOR_VERSION 5 | 6 | cdef np_kmp_match_length(np.ndarray[np.int64_t, ndim=1] sequence, 7 | np.ndarray[np.int64_t, ndim=1] pattern, 8 | int start=0, 9 | int end=-1): 10 | """Adaptated from KMP substring search: 11 | http://code.activestate.com/recipes/117214-knuth-morris-pratt-string-matching/ 12 | 13 | The algorithm is modified to return the match length at the given position 14 | """ 15 | ret = [] 16 | cdef int m = len(pattern) 17 | if end == -1: 18 | end = m 19 | # build table of shift amounts 20 | cdef np.ndarray[np.int64_t, ndim=1] shifts = np.ones((m + 1,), dtype=int) 21 | cdef int shift = 1 22 | cdef int pos 23 | for pos in range(m): 24 | while shift <= pos and pattern[pos] != pattern[pos-shift]: 25 | shift += shifts[pos-shift] 26 | shifts[pos+1] = shift 27 | 28 | # do the actual search 29 | cdef int startPos = start 30 | cdef int matchLen = 0 31 | cdef int c 32 | for c in sequence[start:]: 33 | if startPos >= end: 34 | break 35 | while matchLen == m or \ 36 | matchLen >= 0 and pattern[matchLen] != c: 37 | if matchLen > 0: 38 | ret.append((startPos, matchLen)) 39 | startPos += shifts[matchLen] 40 | matchLen -= shifts[matchLen] 41 | matchLen += 1 42 | if matchLen > 0 and startPos < end: 43 | ret.append((startPos, matchLen)) 44 | 45 | return ret 46 | 47 | 48 | cdef u_kmp_match_length(unicode sequence, unicode pattern, int start=0, int end=-1): 49 | """Adaptated from KMP substring search: 50 | http://code.activestate.com/recipes/117214-knuth-morris-pratt-string-matching/ 51 | 52 | The algorithm is modified to return the match length at the given position 53 | """ 54 | ret = [] 55 | cdef int m = len(pattern) 56 | if end == -1: 57 | end = m 58 | # build table of shift amounts 59 | cdef np.ndarray[np.int64_t, ndim=1] shifts = np.ones((m + 1,), dtype=int) 60 | cdef int shift = 1 61 | cdef int pos 62 | for pos in range(m): 63 | while shift <= pos and pattern[pos] != pattern[pos-shift]: 64 | shift += shifts[pos-shift] 65 | shifts[pos+1] = shift 66 | 67 | # do the actual search 68 | cdef int startPos = start 69 | cdef int matchLen = 0 70 | cdef Py_UCS4 c 71 | for c in sequence[start:]: 72 | if startPos >= end: 73 | break 74 | while matchLen == m or \ 75 | matchLen >= 0 and pattern[matchLen] != c: 76 | if matchLen > 0: 77 | ret.append((startPos, matchLen)) 78 | startPos += shifts[matchLen] 79 | matchLen -= shifts[matchLen] 80 | matchLen += 1 81 | if matchLen > 0 and startPos < end: 82 | ret.append((startPos, matchLen)) 83 | 84 | return ret 85 | 86 | 87 | cdef np_naive_match_length(np.ndarray[np.int64_t, ndim=1] sequence, 88 | np.ndarray[np.int64_t, ndim=1] pattern, 89 | int start=0, 90 | int end=-1): 91 | ret = [] 92 | cdef int m = len(sequence) 93 | cdef int n = min(m, len(pattern)) 94 | cdef int i 95 | cdef int j 96 | cdef int k 97 | if end == -1: 98 | end = m 99 | else: 100 | end = min(end, m) 101 | for i in range(start, end): 102 | j = 0 103 | k = i 104 | while sequence[k] == pattern[j]: 105 | j += 1 106 | k += 1 107 | if k == m or j == n: 108 | break 109 | if j > 0: 110 | ret.append((i, j)) 111 | return ret 112 | 113 | 114 | cdef u_naive_match_length(unicode sequence, 115 | unicode pattern, int start=0, int end=-1): 116 | ret = [] 117 | cdef int m = len(sequence) 118 | cdef int n = min(m, len(pattern)) 119 | cdef int i 120 | cdef int j 121 | cdef int k 122 | if end == -1: 123 | end = m 124 | else: 125 | end = min(end, m) 126 | for i in range(start, end): 127 | j = 0 128 | k = i 129 | while sequence[k] == pattern[j]: 130 | j += 1 131 | k += 1 132 | if k == m or j == n: 133 | break 134 | if j > 0: 135 | ret.append((i, j)) 136 | return ret 137 | 138 | 139 | cdef unicode _ustring(s): 140 | if type(s) is unicode: 141 | # fast path for most common case(s) 142 | return s 143 | elif PY_MAJOR_VERSION < 3 and isinstance(s, bytes): 144 | # only accept byte strings in Python 2.x, not in Py3 145 | return (s).decode('ascii') 146 | elif isinstance(s, unicode): 147 | # an evil cast to might work here in some(!) cases, 148 | # depending on what the further processing does. to be safe, 149 | # we can always create a copy instead 150 | return unicode(s) 151 | else: 152 | raise TypeError('Expected str or unicode') 153 | 154 | 155 | cpdef naive_match_length(sequence, pattern, int start=0, int end=-1): 156 | if isinstance(sequence, np.ndarray): 157 | if isinstance(pattern, np.ndarray): 158 | return np_naive_match_length(sequence, pattern, start, end) 159 | else: 160 | raise TypeError('Different types for sequence and pattern') 161 | else: 162 | return u_naive_match_length( 163 | _ustring(sequence), _ustring(pattern), start, end) 164 | 165 | cpdef kmp_match_length(sequence, pattern, int start=0, int end=-1): 166 | if isinstance(sequence, np.ndarray): 167 | if isinstance(pattern, np.ndarray): 168 | return np_kmp_match_length(sequence, pattern, start, end) 169 | else: 170 | raise TypeError('Different types for sequence and pattern') 171 | else: 172 | return u_kmp_match_length( 173 | _ustring(sequence), _ustring(pattern), start, end) 174 | -------------------------------------------------------------------------------- /scrapely/extraction/pageobjects.py: -------------------------------------------------------------------------------- 1 | """ 2 | Page objects 3 | 4 | This module contains objects representing pages and parts of pages (e.g. tokens 5 | and annotations) used in the instance based learning algorithm. 6 | """ 7 | from itertools import chain 8 | from numpy import array, ndarray 9 | 10 | from scrapely.htmlpage import HtmlTagType, HtmlPageRegion, HtmlPageParsedRegion 11 | 12 | 13 | class TokenType(HtmlTagType): 14 | """constants for token types""" 15 | WORD = 0 16 | 17 | 18 | class TokenDict(object): 19 | """Mapping from parse tokens to integers 20 | 21 | >>> d = TokenDict() 22 | >>> d.tokenid('i') 23 | 0 24 | >>> d.tokenid('b') 25 | 1 26 | >>> d.tokenid('i') 27 | 0 28 | 29 | Tokens can be searched for by id 30 | >>> d.find_token(1) 31 | 'b' 32 | 33 | The lower 24 bits store the token reference and the higher bits the type. 34 | """ 35 | 36 | def __init__(self): 37 | self.token_ids = {} 38 | 39 | def tokenid(self, token, token_type=TokenType.WORD): 40 | """create an integer id from the token and token type passed""" 41 | tid = self.token_ids.setdefault(token, len(self.token_ids)) 42 | return tid | (token_type << 24) 43 | 44 | @staticmethod 45 | def token_type(token): 46 | """extract the token type from the token id passed""" 47 | return token >> 24 48 | 49 | def find_token(self, tid): 50 | """Search for a tag with the given ID 51 | 52 | This is O(N) and is only intended for debugging 53 | """ 54 | tid &= 0xFFFFFF 55 | if tid >= len(self.token_ids) or tid < 0: 56 | raise ValueError("tag id %s out of range" % tid) 57 | 58 | for (token, token_id) in self.token_ids.items(): 59 | if token_id == tid: 60 | return token 61 | assert False, "token dictionary is corrupt" 62 | 63 | def token_string(self, tid): 64 | """create a string representation of a token 65 | 66 | This is O(N). 67 | """ 68 | templates = ["%s", "<%s>", "", "<%s/>"] 69 | return templates[tid >> 24] % self.find_token(tid) 70 | 71 | 72 | class PageRegion(object): 73 | """A region in a page, defined by a start and end index""" 74 | 75 | __slots__ = ('start_index', 'end_index') 76 | 77 | def __init__(self, start, end): 78 | self.start_index = start 79 | self.end_index = end 80 | 81 | def __str__(self): 82 | return "%s(%s, %s)" % (self.__class__.__name__, self.start_index, 83 | self.end_index) 84 | 85 | def __repr__(self): 86 | return str(self) 87 | 88 | 89 | class FragmentedHtmlPageRegion(HtmlPageParsedRegion, HtmlPageRegion): 90 | """An HtmlPageRegion consisting of possibly non-contiguous sub-regions""" 91 | def __new__(cls, htmlpage, regions): 92 | text = u''.join(regions) 93 | return HtmlPageRegion.__new__(cls, htmlpage, text) 94 | 95 | def __init__(self, htmlpage, regions): 96 | self.htmlpage = htmlpage 97 | self.regions = regions 98 | 99 | @property 100 | def parsed_fragments(self): 101 | return chain(*(r.parsed_fragments for r in self.regions)) 102 | 103 | 104 | class Page(object): 105 | """Basic representation of a page. This consists of a reference to a 106 | dictionary of tokens and an array of raw token ids 107 | """ 108 | 109 | __slots__ = ('token_dict', 'page_tokens', 'htmlpage') 110 | 111 | def __init__(self, htmlpage, token_dict, page_tokens): 112 | self.htmlpage = htmlpage 113 | self.token_dict = token_dict 114 | # use a numpy array because we can index/slice easily and efficiently 115 | if not isinstance(page_tokens, ndarray): 116 | page_tokens = array(page_tokens) 117 | self.page_tokens = page_tokens 118 | 119 | 120 | class TemplatePage(Page): 121 | __slots__ = ('annotations', 'id', 'ignored_regions', 'extra_required_attrs') 122 | 123 | def __init__(self, htmlpage, token_dict, page_tokens, annotations, \ 124 | template_id=None, ignored_regions=None, extra_required=None): 125 | Page.__init__(self, htmlpage, token_dict, page_tokens) 126 | # ensure order is the same as start tag order in the original page 127 | annotations = sorted(annotations, key=lambda x: x.end_index, reverse=True) 128 | self.annotations = sorted(annotations, key=lambda x: x.start_index) 129 | self.id = template_id 130 | self.ignored_regions = [i if isinstance(i, PageRegion) else PageRegion(*i) \ 131 | for i in (ignored_regions or [])] 132 | self.extra_required_attrs = set(extra_required or []) 133 | 134 | def __str__(self): 135 | summary = [] 136 | for index, token in enumerate(self.page_tokens): 137 | text = "%s: %s" % (index, self.token_dict.find_token(token)) 138 | summary.append(text) 139 | return "TemplatePage\n============\nTokens: (index, token)\n%s\nAnnotations: %s\n" % \ 140 | ('\n'.join(summary), '\n'.join(map(str, self.annotations))) 141 | 142 | 143 | class ExtractionPage(Page): 144 | """Parsed data belonging to a web page upon which we wish to perform 145 | extraction. 146 | """ 147 | __slots__ = ('token_page_indexes', ) 148 | 149 | def __init__(self, htmlpage, token_dict, page_tokens, token_page_indexes): 150 | """Construct a new ExtractionPage 151 | 152 | Arguments: 153 | `htmlpage`: The source HtmlPage 154 | `token_dict`: Token Dictionary used for tokenization 155 | `page_tokens': array of page tokens for matching 156 | `token_page_indexes`: indexes of each token in the parsed htmlpage 157 | """ 158 | Page.__init__(self, htmlpage, token_dict, page_tokens) 159 | self.token_page_indexes = token_page_indexes 160 | 161 | def htmlpage_region(self, start_token_index, end_token_index): 162 | """The region in the HtmlPage corresponding to the area defined by 163 | the start_token_index and the end_token_index 164 | 165 | This includes the tokens at the specified indexes 166 | """ 167 | start = self.token_page_indexes[start_token_index] 168 | end = self.token_page_indexes[end_token_index] 169 | return self.htmlpage.subregion(start, end) 170 | 171 | def htmlpage_region_inside(self, start_token_index, end_token_index): 172 | """The region in the HtmlPage corresponding to the area between 173 | the start_token_index and the end_token_index. 174 | 175 | This excludes the tokens at the specified indexes 176 | """ 177 | start = self.token_page_indexes[start_token_index] + 1 178 | end = self.token_page_indexes[end_token_index] - 1 179 | return self.htmlpage.subregion(start, end) 180 | 181 | def htmlpage_tag(self, token_index): 182 | """The HtmlPage tag at corresponding to the token at token_index""" 183 | return self.htmlpage.parsed_body[self.token_page_indexes[token_index]] 184 | 185 | def __str__(self): 186 | summary = [] 187 | for token, tindex in zip(self.page_tokens, self.token_page_indexes): 188 | text = "%s page[%s]: %s" % (self.token_dict.find_token(token), 189 | tindex, self.htmlpage.parsed_body[tindex]) 190 | summary.append(text) 191 | return "ExtractionPage\n==============\nTokens: %s\n\nRaw text: %s\n\n" \ 192 | % ('\n'.join(summary), self.htmlpage.body) 193 | 194 | 195 | class AnnotationText(object): 196 | __slots__ = ('start_text', 'follow_text') 197 | 198 | def __init__(self, start_text=None, follow_text=None): 199 | self.start_text = start_text 200 | self.follow_text = follow_text 201 | 202 | def __str__(self): 203 | return "AnnotationText(%s..%s)" % \ 204 | (repr(self.start_text), repr(self.follow_text)) 205 | 206 | 207 | class AnnotationTag(PageRegion): 208 | """A tag that annotates part of the document 209 | 210 | It has the following properties: 211 | start_index - index of the token for the opening tag 212 | end_index - index of the token for the closing tag 213 | surrounds_attribute - the attribute name surrounded by this tag 214 | tag_attributes - list of (tag attribute, extracted attribute) tuples 215 | for each item to be extracted from a tag attribute 216 | annotation_text - text prefix and suffix for the attribute to be extracted 217 | metadata - dict with annotation data not used by IBL extractor 218 | """ 219 | __slots__ = ('surrounds_attribute', 'start_index', 'end_index', 220 | 'tag_attributes', 'annotation_text', 'variant_id', 221 | 'metadata') 222 | 223 | def __init__(self, start_index, end_index, surrounds_attribute=None, 224 | annotation_text=None, tag_attributes=None, variant_id=None): 225 | PageRegion.__init__(self, start_index, end_index) 226 | self.surrounds_attribute = surrounds_attribute 227 | self.annotation_text = annotation_text 228 | self.tag_attributes = tag_attributes or [] 229 | self.variant_id = variant_id 230 | self.metadata = {} 231 | 232 | def __str__(self): 233 | return "AnnotationTag(%s)" % ", ".join( 234 | ["%s=%s" % (s, getattr(self, s)) \ 235 | for s in self.__slots__ if getattr(self, s)]) 236 | 237 | def __repr__(self): 238 | return str(self) 239 | 240 | -------------------------------------------------------------------------------- /scrapely/extraction/pageparsing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Page parsing 3 | 4 | Parsing of web pages for extraction task. 5 | """ 6 | import json 7 | from collections import defaultdict 8 | from numpy import array 9 | 10 | from scrapely.htmlpage import HtmlTagType, HtmlTag, HtmlPage 11 | from scrapely.extraction.pageobjects import (AnnotationTag, 12 | TemplatePage, ExtractionPage, AnnotationText, TokenDict) 13 | 14 | 15 | def parse_strings(template_html, extraction_html): 16 | """Create a template and extraction page from raw strings 17 | 18 | this is useful for testing purposes 19 | """ 20 | t = TokenDict() 21 | template_page = HtmlPage(body=template_html) 22 | extraction_page = HtmlPage(body=extraction_html) 23 | return (parse_template(t, template_page), 24 | parse_extraction_page(t, extraction_page)) 25 | 26 | 27 | def parse_template(token_dict, template_html): 28 | """Create an TemplatePage object by parsing the annotated html""" 29 | parser = TemplatePageParser(token_dict) 30 | parser.feed(template_html) 31 | return parser.to_template() 32 | 33 | 34 | def parse_extraction_page(token_dict, page_html): 35 | """Create an ExtractionPage object by parsing the html""" 36 | parser = ExtractionPageParser(token_dict) 37 | parser.feed(page_html) 38 | return parser.to_extraction_page() 39 | 40 | 41 | class InstanceLearningParser(object): 42 | """Base parser for instance based learning algorithm 43 | 44 | This does not require correct HTML and the parsing method should not alter 45 | the original tag order. It is important that parsing results do not vary. 46 | """ 47 | def __init__(self, token_dict): 48 | self.token_dict = token_dict 49 | self.token_list = [] 50 | 51 | def _add_token(self, token, token_type, start, end): 52 | tid = self.token_dict.tokenid(token, token_type) 53 | self.token_list.append(tid) 54 | 55 | def feed(self, html_page): 56 | self.html_page = html_page 57 | self.previous_element_class = None 58 | for index, data in enumerate(html_page.parsed_body): 59 | if isinstance(data, HtmlTag): 60 | self._add_token(data.tag, data.tag_type, data.start, data.end) 61 | self.handle_tag(data, index) 62 | else: 63 | self.handle_data(data, index) 64 | self.previous_element_class = data.__class__ 65 | 66 | def handle_data(self, html_data_fragment, index): 67 | pass 68 | 69 | def handle_tag(self, html_tag, index): 70 | pass 71 | 72 | 73 | _END_UNPAIREDTAG_TAGS = ["form", "div", "p", "table", "tr", "td"] 74 | _AUTO_CLOSE_TAGS_ON_OPEN = { 75 | # the given keys closes the tags in the list 76 | "p": ["p"], 77 | "option": ["option"], 78 | } 79 | _AUTO_CLOSE_TAGS_ON_CLOSE = { 80 | "select": ["option"], 81 | } 82 | 83 | 84 | class TemplatePageParser(InstanceLearningParser): 85 | """Template parsing for instance based learning algorithm""" 86 | 87 | def __init__(self, token_dict): 88 | InstanceLearningParser.__init__(self, token_dict) 89 | self.annotations = [] 90 | self.ignored_regions = [] 91 | self.extra_required_attrs = [] 92 | self.ignored_tag_stacks = defaultdict(list) 93 | # tag names that have not been completed 94 | self.labelled_tag_stacks = defaultdict(list) 95 | self.replacement_stacks = defaultdict(list) 96 | self.unpairedtag_stack = [] 97 | self.variant_stack = [] 98 | self.prev_data = None 99 | self.last_text_region = None 100 | self.next_tag_index = 0 101 | 102 | def handle_tag(self, html_tag, index): 103 | if self.last_text_region: 104 | self._process_text('') 105 | 106 | if html_tag.tag_type == HtmlTagType.OPEN_TAG: 107 | self._handle_open_tag(html_tag) 108 | elif html_tag.tag_type == HtmlTagType.CLOSE_TAG: 109 | self._handle_close_tag(html_tag) 110 | else: 111 | # the tag is not paired, it can contain only attribute annotations 112 | self._handle_unpaired_tag(html_tag) 113 | 114 | @staticmethod 115 | def _read_template_annotation(html_tag): 116 | template_attr = html_tag.attributes.get('data-scrapy-annotate') 117 | if template_attr is None: 118 | return None 119 | unescaped = template_attr.replace('"', '"') 120 | return json.loads(unescaped) 121 | 122 | @staticmethod 123 | def _read_bool_template_attribute(html_tag, attribute): 124 | return html_tag.attributes.get("data-scrapy-" + attribute) == "true" 125 | 126 | def _close_unpaired_tag(self): 127 | self.unpairedtag_stack[0].end_index = self.next_tag_index 128 | self.unpairedtag_stack = [] 129 | 130 | def _handle_unpaired_tag(self, html_tag): 131 | if self._read_bool_template_attribute(html_tag, "ignore") and html_tag.tag == "img": 132 | self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) 133 | elif self._read_bool_template_attribute(html_tag, "ignore-beneath"): 134 | self.ignored_regions.append((self.next_tag_index, None)) 135 | jannotation = self._read_template_annotation(html_tag) 136 | if jannotation: 137 | if self.unpairedtag_stack: 138 | self._close_unpaired_tag() 139 | 140 | annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) 141 | attribute_annotations = jannotation.pop('annotations', {}).items() 142 | content_key = jannotation.pop('text-content', 'content') 143 | for extract_attribute, tag_value in attribute_annotations: 144 | if extract_attribute == content_key: 145 | annotation.surrounds_attribute = tag_value 146 | self.unpairedtag_stack.append(annotation) 147 | else: 148 | annotation.tag_attributes.append((extract_attribute, tag_value)) 149 | self.annotations.append(annotation) 150 | 151 | self.extra_required_attrs.extend(jannotation.pop('required', [])) 152 | variant_id = jannotation.pop('variant', 0) 153 | if variant_id > 0: 154 | annotation.variant_id = variant_id 155 | assert jannotation.pop("generated", False) == False 156 | annotation.metadata = jannotation 157 | 158 | self.next_tag_index += 1 159 | 160 | def _handle_open_tag(self, html_tag): 161 | if self._read_bool_template_attribute(html_tag, "ignore"): 162 | if html_tag.tag == "img": 163 | self.ignored_regions.append((self.next_tag_index, self.next_tag_index + 1)) 164 | else: 165 | self.ignored_regions.append((self.next_tag_index, None)) 166 | self.ignored_tag_stacks[html_tag.tag].append(html_tag) 167 | 168 | elif self.ignored_tag_stacks.get(html_tag.tag): 169 | self.ignored_tag_stacks[html_tag.tag].append(None) 170 | if self._read_bool_template_attribute(html_tag, "ignore-beneath"): 171 | self.ignored_regions.append((self.next_tag_index, None)) 172 | 173 | replacement = html_tag.attributes.pop("data-scrapy-replacement", None) 174 | if replacement: 175 | self.token_list.pop() 176 | self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end) 177 | self.replacement_stacks[html_tag.tag].append(replacement) 178 | elif html_tag.tag in self.replacement_stacks: 179 | self.replacement_stacks[html_tag.tag].append(None) 180 | 181 | if self.unpairedtag_stack: 182 | if html_tag.tag in _END_UNPAIREDTAG_TAGS: 183 | self._close_unpaired_tag() 184 | else: 185 | self.unpairedtag_stack.append(html_tag.tag) 186 | 187 | tagname = replacement or self._update_replacement_stack(html_tag) 188 | self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN) 189 | 190 | jannotation = self._read_template_annotation(html_tag) 191 | if not jannotation: 192 | if tagname in self.labelled_tag_stacks: 193 | # add this tag to the stack to match correct end tag 194 | self.labelled_tag_stacks[tagname].append(None) 195 | self.next_tag_index += 1 196 | return 197 | 198 | annotation = AnnotationTag(self.next_tag_index, None) 199 | if jannotation.pop('generated', False): 200 | self.token_list.pop() 201 | annotation.start_index -= 1 202 | if self.previous_element_class == HtmlTag: 203 | annotation.annotation_text = AnnotationText('') 204 | else: 205 | annotation.annotation_text = AnnotationText(self.prev_data) 206 | if self._read_bool_template_attribute(html_tag, "ignore") \ 207 | or self._read_bool_template_attribute(html_tag, "ignore-beneath"): 208 | ignored = self.ignored_regions.pop() 209 | self.ignored_regions.append((ignored[0]-1, ignored[1])) 210 | 211 | self.extra_required_attrs.extend(jannotation.pop('required', [])) 212 | 213 | attribute_annotations = jannotation.pop('annotations', {}).items() 214 | content_key = jannotation.pop('text-content', 'content') 215 | for extract_attribute, tag_value in attribute_annotations: 216 | if extract_attribute == content_key: 217 | annotation.surrounds_attribute = tag_value 218 | else: 219 | annotation.tag_attributes.append((extract_attribute, tag_value)) 220 | 221 | variant_id = jannotation.pop('variant', 0) 222 | if variant_id > 0: 223 | if annotation.surrounds_attribute is not None: 224 | self.variant_stack.append(variant_id) 225 | else: 226 | annotation.variant_id = variant_id 227 | 228 | annotation.metadata = jannotation 229 | 230 | if annotation.annotation_text is None: 231 | self.next_tag_index += 1 232 | if self.variant_stack and annotation.variant_id is None: 233 | variant_id = self.variant_stack[-1] 234 | if variant_id == '0': 235 | variant_id = None 236 | annotation.variant_id = variant_id 237 | 238 | # look for a closing tag if the content is important 239 | if annotation.surrounds_attribute: 240 | self.labelled_tag_stacks[tagname].append(annotation) 241 | else: 242 | annotation.end_index = annotation.start_index + 1 243 | self.annotations.append(annotation) 244 | 245 | def _handle_close_tag(self, html_tag): 246 | 247 | if self.unpairedtag_stack: 248 | if html_tag.tag == self.unpairedtag_stack[-1]: 249 | self.unpairedtag_stack.pop() 250 | else: 251 | self._close_unpaired_tag() 252 | 253 | ignored_tags = self.ignored_tag_stacks.get(html_tag.tag) 254 | if ignored_tags is not None: 255 | tag = ignored_tags.pop() 256 | if isinstance(tag, HtmlTag): 257 | for i in range(-1, -len(self.ignored_regions) - 1, -1): 258 | if self.ignored_regions[i][1] is None: 259 | self.ignored_regions[i] = (self.ignored_regions[i][0], self.next_tag_index) 260 | break 261 | if len(ignored_tags) == 0: 262 | del self.ignored_tag_stacks[html_tag.tag] 263 | 264 | tagname = self._update_replacement_stack(html_tag) 265 | self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_CLOSE) 266 | 267 | labelled_tags = self.labelled_tag_stacks.get(tagname) 268 | if labelled_tags is None: 269 | self.next_tag_index += 1 270 | return 271 | annotation = labelled_tags.pop() 272 | if annotation is None: 273 | self.next_tag_index += 1 274 | else: 275 | annotation.end_index = self.next_tag_index 276 | self.annotations.append(annotation) 277 | if annotation.annotation_text is not None: 278 | self.token_list.pop() 279 | self.last_text_region = annotation 280 | else: 281 | self.next_tag_index += 1 282 | if len(labelled_tags) == 0: 283 | del self.labelled_tag_stacks[tagname] 284 | if annotation.variant_id and self.variant_stack: 285 | prev = self.variant_stack.pop() 286 | if prev != annotation.variant_id: 287 | raise ValueError("unbalanced variant annotation tags") 288 | 289 | def _update_replacement_stack(self, html_tag): 290 | replacement = html_tag.tag 291 | if html_tag.tag in self.replacement_stacks: 292 | replacement = self.replacement_stacks[html_tag.tag].pop() 293 | if replacement: 294 | self.token_list.pop() 295 | self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end) 296 | if len(self.replacement_stacks[html_tag.tag]) == 0: 297 | del self.replacement_stacks[html_tag.tag] 298 | return replacement 299 | 300 | def _handle_unclosed_tags(self, tagname, auto_close_tags): 301 | """I.e. can't be a p inside another p. Also, an open p element closes 302 | a previous open p element""" 303 | if tagname in auto_close_tags: 304 | for _close_tag in auto_close_tags[tagname]: 305 | if _close_tag in self.labelled_tag_stacks: 306 | annotation = self.labelled_tag_stacks.pop(_close_tag)[0] 307 | annotation.end_index = self.next_tag_index 308 | self.annotations.append(annotation) 309 | break 310 | return tagname 311 | 312 | def handle_data(self, html_data_fragment, index): 313 | fragment_text = self.html_page.fragment_data(html_data_fragment) 314 | self._process_text(fragment_text) 315 | 316 | def _process_text(self, text): 317 | if self.last_text_region is not None: 318 | self.last_text_region.annotation_text.follow_text = text 319 | self.last_text_region = None 320 | self.prev_data = text 321 | 322 | def to_template(self): 323 | """create a TemplatePage from the data fed to this parser""" 324 | return TemplatePage(self.html_page, self.token_dict, self.token_list, self.annotations, 325 | self.html_page.page_id, self.ignored_regions, self.extra_required_attrs) 326 | 327 | 328 | class ExtractionPageParser(InstanceLearningParser): 329 | """Parse an HTML page for extraction using the instance based learning 330 | algorithm 331 | 332 | This needs to extract the tokens in a similar way to LabelledPageParser, 333 | it needs to also maintain a mapping from token index to the original content 334 | so that once regions are identified, the original content can be extracted. 335 | """ 336 | def __init__(self, token_dict): 337 | InstanceLearningParser.__init__(self, token_dict) 338 | self._page_token_indexes = [] 339 | 340 | def handle_tag(self, html_tag, index): 341 | self._page_token_indexes.append(index) 342 | 343 | def to_extraction_page(self): 344 | return ExtractionPage(self.html_page, self.token_dict, array(self.token_list), 345 | self._page_token_indexes) 346 | -------------------------------------------------------------------------------- /scrapely/extraction/regionextract.py: -------------------------------------------------------------------------------- 1 | """ 2 | Region Extract 3 | 4 | Custom extraction for regions in a document 5 | """ 6 | import re 7 | import operator 8 | import copy 9 | import pprint 10 | import six 11 | 12 | from itertools import groupby, starmap 13 | 14 | from numpy import array 15 | 16 | from six.moves import zip as izip, xrange, StringIO 17 | 18 | from scrapely.descriptor import FieldDescriptor 19 | from scrapely.htmlpage import HtmlPageRegion 20 | from scrapely.extraction.similarity import ( 21 | similar_region, longest_unique_subsequence, common_prefix) 22 | from scrapely.extraction.pageobjects import ( 23 | AnnotationTag, PageRegion, FragmentedHtmlPageRegion) 24 | 25 | _EXTRACT_HTML = lambda x: x 26 | _DEFAULT_DESCRIPTOR = FieldDescriptor('none', None) 27 | 28 | __all__ = ['BasicTypeExtractor', 29 | 'TraceExtractor', 30 | 'RepeatedDataExtractor', 31 | 'AdjacentVariantExtractor', 32 | 'RecordExtractor', 33 | 'TemplatePageExtractor', 34 | 'TextRegionDataExtractor', 35 | 'attrs2dict', 36 | 'labelled_element'] 37 | 38 | 39 | def _int_cmp(a, op, b): 40 | op = getattr(operator, op) 41 | a = -float('inf') if a is None else a 42 | b = -float('inf') if b is None else b 43 | return op(a, b) 44 | 45 | 46 | def labelled_element(obj): 47 | """ 48 | Returns labelled element of the object (extractor or labelled region) 49 | """ 50 | return getattr(obj, 'annotation', obj) 51 | 52 | 53 | def _compose(f, g): 54 | """given unary functions f and g, return a function that computes f(g(x)) 55 | """ 56 | def _exec(x): 57 | ret = g(x) 58 | return f(ret) if ret is not None else None 59 | return _exec 60 | 61 | 62 | class BasicTypeExtractor(object): 63 | """The BasicTypeExtractor extracts single attributes corresponding to 64 | annotations. 65 | 66 | For example: 67 | >>> from scrapely.extraction.pageparsing import parse_strings 68 | >>> template, page = parse_strings( \ 69 | u'

x

', u'

a name

') 70 | >>> ex = BasicTypeExtractor(template.annotations[0]) 71 | >>> ex.extract(page, 0, 1, None) 72 | [(u'name', u' a name')] 73 | 74 | It supports attribute descriptors 75 | >>> descriptor = FieldDescriptor('name', None, lambda x: x.strip()) 76 | >>> ex = BasicTypeExtractor(template.annotations[0], {'name': descriptor}) 77 | >>> ex.extract(page, 0, 1, None) 78 | [(u'name', u'a name')] 79 | 80 | It supports ignoring regions 81 | >>> template, page = parse_strings(\ 82 | u'
x xx
',\ 83 | u'
a name id-9
') 84 | >>> ex = BasicTypeExtractor(template.annotations[0]) 85 | >>> ex.extract(page, 0, 3, [PageRegion(1, 2)]) 86 | [(u'name', u'a name')] 87 | """ 88 | 89 | def __init__(self, annotation, attribute_descriptors=None): 90 | self.annotation = annotation 91 | if attribute_descriptors is None: 92 | attribute_descriptors = {} 93 | 94 | if annotation.surrounds_attribute: 95 | descriptor = attribute_descriptors.get(annotation.surrounds_attribute) 96 | if descriptor: 97 | self.content_validate = descriptor.extractor 98 | else: 99 | self.content_validate = _EXTRACT_HTML 100 | self.extract = self._extract_content 101 | 102 | if annotation.tag_attributes: 103 | self.tag_data = [] 104 | for (tag_attr, extraction_attr) in annotation.tag_attributes: 105 | descriptor = attribute_descriptors.get(extraction_attr) 106 | extractf = descriptor.extractor if descriptor else _EXTRACT_HTML 107 | self.tag_data.append((extractf, tag_attr, extraction_attr)) 108 | 109 | self.extract = self._extract_both if \ 110 | annotation.surrounds_attribute else self._extract_attribute 111 | 112 | def _extract_both(self, page, start_index, end_index, ignored_regions=None, **kwargs): 113 | return self._extract_content(page, start_index, end_index, ignored_regions) + \ 114 | self._extract_attribute(page, start_index, end_index, ignored_regions) 115 | 116 | def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs): 117 | """extract content between annotation indexes""" 118 | if ignored_regions and (_int_cmp(start_index, 'le', ignored_regions[0].start_index) and 119 | _int_cmp(end_index, 'ge', ignored_regions[-1].end_index)): 120 | starts = [start_index] + [i.end_index for i in ignored_regions if i.end_index is not None] 121 | ends = [i.start_index for i in ignored_regions] 122 | if starts[-1] is not None: 123 | ends.append(end_index) 124 | included_regions = izip(starts, ends) 125 | if ends[0] is None: 126 | included_regions.next() 127 | regions = starmap(extraction_page.htmlpage_region_inside, included_regions) 128 | region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions)) 129 | else: 130 | region = extraction_page.htmlpage_region_inside(start_index, end_index) 131 | validated = self.content_validate(region) 132 | return [(self.annotation.surrounds_attribute, validated)] if validated else [] 133 | 134 | def _extract_attribute(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs): 135 | data = [] 136 | for (f, ta, ea) in self.tag_data: 137 | tag_value = extraction_page.htmlpage_tag(start_index).attributes.get(ta) 138 | if tag_value: 139 | region = HtmlPageRegion(extraction_page.htmlpage, tag_value) 140 | extracted = f(region) 141 | if extracted is not None: 142 | data.append((ea, extracted)) 143 | return data 144 | 145 | @classmethod 146 | def create(cls, annotations, attribute_descriptors=None): 147 | """Create a list of basic extractors from the given annotations 148 | and attribute descriptors 149 | """ 150 | if attribute_descriptors is None: 151 | attribute_descriptors = {} 152 | return [cls._create_basic_extractor(annotation, attribute_descriptors) \ 153 | for annotation in annotations \ 154 | if annotation.surrounds_attribute or annotation.tag_attributes] 155 | 156 | @staticmethod 157 | def _create_basic_extractor(annotation, attribute_descriptors): 158 | """Create a basic type extractor for the annotation""" 159 | text_region = annotation.annotation_text 160 | if text_region is not None: 161 | region_extract = TextRegionDataExtractor(text_region.start_text, 162 | text_region.follow_text).extract 163 | # copy attribute_descriptors and add the text extractor 164 | descriptor_copy = dict(attribute_descriptors) 165 | attr_descr = descriptor_copy.get(annotation.surrounds_attribute, 166 | _DEFAULT_DESCRIPTOR) 167 | attr_descr = copy.copy(attr_descr) 168 | attr_descr.extractor = _compose(attr_descr.extractor, region_extract) 169 | descriptor_copy[annotation.surrounds_attribute] = attr_descr 170 | attribute_descriptors = descriptor_copy 171 | return BasicTypeExtractor(annotation, attribute_descriptors) 172 | 173 | def extracted_item(self): 174 | """key used to identify the item extracted""" 175 | return (self.annotation.surrounds_attribute, self.annotation.tag_attributes) 176 | 177 | def __repr__(self): 178 | return str(self) 179 | 180 | def __str__(self): 181 | messages = ['BasicTypeExtractor('] 182 | if self.annotation.surrounds_attribute: 183 | messages.append(self.annotation.surrounds_attribute) 184 | if self.content_validate != _EXTRACT_HTML: 185 | messages += [', extracted with \'', 186 | self.content_validate.__name__, '\''] 187 | 188 | if self.annotation.tag_attributes: 189 | if self.annotation.surrounds_attribute: 190 | messages.append(';') 191 | for (f, ta, ea) in self.tag_data: 192 | messages += [ea, ': tag attribute "', ta, '"'] 193 | if f != _EXTRACT_HTML: 194 | messages += [', validated by ', str(f)] 195 | messages.append(", template[%s:%s])" % \ 196 | (self.annotation.start_index, self.annotation.end_index)) 197 | return ''.join(messages) 198 | 199 | 200 | class RepeatedDataExtractor(object): 201 | """Data extractor for handling repeated data""" 202 | 203 | def __init__(self, prefix, suffix, extractors): 204 | self.prefix = array(prefix) 205 | self.suffix = array(suffix) 206 | self.extractor = copy.copy(extractors[0]) 207 | self.annotation = copy.copy(self.extractor.annotation) 208 | self.annotation.end_index = extractors[-1].annotation.end_index 209 | 210 | def extract(self, page, start_index, end_index, ignored_regions, **kwargs): 211 | """repeatedly find regions bounded by the repeated 212 | prefix and suffix and extract them 213 | """ 214 | prefixlen = len(self.prefix) 215 | suffixlen = len(self.suffix) 216 | index = max(0, start_index - prefixlen) 217 | max_index = min(len(page.page_tokens) - suffixlen, end_index + len(self.suffix)) 218 | max_start_index = max_index - prefixlen 219 | extracted = [] 220 | while index <= max_start_index: 221 | prefix_end = index + prefixlen 222 | if (page.page_tokens[index:prefix_end] == self.prefix).all(): 223 | for peek in xrange(prefix_end, max_index + 1): 224 | if (page.page_tokens[peek:peek + suffixlen] \ 225 | == self.suffix).all(): 226 | extracted += self.extractor.extract(page, 227 | prefix_end - 1, peek, ignored_regions, suffix_max_length=suffixlen) 228 | index = max(peek, index + 1) 229 | break 230 | else: 231 | break 232 | else: 233 | index += 1 234 | return extracted 235 | 236 | @staticmethod 237 | def apply(template, extractors): 238 | tokens = template.page_tokens 239 | output_extractors = [] 240 | group_key = lambda x: (x.extracted_item(), x.annotation.variant_id) 241 | for extr_key, extraction_group in groupby(extractors, group_key): 242 | extraction_group = list(extraction_group) 243 | if extr_key is None or len(extraction_group) == 1: 244 | output_extractors += extraction_group 245 | continue 246 | 247 | separating_tokens = [ \ 248 | tokens[x.annotation.end_index:y.annotation.start_index+1] \ 249 | for (x, y) in zip(extraction_group[:-1], extraction_group[1:])] 250 | 251 | # calculate the common prefix 252 | group_start = extraction_group[0].annotation.start_index 253 | prefix_start = max(0, group_start - len(separating_tokens[0])) 254 | first_prefix = tokens[prefix_start:group_start+1] 255 | prefixes = [first_prefix] + separating_tokens 256 | prefix_pattern = list(reversed( 257 | common_prefix(*map(reversed, prefixes)))) 258 | 259 | # calculate the common suffix 260 | group_end = extraction_group[-1].annotation.end_index 261 | last_suffix = tokens[group_end:group_end + \ 262 | len(separating_tokens[-1])] 263 | suffixes = separating_tokens + [last_suffix] 264 | suffix_pattern = common_prefix(*suffixes) 265 | 266 | # create a repeated data extractor, if there is a suitable 267 | # prefix and suffix. (TODO: tune this heuristic) 268 | matchlen = len(prefix_pattern) + len(suffix_pattern) 269 | if matchlen >= len(separating_tokens): 270 | group_extractor = RepeatedDataExtractor(prefix_pattern, 271 | suffix_pattern, extraction_group) 272 | output_extractors.append(group_extractor) 273 | else: 274 | output_extractors += extraction_group 275 | return output_extractors 276 | 277 | def extracted_item(self): 278 | """key used to identify the item extracted""" 279 | return self.extractor.extracted_item() 280 | 281 | def __repr__(self): 282 | return "Repeat(%r)" % self.extractor 283 | 284 | def __str__(self): 285 | return "Repeat(%s)" % self.extractor 286 | 287 | 288 | class TransposedDataExtractor(object): 289 | """ """ 290 | pass 291 | 292 | 293 | _namef = operator.itemgetter(0) 294 | _valuef = operator.itemgetter(1) 295 | def attrs2dict(attributes): 296 | """convert a list of attributes (name, value) tuples 297 | into a dict of lists. 298 | 299 | For example: 300 | >>> l = [('name', 'sofa'), ('colour', 'red'), ('colour', 'green')] 301 | >>> attrs2dict(l) == {'name': ['sofa'], 'colour': ['red', 'green']} 302 | True 303 | """ 304 | grouped_data = groupby(sorted(attributes, key=_namef), _namef) 305 | return dict((name, list(map(_valuef, data))) for (name, data) in grouped_data) 306 | 307 | 308 | class RecordExtractor(object): 309 | """The RecordExtractor will extract records given annotations. 310 | 311 | It looks for a similar region in the target document, using the ibl 312 | similarity algorithm. The annotations are partitioned by the first similar 313 | region found and searched recursively. 314 | 315 | Records are represented as dicts mapping attribute names to lists 316 | containing their values. 317 | 318 | For example: 319 | >>> from scrapely.extraction.pageparsing import parse_strings 320 | >>> template, page = parse_strings( \ 321 | u'

x

' + \ 322 | u'

y

', \ 323 | u'

name

description

') 324 | >>> basic_extractors = list(map(BasicTypeExtractor, template.annotations)) 325 | >>> ex = RecordExtractor.apply(template, basic_extractors)[0] 326 | >>> ex.extract(page) == [{u'description': [u'description'], u'name': [u'name']}] 327 | True 328 | """ 329 | 330 | def __init__(self, extractors, template_tokens): 331 | """Construct a RecordExtractor for the given annotations and their 332 | corresponding region extractors 333 | """ 334 | self.extractors = extractors 335 | self.template_tokens = template_tokens 336 | self.template_ignored_regions = [] 337 | start_index = min(e.annotation.start_index for e in extractors) 338 | end_index = max(e.annotation.end_index for e in extractors) 339 | self.annotation = AnnotationTag(start_index, end_index) 340 | self.best_match = longest_unique_subsequence 341 | 342 | def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs): 343 | """extract data from an extraction page 344 | 345 | The region in the page to be extracted from may be specified using 346 | start_index and end_index 347 | """ 348 | if ignored_regions is None: 349 | ignored_regions = [] 350 | extractors = sorted(self.extractors + ignored_regions, key=lambda x: labelled_element(x).start_index) 351 | _, _, attributes = self._doextract(page, extractors, start_index, end_index, **kwargs) 352 | # collect variant data, maintaining the order of variants 353 | variant_ids = []; variants = {}; items = [] 354 | for k, v in attributes: 355 | if isinstance(k, six.integer_types): 356 | if k in variants: 357 | variants[k] += v 358 | else: 359 | variant_ids.append(k) 360 | variants[k] = v 361 | else: 362 | items.append((k, v)) 363 | 364 | variant_records = [('variants', attrs2dict(variants[vid])) \ 365 | for vid in variant_ids] 366 | items += variant_records 367 | return [attrs2dict(items)] 368 | 369 | def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): 370 | """Carry out extraction of records using the given annotations 371 | in the page tokens bounded by start_index and end_index 372 | """ 373 | # reorder extractors leaving nested ones for the end and separating 374 | # ignore regions 375 | nested_regions = nested_regions or [] 376 | ignored_regions = ignored_regions or [] 377 | current_extractor, following_extractors = extractors[0], extractors[1:] 378 | while (following_extractors and 379 | _int_cmp(labelled_element(following_extractors[0]).start_index, 'lt', 380 | labelled_element(current_extractor).end_index)): 381 | ex = following_extractors.pop(0) 382 | labelled = labelled_element(ex) 383 | if (isinstance(labelled, AnnotationTag) or 384 | (nested_regions and 385 | _int_cmp(labelled_element(nested_regions[-1]).start_index, 'lt', labelled.start_index) and 386 | _int_cmp(labelled.start_index, 'lt', labelled_element(nested_regions[-1]).end_index))): 387 | nested_regions.append(ex) 388 | else: 389 | ignored_regions.append(ex) 390 | extracted_data = [] 391 | # end_index is inclusive, but similar_region treats it as exclusive 392 | end_index_exclusive = None if end_index is None else end_index + 1 393 | labelled = labelled_element(current_extractor) 394 | score, pindex, sindex = \ 395 | similar_region(page.page_tokens, self.template_tokens, 396 | labelled, start_index, end_index_exclusive, self.best_match, **kwargs) 397 | if score > 0: 398 | if isinstance(labelled, AnnotationTag): 399 | similar_ignored_regions = [] 400 | start = pindex 401 | for i in ignored_regions: 402 | s, p, e = similar_region(page.page_tokens, self.template_tokens, 403 | i, start, sindex, self.best_match, **kwargs) 404 | if s > 0: 405 | similar_ignored_regions.append(PageRegion(p, e)) 406 | start = e or start 407 | extracted_data = current_extractor.extract(page, pindex, sindex, similar_ignored_regions, **kwargs) 408 | if extracted_data: 409 | if current_extractor.annotation.variant_id: 410 | extracted_data = [(current_extractor.annotation.variant_id, extracted_data)] 411 | 412 | if nested_regions: 413 | _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs) 414 | extracted_data += nested_data 415 | if following_extractors: 416 | _, _, following_data = self._doextract(page, following_extractors, sindex or start_index, end_index, **kwargs) 417 | extracted_data += following_data 418 | 419 | elif following_extractors: 420 | end_index, _, following_data = self._doextract(page, following_extractors, start_index, end_index, **kwargs) 421 | if end_index is not None: 422 | pindex, sindex, extracted_data = self._doextract(page, [current_extractor], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs) 423 | extracted_data += following_data 424 | elif nested_regions: 425 | _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs) 426 | extracted_data += nested_data 427 | return pindex, sindex, extracted_data 428 | 429 | @classmethod 430 | def apply(cls, template, extractors): 431 | return [cls(extractors, template.page_tokens)] 432 | 433 | def extracted_item(self): 434 | return [self.__class__.__name__] + \ 435 | sorted((e.extracted_item() for e in self.extractors), 436 | key=lambda x: '' if x[0] is None else x[0]) 437 | 438 | def __repr__(self): 439 | return str(self) 440 | 441 | def __str__(self): 442 | stream = StringIO() 443 | pprint.pprint(self.extractors, stream) 444 | stream.seek(0) 445 | template_data = stream.read() 446 | if template_data: 447 | return "%s[\n%s\n]" % (self.__class__.__name__, template_data) 448 | return "%s[none]" % (self.__class__.__name__) 449 | 450 | 451 | class AdjacentVariantExtractor(RecordExtractor): 452 | """Extractor for variants 453 | 454 | This simply extends the RecordExtractor to output data in a "variants" 455 | attribute. 456 | 457 | The "apply" method will only apply to variants whose items are all adjacent and 458 | it will appear as one record so that it can be handled by the RepeatedDataExtractor. 459 | """ 460 | 461 | def extract(self, page, start_index=0, end_index=None, ignored_regions=None, **kwargs): 462 | records = RecordExtractor.extract(self, page, start_index, end_index, ignored_regions, **kwargs) 463 | return [('variants', r['variants'][0]) for r in records if r] 464 | 465 | @classmethod 466 | def apply(cls, template, extractors): 467 | adjacent_variants = set([]) 468 | variantf = lambda x: x.annotation.variant_id 469 | for vid, egroup in groupby(extractors, variantf): 470 | if not vid: 471 | continue 472 | if vid in adjacent_variants: 473 | adjacent_variants.remove(vid) 474 | else: 475 | adjacent_variants.add(vid) 476 | new_extractors = [] 477 | for variant, group_seq in groupby(extractors, variantf): 478 | group_seq = list(group_seq) 479 | if variant in adjacent_variants: 480 | record_extractor = AdjacentVariantExtractor(group_seq, template.page_tokens) 481 | new_extractors.append(record_extractor) 482 | else: 483 | new_extractors += group_seq 484 | return new_extractors 485 | 486 | def __repr__(self): 487 | return str(self) 488 | 489 | 490 | class TraceExtractor(object): 491 | """Extractor that wraps other extractors and prints an execution 492 | trace of the extraction process to aid debugging 493 | """ 494 | 495 | def __init__(self, traced, template): 496 | self.traced = traced 497 | self.annotation = traced.annotation 498 | tstart = traced.annotation.start_index 499 | tend = traced.annotation.end_index 500 | self.tprefix = " ".join([template.token_dict.token_string(t) 501 | for t in template.page_tokens[tstart-4:tstart+1]]) 502 | self.tsuffix = " ".join([template.token_dict.token_string(t) 503 | for t in template.page_tokens[tend:tend+5]]) 504 | 505 | def summarize_trace(self, page, start, end, ret): 506 | text_start = page.htmlpage.parsed_body[page.token_page_indexes[start]].start 507 | text_end = page.htmlpage.parsed_body[page.token_page_indexes[end or -1]].end 508 | page_snippet = "(...%s)%s(%s...)" % ( 509 | page.htmlpage.body[text_start-50:text_start].replace('\n', ' '), 510 | page.htmlpage.body[text_start:text_end], 511 | page.htmlpage.body[text_end:text_end+50].replace('\n', ' ')) 512 | pre_summary = "\nstart %s page[%s:%s]\n" % (self.traced.__class__.__name__, start, end) 513 | post_summary = """ 514 | %s page[%s:%s] 515 | 516 | html 517 | %s 518 | 519 | annotation 520 | ...%s 521 | %s 522 | %s... 523 | 524 | extracted 525 | %s 526 | """ % (self.traced.__class__.__name__, start, end, page_snippet, 527 | self.tprefix, self.annotation, self.tsuffix, [r for r in ret if 'trace' not in r]) 528 | return pre_summary, post_summary 529 | 530 | def extract(self, page, start, end, ignored_regions, **kwargs): 531 | ret = self.traced.extract(page, start, end, ignored_regions, **kwargs) 532 | if not ret: 533 | return [] 534 | 535 | # handle records by inserting a trace and combining with variant traces 536 | if len(ret) == 1 and isinstance(ret[0], dict): 537 | item = ret[0] 538 | trace = item.pop('trace', []) 539 | variants = item.get('variants', ()) 540 | for variant in variants: 541 | trace += variant.pop('trace', []) 542 | pre_summary, post_summary = self.summarize_trace(page, start, end, ret) 543 | item['trace'] = [pre_summary] + trace + [post_summary] 544 | return ret 545 | 546 | pre_summary, post_summary = self.summarize_trace(page, start, end, ret) 547 | return [('trace', pre_summary)] + ret + [('trace', post_summary)] 548 | 549 | @staticmethod 550 | def apply(template, extractors): 551 | output = [] 552 | for extractor in extractors: 553 | if not isinstance(extractor, TraceExtractor): 554 | extractor = TraceExtractor(extractor, template) 555 | output.append(extractor) 556 | return output 557 | 558 | def extracted_item(self): 559 | return self.traced.extracted_item() 560 | 561 | def __repr__(self): 562 | return "Trace(%s)" % repr(self.traced) 563 | 564 | 565 | class TemplatePageExtractor(object): 566 | """Top level extractor for a template page""" 567 | 568 | def __init__(self, template, extractors): 569 | self.extractors = extractors 570 | self.template = template 571 | 572 | def extract(self, page, start_index=0, end_index=None): 573 | items = [] 574 | for extractor in self.extractors: 575 | items.extend(extractor.extract(page, start_index, end_index, self.template.ignored_regions)) 576 | return [self._merge_list_dicts(items)] 577 | 578 | def _merge_list_dicts(self, dicts): 579 | res = {} 580 | for d in dicts: 581 | res.update(d) 582 | return res 583 | 584 | def __repr__(self): 585 | return repr(self.extractors) 586 | 587 | def __str__(self): 588 | return str(self.extractors) 589 | 590 | 591 | # Based on nltk's WordPunctTokenizer 592 | _tokenize = re.compile(r'\w+|[^\w\s]+', re.UNICODE | re.MULTILINE | re.DOTALL).findall 593 | 594 | class TextRegionDataExtractor(object): 595 | """Data Extractor for extracting text fragments from an annotation page 596 | fragment or string. It extracts based on the longest unique prefix and 597 | suffix. 598 | 599 | for example: 600 | >>> extractor = TextRegionDataExtractor('designed by ', '.') 601 | >>> extractor.extract_text("by Marc Newson.") 602 | 'Marc Newson' 603 | 604 | Both prefix and suffix are optional: 605 | >>> extractor = TextRegionDataExtractor('designed by ') 606 | >>> extractor.extract_text("by Marc Newson.") 607 | 'Marc Newson.' 608 | >>> extractor = TextRegionDataExtractor(suffix='.') 609 | >>> extractor.extract_text("by Marc Newson.") 610 | 'by Marc Newson' 611 | 612 | It requires a minimum match of at least one word or punctuation character: 613 | >>> extractor = TextRegionDataExtractor('designed by') 614 | >>> extractor.extract_text("y Marc Newson.") is None 615 | True 616 | """ 617 | def __init__(self, prefix=None, suffix=None): 618 | self.prefix = (prefix or '')[::-1] 619 | self.suffix = suffix or '' 620 | self.minprefix = self.minmatch(self.prefix) 621 | self.minsuffix = self.minmatch(self.suffix) 622 | 623 | @staticmethod 624 | def minmatch(matchstring): 625 | """the minimum number of characters that should match in order 626 | to consider it a match for that string. 627 | 628 | This uses the last word of punctuation character 629 | """ 630 | tokens = _tokenize(matchstring or '') 631 | return len(tokens[0]) if tokens else 0 632 | 633 | def extract(self, region): 634 | """Extract a region from the region passed""" 635 | text = self.extract_text(region) 636 | return HtmlPageRegion(region.htmlpage, text) if text else None 637 | 638 | def extract_text(self, text): 639 | """Extract a substring from the text""" 640 | pref_index = 0 641 | if self.minprefix > 0: 642 | rev_idx, plen = longest_unique_subsequence(text[::-1], self.prefix) 643 | if plen is None or plen < self.minprefix: 644 | return None 645 | pref_index = -rev_idx 646 | if self.minsuffix == 0: 647 | return text[pref_index:] 648 | sidx, slen = longest_unique_subsequence(text[pref_index:], self.suffix) 649 | if slen is None or slen < self.minsuffix: 650 | return None 651 | return text[pref_index:pref_index + sidx] 652 | -------------------------------------------------------------------------------- /scrapely/extraction/similarity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Similarity calculation for Instance based extraction algorithm. 3 | """ 4 | from itertools import count 5 | from six.moves import zip as izip, xrange 6 | from operator import itemgetter 7 | from heapq import nlargest 8 | 9 | try: 10 | # For typical use cases (small sequences and patterns) the naive approach 11 | # actually runs faster than KMP algorithm 12 | from . _similarity import naive_match_length 13 | except ImportError: 14 | def naive_match_length(to_search, subsequence, range_start, range_end): 15 | startval = subsequence[0] 16 | return ((i, common_prefix_length(to_search[i:], subsequence)) 17 | for i in xrange(range_start, range_end) 18 | if startval == to_search[i]) 19 | 20 | 21 | def common_prefix_length(a, b): 22 | """Calculate the length of the common prefix in both sequences passed. 23 | 24 | For example, the common prefix in this example is [1, 3] 25 | >>> common_prefix_length([1, 3, 4], [1, 3, 5, 1]) 26 | 2 27 | 28 | If there is no common prefix, 0 is returned 29 | >>> common_prefix_length([1], []) 30 | 0 31 | """ 32 | i = -1 33 | for i, x, y in izip(count(), a, b): 34 | if x != y: 35 | return i 36 | return i + 1 37 | 38 | 39 | def common_prefix(*sequences): 40 | """determine the common prefix of all sequences passed 41 | 42 | For example: 43 | >>> common_prefix('abcdef', 'abc', 'abac') 44 | ['a', 'b'] 45 | """ 46 | prefix = [] 47 | for sample in izip(*sequences): 48 | first = sample[0] 49 | if all(x == first for x in sample[1:]): 50 | prefix.append(first) 51 | else: 52 | break 53 | return prefix 54 | 55 | 56 | def longest_unique_subsequence(to_search, subsequence, range_start=0, 57 | range_end=None): 58 | """Find the longest unique subsequence of items in an array or string. This 59 | searches to_search looking for the longest overlapping 60 | match with subsequence. If the largest match is unique (there is no other 61 | match of equivalent length), the index and length of match is returned. If 62 | there is no match, (None, None) is returned. 63 | 64 | Please see section 3.2 of Extracting Web Data Using Instance-Based 65 | Learning by Yanhong Zhai and Bing Liu 66 | 67 | For example, the longest match occurs at index 2 and has length 3 68 | >>> import numpy as np 69 | >>> to_search = np.array([6, 3, 2, 4, 3, 2, 5]) 70 | >>> longest_unique_subsequence(to_search, np.array([2, 4, 3])) 71 | (2, 3) 72 | 73 | When there are two equally long subsequences, it does not generate a match 74 | >>> longest_unique_subsequence(to_search, np.array([3, 2])) 75 | (None, None) 76 | 77 | range_start and range_end specify a range in which the match must begin 78 | >>> longest_unique_subsequence(to_search, np.array([3, 2]), 3) 79 | (4, 2) 80 | >>> longest_unique_subsequence(to_search, np.array([3, 2]), 0, 2) 81 | (1, 2) 82 | """ 83 | if range_end is None: 84 | range_end = len(to_search) 85 | matches = naive_match_length(to_search, subsequence, range_start, range_end) 86 | best2 = nlargest(2, matches, key=itemgetter(1)) 87 | # if there is a single unique best match, return that 88 | if len(best2) == 1 or len(best2) == 2 and best2[0][1] != best2[1][1]: 89 | return best2[0][0], best2[0][1] 90 | return None, None 91 | 92 | 93 | def first_longest_subsequence(to_search, subsequence, range_start=0, range_end=None): 94 | """Find the first longest subsequence of the items in a list or array. 95 | 96 | range_start and range_end specify a range in which the match must begin. 97 | 98 | For example, the longest match occurs at index 2 and has length 3 99 | >>> to_search = [6, 3, 2, 4, 3, 2, 5] 100 | >>> first_longest_subsequence(to_search, [2, 4, 3]) 101 | (2, 3) 102 | 103 | When there are two equally long subsequences, it return the nearest one) 104 | >>> first_longest_subsequence(to_search, [3, 2]) 105 | (1, 2) 106 | 107 | >>> first_longest_subsequence([], [3, 2]) 108 | (None, None) 109 | """ 110 | startval = subsequence[0] 111 | if range_end is None: 112 | range_end = len(to_search) 113 | 114 | # the comparison to startval ensures only matches of length >= 1 and 115 | # reduces the number of calls to the common_length function 116 | matches = [(i, common_prefix_length(to_search[i:], subsequence)) 117 | for i in xrange(range_start, range_end) if startval == to_search[i]] 118 | 119 | if not matches: 120 | return None, None 121 | # secondary sort on position and prefer the smaller one (near) 122 | return max(matches, key=lambda x: (x[1], -x[0])) 123 | 124 | 125 | def similar_region(extracted_tokens, template_tokens, labelled_region, 126 | range_start=0, range_end=None, best_match=longest_unique_subsequence, **kwargs): 127 | """Given a labelled section in a template, identify a similar region 128 | in the extracted tokens. 129 | 130 | The start and end index of the similar region in the extracted tokens 131 | is returned. 132 | 133 | This will return a tuple containing: 134 | (match score, start index, end index) 135 | where match score is the sum of the length of the matching prefix and 136 | suffix. If there is no unique match, (0, None, None) will be returned. 137 | 138 | start_index and end_index specify a range in which the match must begin 139 | """ 140 | data_length = len(extracted_tokens) 141 | if range_end is None: 142 | range_end = data_length 143 | # calculate the prefix score by finding a longest subsequence in 144 | # reverse order 145 | reverse_prefix = template_tokens[labelled_region.start_index::-1] 146 | reverse_tokens = extracted_tokens[::-1] 147 | (rpi, pscore) = best_match(reverse_tokens, reverse_prefix, 148 | data_length - range_end, data_length - range_start) 149 | 150 | # None means nothing extracted. Index 0 means there cannot be a suffix. 151 | if not rpi: 152 | return 0, None, None 153 | 154 | # convert to an index from the start instead of in reverse 155 | prefix_index = len(extracted_tokens) - rpi - 1 156 | 157 | if labelled_region.end_index is None: 158 | return pscore, prefix_index, None 159 | elif kwargs.get("suffix_max_length", None) == 0: 160 | return pscore, prefix_index, range_start + 1 161 | 162 | suffix = template_tokens[labelled_region.end_index:] 163 | 164 | # if it's not a paired tag, use the best match between prefix & suffix 165 | if labelled_region.start_index == labelled_region.end_index: 166 | (match_index, sscore) = best_match(extracted_tokens, 167 | suffix, prefix_index, range_end) 168 | if match_index == prefix_index: 169 | return (pscore + sscore, prefix_index, match_index) 170 | elif pscore > sscore: 171 | return pscore, prefix_index, prefix_index 172 | elif sscore > pscore: 173 | return sscore, match_index, match_index 174 | return 0, None, None 175 | 176 | # calculate the suffix match on the tokens following the prefix. We could 177 | # consider the whole page and require a good match. 178 | (match_index, sscore) = best_match(extracted_tokens, 179 | suffix, prefix_index + 1, range_end) 180 | if match_index is None: 181 | return 0, None, None 182 | return (pscore + sscore, prefix_index, match_index) 183 | -------------------------------------------------------------------------------- /scrapely/extractors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extractors collection 3 | """ 4 | 5 | import re 6 | 7 | from six.moves.urllib.parse import urlparse, urlunparse 8 | from six import unichr 9 | 10 | from w3lib.html import replace_entities, remove_comments 11 | from w3lib.url import safe_url_string 12 | 13 | from scrapely.htmlpage import HtmlPage, HtmlTag, HtmlTagType 14 | 15 | _NUMERIC_ENTITIES = re.compile("&#([0-9]+)(?:;|\s)", re.U) 16 | _PRICE_NUMBER_RE = re.compile('(?:^|[^a-zA-Z0-9])(\d+(?:\.\d+)?)(?:$|[^a-zA-Z0-9])') 17 | _NUMBER_RE = re.compile('(-?\d+(?:\.\d+)?)') 18 | _DECIMAL_RE = re.compile(r'(-?\d[\d\,\.]*)', re.U | re.M) 19 | 20 | _IMAGES = ( 21 | 'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif', 22 | 'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 23 | ) 24 | 25 | _IMAGES_TYPES = '|'.join(_IMAGES) 26 | _CSS_IMAGERE = re.compile("background(?:-image)?\s*:\s*url\((.*?)\)", re.I) 27 | _BASE_PATH_RE = "/?(?:[^/]+/)*(?:.+%s)" 28 | _IMAGE_PATH_RE = re.compile(_BASE_PATH_RE % '\.(?:%s)' % _IMAGES_TYPES, re.I) 29 | _GENERIC_PATH_RE = re.compile(_BASE_PATH_RE % '', re.I) 30 | _WS = re.compile("\s+", re.U) 31 | 32 | # tags to keep (only for attributes with markup) 33 | _TAGS_TO_KEEP = frozenset(['br', 'p', 'big', 'em', 'small', 'strong', 'sub', 34 | 'sup', 'ins', 'del', 'code', 'kbd', 'samp', 'tt', 'var', 'pre', 'listing', 35 | 'plaintext', 'abbr', 'acronym', 'address', 'bdo', 'blockquote', 'q', 36 | 'cite', 'dfn', 'table', 'tr', 'th', 'td', 'tbody', 'ul', 'ol', 'li', 'dl', 37 | 'dd', 'dt']) 38 | 39 | # tag names to be replaced by other tag names (overrides tags_to_keep) 40 | _TAGS_TO_REPLACE = { 41 | 'h1': 'strong', 42 | 'h2': 'strong', 43 | 'h3': 'strong', 44 | 'h4': 'strong', 45 | 'h5': 'strong', 46 | 'h6': 'strong', 47 | 'b' : 'strong', 48 | 'i' : 'em', 49 | } 50 | # tags whoose content will be completely removed (recursively) 51 | # (overrides tags_to_keep and tags_to_replace) 52 | _TAGS_TO_PURGE = ('script', 'style', 'img', 'input') 53 | # tags that are automatically closed in HTML4 and HTML5 54 | _VOID_TAGS = frozenset([ 55 | 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 56 | 'link', 'meta', 'param', 'source', 'track', 'wbr' 57 | ]) 58 | 59 | 60 | def htmlregion(text): 61 | """convenience function to make an html region from text. 62 | This is useful for testing 63 | """ 64 | return HtmlPage(body=text).subregion() 65 | 66 | 67 | def notags(region, tag_replace=u' '): 68 | """Removes all html tags""" 69 | fragments = getattr(region, 'parsed_fragments', None) 70 | if fragments is None: 71 | return region 72 | page = region.htmlpage 73 | data = [page.fragment_data(f) for f in fragments if not isinstance(f, HtmlTag)] 74 | return tag_replace.join(data) 75 | 76 | 77 | def text(region): 78 | """Converts HTML to text. There is no attempt at formatting other than 79 | removing excessive whitespace, 80 | 81 | For example: 82 | >>> t = lambda s: text(htmlregion(s)) 83 | >>> t(u'

test

') 84 | u'test' 85 | 86 | Leading and trailing whitespace are removed 87 | >>> t(u'

test

') 88 | u'test' 89 | 90 | Comments are removed 91 | >>> t(u'test me') 92 | u'test me' 93 | 94 | Text between script tags is ignored 95 | >>> t(u"scripts are ignored") 96 | u'scripts are ignored' 97 | 98 | HTML entities are converted to text 99 | >>> t(u"only £42") 100 | u'only \\xa342' 101 | 102 | >>> t(u"

The text

is here

") 103 | u'The text is here' 104 | """ 105 | text = replace_entities(region.text_content, encoding=region.htmlpage.encoding) 106 | return _WS.sub(u' ', text).strip() 107 | 108 | 109 | def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE, 110 | tags_to_purge=_TAGS_TO_PURGE): 111 | """Creates an HTML subset, using a whitelist of HTML tags. 112 | 113 | The HTML generated is safe for display on a website,without escaping and 114 | should not cause formatting problems. 115 | 116 | Behaviour can be customized through the following keyword arguments: 117 | allowed_tags is a set of tags that are allowed 118 | replace_tags is a mapping of tags to alternative tags to substitute. 119 | tags_to_purge are tags that, if encountered, all content between the 120 | opening and closing tag is removed. 121 | 122 | For example: 123 | >>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep) 124 | >>> t(u'test test') 125 | u'test test' 126 | 127 | Some tags, like script, are completely removed 128 | >>> t(u'test') 129 | u'test' 130 | 131 | replace_tags define tags that are converted. By default all headers, bold 132 | and indenting are converted to strong and em. 133 | >>> t(u'

header

test bold indent') 134 | u'header test bold indent' 135 | 136 | tags_to_purge defines the tags that have enclosing content removed: 137 | >>> t(u'

test

') 138 | u'

test

' 139 | 140 | Comments are stripped, but entities are not converted 141 | >>> t(u' only £42') 142 | u'only £42' 143 | 144 | Paired tags are closed 145 | >>> t(u'

test') 146 | u'

test

' 147 | 148 | >>> t(u'

test
test

') 149 | u'

test
test

' 150 | 151 | Include or exclude tags that you want 152 | >>> t(u'Keep and
tags') 153 | u'Keep and tags' 154 | >>> tags = set(list(_TAGS_TO_KEEP)[:] + ['meta', 'hr']) 155 | >>> t(u'Keep and
tags', tags) 156 | u'Keep and
tags
' 157 | 158 | Handle void tags when purged 159 | >>> t(u'Keep content around img tag') 160 | u'Keep content around img tag' 161 | 162 | """ 163 | tagstack = [] 164 | 165 | def _process_tag(tag): 166 | tagstr = replace_tags.get(tag.tag, tag.tag) 167 | if tagstr not in allowed_tags: 168 | return 169 | if tag.tag_type == HtmlTagType.OPEN_TAG: 170 | if tag.tag not in _VOID_TAGS: 171 | tagstack.append(tagstr) 172 | return u"<%s>" % tagstr 173 | elif tag.tag_type == HtmlTagType.CLOSE_TAG: 174 | try: 175 | last = tagstack.pop() 176 | # common case of matching tag 177 | if last == tagstr: 178 | return u"" % last 179 | # output all preceeding tags (if present) 180 | revtags = tagstack[::-1] 181 | tindex = revtags.index(tagstr) 182 | del tagstack[-tindex-1:] 183 | return u"" % (last, u">" % tag.tag 190 | chunks = list(_process_markup(region, lambda text: text, 191 | _process_tag, tags_to_purge)) + ["" % t for t in reversed(tagstack)] 192 | return u''.join(chunks).strip() 193 | 194 | 195 | def _process_markup(region, textf, tagf, tags_to_purge=_TAGS_TO_PURGE): 196 | fragments = getattr(region, 'parsed_fragments', None) 197 | if fragments is None: 198 | yield textf(region) 199 | return 200 | fiter = iter(fragments) 201 | for fragment in fiter: 202 | if isinstance(fragment, HtmlTag): 203 | # skip forward to closing script tags 204 | tag = fragment.tag 205 | if tag in tags_to_purge: 206 | # if opening, keep going until closed 207 | if (fragment.tag_type == HtmlTagType.OPEN_TAG and 208 | tag not in _VOID_TAGS): 209 | for probe in fiter: 210 | if isinstance(probe, HtmlTag) and \ 211 | probe.tag == tag and \ 212 | probe.tag_type == HtmlTagType.CLOSE_TAG: 213 | break 214 | else: 215 | output = tagf(fragment) 216 | if output: 217 | yield output 218 | else: 219 | text = region.htmlpage.fragment_data(fragment) 220 | text = remove_comments(text) 221 | text = textf(text) 222 | if text: 223 | yield text 224 | 225 | 226 | def html(pageregion): 227 | """A page region is already html, so this is the identity function""" 228 | return pageregion 229 | 230 | 231 | def contains_any_numbers(txt): 232 | """text that must contain at least one number 233 | >>> contains_any_numbers('foo') 234 | >>> contains_any_numbers('$67 at 15% discount') 235 | '$67 at 15% discount' 236 | """ 237 | if _NUMBER_RE.search(txt) is not None: 238 | return txt 239 | 240 | 241 | def contains_prices(txt): 242 | """text must contain a number that is not joined to text""" 243 | if _PRICE_NUMBER_RE.findall(txt) is not None: 244 | return txt 245 | 246 | 247 | def contains_numbers(txt, count=1): 248 | """Must contain a certain amount of numbers 249 | 250 | >>> contains_numbers('foo', 2) 251 | >>> contains_numbers('this 1 has 2 numbers', 2) 252 | 'this 1 has 2 numbers' 253 | """ 254 | numbers = _NUMBER_RE.findall(txt) 255 | if len(numbers) == count: 256 | return txt 257 | 258 | 259 | def extract_number(txt): 260 | """Extract a numeric value. 261 | 262 | This will fail if more than one numeric value is present. 263 | 264 | >>> extract_number(' -45.3') 265 | '-45.3' 266 | >>> extract_number(' +45.3') 267 | '45.3' 268 | >>> extract_number(' 45.3') 269 | '45.3' 270 | >>> extract_number(' 45.3, 7') 271 | 272 | It will handle unescaped entities: 273 | >>> extract_number(u'£129.99') 274 | u'129.99' 275 | """ 276 | txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt) 277 | numbers = _NUMBER_RE.findall(txt) 278 | if len(numbers) == 1: 279 | return numbers[0] 280 | 281 | 282 | def extract_price(txt): 283 | """ 284 | Extracts numbers making some price format specific assumptions 285 | 286 | >>> extract_price('asdf 234,234.45sdf ') 287 | '234234.45' 288 | >>> extract_price('234,23') 289 | '234.23' 290 | >>> extract_price('234,230') 291 | '234230' 292 | >>> extract_price('asdf 2234 sdf ') 293 | '2234' 294 | >>> extract_price('947') 295 | '947' 296 | >>> extract_price('-200,069,000,006.565456') 297 | '-200069000006.565456' 298 | >>> extract_price('1,000,000') 299 | '1000000' 300 | >>> extract_price('1,000,000.00') 301 | '1000000.00' 302 | >>> extract_price('1,000') 303 | '1000' 304 | >>> extract_price('1000,00') 305 | '1000.00' 306 | >>> extract_price('1,000.00') 307 | '1000.00' 308 | >>> extract_price('500,000.00') 309 | '500000.00' 310 | >>> extract_price('500.000,00') 311 | '500000.00' 312 | >>> extract_price('-500,000.00') 313 | '-500000.00' 314 | >>> extract_price('500 000,00') 315 | '500000.00' 316 | >>> extract_price(u'£129.99') 317 | u'129.99' 318 | >>> extract_price('adsfg') 319 | >>> extract_price('stained, linseed oil finish, clear glas doors') 320 | >>> extract_price('') 321 | """ 322 | txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), txt) 323 | txt = txt.replace(' ', '') 324 | m = _DECIMAL_RE.search(txt) 325 | POINT, COMMA = 0, 1 326 | decimal_separator = POINT 327 | 328 | if m: 329 | value = m.group(1) 330 | last_point_idx = value.rfind('.') 331 | last_comma_idx = value.rfind(',') 332 | 333 | # If a number has both separators take the last one 334 | if last_point_idx > 0 and last_comma_idx > 0: 335 | if last_comma_idx > last_point_idx: 336 | decimal_separator = COMMA 337 | # If a number has only commas check the last one 338 | elif last_comma_idx > 0: 339 | first_comma_idx = value.find(',') 340 | if (first_comma_idx == last_comma_idx and 341 | len(value) - last_comma_idx <= 3): 342 | decimal_separator = COMMA 343 | 344 | if decimal_separator == POINT: 345 | value = value.replace(',', '') 346 | else: 347 | value = value.replace('.', '') 348 | return value.replace(',', '.') 349 | 350 | 351 | def url(txt): 352 | """convert text to a url 353 | 354 | this is quite conservative, since relative urls are supported 355 | """ 356 | txt = txt.strip("\t\r\n '\"") 357 | if txt: 358 | return txt 359 | 360 | 361 | def image_url(txt): 362 | """convert text to a url 363 | 364 | this is quite conservative, since relative urls are supported 365 | Example: 366 | 367 | >>> image_url('') 368 | 369 | >>> image_url(' ') 370 | 371 | >>> image_url(' \\n\\n ') 372 | 373 | >>> image_url('foo-bar.jpg') 374 | ['foo-bar.jpg'] 375 | >>> image_url('/images/main_logo12.gif') 376 | ['/images/main_logo12.gif'] 377 | >>> image_url("http://www.image.com/image.jpg") 378 | ['http://www.image.com/image.jpg'] 379 | >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg") 380 | ['http://www.domain.com/path1/path2/path3/image.jpg'] 381 | >>> image_url("/path1/path2/path3/image.jpg") 382 | ['/path1/path2/path3/image.jpg'] 383 | >>> image_url("path1/path2/image.jpg") 384 | ['path1/path2/image.jpg'] 385 | >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)") 386 | ['http://www.site.com/path1/path2/image.jpg'] 387 | >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')") 388 | ['http://www.site.com/path1/path2/image.jpg'] 389 | >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")') 390 | ['http://www.site.com/path1/path2/image.jpg'] 391 | >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)") 392 | ['http://www.site.com/path1/path2/image.jpg'] 393 | >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')") 394 | ['http://www.site.com/path1/path2/image.jpg'] 395 | >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")') 396 | ['http://www.site.com/path1/path2/image.jpg'] 397 | >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') 398 | ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] 399 | >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') 400 | ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] 401 | >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80') 402 | ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80'] 403 | >>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg') 404 | ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg'] 405 | >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff') 406 | ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff'] 407 | >>> image_url('http://www.site.com/image.php') 408 | ['http://www.site.com/image.php'] 409 | >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)') 410 | ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] 411 | 412 | """ 413 | imgurl = extract_image_url(txt) 414 | return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None 415 | 416 | 417 | def extract_image_url(txt): 418 | txt = url(txt) 419 | imgurl = None 420 | if txt: 421 | # check if the text is style content 422 | m = _CSS_IMAGERE.search(txt) 423 | txt = m.groups()[0] if m else txt 424 | parsed = urlparse(txt) 425 | path = None 426 | m = _IMAGE_PATH_RE.search(parsed.path) 427 | if m: 428 | path = m.group() 429 | elif parsed.query: 430 | m = _GENERIC_PATH_RE.search(parsed.path) 431 | if m: 432 | path = m.group() 433 | if path is not None: 434 | parsed = list(parsed) 435 | parsed[2] = path 436 | imgurl = urlunparse(parsed) 437 | if not imgurl: 438 | imgurl = txt 439 | return imgurl 440 | -------------------------------------------------------------------------------- /scrapely/htmlpage.py: -------------------------------------------------------------------------------- 1 | """ 2 | htmlpage 3 | 4 | Container objects for representing html pages and their parts in the IBL 5 | system. This encapsulates page related information and prevents parsing 6 | multiple times. 7 | """ 8 | import hashlib 9 | import six 10 | 11 | from six.moves.urllib.request import urlopen 12 | from copy import deepcopy 13 | from w3lib.encoding import html_to_unicode 14 | try: 15 | from . import _htmlpage 16 | parse_html = _htmlpage.parse_html 17 | HtmlDataFragment = _htmlpage.HtmlDataFragment 18 | HtmlTag = _htmlpage.HtmlTag 19 | HtmlTagType = _htmlpage.HtmlTagType 20 | except ImportError: 21 | import re 22 | from collections import OrderedDict 23 | 24 | class HtmlTagType(object): 25 | OPEN_TAG = 1 26 | CLOSE_TAG = 2 27 | UNPAIRED_TAG = 3 28 | 29 | class HtmlDataFragment(object): 30 | __slots__ = ('start', 'end', 'is_text_content') 31 | 32 | def __init__(self, start, end, is_text_content=False): 33 | self.start = start 34 | self.end = end 35 | self.is_text_content = is_text_content 36 | 37 | def __str__(self): 38 | return "" % ( 39 | self.start, self.end, self.is_text_content) 40 | 41 | def __repr__(self): 42 | return str(self) 43 | 44 | class HtmlTag(HtmlDataFragment): 45 | __slots__ = ('tag_type', 'tag', '_attributes', '_attr_text') 46 | 47 | def __init__(self, tag_type, tag, attr_text, start, end): 48 | HtmlDataFragment.__init__(self, start, end) 49 | self.tag_type = tag_type 50 | self.tag = tag 51 | if isinstance(attr_text, dict): 52 | self._attributes = attr_text 53 | self._attr_text = None 54 | else: # defer loading attributes until necessary 55 | self._attributes = OrderedDict() 56 | self._attr_text = attr_text 57 | 58 | @property 59 | def attributes(self): 60 | if not self._attributes and self._attr_text: 61 | for attr_match in _ATTR_REGEXP.findall(self._attr_text): 62 | name = attr_match[0].lower() 63 | values = [v for v in attr_match[1:] if v] 64 | # According to HTML spec if attribute name is repeated only 65 | # the first one is taken into account 66 | if name not in self._attributes: 67 | self._attributes[name] = values[0] if values else None 68 | return self._attributes 69 | 70 | def __str__(self): 71 | attributes = ', '.join( 72 | sorted(["%s: %s" % (k, repr(v)) 73 | for k, v in self.attributes.items()])) 74 | return "" % ( 75 | self.tag, attributes, self.tag_type, self.start, self.end) 76 | 77 | def __repr__(self): 78 | return str(self) 79 | 80 | _ATTR = ("((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|" 81 | "([^>\s]+))?)?") 82 | _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?" 83 | _DOCTYPE = r"" 84 | _SCRIPT = "()(.*?)()" 85 | _COMMENT = "( 44 | 45 | 46 | 52 | 53 | 55 | 56 |
57 | 58 | 59 | 62 | 63 | 64 | 67 | 69 | 70 | 71 | 98 | 169 | 177 | 178 | 179 | 182 | 186 | 187 |
60 |

61 | retrosixty

65 |

66 | retrosixty

  68 |
72 |

73 | 74 | Home

75 | 76 | About Us

77 | 78 | Shipping

79 | 80 | Links

81 | 82 | Contact

  83 |

84 | 85 | Furniture

86 | 87 | Lighting

88 | 89 | Technology

90 | 91 | Ceramics

92 | 93 | Art

94 | 95 | Misc. Items

96 | 97 | Contemporary

99 |

100 | 101 | Lighting..

102 |

103 | Please click the thumbnails for larger 104 | images and the back button to return to the Lighting index.

105 | 106 | 107 | 122 | 160 | 161 |
108 |

109 |  

110 | 111 | 112 |

113 | 114 |

115 | 116 |

117 | 118 |

119 |  

120 |  

121 |  

123 |

124 | Designer: 125 | 126 | 127 | Charlotte Perriand    

128 | Manufacturer: 129 | 130 | Philips, Netherlands 131 |  

132 | Description: 133 | 134 | 135 | 136 | A Perriand designed 'infraphil' infrared heat lamp 137 | designed in c1960s. This example is in good vintage 138 | condition with some minor wear as one would expect. 139 | Original Philips sticker intact, although it has some 140 | wear as pictured. 141 |

142 | 143 | As with all electrical items we always 144 | recommend having them tested by a professional prior to 145 | use although it is in full working order. The lamp can 146 | be used as a table lamp, or mounted on the wall - full 147 | adjustable...

148 | Price: £60

149 | Size: 150 | 151 | N/A 152 |     153 |    

154 | Shipping: 155 | 156 | £7 to mainland UK. 157 | Please enquire for other locations.

158 | Ref #: 0642

159 |  

162 | 163 | 164 |

165 | 166 | 167 | << BACK

168 |
170 |

171 | retrosixty

172 |

173 | retrosixty

174 |

175 | retrosixty

176 |
180 |

181 | retrosixty

183 |

184 | Site Layout, Design & 185 | Content Copyright 2006-09 - retrosixty.co.uk

188 |
189 | 190 | -------------------------------------------------------------------------------- /tests/samples/samples_pageparsing_0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "surrounds_attribute": "name", 4 | "annotation_text": null, 5 | "match_common_prefix": false, 6 | "surrounds_variant": null, 7 | "variant_id": null, 8 | "tag_attributes": [], 9 | "end_index": 133, 10 | "start_index": 132, 11 | "metadata": {} 12 | }, 13 | { 14 | "surrounds_attribute": null, 15 | "annotation_text": null, 16 | "match_common_prefix": false, 17 | "surrounds_variant": null, 18 | "variant_id": null, 19 | "tag_attributes": [ 20 | [ 21 | "src", 22 | "image_urls" 23 | ] 24 | ], 25 | "end_index": 142, 26 | "start_index": 141, 27 | "metadata": {} 28 | }, 29 | { 30 | "surrounds_attribute": null, 31 | "annotation_text": null, 32 | "match_common_prefix": false, 33 | "surrounds_variant": null, 34 | "variant_id": null, 35 | "tag_attributes": [ 36 | [ 37 | "src", 38 | "image_urls" 39 | ] 40 | ], 41 | "end_index": 149, 42 | "start_index": 148, 43 | "metadata": {} 44 | }, 45 | { 46 | "surrounds_attribute": "description", 47 | "annotation_text": null, 48 | "match_common_prefix": false, 49 | "surrounds_variant": null, 50 | "variant_id": null, 51 | "tag_attributes": [], 52 | "end_index": 207, 53 | "start_index": 161, 54 | "metadata": {} 55 | }, 56 | { 57 | "surrounds_attribute": "price", 58 | "annotation_text": null, 59 | "match_common_prefix": false, 60 | "surrounds_variant": null, 61 | "variant_id": null, 62 | "tag_attributes": [], 63 | "end_index": 258, 64 | "start_index": 257, 65 | "metadata": {} 66 | }, 67 | { 68 | "surrounds_attribute": "features", 69 | "annotation_text": null, 70 | "match_common_prefix": false, 71 | "surrounds_variant": null, 72 | "variant_id": null, 73 | "tag_attributes": [], 74 | "end_index": 421, 75 | "start_index": 324, 76 | "metadata": {} 77 | } 78 | ] 79 | -------------------------------------------------------------------------------- /tests/samples/samples_scraper_loadstore_0.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapely/31b5881bed01a99be2b65b30b9c81ad65a517eaf/tests/samples/samples_scraper_loadstore_0.html -------------------------------------------------------------------------------- /tests/samples/samples_scraper_loadstore_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "price": "340", 3 | "designer": "Tom Dixon", 4 | "name": "Copper Shade by Tom Dixon" 5 | } 6 | -------------------------------------------------------------------------------- /tests/samples/samples_scraper_loadstore_1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapely/31b5881bed01a99be2b65b30b9c81ad65a517eaf/tests/samples/samples_scraper_loadstore_1.html -------------------------------------------------------------------------------- /tests/samples/samples_scraper_loadstore_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "price": "229.00", 3 | "designer": "Artemide", 4 | "name": "Mesmeri Halo Chrome" 5 | } 6 | -------------------------------------------------------------------------------- /tests/test_htmlpage.py: -------------------------------------------------------------------------------- 1 | """ 2 | htmlpage.py tests 3 | """ 4 | import os 5 | import copy 6 | import json 7 | from unittest import TestCase 8 | 9 | from scrapely.htmlpage import ( 10 | parse_html, HtmlTag, HtmlDataFragment, HtmlPage, url_to_page 11 | ) 12 | from .test_htmlpage_data import * 13 | from . import iter_samples 14 | BASE_PATH = os.path.abspath(os.path.dirname(__file__)) 15 | 16 | 17 | def _encode_element(el): 18 | """ 19 | jsonize parse element 20 | """ 21 | if isinstance(el, HtmlTag): 22 | return {"tag": el.tag, "attributes": el.attributes, 23 | "start": el.start, "end": el.end, "tag_type": el.tag_type} 24 | if isinstance(el, HtmlDataFragment): 25 | return {"start": el.start, "end": el.end, "is_text_content": el.is_text_content} 26 | raise TypeError 27 | 28 | 29 | def _decode_element(dct): 30 | """ 31 | dejsonize parse element 32 | """ 33 | if "tag" in dct: 34 | return HtmlTag(dct["tag_type"], dct["tag"], 35 | dct["attributes"], dct["start"], dct["end"]) 36 | if "start" in dct: 37 | return HtmlDataFragment(dct["start"], dct["end"], dct.get("is_text_content", True)) 38 | return dct 39 | 40 | 41 | class TestParseHtml(TestCase): 42 | """Test for parse_html""" 43 | def _test_sample(self, source, expected_parsed, samplecount=None): 44 | parsed = parse_html(source) 45 | count_element = 0 46 | count_expected = 0 47 | for element in parsed: 48 | if type(element) == HtmlTag: 49 | count_element += 1 50 | expected = expected_parsed.pop(0) 51 | if type(expected) == HtmlTag: 52 | count_expected += 1 53 | element_text = source[element.start:element.end] 54 | expected_text = source[expected.start:expected.end] 55 | if element.start != expected.start or element.end != expected.end: 56 | errstring = "[%s,%s] %s != [%s,%s] %s" % (element.start, \ 57 | element.end, element_text, expected.start, \ 58 | expected.end, expected_text) 59 | if samplecount is not None: 60 | errstring += " (sample %d)" % samplecount 61 | assert False, errstring 62 | if type(element) != type(expected): 63 | errstring = "(%s) %s != (%s) %s for text\n%s" % (count_element, \ 64 | repr(type(element)), count_expected, repr(type(expected)), element_text) 65 | if samplecount is not None: 66 | errstring += " (sample %d)" % samplecount 67 | assert False, errstring 68 | if type(element) == HtmlTag: 69 | self.assertEqual(element.tag, expected.tag) 70 | self.assertEqual(element.attributes, expected.attributes) 71 | self.assertEqual(element.tag_type, expected.tag_type) 72 | if type(element) == HtmlDataFragment: 73 | msg = "Got: %s Expected: %s in sample: %d [%d:%d] (%s)" % \ 74 | (element.is_text_content, expected.is_text_content, samplecount, element.start, element.end, repr(element_text)) \ 75 | if samplecount is not None else None 76 | self.assertEqual(element.is_text_content, expected.is_text_content, msg) 77 | 78 | if expected_parsed: 79 | errstring = "Expected %s" % repr(expected_parsed) 80 | if samplecount is not None: 81 | errstring += " (sample %d)" % samplecount 82 | assert False, errstring 83 | 84 | def test_parse(self): 85 | """simple parse_html test""" 86 | parsed = [_decode_element(d) for d in PARSED] 87 | sample = {"source": PAGE, "parsed": parsed} 88 | self._test_sample(PAGE, parsed) 89 | 90 | def test_site_samples(self): 91 | """test parse_html from real cases""" 92 | for i, (source, parsed) in enumerate( 93 | iter_samples('htmlpage', object_hook=_decode_element)): 94 | self._test_sample(source, parsed, i) 95 | 96 | def test_bad(self): 97 | """test parsing of bad html layout""" 98 | parsed = [_decode_element(d) for d in PARSED2] 99 | self._test_sample(PAGE2, parsed) 100 | 101 | def test_comments(self): 102 | """test parsing of tags inside comments""" 103 | parsed = [_decode_element(d) for d in PARSED3] 104 | self._test_sample(PAGE3, parsed) 105 | 106 | def test_script_text(self): 107 | """test parsing of tags inside scripts""" 108 | parsed = [_decode_element(d) for d in PARSED4] 109 | self._test_sample(PAGE4, parsed) 110 | 111 | def test_sucessive(self): 112 | """test parsing of sucesive cleaned elements""" 113 | parsed = [_decode_element(d) for d in PARSED5] 114 | self._test_sample(PAGE5, parsed) 115 | 116 | def test_sucessive2(self): 117 | """test parsing of sucesive cleaned elements (variant 2)""" 118 | parsed = [_decode_element(d) for d in PARSED6] 119 | self._test_sample(PAGE6, parsed) 120 | 121 | def test_special_cases(self): 122 | """some special cases tests""" 123 | parsed = list(parse_html("")) 124 | self.assertEqual(parsed[0].attributes, {'content': 'no-cache', 'http-equiv': 'Pragma'}) 125 | parsed = list(parse_html("")) 126 | self.assertEqual(parsed[0].attributes, {'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': 'en', 'lang': 'en'}) 127 | parsed = list(parse_html("")) 128 | self.assertEqual(parsed[0].attributes, {'src': 'http://images.play.com/banners/SAM550a.jpg', \ 129 | 'align': 'left', 'hspace': '5', '/': None}) 130 | 131 | def test_no_ending_body(self): 132 | """Test case when no ending body nor html elements are present""" 133 | parsed = [_decode_element(d) for d in PARSED7] 134 | self._test_sample(PAGE7, parsed) 135 | 136 | def test_malformed(self): 137 | """Test parsing of some malformed cases""" 138 | parsed = [_decode_element(d) for d in PARSED8] 139 | self._test_sample(PAGE8, parsed) 140 | 141 | def test_malformed2(self): 142 | """Test case when attributes are not separated by space (still recognizable because of quotes)""" 143 | parsed = [_decode_element(d) for d in PARSED9] 144 | self._test_sample(PAGE9, parsed) 145 | 146 | def test_malformed3(self): 147 | """Test case where attributes are repeated (should take first attribute, accoring to spec)""" 148 | parsed = [_decode_element(d) for d in PARSED10] 149 | self._test_sample(PAGE10, parsed) 150 | 151 | def test_empty_subregion(self): 152 | htmlpage = HtmlPage(body=u"") 153 | self.assertEqual(htmlpage.subregion(), u"") 154 | 155 | def test_ignore_xml_declaration(self): 156 | """Ignore xml declarations inside html""" 157 | parsed = list(parse_html(u"

The text

is here

")) 158 | self.assertFalse(parsed[3].is_text_content) 159 | 160 | def test_copy(self): 161 | """Test copy/deepcopy""" 162 | page = HtmlPage(url='http://www.example.com', body=PAGE) 163 | region = page.subregion(10, 15) 164 | 165 | regioncopy = copy.copy(region) 166 | self.assertEqual(regioncopy.start_index, 10) 167 | self.assertEqual(regioncopy.end_index, 15) 168 | self.assertFalse(region is regioncopy) 169 | self.assertTrue(region.htmlpage is regioncopy.htmlpage) 170 | 171 | regiondeepcopy = copy.deepcopy(region) 172 | self.assertEqual(regiondeepcopy.start_index, 10) 173 | self.assertEqual(regiondeepcopy.end_index, 15) 174 | self.assertFalse(region is regiondeepcopy) 175 | self.assertFalse(region.htmlpage is regiondeepcopy.htmlpage) 176 | 177 | def test_load_page_from_url(self): 178 | filepath = os.path.join(BASE_PATH, 'samples/samples_htmlpage_0') 179 | url = 'file://{}.{}'.format(filepath, 'html') 180 | page = url_to_page(url) 181 | parsed = json.load(open('{}.{}'.format(filepath, 'json'))) 182 | parsed = [_decode_element(d) for d in parsed] 183 | self.assertEqual(page.url, url) 184 | self._test_sample(page.body, parsed, 1) 185 | -------------------------------------------------------------------------------- /tests/test_htmlpage_data.py: -------------------------------------------------------------------------------- 1 | PAGE = u""" 2 | 4 | 5 |