",
40 | items: [],
41 | };
42 |
43 | var itemsJson = {
44 | "default": {
45 | "fields": {
46 | "description": {
47 | "required": false,
48 | "type": "safe html",
49 | "vary": false
50 | },
51 | "images": {
52 | "required": true,
53 | "type": "image",
54 | "vary": true
55 | },
56 | }
57 | }
58 | };
--------------------------------------------------------------------------------
/slybot/slybot/baseurl.py:
--------------------------------------------------------------------------------
1 | """
2 | html page utils
3 | """
4 | import urlparse, re
5 | from scrapely.htmlpage import parse_html, HtmlTagType
6 |
7 | ABSURLRE = re.compile("^https?\:\/\/")
8 | DOCTYPERE = re.compile("", re.S | re.I)
9 |
10 | def _is_abs_url(url):
11 | return bool(ABSURLRE.match(url))
12 |
13 | def insert_base_url(html, base):
14 | """
15 | Inserts the given base url if does not exist in html source,
16 | or replace the existing if needed
17 | """
18 | baseurl = baseelement = headelement = htmlelement = None
19 | for element in parse_html(html):
20 | if getattr(element, "tag", None) == "base":
21 | baseurl = element.attributes.get("href", None)
22 | baseelement = element
23 | elif getattr(element, "tag", None) == "head" and \
24 | element.tag_type == HtmlTagType.OPEN_TAG:
25 | headelement = element
26 | elif getattr(element, "tag", None) == "html" and \
27 | element.tag_type == HtmlTagType.OPEN_TAG:
28 | htmlelement = element
29 |
30 | if baseurl:
31 | if not _is_abs_url(baseurl):
32 | absurl = urlparse.urljoin(base, baseurl)
33 | # replace original base tag
34 | basetag = '' % absurl
35 | html = html[:baseelement.start] + basetag + html[baseelement.end:]
36 |
37 | else:
38 | # Generate new base element and include
39 | basetag = '' % base
40 | if headelement:
41 | insertpos = headelement.end
42 | else:
43 | if htmlelement:
44 | basetag = "\n%s\n" % basetag
45 | insertpos = htmlelement.end
46 | else:
47 | doctype_match = DOCTYPERE.search(html)
48 | if doctype_match:
49 | insertpos = doctype_match.end()
50 | else:
51 | insertpos = 0
52 | html = html[:insertpos] + basetag + html[insertpos:]
53 |
54 | return html
55 |
56 | def get_base_url(htmlpage):
57 | """Return the base url of the given HtmlPage"""
58 | for element in htmlpage.parsed_body:
59 | if getattr(element, "tag", None) == "base":
60 | return element.attributes.get("href") or htmlpage.url
61 | return htmlpage.url
62 |
--------------------------------------------------------------------------------
/slyd/media/js/messages.js:
--------------------------------------------------------------------------------
1 | ASTool.Messages = Ember.Namespace.create({
2 | // Inline help messages.
3 | overlay_blocked_links: 'Enable this options to highlight links not followed at crawl time in red and followed links in green.',
4 | follow_links: 'Links that match any of the regular expressions in this list will be followed (they should also be in the domain of one of the start pages).',
5 | exclude_links: 'Links that match any of the regular expressions in this list will be excluded.',
6 | perform_login: 'Select this option if the site you are crawling requires login credentials.',
7 | template_required: 'This setting is equivalent to marking the fields as required in the item definition, but limiting the scope to this template only.',
8 | extractors: 'With regular expression extractors, the extracted data is matched against the specified expression and replaced by the match group enclosed between parentheses. If there is no match, the field is not extracted.
Type extractors override the type specified in the item definition.',
9 | select_item: 'You can choose what item type is extracted by this template using the combobox. You can also create and modify items by clicking on the Edit Items button.',
10 | variant: 'By selecting a different variant than Base(0) in your annotation, the resulting extracted data will be assigned to the base item special field variants, which is a list of objects similar to an item.',
11 | ignored_subregions: 'Allows you to define subregions that should be excluded from the extraction process.',
12 | selected_region_ancestors: 'Refine your selection by navigating its ancestors.',
13 | selected_region_children: 'Refine your selection by navigating its children.',
14 | sticky_fields: 'Required attributes are not extracted, but they must be present for a page to match the template.',
15 | annotation_widget: 'Select the attribute you want to extract and an item field to map it.
Choose -just required- to indicate that the template must match a particular feature without generating any extracted data.
You can create new fields by clicking the + field button or by seleting the -create new- option from the field combobox.',
16 |
17 | // Other messages.
18 | confirm_change_selection: 'If you select a different region you will lose all current attribute mappings and ignored subregions, proceed anyway?',
19 | });
20 |
--------------------------------------------------------------------------------
/slyd/media/js/controllers/projects-controller.js:
--------------------------------------------------------------------------------
1 | ASTool.ProjectsIndexController = Em.ArrayController.extend(ASTool.BaseControllerMixin, {
2 | needs: ['application'],
3 |
4 | projectSite: null,
5 |
6 | createProjectDisabled: function() {
7 | return Em.isEmpty(this.get('projectSite'));
8 | }.property('projectSite'),
9 |
10 | actions: {
11 |
12 | openProject: function(projectName) {
13 | this.set('slyd.project', projectName);
14 | this.transitionToRoute('project', { id: projectName });
15 | },
16 |
17 | deleteProject: function(projectName) {
18 | if (confirm('Are you sure you want to delete this project? This operation cannot be undone.')) {
19 | this.get('slyd').deleteProject(projectName);
20 | this.removeObject(projectName);
21 | }
22 | },
23 |
24 | createProject: function() {
25 | var newProjectName = this.getUnusedName('new_project', this.get('content'));
26 | this.get('slyd').createProject(newProjectName).then(function() {
27 | this.set('slyd.project', newProjectName);
28 | // Initialize items spec.
29 | this.get('slyd').saveItems([
30 | ASTool.Item.create({ name: 'default', fields: [ ]
31 | })
32 | ]);
33 | // Initialize extractors spec.
34 | this.get('slyd').saveExtractors([]);
35 | // Setup automatic creation of an initial spider.
36 | this.set('controllers.application.siteWizard', this.get('projectSite'));
37 | this.set('projectSite', null);
38 | this.transitionToRoute('project', { id: newProjectName });
39 | }.bind(this));
40 | }
41 | },
42 |
43 | animateProjectSiteInput: function() {
44 | var animateBorderColor = function () {
45 | $('#projectSiteTextField')
46 | .animate({ 'border-color': 'rgba(88,150,220,0.4)', 'background-color': 'rgba(130,210,230,0.1)' }, 1000)
47 | .animate({ 'border-color': '#BBBBB', 'background-color': '#FFFFFF' }, 1000, animateBorderColor)
48 | };
49 | Em.run.schedule('afterRender', this, function() {
50 | $('#projectSiteTextField')
51 | .hide()
52 | .show('fast', animateBorderColor)
53 | .click(function(e) {
54 | $('#projectSiteTextField').stop(true)
55 | .css({ 'border-color': '#BBBBB', 'background-color': '#FFFFFF' });
56 | });
57 | });
58 | },
59 |
60 | willEnter: function() {
61 | this.get('documentView').showSpider();
62 | if (Em.isEmpty(this.get('content'))) {
63 | this.animateProjectSiteInput();
64 | }
65 | }
66 | });
67 |
--------------------------------------------------------------------------------
/slybot/slybot/item.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | from collections import defaultdict
3 |
4 | from scrapy.item import DictItem, Field
5 | from scrapely.descriptor import ItemDescriptor, FieldDescriptor
6 |
7 | from slybot.fieldtypes import FieldTypeManager
8 |
9 | class SlybotItem(DictItem):
10 | # like DictItem.__setitem__ but doesn't check the field is declared
11 | def __setitem__(self, name, value):
12 | self._values[name] = value
13 | @classmethod
14 | def create_iblitem_class(cls, schema):
15 | class IblItem(cls):
16 | fields = defaultdict(dict)
17 | version_fields = []
18 | for _name, _meta in schema['fields'].items():
19 | fields[_name] = Field(_meta)
20 | if not _meta.get("vary", False):
21 | version_fields.append(_name)
22 | version_fields = sorted(version_fields)
23 | return IblItem
24 |
25 | def create_slybot_item_descriptor(schema):
26 | field_type_manager = FieldTypeManager()
27 | descriptors = []
28 | for pname, pdict in schema['fields'].items():
29 | required = pdict['required']
30 | pclass = field_type_manager.type_processor_class(pdict['type'])
31 | processor = pclass()
32 | descriptor = SlybotFieldDescriptor(pname, pname, processor, required)
33 | descriptors.append(descriptor)
34 | return ItemDescriptor("", "", descriptors)
35 |
36 | class SlybotFieldDescriptor(FieldDescriptor):
37 | """Extends the scrapely field descriptor to use slybot fieldtypes and
38 | to be created from a slybot item schema
39 | """
40 |
41 | def __init__(self, name, description, field_type_processor, required=False):
42 | """Create a new SlybotFieldDescriptor with the given name and description.
43 | The field_type_processor is used for extraction and is publicly available
44 | """
45 | FieldDescriptor.__init__(self, name, description,
46 | field_type_processor.extract, required)
47 | # add an adapt method
48 | self.adapt = field_type_processor.adapt
49 |
50 | def create_item_version(item):
51 | """Item version based on hashlib.sha1 algorithm"""
52 | if not item.version_fields:
53 | return
54 | _hash = hashlib.sha1()
55 | for attrname in item.version_fields:
56 | _hash.update(repr(item.get(attrname)))
57 | return _hash.digest()
58 |
59 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | portia
2 | ======
3 | Visual scraping for Scrapy.
4 |
5 |
6 | Overview
7 | ========
8 |
9 | Portia is a tool for visually scraping web sites without any programming knowledge. Just annotate web pages with a point and click editor to indicate what data you want to extract, and portia will learn how to scrape similar pages
10 | from the site.
11 |
12 | Portia has a web based UI served by a [Twisted] server, so you can install it on almost any modern platform.
13 |
14 | Requirements
15 | ============
16 |
17 | * Python 2.7
18 | * Works on Linux, Windows, Mac OSX, BSD
19 | * Supported browsers: Latest versions of Chrome (recommended) or Firefox
20 |
21 |
22 | Repository structure
23 | ====================
24 |
25 | There are two main components in this repository, __slyd__ and __slybot__:
26 |
27 | ###slyd
28 |
29 | The visual editor used to create your scraping projects.
30 |
31 | ###slybot
32 |
33 | The Python web crawler that performs the actual site scraping. It's implemented on top of the [Scrapy] web crawling
34 | framework and the [Scrapely] extraction library. It uses projects created with __slyd__ as input.
35 |
36 |
37 | How to install portia
38 | =============================
39 |
40 | The recommended way to install dependencies is to use __virtualenv__ and then do:
41 |
42 | cd slyd
43 | pip install -r requirements.txt
44 |
45 | As __slybot__ is a __slyd__ dependency, it will also get installed.
46 |
47 | Running portia
48 | ==============
49 |
50 | First, you need to start the ui and create a project. Run __slyd__ using:
51 |
52 | cd slyd
53 | twistd -n slyd
54 |
55 | and point your browser to: `http://localhost:9001/static/main.html`
56 |
57 | Choose the site you want to scrape and create a project. Every project is created with a default spider named after the domain of the site you are scraping. When you are ready, you can run your project with __slybot__ to do the actual crawling/extraction.
58 |
59 | Projects created with __slyd__ can be found at:
60 |
61 | slyd/data/projects
62 |
63 | To run one of those projects use:
64 |
65 | portiacrawl project_path spidername
66 |
67 | Where `spidername` should be one of the project spiders. If you don't remember the name of the spider, just use:
68 |
69 | portiacrawl project_path
70 |
71 | and you will get the list of spiders for that project.
72 |
73 |
74 | [Twisted]: https://twistedmatrix.com
75 | [Scrapely]: https://github.com/scrapy/scrapely
76 | [Scrapy]: http://scrapy.org
77 |
--------------------------------------------------------------------------------
/slybot/slybot/tests/test_schema_validation.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from unittest import TestCase
4 | from os.path import dirname, join
5 |
6 | from slybot.validation.schema import get_schema_validator, \
7 | ValidationError, validate_project_schema
8 | from slybot.utils import open_project_from_dir
9 |
10 | _TEST_PROJECT_DIR = join(dirname(__file__), "data/SampleProject")
11 |
12 | class JsonSchemaTest(TestCase):
13 |
14 | def assertRaisesRegexp(self, eclass, pattern, func, *args):
15 | """assertRaisesRegexp is not provided in python versions below 2.7"""
16 | try:
17 | func(*args)
18 | except eclass, e:
19 | m = re.search(pattern, e.message)
20 | if not m:
21 | raise AssertionError('"%s" does not match "%s"' % (pattern, e.message))
22 | else:
23 | raise AssertionError("%s not raised" % eclass.__name__)
24 |
25 | def test_regex_formatting_wrong(self):
26 | obj = {
27 | "0": {
28 | "regular_expression": "Item: (\d+"
29 | }
30 | }
31 | validator = get_schema_validator("extractors")
32 | self.assertRaisesRegexp(ValidationError, "Invalid regular expression",
33 | validator.validate, obj)
34 |
35 | def test_regex_formatting_ok(self):
36 | obj = {
37 | "0": {
38 | "regular_expression": "Item: (\d+)"
39 | }
40 | }
41 | validator = get_schema_validator("extractors")
42 | self.assertEqual(validator.validate(obj), None)
43 |
44 | def test_valid_url(self):
45 | obj = {
46 | "start_urls": ['http://www.example.com/'],
47 | "links_to_follow": "none",
48 | "respect_nofollow": True,
49 | "templates": [],
50 | }
51 | validator = get_schema_validator("spider")
52 | self.assertEqual(validator.validate(obj), None)
53 |
54 | def test_invalid_url(self):
55 | obj = {
56 | "start_urls": ['www.example.com'],
57 | "links_to_follow": "none",
58 | "respect_nofollow": True,
59 | "templates": [],
60 | }
61 | validator = get_schema_validator("spider")
62 | self.assertRaisesRegexp(ValidationError, "Invalid url:", validator.validate, obj)
63 |
64 | def test_test_project(self):
65 | specs = open_project_from_dir(_TEST_PROJECT_DIR)
66 | self.assertTrue(validate_project_schema(specs))
67 |
68 |
--------------------------------------------------------------------------------
/slyd/tests/test_bot.py:
--------------------------------------------------------------------------------
1 | import json
2 | from os.path import join
3 | from twisted.trial import unittest
4 | from twisted.internet.defer import inlineCallbacks
5 | from twisted.web.server import Site
6 | from twisted.web.static import File
7 | from twisted.internet import reactor
8 | from slyd.bot import create_bot_resource
9 | from .utils import TestSite, test_spec_manager
10 | from .settings import RESOURCE_DIR
11 |
12 |
13 | class BotTest(unittest.TestCase):
14 | def setUp(self):
15 | # configure bot resource
16 | sm = test_spec_manager()
17 | self.bot_resource = create_bot_resource(sm)
18 | self.botsite = TestSite(self.bot_resource)
19 |
20 | # configure fake website to crawl
21 | docroot = join(RESOURCE_DIR, 'docroot')
22 | factory = Site(File(docroot))
23 | self.listen_port = reactor.listenTCP(8997, factory)
24 |
25 |
26 | def _fetch(self, url, **params):
27 | req = dict(params)
28 | req.setdefault('request', {})['url'] = url
29 | request_json = json.dumps(req)
30 | return self.botsite.post('fetch', data=request_json)
31 |
32 | @inlineCallbacks
33 | def test_fetch(self):
34 | # test status code
35 | result = yield self._fetch("http://localhost:8997/notexists")
36 | self.assertEqual(result.responseCode, 200)
37 | status = json.loads(result.value())['response']['status']
38 | self.assertEqual(status, 404)
39 |
40 | # get an existing file
41 | test_url = "http://localhost:8997/test.html"
42 | result = yield self._fetch(test_url)
43 | self.assertEqual(result.responseCode, 200)
44 | value = json.loads(result.value())
45 | # expect 200 response and base href added
46 | self.assertEqual(value['response']['status'], 200)
47 | self.assertIn('>> extractor = create_regex_extractor("(\d+).*(\.\d+)")
13 | >>> extractor(u"The price of this product is
45
.50 pounds")
14 | u'45.50'
15 | """
16 | ereg = re.compile(pattern, re.S)
17 | def _extractor(txt):
18 | m = ereg.search(txt)
19 | if m:
20 | return htmlregion(u"".join(filter(None, m.groups() or m.group())))
21 |
22 | _extractor.__name__ = "Regex: %s" % pattern.encode("utf-8")
23 | return _extractor
24 |
25 | class PipelineExtractor:
26 | def __init__(self, *extractors):
27 | self.extractors = extractors
28 |
29 | def __call__(self, value):
30 | for extractor in self.extractors:
31 | value = extractor(value) if value else value
32 | return value
33 |
34 | @property
35 | def __name__(self):
36 | return repr(self.extractors)
37 |
38 |
39 | def apply_extractors(descriptor, template_extractors, extractors):
40 | field_type_manager = FieldTypeManager()
41 |
42 | for field_name, field_extractors in template_extractors.items():
43 | equeue = []
44 | for eid in field_extractors:
45 | extractor_doc = extractors[eid]
46 | if "regular_expression" in extractor_doc:
47 | equeue.append(create_regex_extractor(extractor_doc["regular_expression"]))
48 | elif "type_extractor" in extractor_doc: # overrides default one
49 | descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name,
50 | field_name, field_type_manager.type_processor_class(extractor_doc["type_extractor"])())
51 | if not field_name in descriptor.attribute_map:
52 | # if not defined type extractor, use text type by default, as it is by far the most commonly used
53 | descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name,
54 | field_name, field_type_manager.type_processor_class("text")())
55 |
56 | if equeue:
57 | equeue.insert(0, descriptor.attribute_map[field_name].extractor)
58 | descriptor.attribute_map[field_name].extractor = PipelineExtractor(*equeue)
59 |
60 |
--------------------------------------------------------------------------------
/slybot/slybot/fieldtypes/text.py:
--------------------------------------------------------------------------------
1 | """
2 | Text types
3 | """
4 | from scrapely.extractors import text as extract_text, safehtml
5 |
6 | class _BaseTextProcessor(object):
7 | """basic text processor, defines identity functions, some of which
8 | are overridden in subclasses
9 | """
10 | def extract(self, text):
11 | """Matches and extracts any string, as it is"""
12 | return text
13 |
14 | def adapt(self, text, htmlpage):
15 | return text
16 |
17 | class RawFieldTypeProcessor(_BaseTextProcessor):
18 | """Extracts the raw data, without processing. Data is escaped for presentation
19 |
20 | >>> from scrapely.extractors import htmlregion
21 | >>> r = RawFieldTypeProcessor()
22 | >>> html = htmlregion(u'
test
')
23 | >>> r.extract(html)
24 | u'
test
'
25 | >>> r.adapt(html, None)
26 | u'
test
'
27 | """
28 | name = 'raw html'
29 | description = 'raw html as it appears in the page'
30 |
31 | class TextFieldTypeProcessor(_BaseTextProcessor):
32 | """Extracts strings, removing all HTML markup
33 |
34 | >>> from scrapely.extractors import htmlregion
35 | >>> p = TextFieldTypeProcessor()
36 | >>> html = htmlregion(u'
')
43 | >>> p.extract(html)
44 | u''
45 | """
46 | name = 'text'
47 | description = 'extracts text from web pages, cleaning all markup'
48 |
49 | def extract(self, htmlregion):
50 | return extract_text(htmlregion.text_content)
51 |
52 |
53 | class SafeHtmlFieldTypeProcessor(_BaseTextProcessor):
54 | """Extracts strings, with only a safe subset of HTML remaining
55 |
56 | Extraction checks for presence of text content, and adapt transforms the HTML
57 | >>> from scrapely.extractors import htmlregion
58 | >>> p = SafeHtmlFieldTypeProcessor()
59 | >>> html = htmlregion(u'