├── slyd ├── slyd │ ├── __init__.py │ ├── settings.py │ ├── resource.py │ ├── utils.py │ ├── tap.py │ ├── html.py │ └── projects.py ├── tests │ ├── __init__.py │ ├── resources │ │ ├── data │ │ │ └── projects │ │ │ │ └── test │ │ │ │ ├── extractors.json │ │ │ │ ├── project.json │ │ │ │ ├── spiders │ │ │ │ ├── mh2.json │ │ │ │ └── mhvillage.json │ │ │ │ └── items.json │ │ └── docroot │ │ │ └── test.html │ ├── settings.py │ ├── mockserver.py │ ├── test_bot.py │ ├── utils.py │ ├── test_projects.py │ └── test_spec.py ├── media │ ├── portia.png │ ├── minispider.png │ ├── images │ │ ├── info.png │ │ ├── Chevron.gif │ │ ├── IconHome.gif │ │ ├── toolbox_show.png │ │ └── ChevronOverlay.png │ ├── css │ │ ├── images │ │ │ ├── animated-overlay.gif │ │ │ ├── ui-icons_FFF_256x240.png │ │ │ ├── ui-icons_2e83ff_256x240.png │ │ │ ├── ui-icons_333333_256x240.png │ │ │ ├── ui-icons_cd0a0a_256x240.png │ │ │ ├── ui-icons_ffffff_256x240.png │ │ │ ├── ui-bg_flat_0_aaaaaa_40x100.png │ │ │ ├── ui-bg_flat_100_444444_40x100.png │ │ │ ├── ui-bg_flat_100_999999_40x100.png │ │ │ ├── ui-bg_flat_100_aaaaaa_40x100.png │ │ │ ├── ui-bg_flat_100_cccccc_40x100.png │ │ │ ├── ui-bg_flat_100_ffffff_40x100.png │ │ │ ├── ui-bg_glass_55_fbf9ee_1x400.png │ │ │ └── ui-bg_glass_95_fef1ec_1x400.png │ │ └── breadcrumb.css │ ├── js │ │ ├── templates │ │ │ ├── application.handlebars │ │ │ ├── inline-textfield.handlebars │ │ │ ├── annotated-document-view.handlebars │ │ │ ├── topbar-project.handlebars │ │ │ ├── topbar-projects.handlebars │ │ │ ├── navigation.handlebars │ │ │ ├── topbar-extraction.handlebars │ │ │ ├── toolbox.handlebars │ │ │ ├── toolbox-projects.handlebars │ │ │ ├── toolbox-items.handlebars │ │ │ ├── toolbox-project.handlebars │ │ │ ├── item.handlebars │ │ │ ├── annotation-widget.handlebars │ │ │ ├── edit-item.handlebars │ │ │ ├── floating-annotation-widget.handlebars │ │ │ ├── topbar-browse.handlebars │ │ │ ├── toolbox-template.handlebars │ │ │ ├── toolbox-annotation.handlebars │ │ │ └── toolbox-spider.handlebars │ │ ├── controllers │ │ │ ├── application-controller.js │ │ │ ├── controllers.js │ │ │ ├── navigation-controller.js │ │ │ ├── items-controller.js │ │ │ ├── projects-controller.js │ │ │ ├── project-controller.js │ │ │ └── annotation-controller.js │ │ ├── app.js │ │ ├── messages.js │ │ ├── jqplugins.js │ │ ├── emberui.js │ │ └── vendor │ │ │ ├── ic-ajax.js │ │ │ └── loading.js │ ├── start.html │ ├── tests │ │ ├── fixtures.js │ │ └── integration_test_helper.js │ └── main.html ├── themes │ └── classic │ │ └── images │ │ ├── animated-overlay.gif │ │ ├── ui-icons_222222_256x240.png │ │ ├── ui-icons_2e83ff_256x240.png │ │ ├── ui-icons_cd0a0a_256x240.png │ │ ├── ui-icons_eaebda_256x240.png │ │ ├── ui-icons_fe412f_256x240.png │ │ ├── ui-bg_flat_0_111111_40x100.png │ │ ├── ui-bg_flat_0_aaaaaa_40x100.png │ │ ├── ui-bg_glass_55_fbf9ee_1x400.png │ │ ├── ui-bg_glass_95_fef1ec_1x400.png │ │ ├── ui-bg_inset-soft_15_404040_1x100.png │ │ ├── ui-bg_highlight-hard_25_262626_1x100.png │ │ ├── ui-bg_highlight-hard_25_b72724_1x100.png │ │ ├── ui-bg_highlight-hard_45_1e1a1a_1x100.png │ │ └── ui-bg_highlight-soft_75_9e9e9e_1x100.png ├── requirements.txt ├── package.json ├── twisted │ └── plugins │ │ └── slyd_plugin.py ├── .gitignore ├── bin │ └── sh2sly ├── karma.conf.js └── README.md ├── slybot ├── debian │ ├── compat │ ├── pyversions │ ├── copyright │ ├── changelog │ ├── rules │ └── control ├── slybot │ ├── validation │ │ ├── __init__.py │ │ ├── schema.py │ │ └── schemas.json │ ├── __init__.py │ ├── tests │ │ ├── data │ │ │ ├── test_params.txt │ │ │ └── SampleProject │ │ │ │ ├── project.json │ │ │ │ ├── extractors.json │ │ │ │ └── spiders │ │ │ │ ├── example3.com.json │ │ │ │ ├── any_allowed_domains.json │ │ │ │ ├── example2.com.json │ │ │ │ ├── allowed_domains.json │ │ │ │ ├── cargurus.json │ │ │ │ ├── pinterest.com.json │ │ │ │ ├── seedsofchange.com.json │ │ │ │ ├── example.com.json │ │ │ │ ├── ebay.json │ │ │ │ ├── ebay3.json │ │ │ │ ├── ebay4.json │ │ │ │ └── ebay2.json │ │ ├── test_fieldtypes.py │ │ ├── test_dupefilter.py │ │ ├── test_schema_validation.py │ │ └── test_baseurl.py │ ├── fieldtypes │ │ ├── point.py │ │ ├── images.py │ │ ├── price.py │ │ ├── url.py │ │ ├── number.py │ │ ├── __init__.py │ │ └── text.py │ ├── settings.py │ ├── dupefilter.py │ ├── linkextractor │ │ ├── regex.py │ │ ├── ecsv.py │ │ ├── __init__.py │ │ ├── xml.py │ │ └── base.py │ ├── utils.py │ ├── spidermanager.py │ ├── closespider.py │ ├── baseurl.py │ ├── item.py │ ├── extractors.py │ ├── generic_form.py │ └── spiderlets.py ├── Makefile.buildbot ├── scrapy.cfg ├── .gitignore ├── requirements.txt ├── bin │ ├── slybot │ ├── makedeb │ └── portiacrawl ├── README.rst ├── setup.py └── docs │ ├── index.rst │ ├── spiderlets.rst │ └── make.bat ├── LICENSE └── README.md /slyd/slyd/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slyd/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slybot/debian/compat: -------------------------------------------------------------------------------- 1 | 7 2 | -------------------------------------------------------------------------------- /slybot/debian/pyversions: -------------------------------------------------------------------------------- 1 | 2.5- 2 | -------------------------------------------------------------------------------- /slybot/slybot/validation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slybot/Makefile.buildbot: -------------------------------------------------------------------------------- 1 | build: 2 | bin/makedeb 3 | -------------------------------------------------------------------------------- /slybot/slybot/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.9' 2 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/test_params.txt: -------------------------------------------------------------------------------- 1 | Cars 2 | Boats -------------------------------------------------------------------------------- /slyd/tests/resources/data/projects/test/extractors.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /slybot/debian/copyright: -------------------------------------------------------------------------------- 1 | Copyright (C) 2011-2012 Scrapinghub 2 | -------------------------------------------------------------------------------- /slybot/scrapy.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | default = slybot.settings 3 | -------------------------------------------------------------------------------- /slybot/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | docs/_build 3 | slybot.egg-info/ 4 | -------------------------------------------------------------------------------- /slybot/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | scrapely 3 | loginform 4 | lxml 5 | -------------------------------------------------------------------------------- /slyd/media/portia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/portia.png -------------------------------------------------------------------------------- /slyd/media/minispider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/minispider.png -------------------------------------------------------------------------------- /slyd/media/images/info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/info.png -------------------------------------------------------------------------------- /slyd/media/images/Chevron.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/Chevron.gif -------------------------------------------------------------------------------- /slyd/media/images/IconHome.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/IconHome.gif -------------------------------------------------------------------------------- /slyd/media/images/toolbox_show.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/toolbox_show.png -------------------------------------------------------------------------------- /slyd/tests/resources/data/projects/test/project.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1308771278", 3 | "name": "demo" 4 | } -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/project.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1", 3 | "name": "SampleProject" 4 | } 5 | -------------------------------------------------------------------------------- /slyd/media/images/ChevronOverlay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/ChevronOverlay.png -------------------------------------------------------------------------------- /slyd/media/css/images/animated-overlay.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/animated-overlay.gif -------------------------------------------------------------------------------- /slyd/media/css/images/ui-icons_FFF_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_FFF_256x240.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/animated-overlay.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/animated-overlay.gif -------------------------------------------------------------------------------- /slyd/media/css/images/ui-icons_2e83ff_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_2e83ff_256x240.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-icons_333333_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_333333_256x240.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-icons_cd0a0a_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_cd0a0a_256x240.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-icons_ffffff_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_ffffff_256x240.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-bg_flat_0_aaaaaa_40x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_0_aaaaaa_40x100.png -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/extractors.json: -------------------------------------------------------------------------------- 1 | { 2 | "4fad3762688f920d76000000": { 3 | "regular_expression": "(\\d+)" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /slyd/media/css/images/ui-bg_flat_100_444444_40x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_444444_40x100.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-bg_flat_100_999999_40x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_999999_40x100.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-bg_flat_100_aaaaaa_40x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_aaaaaa_40x100.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-bg_flat_100_cccccc_40x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_cccccc_40x100.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-bg_flat_100_ffffff_40x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_ffffff_40x100.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-bg_glass_55_fbf9ee_1x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_glass_55_fbf9ee_1x400.png -------------------------------------------------------------------------------- /slyd/media/css/images/ui-bg_glass_95_fef1ec_1x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_glass_95_fef1ec_1x400.png -------------------------------------------------------------------------------- /slyd/requirements.txt: -------------------------------------------------------------------------------- 1 | twisted 2 | scrapy 3 | loginform 4 | lxml 5 | jsonschema 6 | -e git://github.com/scrapy/scrapely.git#egg=scrapely 7 | -e ../slybot 8 | -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-icons_222222_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_222222_256x240.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-icons_2e83ff_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_2e83ff_256x240.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-icons_cd0a0a_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_cd0a0a_256x240.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-icons_eaebda_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_eaebda_256x240.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-icons_fe412f_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_fe412f_256x240.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-bg_flat_0_111111_40x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_flat_0_111111_40x100.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-bg_flat_0_aaaaaa_40x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_flat_0_aaaaaa_40x100.png -------------------------------------------------------------------------------- /slyd/media/js/templates/application.handlebars: -------------------------------------------------------------------------------- 1 | {{outlet topbar}} 2 | {{view ASTool.AnnotatedDocumentView}} 3 |
4 | {{outlet main}} 5 |
6 | -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-bg_glass_55_fbf9ee_1x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_glass_55_fbf9ee_1x400.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-bg_glass_95_fef1ec_1x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_glass_95_fef1ec_1x400.png -------------------------------------------------------------------------------- /slyd/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "bower": "*", 4 | "karma-ember-preprocessor": "*", 5 | "karma-qunit": "*", 6 | "karma": "0.10.2" 7 | } 8 | } -------------------------------------------------------------------------------- /slybot/bin/slybot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | os.environ['SCRAPY_SETTINGS_MODULE'] = 'slybot.settings' 4 | 5 | from scrapy.cmdline import execute 6 | execute() 7 | -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-bg_inset-soft_15_404040_1x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_inset-soft_15_404040_1x100.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-bg_highlight-hard_25_262626_1x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_highlight-hard_25_262626_1x100.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-bg_highlight-hard_25_b72724_1x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_highlight-hard_25_b72724_1x100.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-bg_highlight-hard_45_1e1a1a_1x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_highlight-hard_45_1e1a1a_1x100.png -------------------------------------------------------------------------------- /slyd/themes/classic/images/ui-bg_highlight-soft_75_9e9e9e_1x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_highlight-soft_75_9e9e9e_1x100.png -------------------------------------------------------------------------------- /slybot/debian/changelog: -------------------------------------------------------------------------------- 1 | python-slybot (0.9) unstable; urgency=low 2 | 3 | * Initial release. 4 | 5 | -- Scrapinghub Team Wed, 31 Oct 2012 16:32:13 -0300 6 | -------------------------------------------------------------------------------- /slyd/tests/resources/docroot/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | test 4 | 5 | 6 | Simple testing page with a link to Scrapy 7 | 8 | -------------------------------------------------------------------------------- /slyd/media/js/templates/inline-textfield.handlebars: -------------------------------------------------------------------------------- 1 | 2 | {{#if view.isEditing}} 3 | 4 | {{view view.textField value=view.value}} 5 | 6 | {{else}} 7 | {{yield}} 8 | {{/if}} 9 | -------------------------------------------------------------------------------- /slyd/twisted/plugins/slyd_plugin.py: -------------------------------------------------------------------------------- 1 | """Registers 'twistd slyd' command.""" 2 | from twisted.application.service import ServiceMaker 3 | 4 | finger = ServiceMaker( 5 | 'slyd', 'slyd.tap', 'A server for creating scrapely spiders', 'slyd') 6 | -------------------------------------------------------------------------------- /slyd/media/start.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /slybot/bin/makedeb: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | version=$(python setup.py --version)-r$(git log --oneline | wc -l)+$(date +%Y%m%d%H%M)${BUILD_CODE:+~$BUILD_CODE} 4 | debchange -m -D unstable --force-distribution -v $version "Automatic build" 5 | debuild --no-lintian -us -uc -b 6 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/example3.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": ["http://www.example.com/index.html"], 4 | "exclude_patterns": [], 5 | "follow_patterns": [], 6 | "links_to_follow": "patterns", 7 | "respect_nofollow": true 8 | } 9 | -------------------------------------------------------------------------------- /slyd/.gitignore: -------------------------------------------------------------------------------- 1 | # python 2 | *.py[cod] 3 | 4 | # editor files 5 | *.orig 6 | *.bak 7 | *.swp 8 | *.project 9 | *.sublime-* 10 | 11 | # twisted 12 | dropin.cache 13 | twistd.log 14 | twistd.pid 15 | _trial_temp* 16 | 17 | # local data files 18 | data/* 19 | 20 | # npm files 21 | node_modules/* 22 | npm-debug.log 23 | -------------------------------------------------------------------------------- /slyd/media/js/controllers/application-controller.js: -------------------------------------------------------------------------------- 1 | ASTool.ApplicationController = Em.Controller.extend(ASTool.BaseControllerMixin, { 2 | 3 | currentPathDidChange: function() { 4 | // Always reset the document view when leaving a route. 5 | this.get('documentView').reset(); 6 | }.observes('currentPath'), 7 | }); 8 | -------------------------------------------------------------------------------- /slybot/slybot/fieldtypes/point.py: -------------------------------------------------------------------------------- 1 | 2 | class GeoPointFieldTypeProcessor(object): 3 | """Renders point with tags""" 4 | 5 | name = 'geopoint' 6 | description = 'geo point' 7 | multivalue = True 8 | 9 | def extract(self, value): 10 | return value 11 | 12 | def adapt(self, value, htmlpage): 13 | return value 14 | 15 | -------------------------------------------------------------------------------- /slyd/tests/resources/data/projects/test/spiders/mh2.json: -------------------------------------------------------------------------------- 1 | { 2 | "exclude_patterns": [], 3 | "follow_patterns": [ 4 | ".+MobileHomePark\\.php\\?key=\\d+" 5 | ], 6 | "links_to_follow": "patterns", 7 | "respect_nofollow": true, 8 | "start_urls": [ 9 | "http://www.mhvillage.com/" 10 | ], 11 | "templates": [] 12 | } -------------------------------------------------------------------------------- /slyd/tests/resources/data/projects/test/spiders/mhvillage.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [ 4 | "http://www.mhvillage.com/" 5 | ], 6 | "exclude_patterns": [], 7 | "follow_patterns": [ 8 | ".+MobileHomePark\\.php\\?key=\\d+" 9 | ], 10 | "links_to_follow": "patterns", 11 | "respect_nofollow": true 12 | } -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/any_allowed_domains.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [ 4 | "http://www.ebay.com/" 5 | ], 6 | "allowed_domains": [], 7 | "exclude_patterns": [], 8 | "respect_nofollow": true, 9 | "follow_patterns": [], 10 | "scrapes": "default", 11 | "links_to_follow": "none" 12 | } 13 | -------------------------------------------------------------------------------- /slyd/media/js/templates/annotated-document-view.handlebars: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 |
6 |
7 |
8 |
9 | -------------------------------------------------------------------------------- /slybot/slybot/fieldtypes/images.py: -------------------------------------------------------------------------------- 1 | """ 2 | Images 3 | """ 4 | from scrapely.extractors import extract_image_url 5 | from slybot.fieldtypes.url import UrlFieldTypeProcessor 6 | 7 | class ImagesFieldTypeProcessor(UrlFieldTypeProcessor): 8 | name = 'image' 9 | description = 'extracts image URLs' 10 | 11 | def extract(self, text): 12 | return extract_image_url(text) 13 | 14 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/example2.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [], 4 | "init_requests": [ 5 | { 6 | "type": "start", 7 | "url": "http://www.example.com/index.html" 8 | } 9 | ], 10 | "exclude_patterns": [], 11 | "follow_patterns": [], 12 | "links_to_follow": "patterns", 13 | "respect_nofollow": true 14 | } 15 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/allowed_domains.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [ 4 | "http://www.ebay.com/sch/ebayadvsearch/?rt=nc" 5 | ], 6 | "allowed_domains": [ 7 | "www.ebay.com", 8 | "www.yahoo.com" 9 | ], 10 | "exclude_patterns": [], 11 | "respect_nofollow": true, 12 | "follow_patterns": [], 13 | "links_to_follow": "none" 14 | } 15 | -------------------------------------------------------------------------------- /slybot/README.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Slybot crawler 3 | ============== 4 | 5 | Slybot is a Python web crawler for doing web scraping. It's implemented on top of the 6 | `Scrapy`_ web crawling framework and the `Scrapely`_ extraction library. 7 | 8 | The documentation (including installation and usage) can be found at: 9 | http://slybot.readthedocs.org/ 10 | 11 | .. _Scrapely: https://github.com/scrapy/scrapely 12 | .. _Scrapy: http://scrapy.org 13 | -------------------------------------------------------------------------------- /slyd/tests/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Slyd test settings 3 | 4 | Imports slyd settings and adds necessary overrides for test setup 5 | """ 6 | from slyd.settings import * 7 | 8 | LOG_LEVEL = 'DEBUG' 9 | 10 | # testing never makes remote requests. A cache may serve stale content. 11 | HTTPCACHE_ENABLED = False 12 | 13 | RESOURCE_DIR = join(dirname(__file__), 'resources') 14 | DATA_DIR = join(RESOURCE_DIR, 'data') 15 | SPEC_DATA_DIR = join(DATA_DIR, 'projects') 16 | -------------------------------------------------------------------------------- /slybot/slybot/settings.py: -------------------------------------------------------------------------------- 1 | SPIDER_MANAGER_CLASS = 'slybot.spidermanager.SlybotSpiderManager' 2 | EXTENSIONS = {'slybot.closespider.SlybotCloseSpider': 1} 3 | ITEM_PIPELINES = ['slybot.dupefilter.DupeFilterPipeline'] 4 | SPIDER_MIDDLEWARES = {'slybot.spiderlets.SpiderletsMiddleware': 999} # as close as possible to spider output 5 | SLYDUPEFILTER_ENABLED = True 6 | PROJECT_DIR = 'slybot-project' 7 | 8 | try: 9 | from local_slybot_settings import * 10 | except ImportError: 11 | pass 12 | -------------------------------------------------------------------------------- /slyd/media/js/templates/topbar-project.handlebars: -------------------------------------------------------------------------------- 1 |
2 | 5 |
6 | {{view ASTool.TextField value=spiderPage width='320px' name='spiderPageTextField' placeholder='Enter page url' action="addSpider"}} 7 | {{view ASTool.ButtonView action="addSpider" icon="ui-icon-circle-plus" label="New spider" class="blue-button" disabled=createSpiderDisabled}} 8 |
9 |
10 | -------------------------------------------------------------------------------- /slybot/debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # -*- makefile -*- 3 | # Sample debian/rules that uses debhelper. 4 | # This file was originally written by Joey Hess and Craig Small. 5 | # As a special exception, when this file is copied by dh-make into a 6 | # dh-make output file, you may use that output file without restriction. 7 | # This special exception was added by Craig Small in version 0.37 of dh-make. 8 | 9 | # Uncomment this to turn on verbose mode. 10 | #export DH_VERBOSE=1 11 | 12 | %: 13 | dh $@ 14 | -------------------------------------------------------------------------------- /slybot/slybot/fieldtypes/price.py: -------------------------------------------------------------------------------- 1 | """ 2 | Price field types 3 | """ 4 | from scrapely import extractors 5 | 6 | class PriceTypeProcessor(object): 7 | """Extracts price from text""" 8 | name = "price" 9 | description = "extracts a price decimal number in the text passed" 10 | 11 | def extract(self, htmlregion): 12 | return extractors.contains_any_numbers(htmlregion.text_content) 13 | 14 | def adapt(self, text, htmlpage): 15 | return extractors.extract_price(text) 16 | 17 | -------------------------------------------------------------------------------- /slyd/media/js/controllers/controllers.js: -------------------------------------------------------------------------------- 1 | ASTool.BaseControllerMixin = Ember.Mixin.create({ 2 | 3 | openAccordion: function(accordionNumber) { 4 | $( ".accordion" ).accordion("option", "active", accordionNumber); 5 | }, 6 | 7 | getUnusedName: function(baseName, usedNames) { 8 | var i = 1; 9 | var newName = baseName; 10 | while(usedNames.any(function(usedName) { 11 | return usedName == newName 12 | })) { 13 | newName = baseName + '_' + i++; 14 | } 15 | return newName; 16 | }, 17 | }); 18 | -------------------------------------------------------------------------------- /slyd/media/js/templates/topbar-projects.handlebars: -------------------------------------------------------------------------------- 1 |
2 | 5 |
6 | {{view ASTool.TextField value=projectSite width='320px' name='projectSiteTextField' placeholder='Enter site url' action="createProject" id="projectSiteTextField"}} 7 | {{view ASTool.ButtonView action="createProject" icon="ui-icon-circle-plus" label="Start" class="blue-button" disabled=createProjectDisabled}} 8 |
9 |
10 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/cargurus.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [ 3 | ], 4 | "start_urls": [ 5 | "http://www.cargurus.com/Cars/sitemap.html" 6 | ], 7 | "exclude_patterns": [ 8 | "-Pictures-", 9 | "-Specs-", 10 | "-Price-", 11 | "_v", 12 | "-Videos-" 13 | ], 14 | "follow_patterns": [ 15 | "-Overview-", 16 | "-Reviews-", 17 | "/rss/" 18 | ], 19 | "links_to_follow": "patterns", 20 | "respect_nofollow": false 21 | } 22 | -------------------------------------------------------------------------------- /slyd/media/js/templates/navigation.handlebars: -------------------------------------------------------------------------------- 1 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/pinterest.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [ 4 | "http://pinterest.com/popular/" 5 | ], 6 | "init_requests": [ 7 | { 8 | "username": "test", 9 | "loginurl": "https://pinterest.com/login/", 10 | "password": "testpass", 11 | "type": "login" 12 | } 13 | ], 14 | "exclude_patterns": [], 15 | "respect_nofollow": true, 16 | "follow_patterns": [], 17 | "links_to_follow": "patterns" 18 | } 19 | -------------------------------------------------------------------------------- /slyd/media/js/templates/topbar-extraction.handlebars: -------------------------------------------------------------------------------- 1 |
2 | 5 |
6 | 7 | {{#view ASTool.LabelWithTooltip title=controller.url}} 8 | {{controller.url}} 9 | {{/view}} 10 | 11 | 12 | {{#if showContinueBrowsing}} 13 | {{view ASTool.ButtonView action="continueBrowsing" label="Continue browsing" class="blue-button"}} 14 | {{/if}} 15 | 16 |
17 |
18 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/seedsofchange.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [ 4 | "http://www.seedsofchange.com/garden_center/browse_category.aspx?id=123" 5 | ], 6 | "exclude_patterns": [ 7 | "/tellafriend.aspx.+" 8 | ], 9 | "follow_patterns": [ 10 | "/garden_center/browse_category.aspx.+", 11 | "/garden_center/detailedCategoryDisplay.aspx.+", 12 | "/garden_center/product_details.aspx.+" 13 | ], 14 | "links_to_follow": "patterns", 15 | "respect_nofollow": true 16 | } 17 | -------------------------------------------------------------------------------- /slyd/media/js/templates/toolbox.handlebars: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 |
5 | 6 |
7 |
8 | 9 | {{view ASTool.PinToolBoxButton action="editAnnotation" class="textless-button pin-button"}} 10 | 11 |
12 | {{yield}} 13 |
14 |
15 |
16 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/example.com.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": ["http://www.example.com/index.html"], 4 | "init_requests": [ 5 | { 6 | "type": "start", 7 | "url": "http://www.example.com/products.csv", 8 | "link_extractor": { 9 | "type": "column", 10 | "value": 1, 11 | "delimiter": "," 12 | } 13 | } 14 | ], 15 | "exclude_patterns": [], 16 | "follow_patterns": [], 17 | "links_to_follow": "patterns", 18 | "respect_nofollow": true 19 | } 20 | -------------------------------------------------------------------------------- /slyd/media/js/templates/toolbox-projects.handlebars: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Open project

4 |
5 | {{#each}} 6 |
7 | {{view ASTool.ButtonView action="openProject" argument=this label=this minWidth='85%' class="light-button"}} 8 | {{view ASTool.ButtonView action="deleteProject" argument=this icon="ui-icon-trash" class="red-button"}} 9 |
10 | {{else}} 11 |
No projects have been created yet.
12 | {{/each}} 13 |
14 |
-------------------------------------------------------------------------------- /slyd/slyd/settings.py: -------------------------------------------------------------------------------- 1 | """Scrapy settings""" 2 | from os.path import join, dirname 3 | 4 | EXTENSIONS = { 5 | 'scrapy.contrib.logstats.LogStats': None, 6 | 'scrapy.webservice.WebService': None, 7 | 'scrapy.telnet.TelnetConsole': None, 8 | 'scrapy.contrib.throttle.AutoThrottle': None 9 | } 10 | 11 | LOG_LEVEL = 'DEBUG' 12 | 13 | # location of slybot projects - assumes a subdir per project 14 | DATA_DIR = join(dirname(dirname(__file__)), 'data') 15 | SPEC_DATA_DIR = join(DATA_DIR, 'projects') 16 | 17 | 18 | # recommended for development - use scrapy to cache http responses 19 | HTTPCACHE_ENABLED = True 20 | HTTPCACHE_DIR = join(DATA_DIR, 'cache') 21 | -------------------------------------------------------------------------------- /slybot/slybot/fieldtypes/url.py: -------------------------------------------------------------------------------- 1 | from urlparse import urljoin 2 | from scrapy.utils.url import safe_download_url 3 | from scrapy.utils.markup import unquote_markup 4 | from slybot.baseurl import get_base_url 5 | 6 | class UrlFieldTypeProcessor(object): 7 | """Renders URLs as links""" 8 | 9 | name = 'url' 10 | description = 'URL' 11 | limit = 80 12 | 13 | def extract(self, text): 14 | return text 15 | 16 | def adapt(self, text, htmlpage): 17 | text = text.encode(htmlpage.encoding) 18 | joined = urljoin(get_base_url(htmlpage).encode(htmlpage.encoding), text) 19 | return safe_download_url(unquote_markup(joined, encoding=htmlpage.encoding)) 20 | 21 | -------------------------------------------------------------------------------- /slybot/slybot/tests/test_fieldtypes.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from scrapely.htmlpage import HtmlPage 3 | 4 | from slybot.fieldtypes import UrlFieldTypeProcessor 5 | 6 | class FieldTypesUrlEncoding(TestCase): 7 | def test_not_standard_chars_in_url(self): 8 | body = u'' 9 | url = u'fotos/produtos/Mam\xe3e noel.jpg' 10 | htmlpage = HtmlPage(url=u"http://www.example.com/", body=body, encoding='cp1252') 11 | processor = UrlFieldTypeProcessor() 12 | self.assertEqual(processor.adapt(url, htmlpage), u'http://www.example.com/fotos/produtos/Mam%C3%A3e%20noel.jpg') 13 | 14 | -------------------------------------------------------------------------------- /slybot/debian/control: -------------------------------------------------------------------------------- 1 | Source: python-slybot 2 | Section: python 3 | Priority: extra 4 | Maintainer: Scrapinghub Team 5 | Build-Depends: debhelper (>= 7), python (>=2.5) 6 | Standards-Version: 3.8.3 7 | Homepage: https://github.com/scrapy/slybot 8 | 9 | Package: python-slybot 10 | Architecture: all 11 | Depends: ${shlibs:Depends}, ${misc:Depends}, ${python:Depends}, 12 | scrapy-0.18 | scrapy-0.20 | scrapy-0.22 | scrapy-0.23, 13 | python-scrapely, 14 | python-loginform, 15 | python-lxml 16 | Description: A web crawler implemented in Python. 17 | Slybot is a Python web crawler for doing web scraping. It's implemented on top 18 | of the Scrapy web crawling framework and the Scrapely extraction library. 19 | -------------------------------------------------------------------------------- /slyd/media/js/templates/toolbox-items.handlebars: -------------------------------------------------------------------------------- 1 |
2 |

Items

3 |
4 |
5 | {{#each model}} 6 | {{view ASTool.EditItemView item=this}} 7 | {{else}} 8 |

No items have been defined yet.

9 | {{/each}} 10 |
11 |
12 | {{view ASTool.ButtonView action="addItem" icon="ui-icon-circle-plus" label="Item" class="blue-button"}} 13 | {{view ASTool.ButtonView action="saveChanges" icon="ui-icon-circle-triangle-w" label="Save changes" class="blue-button"}} 14 | {{view ASTool.ButtonView action="undoChanges" icon="ui-icon-arrowreturnthick-1-w" label="Discard changes" class="red-button"}} 15 |
16 |
17 |
18 | -------------------------------------------------------------------------------- /slyd/media/js/templates/toolbox-project.handlebars: -------------------------------------------------------------------------------- 1 |
2 |
3 | {{#view ASTool.RenameTextField value=name}} 4 | Project {{name}} 5 | {{/view}} 6 |
7 | 8 |

Spiders

9 |
10 |
11 | {{#each}} 12 |
13 | {{view ASTool.ButtonView action="editSpider" argument=this label=this minWidth="82%" class="light-button"}} 14 | {{view ASTool.ButtonView action="deleteSpider" argument=this icon="ui-icon-trash" class="red-button"}} 15 |
16 | {{else}} 17 |

No spiders for this project.

18 | {{/each}} 19 |
20 |
21 |
22 | 23 | -------------------------------------------------------------------------------- /slyd/tests/mockserver.py: -------------------------------------------------------------------------------- 1 | from twisted.web.server import Site 2 | from twisted.web.resource import Resource 3 | 4 | class Root(Resource): 5 | 6 | def __init__(self): 7 | Resource.__init__(self) 8 | self.putChild("status", Status()) 9 | 10 | def getChild(self, name, request): 11 | return self 12 | 13 | def render(self, request): 14 | return 'Slyd mock HTTP server\n' 15 | 16 | # TODO: make PR for scrapy to share code 17 | if __name__ == "__main__": 18 | root = Root() 19 | factory = Site(root) 20 | httpPort = reactor.listenTCP(8998, factory) 21 | 22 | def print_listening(): 23 | httpHost = httpPort.getHost() 24 | print("Mock server running at http://%s:%d/" % ( 25 | httpHost.host, httpHost.port)) 26 | reactor.callWhenRunning(print_listening) 27 | reactor.run() -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/ebay.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [ 4 | "http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc" 5 | ], 6 | "init_requests": [ 7 | { 8 | "type": "form", 9 | "form_url": "http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc", 10 | "xpath": "//form[@name='adv_search_from']", 11 | "fields": [ 12 | { 13 | "xpath": ".//*[@name='_nkw']", 14 | "type": "constants", 15 | "value": ["Cars"] 16 | }, 17 | { 18 | "xpath": ".//*[@name='_in_kw']", 19 | "type": "iterate" 20 | } 21 | ] 22 | } 23 | ], 24 | "exclude_patterns": [], 25 | "respect_nofollow": true, 26 | "follow_patterns": [], 27 | "links_to_follow": "none" 28 | } 29 | -------------------------------------------------------------------------------- /slyd/media/js/templates/item.handlebars: -------------------------------------------------------------------------------- 1 |
2 |
Choose an item field
3 |
4 |

Item {{name}}

5 | {{#if fields}} 6 |
7 | 8 | 9 | {{#each fields}} 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {{/each}} 18 |
Field Type Required Vary
{{name}}
{{type}}
{{#if required}}yes{{else}}no{{/if}}
{{#if vary}}yes{{else}}no{{/if}}
{{view ASTool.ButtonView icon="ui-icon-circle-check" action="fieldSelected" argument=name class="blue-button"}}
19 | {{else}} 20 |
The item has no fields.
21 | {{/if}} 22 |
23 |
-------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/ebay3.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [ 4 | "http://www.ebay.com/sch/ebayadvsearch/?rt=nc" 5 | ], 6 | "init_requests": [ 7 | { 8 | "type": "form", 9 | "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc", 10 | "xpath": "//form[@name='adv_search_from']", 11 | "fields": [ 12 | { 13 | "xpath": ".//*[@name='_nkw']", 14 | "type": "constants", 15 | "value": ["{search_string}"] 16 | }, 17 | { 18 | "xpath": ".//*[@name='_in_kw']", 19 | "type": "iterate" 20 | } 21 | ] 22 | } 23 | ], 24 | "exclude_patterns": [], 25 | "respect_nofollow": true, 26 | "follow_patterns": [], 27 | "scrapes": "default", 28 | "links_to_follow": "none" 29 | } 30 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/ebay4.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [ 4 | "http://www.ebay.com/sch/ebayadvsearch/?rt=nc" 5 | ], 6 | "init_requests": [ 7 | { 8 | "type": "form", 9 | "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc", 10 | "xpath": "//form[@name='adv_search_from']", 11 | "fields": [ 12 | { 13 | "xpath": ".//*[@name='_nkw']", 14 | "type": "constants", 15 | "value": ["{search_string}"] 16 | }, 17 | { 18 | "xpath": ".//*[@name='_in_kw']", 19 | "type": "iterate" 20 | } 21 | ] 22 | } 23 | ], 24 | "exclude_patterns": [], 25 | "respect_nofollow": true, 26 | "follow_patterns": [], 27 | "scrapes": "default", 28 | "links_to_follow": "none" 29 | } 30 | -------------------------------------------------------------------------------- /slybot/setup.py: -------------------------------------------------------------------------------- 1 | from slybot import __version__ 2 | try: 3 | from setuptools import setup 4 | except ImportError: 5 | from distutils.core import setup 6 | 7 | install_requires = ['Scrapy', 'scrapely', 'loginform', 'lxml'] 8 | tests_requires = ['jsonschema'] + install_requires 9 | 10 | setup(name='slybot', 11 | version=__version__, 12 | license='BSD', 13 | description='Slybot crawler', 14 | author='Scrapy project', 15 | author_email='info@scrapy.org', 16 | url='http://github.com/scrapy/slybot', 17 | packages=['slybot', 'slybot.fieldtypes', 'slybot.tests', 'slybot.linkextractor'], 18 | platforms = ['Any'], 19 | scripts = ['bin/slybot', 'bin/portiacrawl'], 20 | install_requires = install_requires, 21 | tests_requires = tests_requires, 22 | classifiers = ['Development Status :: 4 - Beta', 23 | 'License :: OSI Approved :: BSD License', 24 | 'Operating System :: OS Independent', 25 | 'Programming Language :: Python'] 26 | ) 27 | -------------------------------------------------------------------------------- /slybot/slybot/fieldtypes/number.py: -------------------------------------------------------------------------------- 1 | """ 2 | Numeric data extraction 3 | """ 4 | 5 | from scrapely.extractors import contains_any_numbers, extract_number 6 | 7 | class NumberTypeProcessor(object): 8 | """NumberTypeProcessor 9 | 10 | Extracts a number from text 11 | 12 | >>> from scrapely.extractors import htmlregion 13 | >>> n = NumberTypeProcessor() 14 | >>> n.extract(htmlregion(u"there are no numbers here")) 15 | >>> n.extract(htmlregion(u"foo 34")) 16 | u'foo 34' 17 | >>> n.adapt(u"foo 34", None) 18 | u'34' 19 | 20 | If more than one number is present, nothing is extracted 21 | >>> n.adapt(u"34 48", None) is None 22 | True 23 | """ 24 | name = 'number' 25 | description = 'extracts a single number in the text passed' 26 | 27 | def extract(self, htmlregion): 28 | """Only matches and extracts strings with at least one number""" 29 | return contains_any_numbers(htmlregion.text_content) 30 | 31 | def adapt(self, text, htmlpage): 32 | return extract_number(text) 33 | 34 | -------------------------------------------------------------------------------- /slybot/slybot/tests/data/SampleProject/spiders/ebay2.json: -------------------------------------------------------------------------------- 1 | { 2 | "templates": [], 3 | "start_urls": [ 4 | "http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc" 5 | ], 6 | "init_requests": [ 7 | { 8 | "type": "form", 9 | "form_url": "http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc", 10 | "xpath": "//form[@name='adv_search_from']", 11 | "fields": [ 12 | { 13 | "xpath": ".//*[@name='_nkw']", 14 | "type": "inurl", 15 | "value": "file://tmp/test_params.txt" 16 | }, 17 | { 18 | "name": "_nkw2", 19 | "type": "inurl", 20 | "value": "file://tmp/test_params.txt" 21 | }, 22 | { 23 | "xpath": ".//*[@name='_in_kw']", 24 | "type": "iterate" 25 | } 26 | ] 27 | } 28 | ], 29 | "exclude_patterns": [], 30 | "respect_nofollow": true, 31 | "follow_patterns": [], 32 | "scrapes": "default", 33 | "links_to_follow": "none" 34 | } 35 | -------------------------------------------------------------------------------- /slybot/slybot/dupefilter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Duplicates filter middleware for autoscraping 3 | """ 4 | from scrapy.exceptions import NotConfigured 5 | from scrapy.exceptions import DropItem 6 | 7 | from slybot.item import create_item_version 8 | 9 | class DupeFilterPipeline(object): 10 | def __init__(self, settings): 11 | if not settings.getbool('SLYDUPEFILTER_ENABLED'): 12 | raise NotConfigured 13 | self._itemversion_cache = {} 14 | 15 | @classmethod 16 | def from_crawler(cls, crawler): 17 | return cls(crawler.settings) 18 | 19 | def process_item(self, item, spider): 20 | """Checks whether a scrapy item is a dupe, based on version (not vary) 21 | fields of the item class""" 22 | if not hasattr(item, 'version_fields'): 23 | return item 24 | version = create_item_version(item) 25 | if version in self._itemversion_cache: 26 | old_url = self._itemversion_cache[version] 27 | raise DropItem("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url)) 28 | self._itemversion_cache[version] = item["url"] 29 | return item 30 | 31 | -------------------------------------------------------------------------------- /slybot/slybot/linkextractor/regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | from scrapy.link import Link 3 | 4 | from .base import BaseLinkExtractor 5 | 6 | # Based on http://blog.mattheworiordan.com/post/13174566389/url-regular-expression-for-links-with-or-without-the 7 | # leaves aside the fragment part, not needed for link extraction 8 | URL_DEFAULT_REGEX = r'(?:[A-Za-z0-9.\-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.\-]+)(?:(?:\/[\+~%\/.\w\-_]*)?\??(?:[\-\+=&;%@.\w_]*)(?:#[.\!\/\w]*)?)?' 9 | 10 | class RegexLinkExtractor(BaseLinkExtractor): 11 | def __init__(self, regex=None, **kwargs): 12 | super(RegexLinkExtractor, self).__init__(**kwargs) 13 | self.allowed_schemes = filter(lambda x: x and isinstance(x, basestring), self.allowed_schemes) 14 | regex = regex or '(?:%s)://%s' % ('|'.join(self.allowed_schemes), URL_DEFAULT_REGEX) 15 | self.regex = re.compile(regex) 16 | 17 | def _extract_links(self, response): 18 | """First extract regex groups(). If empty, extracts from regex group()""" 19 | for s in self.regex.finditer(response.body): 20 | if s.groups(): 21 | for url in s.groups(): 22 | yield Link(url) 23 | else: 24 | yield Link(s.group()) 25 | 26 | 27 | -------------------------------------------------------------------------------- /slybot/slybot/linkextractor/ecsv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from cStringIO import StringIO 3 | 4 | from scrapy.link import Link 5 | 6 | from .base import BaseLinkExtractor 7 | 8 | # see http://docs.python.org/2/library/csv.html#csv-fmt-params 9 | _FORMAT_PARAMETERS = ( 10 | ('delimiter', ','), 11 | ('quotechar', '"'), 12 | ('doublequote', True), 13 | ('escapechar', None), 14 | ('lineterminator', '\r\n'), 15 | ('skipinitialspace', False), 16 | ('strict', False), 17 | ) 18 | 19 | class CsvLinkExtractor(BaseLinkExtractor): 20 | def __init__(self, column=0, **kwargs): 21 | self.fmtparams = dict((key, kwargs.pop(key, default)) for key, default in _FORMAT_PARAMETERS) 22 | for key, val in self.fmtparams.items(): 23 | if isinstance(val, unicode): 24 | self.fmtparams[key] = val.encode() 25 | super(CsvLinkExtractor, self).__init__(**kwargs) 26 | self.allowed_schemes = filter(lambda x: x and isinstance(x, basestring), self.allowed_schemes) 27 | self.column = column 28 | 29 | def _extract_links(self, response): 30 | buff = StringIO(response.body) 31 | reader = csv.reader(buff, **self.fmtparams) 32 | for row in reader: 33 | if len(row) > self.column: 34 | yield Link(row[self.column]) 35 | 36 | -------------------------------------------------------------------------------- /slybot/slybot/linkextractor/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Link extraction for auto scraping 3 | """ 4 | from scrapy.utils.misc import load_object 5 | 6 | from .base import BaseLinkExtractor, ALLOWED_SCHEMES 7 | from .html import HtmlLinkExtractor 8 | from .xml import XmlLinkExtractor, RssLinkExtractor, SitemapLinkExtractor, AtomLinkExtractor 9 | from .regex import RegexLinkExtractor 10 | from .ecsv import CsvLinkExtractor 11 | 12 | _TYPE_MAP = ( 13 | # type, class, ignore value 14 | ('regex', RegexLinkExtractor, False), 15 | ('xpath', XmlLinkExtractor, False), 16 | ('column', CsvLinkExtractor, False), 17 | ('html', HtmlLinkExtractor, True), 18 | ('rss', RssLinkExtractor, True), 19 | ('sitemap', SitemapLinkExtractor, True), 20 | ('atom', AtomLinkExtractor, True), 21 | ) 22 | def create_linkextractor_from_specs(specs): 23 | """Return a link extractor instance from specs. By default, return a HtmlLinkExtractor. 24 | """ 25 | specs = specs.copy() 26 | ltype, value = specs.pop('type'), specs.pop('value') 27 | if ltype == 'module': 28 | cls = load_object(value) 29 | return cls(**specs) 30 | for key, cls, ignore in _TYPE_MAP: 31 | if key == ltype: 32 | if ignore: 33 | return cls(**specs) 34 | return cls(value, **specs) 35 | raise ValueError("Invalid link extractor type specification") 36 | -------------------------------------------------------------------------------- /slyd/media/js/templates/annotation-widget.handlebars: -------------------------------------------------------------------------------- 1 | {{#unless view.creatingField}} 2 |
3 | {{view ASTool.ButtonView action="editAnnotation" argument=this icon="ui-icon-gear" class="blue-button"}} 4 | {{view ASTool.ButtonView action="deleteAnnotation" argument=this icon="ui-icon-trash" class="red-button"}} 5 |
6 | {{#unless view.hasMultipleMappings}} 7 | {{view view.attributeSelect}} 8 | {{view view.fieldSelect}} 9 |
10 |
Attribute value:
11 | {{view.attributeValue}} 12 |
13 | {{else}} 14 |
15 | {{#each view.mappings}} 16 |
17 |
{{this.name}}
18 |
{{this.mappedField}}
19 |
20 | {{/each}} 21 |
22 | {{/unless}} 23 | {{else}} 24 |
{{view ASTool.ButtonView icon="ui-icon-circle-plus" class="blue-button fix_plus_icon" disabled=view.createFieldDisabled}}
25 | 26 | 27 | 28 |
29 | {{view view.fieldTextField}} 30 | {{view view.typeSelect}} 31 |
32 | {{/unless}} -------------------------------------------------------------------------------- /slyd/slyd/resource.py: -------------------------------------------------------------------------------- 1 | import json, errno 2 | from twisted.web.resource import Resource, NoResource, ErrorPage 3 | 4 | 5 | class SlydJsonResource(Resource): 6 | """Base Resource for Slyd Resources 7 | 8 | This sets the content type to JSON and handles errors 9 | """ 10 | 11 | def render(self, request): 12 | request.setResponseCode(200) 13 | request.setHeader('Content-Type', 'application/json') 14 | try: 15 | return Resource.render(self, request) 16 | except IOError as ex: 17 | if ex.errno == errno.ENOENT: 18 | return NoResource().render(request) 19 | else: 20 | raise 21 | except ErrorPage as ex: 22 | return ex.render(request) 23 | 24 | def error(self, request, status, why): 25 | raise ErrorPage(request, status, why) 26 | 27 | def bad_request(self, why): 28 | self.error(400, "Bad Request", why) 29 | 30 | def read_json(self, request): 31 | try: 32 | return json.load(request.content) 33 | except ValueError as ex: 34 | self.bad_request("Error parsing json. %s" % ex.message) 35 | 36 | 37 | class SlydJsonObjectResource(SlydJsonResource): 38 | """Extends SlydJsonResource, converting 39 | the returned data to JSON 40 | """ 41 | 42 | def render(self, request): 43 | resp = SlydJsonResource.render(self, request) 44 | if resp is not None: 45 | return json.dumps(resp) 46 | -------------------------------------------------------------------------------- /slyd/media/js/controllers/navigation-controller.js: -------------------------------------------------------------------------------- 1 | ASTool.NavigationController = Em.Controller.extend({ 2 | 3 | needs: ['application', 'project_index', 'spider_index', 'template_index', 'annotation'], 4 | 5 | currentPathBinding: 'controllers.application.currentPath', 6 | 7 | splittedPath: function() { 8 | return this.get('currentPath').split('.').filter(function(pathElem) { 9 | return pathElem != 'index'; 10 | }); 11 | }.property('currentPath'), 12 | 13 | previousRoutes: function() { 14 | var splitted = this.get('splittedPath'); 15 | return splitted.slice(0, splitted.length - 1).map(function(route) { 16 | return { route: route, label: this.labelForRoute(route) }; 17 | }.bind(this)); 18 | }.property('splittedPath'), 19 | 20 | currentRoute: function() { 21 | var splitted = this.get('splittedPath'); 22 | var route = splitted[splitted.length - 1]; 23 | return { route: route, label: this.labelForRoute(route) }; 24 | }.property('splittedPath'), 25 | 26 | labelForRoute: function(route) { 27 | var controllerName = this.get('needs').find(function(name) { 28 | return name.indexOf(route) == 0; 29 | }); 30 | if (controllerName) { 31 | var controller = this.get('controllers.' + controllerName); 32 | return controller.get('navigationLabel'); 33 | } else { 34 | return ''; 35 | } 36 | }, 37 | 38 | actions: { 39 | 40 | gotoRoute: function(route) { 41 | ASTool.ToolboxViewMixin.expandToolbox = true; 42 | this.transitionToRoute(route); 43 | }, 44 | }, 45 | }); 46 | -------------------------------------------------------------------------------- /slybot/slybot/utils.py: -------------------------------------------------------------------------------- 1 | from urlparse import urlparse 2 | import os 3 | import json 4 | 5 | from scrapely.htmlpage import HtmlPage 6 | 7 | def iter_unique_scheme_hostname(urls): 8 | """Return an iterator of tuples (scheme, hostname) over the given urls, 9 | filtering dupes 10 | """ 11 | scheme_hostname = set() 12 | for x in urls: 13 | p = urlparse(x) 14 | scheme_hostname.add((p.scheme, p.hostname)) 15 | return list(scheme_hostname) 16 | 17 | def open_project_from_dir(project_dir): 18 | specs = {"spiders": {}} 19 | with open(os.path.join(project_dir, "project.json")) as f: 20 | specs["project"] = json.load(f) 21 | with open(os.path.join(project_dir, "items.json")) as f: 22 | specs["items"] = json.load(f) 23 | with open(os.path.join(project_dir, "extractors.json")) as f: 24 | specs["extractors"] = json.load(f) 25 | for fname in os.listdir(os.path.join(project_dir, "spiders")): 26 | if fname.endswith(".json"): 27 | spider_name = os.path.splitext(fname)[0] 28 | with open(os.path.join(project_dir, "spiders", fname)) as f: 29 | try: 30 | specs["spiders"][spider_name] = json.load(f) 31 | except ValueError, e: 32 | raise ValueError("Error parsing spider (invalid JSON): %s: %s" % (fname, e)) 33 | return specs 34 | 35 | def htmlpage_from_response(response): 36 | return HtmlPage(response.url, response.headers, \ 37 | response.body_as_unicode(), encoding=response.encoding) 38 | -------------------------------------------------------------------------------- /slybot/slybot/linkextractor/xml.py: -------------------------------------------------------------------------------- 1 | """ 2 | Link extraction for auto scraping 3 | """ 4 | from scrapy.link import Link 5 | from scrapy.selector import XmlXPathSelector 6 | 7 | from slybot.linkextractor.base import BaseLinkExtractor 8 | 9 | class XmlLinkExtractor(BaseLinkExtractor): 10 | """Link extractor for XML sources""" 11 | def __init__(self, xpath, **kwargs): 12 | self.remove_namespaces = kwargs.pop('remove_namespaces', False) 13 | super(XmlLinkExtractor, self).__init__(**kwargs) 14 | self.xpath = xpath 15 | 16 | def _extract_links(self, response): 17 | xxs = XmlXPathSelector(response) 18 | if self.remove_namespaces: 19 | xxs.remove_namespaces() 20 | for url in xxs.select(self.xpath).extract(): 21 | yield Link(url.encode(response.encoding)) 22 | 23 | class RssLinkExtractor(XmlLinkExtractor): 24 | """Link extraction from RSS feeds""" 25 | def __init__(self, **kwargs): 26 | super(RssLinkExtractor, self).__init__("//item/link/text()", **kwargs) 27 | 28 | class SitemapLinkExtractor(XmlLinkExtractor): 29 | """Link extraction for sitemap.xml feeds""" 30 | def __init__(self, **kwargs): 31 | kwargs['remove_namespaces'] = True 32 | super(SitemapLinkExtractor, self).__init__("//urlset/url/loc/text() | //sitemapindex/sitemap/loc/text()", **kwargs) 33 | 34 | class AtomLinkExtractor(XmlLinkExtractor): 35 | def __init__(self, **kwargs): 36 | kwargs['remove_namespaces'] = True 37 | super(AtomLinkExtractor, self).__init__("//link/@href", **kwargs) 38 | 39 | -------------------------------------------------------------------------------- /slyd/media/js/templates/edit-item.handlebars: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 | {{#view ASTool.InlineTextField value=name}} 5 | Item {{name}} 6 | {{/view}} 7 | 8 | 9 | {{view ASTool.ButtonView action="deleteItem" argument=this icon="ui-icon-trash" class="red-button"}} 10 | 11 |
12 |
13 | {{#if view.item.fields}} 14 | 15 | 16 | {{#each view.item.fields}} 17 | 18 | 23 | 24 | 25 | 26 | 27 | 28 | {{/each}} 29 |
Field Type Required Vary
19 | {{#view ASTool.InlineTextField value=name}} 20 | {{name}} 21 | {{/view}} 22 | {{view ASTool.TypeSelect name="fieldType" value=type itemField=this}}{{view ASTool.CheckBox checked=required name="fieldRequired"}}{{view ASTool.CheckBox checked=vary name="fieldVary"}}{{view ASTool.ButtonView icon="ui-icon-trash" class="red-button" action="deleteField" argument=view.item argument2=this}}
30 | {{else}} 31 |
No fields defined yet.
32 | {{/if}} 33 |
34 | {{view ASTool.ButtonView action="addField" argument=this icon="ui-icon-circle-plus" label="Field" class="blue-button"}} 35 |
36 |
37 |
-------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Scrapinghub. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of Portia nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /slybot/slybot/tests/test_dupefilter.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from os.path import dirname 3 | 4 | from scrapy.http import HtmlResponse 5 | from scrapy.settings import Settings 6 | from scrapy.item import DictItem 7 | from scrapy.exceptions import DropItem 8 | 9 | from slybot.spidermanager import SlybotSpiderManager 10 | from slybot.dupefilter import DupeFilterPipeline 11 | 12 | _PATH = dirname(__file__) 13 | 14 | class DupeFilterTest(TestCase): 15 | smanager = SlybotSpiderManager("%s/data/SampleProject" % _PATH) 16 | 17 | def test_dupefilter(self): 18 | name = "seedsofchange2" 19 | spider = self.smanager.create(name) 20 | spec = self.smanager._specs["spiders"][name] 21 | t1, t2 = spec["templates"] 22 | 23 | dupefilter = DupeFilterPipeline(Settings({"SLYDUPEFILTER_ENABLED": True})) 24 | 25 | response1 = HtmlResponse(url=t1["url"], body=t1["original_body"].encode('utf-8')) 26 | response2 = HtmlResponse(url=t2["url"], body=t2["original_body"].encode('utf-8')) 27 | 28 | result1 = spider.handle_html(response1) 29 | for item1 in result1: 30 | if isinstance(item1, DictItem): 31 | break 32 | 33 | result2 = spider.handle_html(response2) 34 | for item2 in result2: 35 | if isinstance(item2, DictItem): 36 | break 37 | 38 | self.assertEqual(item1, dupefilter.process_item(item1, spider)) 39 | self.assertEqual(item2, dupefilter.process_item(item2, spider)) 40 | 41 | self.assertRaises(DropItem, dupefilter.process_item, item1, spider) 42 | 43 | -------------------------------------------------------------------------------- /slyd/slyd/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | html page utils 3 | """ 4 | from scrapely.htmlpage import HtmlTagType 5 | 6 | 7 | def _quotify(mystr): 8 | """ 9 | quotifies an html tag attribute value. 10 | Assumes then, that any ocurrence of ' or " in the 11 | string is escaped if original string was quoted 12 | with it. 13 | So this function does not altere the original string 14 | except for quotation at both ends, and is limited just 15 | to guess if string must be quoted with '"' or "'" 16 | """ 17 | quote = '"' 18 | l = len(mystr) 19 | for i in range(l): 20 | if mystr[i] == "\\" and i + 1 < l and mystr[i+1] == "'": 21 | quote = "'"; break 22 | elif mystr[i] == "\\" and i + 1 < l and mystr[i+1] == '"': 23 | quote = '"'; break 24 | elif mystr[i] == "'": 25 | quote = '"'; break 26 | elif mystr[i] == '"': 27 | quote = "'"; break 28 | return quote + mystr + quote 29 | 30 | 31 | def serialize_tag(tag): 32 | """ 33 | Converts a tag into a string when a slice [tag.start:tag.end] 34 | over the source can't be used because tag has been modified 35 | """ 36 | out = "<" 37 | if tag.tag_type == HtmlTagType.CLOSE_TAG: 38 | out += "/" 39 | out += tag.tag 40 | 41 | attributes = [] 42 | for key, val in tag.attributes.iteritems(): 43 | aout = key 44 | if val is not None: 45 | aout += "=" + _quotify(val) 46 | attributes.append(aout) 47 | if attributes: 48 | out += " " + " ".join(attributes) 49 | 50 | if tag.tag_type == HtmlTagType.UNPAIRED_TAG: 51 | out += "/" 52 | return out + ">" 53 | -------------------------------------------------------------------------------- /slyd/media/css/breadcrumb.css: -------------------------------------------------------------------------------- 1 | /* Breadcrumb Styles 2 | ---------------------------------------------------------------------*/ 3 | 4 | .breadCrumb 5 | { 6 | margin: 0; 7 | float: left; 8 | display: block; 9 | height: 21px; 10 | overflow: hidden; 11 | width: 400px; 12 | padding:3px; 13 | padding-top:4px; 14 | background:#fff; 15 | } 16 | .breadCrumb ul 17 | { 18 | margin: 0; 19 | padding: 0; 20 | height: 21px; 21 | display: block; 22 | } 23 | .breadCrumb ul li 24 | { 25 | display: block; 26 | float: left; 27 | position: relative; 28 | height: 21px; 29 | overflow: hidden; 30 | line-height: 21px; 31 | margin: 0px 6px 0px 0; 32 | padding: 0px 10px 0px 0; 33 | font-size: 1em; 34 | background: url(../images/Chevron.gif) no-repeat 100% 0; 35 | } 36 | .breadCrumb ul li div.chevronOverlay 37 | { 38 | position: absolute; 39 | right: 0; 40 | top: 0; 41 | z-index: 2; 42 | } 43 | .breadCrumb ul li span 44 | { 45 | display: block; 46 | overflow: hidden; 47 | } 48 | .breadCrumb ul li a 49 | { 50 | display: block; 51 | position: relative; 52 | height: 21px; 53 | line-height: 21px; 54 | overflow: hidden; 55 | float: left; 56 | } 57 | .breadCrumb ul li.first a 58 | { 59 | height: 16px !important; 60 | text-indent:-1000em; 61 | width:16px; 62 | padding: 0; 63 | margin-top: 2px; 64 | overflow: hidden; 65 | background:url(../images/IconHome.gif) no-repeat 0 0; 66 | } 67 | .breadCrumb ul li.first a:hover 68 | { 69 | background-position: 0 -16px; 70 | } 71 | .breadCrumb ul li.last 72 | { 73 | background: none; 74 | margin-right: 0; 75 | padding-right: 0; 76 | } 77 | .chevronOverlay 78 | { 79 | display: none; 80 | background: url(../images/ChevronOverlay.png) no-repeat 100% 0; 81 | width: 13px; 82 | height: 20px; 83 | } 84 | -------------------------------------------------------------------------------- /slybot/slybot/spidermanager.py: -------------------------------------------------------------------------------- 1 | import tempfile, shutil, atexit 2 | from zipfile import ZipFile 3 | 4 | from zope.interface import implements 5 | from scrapy.interfaces import ISpiderManager 6 | from scrapy.utils.misc import load_object 7 | 8 | from slybot.spider import IblSpider 9 | from slybot.utils import open_project_from_dir 10 | 11 | class SlybotSpiderManager(object): 12 | 13 | implements(ISpiderManager) 14 | 15 | def __init__(self, datadir, spider_cls=None): 16 | self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider 17 | self._specs = open_project_from_dir(datadir) 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | datadir = crawler.settings['PROJECT_DIR'] 22 | spider_cls = crawler.settings['SLYBOT_SPIDER_CLASS'] 23 | return cls(datadir, spider_cls) 24 | 25 | def create(self, name, **args): 26 | spec = self._specs["spiders"][name] 27 | items = self._specs["items"] 28 | extractors = self._specs["extractors"] 29 | return self.spider_cls(name, spec, items, extractors, **args) 30 | 31 | def list(self): 32 | return self._specs["spiders"].keys() 33 | 34 | class ZipfileSlybotSpiderManager(SlybotSpiderManager): 35 | 36 | def __init__(self, datadir, zipfile=None, spider_cls=None): 37 | if zipfile: 38 | datadir = tempfile.mkdtemp(prefix='slybot-') 39 | ZipFile(zipfile).extractall(datadir) 40 | atexit.register(shutil.rmtree, datadir) 41 | super(ZipfileSlybotSpiderManager, self).__init__(datadir, spider_cls) 42 | 43 | @classmethod 44 | def from_crawler(cls, crawler): 45 | s = crawler.settings 46 | sm = cls(s['PROJECT_DIR'], s['PROJECT_ZIPFILE'], s['SLYBOT_SPIDER_CLASS']) 47 | return sm 48 | -------------------------------------------------------------------------------- /slyd/media/js/app.js: -------------------------------------------------------------------------------- 1 | /*************************** Application **************************/ 2 | ASTool = Em.Application.create({ 3 | LOG_TRANSITIONS: true, 4 | ready: function(){ 5 | 6 | } 7 | }); 8 | 9 | // Leave 'null' for using window.location. Define it to override. 10 | var SLYD_URL = null; 11 | 12 | Ember.Application.initializer({ 13 | name: 'slydApiInitializer', 14 | 15 | initialize: function(container, application) { 16 | container.register('api:slyd', ASTool.SlydApi); 17 | application.inject('route', 'slyd', 'api:slyd'); 18 | application.inject('adapter', 'slyd', 'api:slyd'); 19 | application.inject('controller', 'slyd', 'api:slyd'); 20 | } 21 | }); 22 | 23 | 24 | Ember.Application.initializer({ 25 | name: 'documentViewInitializer', 26 | 27 | initialize: function(container, application) { 28 | container.register('document:view', ASTool.DocumentView); 29 | application.inject('controller', 'documentView', 'document:view'); 30 | } 31 | }); 32 | 33 | 34 | Ember.Application.initializer({ 35 | name: 'annotationsStoreInitializer', 36 | 37 | initialize: function(container, application) { 38 | container.register('annotations:store', ASTool.AnnotationsStore); 39 | application.inject('route', 'annotationsStore', 'annotations:store'); 40 | application.inject('controller', 'annotationsStore', 'annotations:store'); 41 | } 42 | }); 43 | 44 | 45 | function s4() { 46 | return Math.floor((1 + Math.random()) * 0x10000) 47 | .toString(16) 48 | .substring(1); 49 | }; 50 | 51 | function guid() { 52 | return s4() + s4() + '-' + s4() + '-' + s4() + '-' + 53 | s4() + '-' + s4() + s4() + s4(); 54 | } 55 | 56 | function shortGuid() { 57 | return s4() + '.' + s4() + '.' + s4(); 58 | } 59 | 60 | ASTool.guid = guid; 61 | ASTool.shortGuid = shortGuid; -------------------------------------------------------------------------------- /slyd/bin/sh2sly: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Downloads AS projects from scrapinghub in slybot format and saves them locally 4 | 5 | Used for testing. 6 | 7 | This uses the python-scrapinghub library 8 | """ 9 | 10 | 11 | def sh2sly(apikey, projects, destination): 12 | from scrapinghub import Connection, Project 13 | import tempfile, zipfile, os 14 | conn = Connection(apikey) 15 | if not projects: 16 | projects = conn.project_ids() 17 | #projects = xrange(1300) 18 | out = tempfile.TemporaryFile() 19 | for projectid in projects: 20 | project = Project(conn, projectid) 21 | project.autoscraping_project_slybot(outputfile=out) 22 | try: 23 | zf = zipfile.ZipFile(out) 24 | scount = len([f for f in zf.namelist() if f.startswith('spiders/')]) 25 | if scount == 0: 26 | print "skipping %s - no spiders" % projectid 27 | continue 28 | pdest = os.path.join(destination, str(projectid)) 29 | zf.extractall(pdest) 30 | print "extracted project with %d spiders to %s" % (scount, pdest) 31 | except zipfile.BadZipfile: 32 | print "error with zipfile in project %s, skipping" % projectid 33 | out.truncate() 34 | 35 | 36 | def main(): 37 | import argparse 38 | parser = argparse.ArgumentParser( 39 | description='fetch slybot projects from scrapinghub') 40 | parser.add_argument('-p', '--project', type=int, action='append', 41 | help='scrapinghub project, may be repeated, default is all') 42 | parser.add_argument('-k', '--apikey', 43 | help='api key') 44 | parser.add_argument('destination') 45 | args = parser.parse_args() 46 | sh2sly(args.apikey, args.project, args.destination) 47 | 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /slyd/media/js/controllers/items-controller.js: -------------------------------------------------------------------------------- 1 | ASTool.ItemsController = Em.ArrayController.extend(ASTool.BaseControllerMixin, { 2 | 3 | needs: ['application', 'annotation'], 4 | 5 | documentView: null, 6 | 7 | addItem: function() { 8 | var newItem = ASTool.Item.create({ name: ASTool.shortGuid() }); 9 | this.addField(newItem); 10 | this.pushObject(newItem); 11 | }, 12 | 13 | addField: function(owner, name, type) { 14 | var newField = ASTool.ItemField.create({ name: name || 'new field', 15 | type: type || 'text', 16 | required: false, 17 | vary: false }); 18 | owner.set('fields', owner.fields || []); 19 | owner.fields.pushObject(newField); 20 | }, 21 | 22 | saveChanges: function() { 23 | this.get('slyd').saveItems(this.content.toArray()).then(function() { 24 | this.transitionToRoute('template'); 25 | }.bind(this)); 26 | }, 27 | 28 | actions: { 29 | 30 | addItem: function() { 31 | this.addItem(); 32 | }, 33 | 34 | addField: function(item) { 35 | this.addField(item); 36 | }, 37 | 38 | deleteItem: function(item) { 39 | this.removeObject(item); 40 | }, 41 | 42 | deleteField: function(item, field) { 43 | item.get('fields').removeObject(field); 44 | }, 45 | 46 | saveChanges: function() { 47 | this.saveChanges(); 48 | }, 49 | 50 | undoChanges: function() { 51 | this.get('slyd').loadItems().then(function(items) { 52 | this.set('content', items); 53 | this.transitionToRoute('template'); 54 | }.bind(this)); 55 | }, 56 | }, 57 | 58 | willEnter: function() { 59 | if (this.get('documentView.canvas')) { 60 | this.set('documentView.canvas.interactionsBlocked', true); 61 | } 62 | }, 63 | 64 | willLeave: function() { 65 | if (this.get('documentView.canvas')) { 66 | this.set('documentView.canvas.interactionsBlocked', false); 67 | } 68 | }, 69 | }); -------------------------------------------------------------------------------- /slybot/bin/portiacrawl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Allow to easily run slybot spiders on console. If spider is not given, print a list of available spiders inside the project""" 3 | import os 4 | import subprocess 5 | from optparse import OptionParser 6 | 7 | def main(): 8 | parser = OptionParser(description=__doc__, 9 | usage="%prog [spider] [options]") 10 | parser.add_option("--settings", help="Give specific settings module (must be on python path)", default='slybot.settings') 11 | parser.add_option("--logfile", help="Specify log file") 12 | parser.add_option("-a", help="Add spider arguments", dest="spargs", action="append", default=[], metavar="NAME=VALUE") 13 | 14 | opts, args = parser.parse_args() 15 | 16 | try: 17 | project_specs = args[0] 18 | if not os.path.exists(project_specs) or len(args) > 2: 19 | parser.print_help() 20 | return 21 | except IndexError: 22 | parser.print_help() 23 | return 24 | 25 | 26 | if opts.settings: 27 | os.environ["SCRAPY_SETTINGS_MODULE"] = opts.settings 28 | 29 | command_spec = ["scrapy", "crawl", args[1]] if len(args) == 2 else ["scrapy", "list"] 30 | if project_specs.endswith(".zip"): 31 | command_spec.extend([ 32 | "-s", "PROJECT_ZIPFILE=%s" % project_specs, 33 | "-s", "SPIDER_MANAGER_CLASS=slybot.spidermanager.ZipfileSlybotSpiderManager", 34 | ]) 35 | else: 36 | command_spec.extend([ 37 | "-s", "PROJECT_DIR=%s" % project_specs, 38 | "-s", "SPIDER_MANAGER_CLASS=slybot.spidermanager.SlybotSpiderManager", 39 | ]) 40 | 41 | if opts.logfile: 42 | command_spec.append("--logfile=%s" % opts.logfile) 43 | 44 | for sparg in opts.spargs: 45 | command_spec.append("-a") 46 | command_spec.append(sparg) 47 | 48 | subprocess.call(command_spec) 49 | 50 | main() 51 | -------------------------------------------------------------------------------- /slyd/media/js/templates/floating-annotation-widget.handlebars: -------------------------------------------------------------------------------- 1 | {{#unless view.creatingField}} 2 |
3 | Map attribute 4 | To field 5 |
6 | {{view ASTool.InlineHelp message="annotation_widget"}} 7 |
8 |
9 | {{view ASTool.ButtonView action="editAnnotation" argument=controller.floatingAnnotation icon="ui-icon-gear" class="clear-button"}} 10 |
11 |
12 | {{view ASTool.ButtonView action="deleteAnnotation" argument=controller.floatingAnnotation icon="ui-icon-trash" class="clear-button"}} 13 |
14 |
15 | {{#unless view.hasMultipleMappings}} 16 |
17 | {{view view.attributeSelect}} 18 | {{view view.fieldSelect}} 19 |
20 | {{view ASTool.ButtonView icon="ui-icon-circle-plus" class="clear-button" label="field"}} 21 |
22 |
23 | {{else}} 24 |
25 | {{#each view.mappings}} 26 |
27 |
{{this.name}}
28 |
{{this.mappedField}}
29 |
30 | {{/each}} 31 |
32 | {{/unless}} 33 | {{else}} 34 | 35 | 36 |
37 | {{view view.fieldTextField width=115 class="darktextfield"}} 38 | {{view view.typeSelect}} 39 | {{view ASTool.ButtonView icon="ui-icon-circle-check" class="clear-button" disabled=view.createFieldDisabled}} 40 |
41 | {{/unless}} 42 | -------------------------------------------------------------------------------- /slybot/slybot/fieldtypes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Field Types 3 | 4 | Spiders extracts items of a given type. These item types are defined by a 5 | schema, which specifies the type of each field in the item. This module 6 | contains FieldProcessor implementations, which are the classes responsible for 7 | custom processing of these types. 8 | 9 | We keep the types of scrapers supported flexible and allow different methods 10 | for each. In the future, we expect many different types, for example one might 11 | be a mechanical turk scraper and the fields would have to do user validation 12 | and provide error messages. 13 | """ 14 | from .text import ( 15 | RawFieldTypeProcessor, 16 | TextFieldTypeProcessor, 17 | SafeHtmlFieldTypeProcessor 18 | ) 19 | from .images import ImagesFieldTypeProcessor 20 | from .url import UrlFieldTypeProcessor 21 | from .number import NumberTypeProcessor 22 | from .point import GeoPointFieldTypeProcessor 23 | from .price import PriceTypeProcessor 24 | 25 | class FieldTypeManager(object): 26 | _TYPEMAP = dict((c.name, c) for c in ( 27 | RawFieldTypeProcessor, TextFieldTypeProcessor, 28 | ImagesFieldTypeProcessor, NumberTypeProcessor, 29 | UrlFieldTypeProcessor, SafeHtmlFieldTypeProcessor, 30 | GeoPointFieldTypeProcessor, PriceTypeProcessor, 31 | )) 32 | _names = sorted(_TYPEMAP.keys()) 33 | 34 | def available_type_names(self): 35 | """Find the names of all field types available. """ 36 | return self._names 37 | 38 | def type_processor_class(self, name): 39 | """Retrieve the class for the given extractor 40 | 41 | This can be useful to introspect on the constructor arguments. If no 42 | suitable type is found, it will default to the RawFieldTypeProcessor 43 | (no processing of extracted data is done). 44 | """ 45 | return self._TYPEMAP.get(name, RawFieldTypeProcessor) 46 | 47 | def all_processor_classes(self): 48 | """Retrieve all processor classes registered""" 49 | return self._TYPEMAP.values() 50 | 51 | -------------------------------------------------------------------------------- /slybot/slybot/validation/schema.py: -------------------------------------------------------------------------------- 1 | """Simple validation of specifications passed to slybot""" 2 | from os.path import dirname, join 3 | import json, re 4 | from urlparse import urlparse 5 | 6 | from jsonschema import Draft3Validator, ValidationError, RefResolver 7 | 8 | _PATH = dirname(__file__) 9 | 10 | def load_schemas(): 11 | filename = join(_PATH, "schemas.json") 12 | return dict((s["id"], s) for s in json.load(open(filename))) 13 | 14 | _SCHEMAS = load_schemas() 15 | 16 | class SlybotJsonSchemaValidator(Draft3Validator): 17 | DEFAULT_TYPES = Draft3Validator.DEFAULT_TYPES.copy() 18 | DEFAULT_TYPES.update({ 19 | "mapping": dict, 20 | }) 21 | def validate_format(self, fmt, instance, schema): 22 | if schema["type"] != "string": 23 | raise ValidationError("Invalid keyword 'format' for type '%s'" % schema["type"]) 24 | 25 | if fmt == "regex": 26 | try: 27 | re.compile(instance) 28 | except: 29 | raise ValidationError("Invalid regular expression: %s" % repr(instance)) 30 | elif fmt == "url": 31 | parsed = urlparse(instance) 32 | if not parsed.scheme or not parsed.netloc: 33 | raise ValidationError("Invalid url: '%s'" % repr(instance)) 34 | 35 | return None 36 | 37 | def get_schema_validator(schema): 38 | resolver = RefResolver("", schema, _SCHEMAS) 39 | return SlybotJsonSchemaValidator(_SCHEMAS[schema], resolver=resolver) 40 | 41 | def validate_project_schema(specs): 42 | 43 | project = specs["project"] 44 | get_schema_validator("project").validate(project) 45 | 46 | items = specs["items"] 47 | get_schema_validator("items").validate(items) 48 | 49 | extractors = specs["extractors"] 50 | get_schema_validator("extractors").validate(extractors) 51 | 52 | spider_schema_validator = get_schema_validator("spider") 53 | for spider in specs["spiders"].values(): 54 | spider_schema_validator.validate(spider) 55 | 56 | return True 57 | 58 | -------------------------------------------------------------------------------- /slyd/slyd/tap.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module is used by the Twisted plugin system 3 | (twisted.plugins.slyd_plugin) to register twistd command to manage 4 | slyd server. The command can be used with 'twistd slyd'. 5 | """ 6 | from os.path import join, dirname 7 | from twisted.python import usage 8 | from twisted.web.resource import Resource 9 | from twisted.application.internet import TCPServer 10 | from twisted.web.server import Site 11 | from twisted.web.static import File 12 | 13 | DEFAULT_PORT = 9001 14 | DEFAULT_DOCROOT = join(dirname(dirname(__file__)), 'media') 15 | 16 | 17 | class Options(usage.Options): 18 | optParameters = [ 19 | ['port', 'p', DEFAULT_PORT, 'Port number to listen on.', int], 20 | ['docroot', 'd', DEFAULT_DOCROOT, 'Default doc root for static media.'], 21 | ] 22 | 23 | 24 | def create_root(config): 25 | from scrapy import log 26 | from scrapy.settings import CrawlerSettings 27 | from slyd.crawlerspec import (CrawlerSpecManager, 28 | create_crawler_spec_resource) 29 | from slyd.bot import create_bot_resource 30 | import slyd.settings 31 | from slyd.projects import ProjectsResource 32 | 33 | root = Resource() 34 | root.putChild("static", File(config['docroot'])) 35 | 36 | crawler_settings = CrawlerSettings(settings_module=slyd.settings) 37 | spec_manager = CrawlerSpecManager(crawler_settings) 38 | 39 | # add project management at /projects 40 | projects = ProjectsResource(crawler_settings) 41 | root.putChild('projects', projects) 42 | 43 | # add crawler at /projects/PROJECT_ID/bot 44 | log.msg("Slybot specs loading from %s/[PROJECT]" % spec_manager.basedir, 45 | level=log.DEBUG) 46 | projects.putChild("bot", create_bot_resource(spec_manager)) 47 | 48 | # add spec at /projects/PROJECT_ID/spec 49 | spec = create_crawler_spec_resource(spec_manager) 50 | projects.putChild("spec", spec) 51 | return root 52 | 53 | 54 | def makeService(config): 55 | root = create_root(config) 56 | site = Site(root) 57 | return TCPServer(config['port'], site) 58 | -------------------------------------------------------------------------------- /slybot/slybot/closespider.py: -------------------------------------------------------------------------------- 1 | """ 2 | This extension closes spiders after they have been crawling inefficiently for a 3 | while 4 | Each SLYCLOSE_SPIDER_CHECK_PERIOD seconds, it checks that at least SLYCLOSE_SPIDER_PERIOD_ITEMS 5 | have been extracted along the last time interval of same length. 6 | """ 7 | 8 | from twisted.internet import task 9 | 10 | from scrapy.xlib.pydispatch import dispatcher 11 | from scrapy import signals 12 | from scrapy.exceptions import NotConfigured 13 | 14 | DEFAULT_CHECK_PERIOD = 3600 15 | DEFAULT_PERIOD_MIN_ITEMS = 200 16 | 17 | class SlybotCloseSpider(object): 18 | 19 | def __init__(self, crawler): 20 | if not crawler.settings.getbool("SLYCLOSE_SPIDER_ENABLED"): 21 | raise NotConfigured 22 | 23 | self.crawler = crawler 24 | self.check_period = crawler.settings.getint("SLYCLOSE_SPIDER_CHECK_PERIOD", DEFAULT_CHECK_PERIOD) 25 | self.period_items = crawler.settings.getint("SLYCLOSE_SPIDER_PERIOD_ITEMS", DEFAULT_PERIOD_MIN_ITEMS) 26 | 27 | self.items_in_period = 0 28 | 29 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 30 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 31 | dispatcher.connect(self.item_scraped, signal=signals.item_scraped) 32 | 33 | def spider_opened(self, spider): 34 | self.task = task.LoopingCall(self._check_crawled_items, spider) 35 | self.task.start(self.check_period, now=False) 36 | 37 | def spider_closed(self, spider): 38 | if self.task.running: 39 | self.task.stop() 40 | 41 | def item_scraped(self, item, spider): 42 | self.items_in_period += 1 43 | 44 | def _check_crawled_items(self, spider): 45 | if self.items_in_period >= self.period_items: 46 | self.items_in_period = 0 47 | else: 48 | spider.log("Closing spider because of low item throughput. Items in last period: %d" % self.items_in_period) 49 | self.crawler.engine.close_spider(spider, 'slybot_fewitems_scraped') 50 | 51 | @classmethod 52 | def from_crawler(cls, crawler): 53 | return cls(crawler) 54 | -------------------------------------------------------------------------------- /slyd/media/tests/fixtures.js: -------------------------------------------------------------------------------- 1 | var spiderNamesJson = ['spider1']; 2 | 3 | var spider1Json = { 4 | "allowed_domains": [], 5 | "exclude_patterns": [], 6 | "follow_patterns": [], 7 | "init_requests": [], 8 | "links_to_follow": "none", 9 | "respect_nofollow": true, 10 | "start_urls": [ 11 | "http://site1.com", 12 | "http://site2.com", 13 | ], 14 | "templates": [ 15 | { 16 | "name": "t1", 17 | "annotated_body": "

Some content

More content

", 18 | "extractors": [], 19 | "original_body": "

Some content

More content

", 20 | "page_id": "", 21 | "page_type": "item", 22 | "scrapes": "default", 23 | "url": "http://site1.com", 24 | }, 25 | { 26 | "name": "t2", 27 | "annotated_body": "

Some content

More content

", 28 | "extractors": [], 29 | "original_body": "

Some content

More content

", 30 | "page_id": "", 31 | "page_type": "item", 32 | "scrapes": "default", 33 | "url": "http://site2.com" 34 | } 35 | ] 36 | }; 37 | 38 | var fetchedPageJson = { 39 | page: "
HELLO
", 40 | items: [], 41 | }; 42 | 43 | var itemsJson = { 44 | "default": { 45 | "fields": { 46 | "description": { 47 | "required": false, 48 | "type": "safe html", 49 | "vary": false 50 | }, 51 | "images": { 52 | "required": true, 53 | "type": "image", 54 | "vary": true 55 | }, 56 | } 57 | } 58 | }; -------------------------------------------------------------------------------- /slybot/slybot/baseurl.py: -------------------------------------------------------------------------------- 1 | """ 2 | html page utils 3 | """ 4 | import urlparse, re 5 | from scrapely.htmlpage import parse_html, HtmlTagType 6 | 7 | ABSURLRE = re.compile("^https?\:\/\/") 8 | DOCTYPERE = re.compile("", re.S | re.I) 9 | 10 | def _is_abs_url(url): 11 | return bool(ABSURLRE.match(url)) 12 | 13 | def insert_base_url(html, base): 14 | """ 15 | Inserts the given base url if does not exist in html source, 16 | or replace the existing if needed 17 | """ 18 | baseurl = baseelement = headelement = htmlelement = None 19 | for element in parse_html(html): 20 | if getattr(element, "tag", None) == "base": 21 | baseurl = element.attributes.get("href", None) 22 | baseelement = element 23 | elif getattr(element, "tag", None) == "head" and \ 24 | element.tag_type == HtmlTagType.OPEN_TAG: 25 | headelement = element 26 | elif getattr(element, "tag", None) == "html" and \ 27 | element.tag_type == HtmlTagType.OPEN_TAG: 28 | htmlelement = element 29 | 30 | if baseurl: 31 | if not _is_abs_url(baseurl): 32 | absurl = urlparse.urljoin(base, baseurl) 33 | # replace original base tag 34 | basetag = '' % absurl 35 | html = html[:baseelement.start] + basetag + html[baseelement.end:] 36 | 37 | else: 38 | # Generate new base element and include 39 | basetag = '' % base 40 | if headelement: 41 | insertpos = headelement.end 42 | else: 43 | if htmlelement: 44 | basetag = "\n%s\n" % basetag 45 | insertpos = htmlelement.end 46 | else: 47 | doctype_match = DOCTYPERE.search(html) 48 | if doctype_match: 49 | insertpos = doctype_match.end() 50 | else: 51 | insertpos = 0 52 | html = html[:insertpos] + basetag + html[insertpos:] 53 | 54 | return html 55 | 56 | def get_base_url(htmlpage): 57 | """Return the base url of the given HtmlPage""" 58 | for element in htmlpage.parsed_body: 59 | if getattr(element, "tag", None) == "base": 60 | return element.attributes.get("href") or htmlpage.url 61 | return htmlpage.url 62 | -------------------------------------------------------------------------------- /slyd/media/js/messages.js: -------------------------------------------------------------------------------- 1 | ASTool.Messages = Ember.Namespace.create({ 2 | // Inline help messages. 3 | overlay_blocked_links: 'Enable this options to highlight links not followed at crawl time in red and followed links in green.', 4 | follow_links: 'Links that match any of the regular expressions in this list will be followed (they should also be in the domain of one of the start pages).', 5 | exclude_links: 'Links that match any of the regular expressions in this list will be excluded.', 6 | perform_login: 'Select this option if the site you are crawling requires login credentials.', 7 | template_required: 'This setting is equivalent to marking the fields as required in the item definition, but limiting the scope to this template only.', 8 | extractors: 'With regular expression extractors, the extracted data is matched against the specified expression and replaced by the match group enclosed between parentheses. If there is no match, the field is not extracted.

Type extractors override the type specified in the item definition.', 9 | select_item: 'You can choose what item type is extracted by this template using the combobox. You can also create and modify items by clicking on the Edit Items button.', 10 | variant: 'By selecting a different variant than Base(0) in your annotation, the resulting extracted data will be assigned to the base item special field variants, which is a list of objects similar to an item.', 11 | ignored_subregions: 'Allows you to define subregions that should be excluded from the extraction process.', 12 | selected_region_ancestors: 'Refine your selection by navigating its ancestors.', 13 | selected_region_children: 'Refine your selection by navigating its children.', 14 | sticky_fields: 'Required attributes are not extracted, but they must be present for a page to match the template.', 15 | annotation_widget: 'Select the attribute you want to extract and an item field to map it.

Choose -just required- to indicate that the template must match a particular feature without generating any extracted data.

You can create new fields by clicking the + field button or by seleting the -create new- option from the field combobox.', 16 | 17 | // Other messages. 18 | confirm_change_selection: 'If you select a different region you will lose all current attribute mappings and ignored subregions, proceed anyway?', 19 | }); 20 | -------------------------------------------------------------------------------- /slyd/media/js/controllers/projects-controller.js: -------------------------------------------------------------------------------- 1 | ASTool.ProjectsIndexController = Em.ArrayController.extend(ASTool.BaseControllerMixin, { 2 | needs: ['application'], 3 | 4 | projectSite: null, 5 | 6 | createProjectDisabled: function() { 7 | return Em.isEmpty(this.get('projectSite')); 8 | }.property('projectSite'), 9 | 10 | actions: { 11 | 12 | openProject: function(projectName) { 13 | this.set('slyd.project', projectName); 14 | this.transitionToRoute('project', { id: projectName }); 15 | }, 16 | 17 | deleteProject: function(projectName) { 18 | if (confirm('Are you sure you want to delete this project? This operation cannot be undone.')) { 19 | this.get('slyd').deleteProject(projectName); 20 | this.removeObject(projectName); 21 | } 22 | }, 23 | 24 | createProject: function() { 25 | var newProjectName = this.getUnusedName('new_project', this.get('content')); 26 | this.get('slyd').createProject(newProjectName).then(function() { 27 | this.set('slyd.project', newProjectName); 28 | // Initialize items spec. 29 | this.get('slyd').saveItems([ 30 | ASTool.Item.create({ name: 'default', fields: [ ] 31 | }) 32 | ]); 33 | // Initialize extractors spec. 34 | this.get('slyd').saveExtractors([]); 35 | // Setup automatic creation of an initial spider. 36 | this.set('controllers.application.siteWizard', this.get('projectSite')); 37 | this.set('projectSite', null); 38 | this.transitionToRoute('project', { id: newProjectName }); 39 | }.bind(this)); 40 | } 41 | }, 42 | 43 | animateProjectSiteInput: function() { 44 | var animateBorderColor = function () { 45 | $('#projectSiteTextField') 46 | .animate({ 'border-color': 'rgba(88,150,220,0.4)', 'background-color': 'rgba(130,210,230,0.1)' }, 1000) 47 | .animate({ 'border-color': '#BBBBB', 'background-color': '#FFFFFF' }, 1000, animateBorderColor) 48 | }; 49 | Em.run.schedule('afterRender', this, function() { 50 | $('#projectSiteTextField') 51 | .hide() 52 | .show('fast', animateBorderColor) 53 | .click(function(e) { 54 | $('#projectSiteTextField').stop(true) 55 | .css({ 'border-color': '#BBBBB', 'background-color': '#FFFFFF' }); 56 | }); 57 | }); 58 | }, 59 | 60 | willEnter: function() { 61 | this.get('documentView').showSpider(); 62 | if (Em.isEmpty(this.get('content'))) { 63 | this.animateProjectSiteInput(); 64 | } 65 | } 66 | }); 67 | -------------------------------------------------------------------------------- /slybot/slybot/item.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from collections import defaultdict 3 | 4 | from scrapy.item import DictItem, Field 5 | from scrapely.descriptor import ItemDescriptor, FieldDescriptor 6 | 7 | from slybot.fieldtypes import FieldTypeManager 8 | 9 | class SlybotItem(DictItem): 10 | # like DictItem.__setitem__ but doesn't check the field is declared 11 | def __setitem__(self, name, value): 12 | self._values[name] = value 13 | @classmethod 14 | def create_iblitem_class(cls, schema): 15 | class IblItem(cls): 16 | fields = defaultdict(dict) 17 | version_fields = [] 18 | for _name, _meta in schema['fields'].items(): 19 | fields[_name] = Field(_meta) 20 | if not _meta.get("vary", False): 21 | version_fields.append(_name) 22 | version_fields = sorted(version_fields) 23 | return IblItem 24 | 25 | def create_slybot_item_descriptor(schema): 26 | field_type_manager = FieldTypeManager() 27 | descriptors = [] 28 | for pname, pdict in schema['fields'].items(): 29 | required = pdict['required'] 30 | pclass = field_type_manager.type_processor_class(pdict['type']) 31 | processor = pclass() 32 | descriptor = SlybotFieldDescriptor(pname, pname, processor, required) 33 | descriptors.append(descriptor) 34 | return ItemDescriptor("", "", descriptors) 35 | 36 | class SlybotFieldDescriptor(FieldDescriptor): 37 | """Extends the scrapely field descriptor to use slybot fieldtypes and 38 | to be created from a slybot item schema 39 | """ 40 | 41 | def __init__(self, name, description, field_type_processor, required=False): 42 | """Create a new SlybotFieldDescriptor with the given name and description. 43 | The field_type_processor is used for extraction and is publicly available 44 | """ 45 | FieldDescriptor.__init__(self, name, description, 46 | field_type_processor.extract, required) 47 | # add an adapt method 48 | self.adapt = field_type_processor.adapt 49 | 50 | def create_item_version(item): 51 | """Item version based on hashlib.sha1 algorithm""" 52 | if not item.version_fields: 53 | return 54 | _hash = hashlib.sha1() 55 | for attrname in item.version_fields: 56 | _hash.update(repr(item.get(attrname))) 57 | return _hash.digest() 58 | 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | portia 2 | ====== 3 | Visual scraping for Scrapy. 4 | 5 | 6 | Overview 7 | ======== 8 | 9 | Portia is a tool for visually scraping web sites without any programming knowledge. Just annotate web pages with a point and click editor to indicate what data you want to extract, and portia will learn how to scrape similar pages 10 | from the site. 11 | 12 | Portia has a web based UI served by a [Twisted] server, so you can install it on almost any modern platform. 13 | 14 | Requirements 15 | ============ 16 | 17 | * Python 2.7 18 | * Works on Linux, Windows, Mac OSX, BSD 19 | * Supported browsers: Latest versions of Chrome (recommended) or Firefox 20 | 21 | 22 | Repository structure 23 | ==================== 24 | 25 | There are two main components in this repository, __slyd__ and __slybot__: 26 | 27 | ###slyd 28 | 29 | The visual editor used to create your scraping projects. 30 | 31 | ###slybot 32 | 33 | The Python web crawler that performs the actual site scraping. It's implemented on top of the [Scrapy] web crawling 34 | framework and the [Scrapely] extraction library. It uses projects created with __slyd__ as input. 35 | 36 | 37 | How to install portia 38 | ============================= 39 | 40 | The recommended way to install dependencies is to use __virtualenv__ and then do: 41 | 42 | cd slyd 43 | pip install -r requirements.txt 44 | 45 | As __slybot__ is a __slyd__ dependency, it will also get installed. 46 | 47 | Running portia 48 | ============== 49 | 50 | First, you need to start the ui and create a project. Run __slyd__ using: 51 | 52 | cd slyd 53 | twistd -n slyd 54 | 55 | and point your browser to: `http://localhost:9001/static/main.html` 56 | 57 | Choose the site you want to scrape and create a project. Every project is created with a default spider named after the domain of the site you are scraping. When you are ready, you can run your project with __slybot__ to do the actual crawling/extraction. 58 | 59 | Projects created with __slyd__ can be found at: 60 | 61 | slyd/data/projects 62 | 63 | To run one of those projects use: 64 | 65 | portiacrawl project_path spidername 66 | 67 | Where `spidername` should be one of the project spiders. If you don't remember the name of the spider, just use: 68 | 69 | portiacrawl project_path 70 | 71 | and you will get the list of spiders for that project. 72 | 73 | 74 | [Twisted]: https://twistedmatrix.com 75 | [Scrapely]: https://github.com/scrapy/scrapely 76 | [Scrapy]: http://scrapy.org 77 | -------------------------------------------------------------------------------- /slybot/slybot/tests/test_schema_validation.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from unittest import TestCase 4 | from os.path import dirname, join 5 | 6 | from slybot.validation.schema import get_schema_validator, \ 7 | ValidationError, validate_project_schema 8 | from slybot.utils import open_project_from_dir 9 | 10 | _TEST_PROJECT_DIR = join(dirname(__file__), "data/SampleProject") 11 | 12 | class JsonSchemaTest(TestCase): 13 | 14 | def assertRaisesRegexp(self, eclass, pattern, func, *args): 15 | """assertRaisesRegexp is not provided in python versions below 2.7""" 16 | try: 17 | func(*args) 18 | except eclass, e: 19 | m = re.search(pattern, e.message) 20 | if not m: 21 | raise AssertionError('"%s" does not match "%s"' % (pattern, e.message)) 22 | else: 23 | raise AssertionError("%s not raised" % eclass.__name__) 24 | 25 | def test_regex_formatting_wrong(self): 26 | obj = { 27 | "0": { 28 | "regular_expression": "Item: (\d+" 29 | } 30 | } 31 | validator = get_schema_validator("extractors") 32 | self.assertRaisesRegexp(ValidationError, "Invalid regular expression", 33 | validator.validate, obj) 34 | 35 | def test_regex_formatting_ok(self): 36 | obj = { 37 | "0": { 38 | "regular_expression": "Item: (\d+)" 39 | } 40 | } 41 | validator = get_schema_validator("extractors") 42 | self.assertEqual(validator.validate(obj), None) 43 | 44 | def test_valid_url(self): 45 | obj = { 46 | "start_urls": ['http://www.example.com/'], 47 | "links_to_follow": "none", 48 | "respect_nofollow": True, 49 | "templates": [], 50 | } 51 | validator = get_schema_validator("spider") 52 | self.assertEqual(validator.validate(obj), None) 53 | 54 | def test_invalid_url(self): 55 | obj = { 56 | "start_urls": ['www.example.com'], 57 | "links_to_follow": "none", 58 | "respect_nofollow": True, 59 | "templates": [], 60 | } 61 | validator = get_schema_validator("spider") 62 | self.assertRaisesRegexp(ValidationError, "Invalid url:", validator.validate, obj) 63 | 64 | def test_test_project(self): 65 | specs = open_project_from_dir(_TEST_PROJECT_DIR) 66 | self.assertTrue(validate_project_schema(specs)) 67 | 68 | -------------------------------------------------------------------------------- /slyd/tests/test_bot.py: -------------------------------------------------------------------------------- 1 | import json 2 | from os.path import join 3 | from twisted.trial import unittest 4 | from twisted.internet.defer import inlineCallbacks 5 | from twisted.web.server import Site 6 | from twisted.web.static import File 7 | from twisted.internet import reactor 8 | from slyd.bot import create_bot_resource 9 | from .utils import TestSite, test_spec_manager 10 | from .settings import RESOURCE_DIR 11 | 12 | 13 | class BotTest(unittest.TestCase): 14 | def setUp(self): 15 | # configure bot resource 16 | sm = test_spec_manager() 17 | self.bot_resource = create_bot_resource(sm) 18 | self.botsite = TestSite(self.bot_resource) 19 | 20 | # configure fake website to crawl 21 | docroot = join(RESOURCE_DIR, 'docroot') 22 | factory = Site(File(docroot)) 23 | self.listen_port = reactor.listenTCP(8997, factory) 24 | 25 | 26 | def _fetch(self, url, **params): 27 | req = dict(params) 28 | req.setdefault('request', {})['url'] = url 29 | request_json = json.dumps(req) 30 | return self.botsite.post('fetch', data=request_json) 31 | 32 | @inlineCallbacks 33 | def test_fetch(self): 34 | # test status code 35 | result = yield self._fetch("http://localhost:8997/notexists") 36 | self.assertEqual(result.responseCode, 200) 37 | status = json.loads(result.value())['response']['status'] 38 | self.assertEqual(status, 404) 39 | 40 | # get an existing file 41 | test_url = "http://localhost:8997/test.html" 42 | result = yield self._fetch(test_url) 43 | self.assertEqual(result.responseCode, 200) 44 | value = json.loads(result.value()) 45 | # expect 200 response and base href added 46 | self.assertEqual(value['response']['status'], 200) 47 | self.assertIn('>> extractor = create_regex_extractor("(\d+).*(\.\d+)") 13 | >>> extractor(u"The price of this product is
45
.50 pounds") 14 | u'45.50' 15 | """ 16 | ereg = re.compile(pattern, re.S) 17 | def _extractor(txt): 18 | m = ereg.search(txt) 19 | if m: 20 | return htmlregion(u"".join(filter(None, m.groups() or m.group()))) 21 | 22 | _extractor.__name__ = "Regex: %s" % pattern.encode("utf-8") 23 | return _extractor 24 | 25 | class PipelineExtractor: 26 | def __init__(self, *extractors): 27 | self.extractors = extractors 28 | 29 | def __call__(self, value): 30 | for extractor in self.extractors: 31 | value = extractor(value) if value else value 32 | return value 33 | 34 | @property 35 | def __name__(self): 36 | return repr(self.extractors) 37 | 38 | 39 | def apply_extractors(descriptor, template_extractors, extractors): 40 | field_type_manager = FieldTypeManager() 41 | 42 | for field_name, field_extractors in template_extractors.items(): 43 | equeue = [] 44 | for eid in field_extractors: 45 | extractor_doc = extractors[eid] 46 | if "regular_expression" in extractor_doc: 47 | equeue.append(create_regex_extractor(extractor_doc["regular_expression"])) 48 | elif "type_extractor" in extractor_doc: # overrides default one 49 | descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, 50 | field_name, field_type_manager.type_processor_class(extractor_doc["type_extractor"])()) 51 | if not field_name in descriptor.attribute_map: 52 | # if not defined type extractor, use text type by default, as it is by far the most commonly used 53 | descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, 54 | field_name, field_type_manager.type_processor_class("text")()) 55 | 56 | if equeue: 57 | equeue.insert(0, descriptor.attribute_map[field_name].extractor) 58 | descriptor.attribute_map[field_name].extractor = PipelineExtractor(*equeue) 59 | 60 | -------------------------------------------------------------------------------- /slybot/slybot/fieldtypes/text.py: -------------------------------------------------------------------------------- 1 | """ 2 | Text types 3 | """ 4 | from scrapely.extractors import text as extract_text, safehtml 5 | 6 | class _BaseTextProcessor(object): 7 | """basic text processor, defines identity functions, some of which 8 | are overridden in subclasses 9 | """ 10 | def extract(self, text): 11 | """Matches and extracts any string, as it is""" 12 | return text 13 | 14 | def adapt(self, text, htmlpage): 15 | return text 16 | 17 | class RawFieldTypeProcessor(_BaseTextProcessor): 18 | """Extracts the raw data, without processing. Data is escaped for presentation 19 | 20 | >>> from scrapely.extractors import htmlregion 21 | >>> r = RawFieldTypeProcessor() 22 | >>> html = htmlregion(u'

test

') 23 | >>> r.extract(html) 24 | u'

test

' 25 | >>> r.adapt(html, None) 26 | u'

test

' 27 | """ 28 | name = 'raw html' 29 | description = 'raw html as it appears in the page' 30 | 31 | class TextFieldTypeProcessor(_BaseTextProcessor): 32 | """Extracts strings, removing all HTML markup 33 | 34 | >>> from scrapely.extractors import htmlregion 35 | >>> p = TextFieldTypeProcessor() 36 | >>> html = htmlregion(u'

test

!') 37 | >>> extracted = p.extract(html) 38 | >>> extracted 39 | u'test !' 40 | >>> p.adapt(extracted, None) 41 | u'test !' 42 | >>> html = htmlregion(u'

 \\n

') 43 | >>> p.extract(html) 44 | u'' 45 | """ 46 | name = 'text' 47 | description = 'extracts text from web pages, cleaning all markup' 48 | 49 | def extract(self, htmlregion): 50 | return extract_text(htmlregion.text_content) 51 | 52 | 53 | class SafeHtmlFieldTypeProcessor(_BaseTextProcessor): 54 | """Extracts strings, with only a safe subset of HTML remaining 55 | 56 | Extraction checks for presence of text content, and adapt transforms the HTML 57 | >>> from scrapely.extractors import htmlregion 58 | >>> p = SafeHtmlFieldTypeProcessor() 59 | >>> html = htmlregion(u'

test

foo') 60 | >>> p.extract(html) 61 | u'

test

foo' 62 | >>> p.adapt(html) 63 | u'

test

foo' 64 | 65 | html without text must not be extracted 66 | >>> html = htmlregion(u'
') 67 | 68 | """ 69 | name = 'safe html' 70 | description = 'removes all but a small subset of html tags' 71 | def extract(self, htmlregion): 72 | if extract_text(htmlregion.text_content): 73 | return htmlregion 74 | 75 | def adapt(self, text, htmlpage=None): 76 | """Remove html markup""" 77 | return safehtml(text) 78 | 79 | -------------------------------------------------------------------------------- /slyd/media/tests/integration_test_helper.js: -------------------------------------------------------------------------------- 1 | document.write('
'); 2 | 3 | ASTool.rootElement = '#ember-testing'; 4 | ASTool.setupForTesting(); 5 | 6 | Ember.Test.registerAsyncHelper('iframeClick', function(app, selector, context) { 7 | var $el = $('#scraped-doc-iframe').contents().find(selector); 8 | Ember.run(function() { 9 | $el.mouseup(); 10 | }); 11 | }); 12 | 13 | Ember.Test.registerAsyncHelper('focus', function(app, selector, context) { 14 | var $el = $(selector); 15 | Ember.run(function() { 16 | $el.focus(); 17 | }); 18 | }); 19 | 20 | Ember.Test.registerAsyncHelper('blur', function(app, selector, context) { 21 | var $el = $(selector); 22 | Ember.run(function() { 23 | $el.blur(); 24 | }); 25 | }); 26 | 27 | Ember.Test.registerAsyncHelper('iframeClick', function(app, selector, context) { 28 | var $el =$('#scraped-doc-iframe').contents().find(selector); 29 | Ember.run(function() { 30 | $el.mouseup(); 31 | }); 32 | }); 33 | 34 | Ember.Test.registerAsyncHelper('sleep', function(app, value) { 35 | return Ember.Test.promise(function(resolve) { 36 | Ember.Test.adapter.asyncStart(); 37 | setTimeout(function() { 38 | Ember.Test.adapter.asyncEnd(); 39 | Em.run(null, resolve, value); 40 | }, 250); 41 | }); 42 | }); 43 | 44 | 45 | ASTool.injectTestHelpers(); 46 | 47 | function exists(selector) { 48 | return !!find(selector).length; 49 | } 50 | 51 | function hasAnnotation(annotatedDoc, attribute, field) { 52 | var pattern = "annotations":{"" + attribute + "":"" + field + ""}"; 53 | 54 | return annotatedDoc.indexOf(pattern) != -1; 55 | } 56 | 57 | /* Define Function.prototype.bind in case the installed version of 58 | PhantomJS does not implement it. 59 | Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Function/bind#Compatibility 60 | */ 61 | if (!Function.prototype.bind) { 62 | Function.prototype.bind = function (oThis) { 63 | if (typeof this !== "function") { 64 | // closest thing possible to the ECMAScript 5 internal IsCallable function 65 | throw new TypeError("Function.prototype.bind - what is trying to be bound is not callable"); 66 | } 67 | 68 | var aArgs = Array.prototype.slice.call(arguments, 1), 69 | fToBind = this, 70 | fNOP = function () {}, 71 | fBound = function () { 72 | return fToBind.apply(this instanceof fNOP && oThis 73 | ? this 74 | : oThis, 75 | aArgs.concat(Array.prototype.slice.call(arguments))); 76 | }; 77 | 78 | fNOP.prototype = this.prototype; 79 | fBound.prototype = new fNOP(); 80 | 81 | return fBound; 82 | }; 83 | } 84 | 85 | -------------------------------------------------------------------------------- /slybot/docs/index.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | Slybot |version| documentation 3 | ============================== 4 | 5 | Slybot is a Python web crawler for doing web scraping. It's implemented on top of the 6 | `Scrapy`_ web crawling framework and the `Scrapely`_ extraction library. 7 | 8 | Requirements 9 | ============ 10 | 11 | * `Scrapy`_ 12 | * `Scrapely`_ 13 | * `loginform`_ 14 | * lxml 15 | 16 | Installation 17 | ============ 18 | 19 | To install the last development version:: 20 | 21 | pip install git+https://github.com/scrapy/slybot.git 22 | 23 | To install the last stable version:: 24 | 25 | pip install slybot 26 | 27 | Quick Usage 28 | =========== 29 | 30 | Create a directory called ``slybot-project``, place your slybot project (JSON 31 | files) there. 32 | 33 | To get a list of all spiders:: 34 | 35 | slybot list 36 | 37 | To run a specific specific spider:: 38 | 39 | slybot crawl 40 | 41 | For those familiar with Scrapy, ``slybot`` is a thin wrapper for the ``scrapy`` 42 | command, that just instructs Scrapy to use Slybot settings 43 | (``slybot.settings``). All commands and arguments supported by the ``scrapy`` 44 | command are also supported by the ``slybot`` command, although a few of them 45 | don't apply. 46 | 47 | Slybot projects 48 | =============== 49 | 50 | Slybot projects are configured through a collection of JSON files which are 51 | documented in :doc:`project`. 52 | 53 | Example project 54 | =============== 55 | 56 | There is a working slybot project example in `slybot/tests/data/SampleProject`_ 57 | that is used for tests and hence use most of the available features. 58 | 59 | User interfaces 60 | =============== 61 | 62 | Here is a list of known UIs to create Slybot templates: 63 | 64 | * `Scrapely tool`_ (command line interface) 65 | * `Scrapinghub Autoscraping`_ (visual, web-based) 66 | 67 | Advanced configuration 68 | ====================== 69 | 70 | Slybot is a `Scrapy`_ project, so it can be tuned and configured using `Scrapy 71 | settings`_. For more information see: :doc:`config`. 72 | 73 | Spiderlets 74 | ========== 75 | 76 | The behaviour of Slybot spiders can also be tuned with small (Python) code 77 | snippets called spiderlets. For more information see :doc:`spiderlets`. 78 | 79 | Table of contents 80 | ================= 81 | 82 | .. toctree:: 83 | :maxdepth: 2 84 | 85 | project 86 | config 87 | spiderlets 88 | 89 | .. _Scrapy: https://github.com/scrapy/scrapy 90 | .. _Scrapely: https://github.com/scrapy/scrapely 91 | .. _loginform: https://github.com/scrapy/loginform 92 | .. _slybot/tests/data/SampleProject: https://github.com/scrapy/slybot/tree/master/slybot/tests/data/SampleProject 93 | .. _Scrapy settings: http://doc.scrapy.org/en/latest/topics/settings.html 94 | .. _Scrapely tool: https://github.com/scrapy/scrapely#usage-command-line-tool 95 | .. _Scrapinghub Autoscraping: http://scrapinghub.com/autoscraping.html 96 | -------------------------------------------------------------------------------- /slyd/media/js/jqplugins.js: -------------------------------------------------------------------------------- 1 | jQuery.fn.getUniquePath = function () { 2 | if (this.length != 1) { 3 | throw 'Requires one element.'; 4 | } 5 | var path, node = this; 6 | while (node.length) { 7 | var realNode = node[0], name = realNode.localName; 8 | if (!name) { 9 | break; 10 | } 11 | name = name.toLowerCase(); 12 | var parent = node.parent(); 13 | var siblings = parent.children(name); 14 | if (siblings.length > 1) { 15 | name += ':eq(' + siblings.index(realNode) + ')'; 16 | } 17 | path = name + (path ? '>' + path : ''); 18 | node = parent; 19 | } 20 | return path; 21 | }; 22 | 23 | jQuery.fn.getPath = function() { 24 | var path = [this.prop('tagName')]; 25 | this.parents().not('html').each(function() { 26 | var entry = this.tagName.toLowerCase(); 27 | path.push(entry); 28 | }); 29 | return path.reverse().join(' > '); 30 | }; 31 | 32 | jQuery.fn.getAttributeList = function() { 33 | var attributeList = []; 34 | if (this.text()) { 35 | attributeList.push(ASTool.Attribute.create({ 36 | name: 'content', 37 | value: this.text()})); 38 | } 39 | var element = this.get(0); 40 | $(element.attributes).each(function() { 41 | if ($.inArray(this.nodeName, jQuery.fn.getAttributeList.ignoredAttributes) == -1 && 42 | this.nodeValue) { 43 | attributeList.push(ASTool.Attribute.create({ 44 | name: this.nodeName, 45 | value: this.nodeValue})); 46 | } 47 | }) 48 | return attributeList; 49 | }; 50 | 51 | jQuery.fn.getAttributeList.ignoredAttributes = ['id', 'class', 52 | 'width', 'style', 'height', 'cellpadding', 53 | 'cellspacing', 'border', 'bgcolor', 'color', 'colspan', 54 | 'data-scrapy-annotate']; 55 | 56 | jQuery.fn.boundingBox = function() { 57 | var rect = {}; 58 | rect.left = this.offset().left; 59 | rect.top = this.offset().top; 60 | rect.width = this.outerWidth(); 61 | rect.height = this.outerHeight(); 62 | return rect; 63 | }; 64 | 65 | jQuery.fn.isDescendant = function(parent) { 66 | return $(parent).find(this).length > 0; 67 | }; 68 | 69 | jQuery.fn.findAnnotatedElements = function() { 70 | return this.find('[data-scrapy-annotate]'); 71 | }; 72 | 73 | jQuery.fn.findAnnotatedElement = function(annotationId) { 74 | var selector = '[data-scrapy-annotate*="' + annotationId + '"]'; 75 | return this.find(selector); 76 | }; 77 | 78 | jQuery.fn.findIgnoredElements = function(annotationId) { 79 | var selector; 80 | if (annotationId) { 81 | selector = '[data-scrapy-ignore*="' + annotationId + '"], [data-scrapy-ignore-beneath*="' + annotationId + '"]'; 82 | } else { 83 | selector = '[data-scrapy-ignore], [data-scrapy-ignore-beneath]'; 84 | } 85 | return this.find(selector); 86 | }; 87 | 88 | jQuery.fn.removePartialAnnotation = function() { 89 | // FIXME: this may leave empty text node children. 90 | var element = this.get(0); 91 | var textNode = element.childNodes[0]; 92 | var parentNode = element.parentNode; 93 | $(textNode).unwrap(); 94 | parentNode.normalize(); 95 | } -------------------------------------------------------------------------------- /slyd/tests/utils.py: -------------------------------------------------------------------------------- 1 | from cStringIO import StringIO 2 | from twisted.internet.defer import succeed 3 | from twisted.web import server 4 | from twisted.web.test.test_web import DummyRequest 5 | from scrapy.settings import CrawlerSettings 6 | from slyd.crawlerspec import CrawlerSpecManager 7 | from slyd.projects import ProjectsResource 8 | import tests.settings as test_settings 9 | 10 | 11 | def test_spec_manager(): 12 | """Create a CrawlerSpecManager configured to use test settings""" 13 | crawler_settings = CrawlerSettings(settings_module=test_settings) 14 | return CrawlerSpecManager(crawler_settings) 15 | 16 | def test_projects_resource(temp_projects_dir): 17 | """Create a ProjectsResource configured to use test settings""" 18 | crawler_settings = CrawlerSettings(settings_module=test_settings) 19 | projects = ProjectsResource(crawler_settings) 20 | projects.projectsdir = temp_projects_dir 21 | return projects 22 | 23 | 24 | class _SlydDummyRequest(DummyRequest): 25 | def __init__(self, method, url, project='test', data=None, args=None, headers=None): 26 | DummyRequest.__init__(self, url.split('/')) 27 | if data is not None: 28 | self.content = StringIO(data) 29 | if project is not None: 30 | self.project = project 31 | self.method = method 32 | self.headers.update(headers or {}) 33 | # set args 34 | args = args or {} 35 | for k, v in args.items(): 36 | self.addArg(k, v) 37 | 38 | def value(self): 39 | return "".join(self.written) 40 | 41 | 42 | class TestSite(server.Site): 43 | """A Site used for test_settings 44 | 45 | Adds some convenience methods for GET and POST and result 46 | capture 47 | """ 48 | 49 | def __init__(self, resource, project='test'): 50 | server.Site.__init__(self, resource) 51 | self.project = project 52 | 53 | def get(self, url, args=None, headers=None): 54 | return self._request("GET", url, args, headers, None) 55 | 56 | def post(self, url, data, args=None, headers=None): 57 | return self._request("POST", url, args, headers, data) 58 | 59 | def _request(self, method, url, args, headers, data): 60 | request = _SlydDummyRequest(method, url, self.project, 61 | data, args, headers) 62 | resource = self.getResourceFor(request) 63 | result = resource.render(request) 64 | return self._resolveResult(request, result) 65 | 66 | def _resolveResult(self, request, result): 67 | if isinstance(result, str): 68 | request.write(result) 69 | request.finish() 70 | return succeed(request) 71 | elif result is server.NOT_DONE_YET: 72 | if request.finished: 73 | return succeed(request) 74 | else: 75 | return request.notifyFinish().addCallback(lambda _: request) 76 | else: 77 | raise ValueError("Unexpected return value: %r" % (result,)) 78 | -------------------------------------------------------------------------------- /slybot/slybot/generic_form.py: -------------------------------------------------------------------------------- 1 | import re 2 | import itertools 3 | from lxml import html 4 | 5 | from scrapy.http.request.form import _get_inputs 6 | 7 | class GenericForm: 8 | 9 | def __init__(self, **kwargs): 10 | self.kwargs = kwargs 11 | 12 | def _pick_node(self, doc, selector): 13 | nodes = doc.xpath(selector['xpath']) 14 | if nodes: 15 | return nodes[0] 16 | 17 | def _filter_by_regex(self, lines, regex): 18 | search_regex = re.compile(regex).search 19 | return [l for l in lines if search_regex(l)] 20 | 21 | def _get_field_values(self, form, field_descriptor): 22 | if 'name' in field_descriptor: 23 | field_name = field_descriptor['name'] 24 | else: 25 | select_field = self._pick_node(form, field_descriptor) 26 | field_name = select_field.name 27 | 28 | field_type = field_descriptor['type'] 29 | if field_type == 'constants': 30 | return [[field_name, option] for option in self.get_value(field_descriptor)] 31 | elif field_type == 'iterate': 32 | select_field = self._pick_node(form, field_descriptor) 33 | values = self._filter_by_regex(select_field.value_options, 34 | self.get_value(field_descriptor)) 35 | return [[select_field.name, option] for option in values] 36 | elif field_type == 'inurl': 37 | return [[field_name, option] for option in field_descriptor['file_values']] 38 | 39 | def get_value(self, field_descriptor): 40 | values = field_descriptor.get('value', '') 41 | if isinstance(values, list): 42 | return [val.format(**self.kwargs) for val in values] 43 | else: 44 | return values.format(**self.kwargs) 45 | 46 | def set_values_url_field(self, field_descriptor, body): 47 | field_descriptor['file_values'] = body.split('\n') 48 | 49 | def get_url_field(self, form_descriptor): 50 | for i, field_descriptor in enumerate(form_descriptor['fields']): 51 | if (field_descriptor['type'] == 'inurl' 52 | and (not 'file_values' in field_descriptor or 53 | not field_descriptor['file_values'])): 54 | yield i, field_descriptor 55 | 56 | def fill_generic_form(self, url, body, form_descriptor): 57 | 58 | doc = html.document_fromstring(body, base_url=url) 59 | form = self._pick_node(doc, form_descriptor) 60 | if form is None: 61 | raise Exception('Generic form not found') 62 | 63 | # Get all the possible inputs for each field 64 | values = [self._get_field_values(form, field) 65 | for field in form_descriptor['fields']] 66 | 67 | for params in itertools.product(*values): 68 | form_values = dict(_get_inputs(form, None, False, None, None)) 69 | for name, option in params: 70 | form_values[name] = option 71 | yield form_values.items(), form.action or form.base_url, form.method 72 | -------------------------------------------------------------------------------- /slyd/tests/test_projects.py: -------------------------------------------------------------------------------- 1 | import json 2 | import unittest 3 | from tempfile import mkdtemp 4 | from os.path import join, exists 5 | from shutil import rmtree 6 | from scrapy.tests.mockserver import Status 7 | from twisted.internet.defer import inlineCallbacks 8 | from twisted.web.resource import NoResource, Resource 9 | from .utils import TestSite, test_projects_resource 10 | from .settings import DATA_DIR 11 | 12 | 13 | class ProjectsTest(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.temp_projects_dir = mkdtemp(dir=DATA_DIR, 17 | prefix='test-run-') 18 | root = Resource() 19 | projects = test_projects_resource(self.temp_projects_dir) 20 | root.putChild('projects', projects) 21 | projects.putChild('status', Status()) 22 | self.projectssite = TestSite(root, None) 23 | 24 | def check_project_exists(self, project_name): 25 | self.assertTrue(exists(join(self.temp_projects_dir, project_name))) 26 | self.assertTrue( 27 | exists(join(self.temp_projects_dir, project_name, 'spiders'))) 28 | 29 | def check_project_not_exists(self, project_name): 30 | self.assertFalse(exists(join(self.temp_projects_dir, project_name))) 31 | 32 | @inlineCallbacks 33 | def test_childaccess(self): 34 | with self.assertRaises(NoResource): 35 | yield self.projectssite.get("projects/noresource") 36 | with self.assertRaises(NoResource): 37 | yield self.projectssite.get("projects/project/noresource") 38 | yield self.projectssite.get("projects/project/status") 39 | 40 | @inlineCallbacks 41 | def post_command(self, cmd, *args, **kwargs): 42 | obj = {'cmd': cmd, 'args': args} 43 | result = yield self.projectssite.post('projects', data=json.dumps(obj)) 44 | self.assertEqual(result.responseCode, kwargs.get('expect', 200)) 45 | 46 | @inlineCallbacks 47 | def test_list_projects(self): 48 | result = yield self.projectssite.get('projects') 49 | self.assertEqual(json.loads(result.value()), []) 50 | self.post_command('create', 'project1') 51 | self.post_command('create', 'project2') 52 | result = yield self.projectssite.get('projects') 53 | self.assertEqual(json.loads(result.value()), ['project1', 'project2']) 54 | 55 | def test_commands(self): 56 | self.post_command('rm', 'doesnotexist', expect=404) 57 | self.post_command('create', 'project1') 58 | self.check_project_exists('project1') 59 | self.post_command('mv', 'project1', 'project2') 60 | self.check_project_exists('project2') 61 | self.post_command('rm', 'project2') 62 | self.check_project_not_exists('project2') 63 | # Don't allow overwrites when creating or renaming projects 64 | self.post_command('create', 'project1') 65 | self.post_command('create', 'project1', expect=400) 66 | self.post_command('create', 'project2') 67 | self.post_command('mv', 'project1', 'project2', expect=400) 68 | 69 | def tearDown(self): 70 | rmtree(self.temp_projects_dir) 71 | -------------------------------------------------------------------------------- /slybot/slybot/spiderlets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Spider middleware for AS for completing the work made by AS with a "spiderlet" code 3 | 4 | """ 5 | import pkgutil, inspect 6 | 7 | from scrapy.xlib.pydispatch import dispatcher 8 | from scrapy import signals 9 | from scrapy.exceptions import NotConfigured 10 | from scrapy.http import Request 11 | 12 | class DefaultSpiderlet(object): 13 | name = None 14 | def __init__(self, spider): 15 | self.spider = spider 16 | def process_request(self, request, response): 17 | return request 18 | def process_item(self, item, response): 19 | return item 20 | def process_start_request(self, request): 21 | return request 22 | 23 | def parse_login_page(self, response): 24 | return self.spider.parse_login_page(response) 25 | 26 | def list_spiderlets(spiderlets_module_path): 27 | spiderlets_module = __import__(spiderlets_module_path, {}, {}, ['']) 28 | seen_classes = set() 29 | for _, mname, _ in pkgutil.iter_modules(spiderlets_module.__path__): 30 | module = __import__(".".join([spiderlets_module_path, mname]), {}, {}, ['']) 31 | for cls in [c for c in vars(module).itervalues() if inspect.isclass(c)]: 32 | if cls in seen_classes: 33 | continue 34 | seen_classes.add(cls) 35 | name = getattr(cls, 'name', None) 36 | if name: 37 | yield cls 38 | 39 | def _load_spiderlet(spiderlets_module_path, spider): 40 | for cls in list_spiderlets(spiderlets_module_path): 41 | if cls.name == spider.name: 42 | class _spiderlet_cls(cls, DefaultSpiderlet): 43 | pass 44 | spider.log("SpiderletMiddleware: loaded %s" % _spiderlet_cls.name) 45 | return _spiderlet_cls(spider) 46 | return DefaultSpiderlet(spider) 47 | 48 | class SpiderletsMiddleware(object): 49 | @classmethod 50 | def from_crawler(cls, crawler): 51 | return cls(crawler.settings) 52 | 53 | def __init__(self, settings): 54 | self.annotating = "annotating" in settings.getlist('SHUB_JOB_TAGS') 55 | self.spiderlets_module_path = settings["SPIDERLETS_MODULE"] 56 | if not self.spiderlets_module_path: 57 | raise NotConfigured 58 | dispatcher.connect(self.spider_opened, signals.spider_opened) 59 | 60 | def spider_opened(self, spider): 61 | self.spiderlet = _load_spiderlet(self.spiderlets_module_path, spider) 62 | 63 | def process_spider_output(self, response, result, spider): 64 | for item_or_request in result: 65 | if isinstance(item_or_request, Request): 66 | yield self.spiderlet.process_request(item_or_request, response) 67 | else: 68 | yield self.spiderlet.process_item(item_or_request, response) 69 | 70 | def process_start_requests(self, start_requests, spider): 71 | for request in start_requests: 72 | if request.callback == spider.parse_login_page: 73 | request.callback = self.spiderlet.parse_login_page 74 | yield self.spiderlet.process_start_request(request) 75 | 76 | -------------------------------------------------------------------------------- /slyd/media/main.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /slybot/slybot/tests/test_baseurl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for apply_annotations 3 | """ 4 | 5 | from unittest import TestCase 6 | from slybot.baseurl import insert_base_url, get_base_url 7 | from scrapely.htmlpage import HtmlPage 8 | 9 | class TestApplyAnnotations(TestCase): 10 | def test_insert_base_relative(self): 11 | """Replace relative base href""" 12 | html_in = '' 13 | html_target = '\ 14 | ' 15 | html_out = insert_base_url(html_in, "http://localhost:8000/") 16 | self.assertEqual(html_out, html_target) 17 | 18 | def test_insert_base_noreplace(self): 19 | """base tag dont need to be replaced""" 20 | html_in = html_target = '' 21 | html_out = insert_base_url(html_in, "http://localhost:8000/users/blog.html") 22 | self.assertEqual(html_out, html_target) 23 | 24 | def test_insert_base_addbase(self): 25 | """add base tag when not present""" 26 | html_in = '\ 27 | ' 28 | html_target = '\ 29 | \ 30 | ' 31 | html_out = insert_base_url(html_in, "http://localhost:8000/") 32 | self.assertEqual(html_out, html_target) 33 | 34 | def test_insert_base_commented(self): 35 | """Test weird case when base tag is commented in origin""" 36 | html_in = '\ 37 | Body' 38 | html_target = '\ 39 | Body' 40 | html_out = insert_base_url(html_in, "http://example.com/") 41 | self.assertEqual(html_out, html_target) 42 | 43 | def test_insert_base_nohead(self): 44 | """Test base insert when no head element is present""" 45 | html_in = 'Body' 46 | html_target = '\n\ 47 | \n\ 48 | Body' 49 | html_out = insert_base_url(html_in, "http://localhost:8000/") 50 | self.assertEqual(html_out, html_target) 51 | 52 | def test_get_base_url(self): 53 | """Basic get_base_url test""" 54 | html = u'\ 55 | ' 56 | page = HtmlPage("http://example.com/products/p19.html", body=html) 57 | self.assertEqual(get_base_url(page), "http://example.com/products/") 58 | 59 | def test_get_base_url_nobase(self): 60 | """Base tag does not exists""" 61 | html = u'' 62 | page = HtmlPage("http://example.com/products/p19.html", body=html) 63 | self.assertEqual(get_base_url(page), "http://example.com/products/p19.html") 64 | 65 | def test_get_base_url_empty_basehref(self): 66 | """Base tag exists but href is empty""" 67 | html = u'\ 68 | ' 69 | url = "http://example.com/products/p19.html" 70 | page = HtmlPage(url, body=html) 71 | self.assertEqual(get_base_url(page), url) 72 | 73 | 74 | -------------------------------------------------------------------------------- /slybot/slybot/validation/schemas.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "project", 4 | "type": "object", 5 | "properties": { 6 | "name": {"type": "string", "required": true}, 7 | "version": {"type": "string", "required": true}, 8 | "comment": {"type": "string"} 9 | } 10 | }, 11 | { 12 | "id": "items", 13 | "additionalProperties": { 14 | "$ref": "item" 15 | } 16 | }, 17 | { 18 | "id": "item", 19 | "type": "object", 20 | "properties": { 21 | "fields": {"additionalProperties": {"$ref": "field"}, "required": true} 22 | } 23 | }, 24 | { 25 | "id": "field", 26 | "type": "object", 27 | "properties": { 28 | "type": {"type": "string", "required": true}, 29 | "required": {"type": "boolean", "required": true}, 30 | "vary": {"type": "boolean", "required": true} 31 | } 32 | }, 33 | { 34 | "id": "extractors", 35 | "additionalProperties": {"$ref": "extractor"} 36 | }, 37 | { 38 | "id": "extractor", 39 | "type": "object", 40 | "properties": { 41 | "type_extractor": {"type": "string"}, 42 | "regular_expression": {"type": "string", "format": "regex"} 43 | } 44 | }, 45 | { 46 | "id": "spider", 47 | "type": "object", 48 | "properties": { 49 | "start_urls": {"type": "array", "items": {"type": "string", "format": "url"}, "required": true}, 50 | "links_to_follow": {"type": "string", "enum": ["none", "patterns"], "required": true}, 51 | "follow_patterns": {"type": "array", "items": {"type": "string", "format": "regex"}}, 52 | "exclude_patterns": {"type": "array", "items": {"type": "string", "format": "regex"}}, 53 | "respect_nofollow": {"type": "boolean", "required": true}, 54 | "allowed_domains": {"type": "array", "items": {"type": "string"}}, 55 | "templates": {"type": "array", "items": {"$ref": "template"}, "required": true}, 56 | "init_requests": {"type": "array", "items": {"$ref": "request"}} 57 | } 58 | }, 59 | { 60 | "id": "template", 61 | "type": "object", 62 | "properties": { 63 | "page_id": {"type": "string", "required": true}, 64 | "page_type": {"type": "string", "enum": ["links", "item"], "required": true}, 65 | "scrapes": {"type": "string", "required": true}, 66 | "url": {"type": "string", "format": "url", "required": true}, 67 | "extractors": {"additionalProperties": {"type": "array", "items": {"type": "string"}}, "required": true}, 68 | "annotated_body": {"type": "string", "required": true}, 69 | "original_body": {"type": "string", "required": true} 70 | } 71 | }, 72 | { 73 | "id": "link_extractor", 74 | "type": "object", 75 | "properties": { 76 | "type": {"type": "string", "required": true}, 77 | "value": {"type": "any", "required": true}, 78 | "additionalProperties": {"type": "any"} 79 | } 80 | }, 81 | { 82 | "id": "request", 83 | "type": "object", 84 | "properties": { 85 | "type": {"type": "string", "required": true}, 86 | "link_extractor": {"$ref": "link_extractor"}, 87 | "additionalProperties": {"type": "any"} 88 | } 89 | } 90 | ] 91 | -------------------------------------------------------------------------------- /slyd/media/js/templates/topbar-browse.handlebars: -------------------------------------------------------------------------------- 1 |
2 | 5 | 6 |
7 | 8 | {{view ASTool.ButtonView action="browseBack" icon="ui-icon-arrowthick-1-w" disabled=browseBackDisabled}} 9 | 10 | 11 | {{view ASTool.ButtonView action="reload" icon="ui-icon-arrowrefresh-1-e" disabled=reloadDisabled}} 12 | 13 | 14 | 15 | {{#view ASTool.LabelWithTooltip title=controller.currentUrl}} 16 | {{controller.currentUrl}} 17 | {{/view}} 18 | 19 | 20 | {{#unless addTemplateDisabled}} 21 | 22 | {{view ASTool.ButtonView action="addTemplate" label="Annotate this page" class="red-button"}} 23 | 24 | {{/unless}} 25 | 26 | {{#unless showItemsDisabled}} 27 | 28 | {{view ASTool.ButtonView action="toggleShowItems" label=itemsButtonLabel class="black-button"}} 29 | 30 | {{/unless}} 31 | 32 | {{#if showNoItemsExtracted}} 33 | 34 | No items extracted 35 | 36 | {{/if}} 37 | 38 | {{#if saving}} 39 | 40 | Saving spider... 41 | 42 | {{/if}} 43 | 44 | 45 | {{#if showItems}} 46 | {{#if controller.extractedItems.length}} 47 |
48 | 49 | {{view ASTool.ButtonView action="toggleShowItems" class="clear-button" icon="ui-icon-closethick"}} 50 | 51 |

Displaying {{controller.extractedItems.length}} extracted items

52 |
53 | {{#each controller.extractedItems}} 54 |
55 | {{#view ASTool.ExtractedItemView extractedItem=this}} 56 | {{#each view.textFields}} 57 |
58 | {{this.name}}: 59 | {{this.value}} 60 |
61 | {{/each}} 62 | {{#each view.imageFields}} 63 |
64 | {{this.name}}: 65 | {{#each this.value}} 66 | {{this}} 67 |
{{view ASTool.ImageView src=this width="200px"}}
68 | {{/each}} 69 |
70 | {{/each}} 71 | {{#if view.variants}} 72 |

Item variants

73 | {{#each view.variants}} 74 |
75 | {{#each fields}} 76 |
77 | {{this.name}}: 78 | {{this.value}} 79 |
80 | {{/each}} 81 |
82 | {{/each}} 83 | {{/if}} 84 | {{/view}} 85 |
86 | {{/each}} 87 |
88 |
89 | {{/if}} 90 | {{/if}} 91 |
92 |
93 | -------------------------------------------------------------------------------- /slyd/tests/resources/data/projects/test/items.json: -------------------------------------------------------------------------------- 1 | { 2 | "accomodation": { 3 | "fields": { 4 | "address": { 5 | "required": false, 6 | "type": "text", 7 | "vary": true 8 | }, 9 | "id": { 10 | "required": true, 11 | "type": "text", 12 | "vary": false 13 | }, 14 | "name": { 15 | "required": true, 16 | "type": "text", 17 | "vary": true 18 | }, 19 | "price": { 20 | "required": true, 21 | "type": "text", 22 | "vary": true 23 | }, 24 | "services": { 25 | "required": false, 26 | "type": "text", 27 | "vary": true 28 | }, 29 | "url": { 30 | "required": false, 31 | "type": "url", 32 | "vary": true 33 | } 34 | } 35 | }, 36 | "default": { 37 | "fields": { 38 | "description": { 39 | "required": false, 40 | "type": "safe html", 41 | "vary": false 42 | }, 43 | "image_url": { 44 | "required": false, 45 | "type": "image", 46 | "vary": false 47 | }, 48 | "images": { 49 | "required": false, 50 | "type": "image", 51 | "vary": false 52 | }, 53 | "name": { 54 | "required": false, 55 | "type": "text", 56 | "vary": false 57 | }, 58 | "price": { 59 | "required": false, 60 | "type": "number", 61 | "vary": false 62 | }, 63 | "url": { 64 | "required": false, 65 | "type": "url", 66 | "vary": false 67 | } 68 | } 69 | }, 70 | "pin": { 71 | "fields": { 72 | "image": { 73 | "required": true, 74 | "type": "image", 75 | "vary": true 76 | }, 77 | "name": { 78 | "required": true, 79 | "type": "text", 80 | "vary": false 81 | }, 82 | "url": { 83 | "required": false, 84 | "type": "url", 85 | "vary": true 86 | } 87 | } 88 | }, 89 | "recipe": { 90 | "fields": { 91 | "directions": { 92 | "required": true, 93 | "type": "text", 94 | "vary": true 95 | }, 96 | "id": { 97 | "required": true, 98 | "type": "text", 99 | "vary": false 100 | }, 101 | "image": { 102 | "required": true, 103 | "type": "text", 104 | "vary": true 105 | }, 106 | "ingredients": { 107 | "required": true, 108 | "type": "text", 109 | "vary": true 110 | }, 111 | "name": { 112 | "required": true, 113 | "type": "text", 114 | "vary": true 115 | }, 116 | "url": { 117 | "required": false, 118 | "type": "url", 119 | "vary": true 120 | } 121 | } 122 | } 123 | } -------------------------------------------------------------------------------- /slybot/slybot/linkextractor/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Link extraction for auto scraping 3 | """ 4 | import re, os, posixpath 5 | from urlparse import urlparse 6 | from scrapy.linkextractor import IGNORED_EXTENSIONS 7 | 8 | _ONCLICK_LINK_RE = re.compile("(?P('|\"))(?P.+?)(?P=sep)") 9 | 10 | _ignored_exts = frozenset(['.' + e for e in IGNORED_EXTENSIONS]) 11 | 12 | # allowed protocols 13 | ALLOWED_SCHEMES = frozenset(['http', 'https', None, '']) 14 | 15 | class BaseLinkExtractor(object): 16 | 17 | def __init__(self, max_url_len=2083, ignore_extensions=_ignored_exts, 18 | allowed_schemes=ALLOWED_SCHEMES): 19 | """Creates a new LinkExtractor 20 | 21 | The defaults are a good guess for the first time crawl. After that, we 22 | expect that they can be learned. 23 | """ 24 | self.max_url_len = max_url_len 25 | self.ignore_extensions = ignore_extensions 26 | self.allowed_schemes = allowed_schemes 27 | 28 | def _extract_links(self, source): 29 | raise NotImplementedError 30 | 31 | def links_to_follow(self, source): 32 | """Returns normalized extracted links""" 33 | for link in self._extract_links(source): 34 | link = self.normalize_link(link) 35 | if link is not None: 36 | yield link 37 | 38 | def normalize_link(self, link): 39 | """Normalize a link 40 | 41 | >>> from scrapy.link import Link 42 | >>> le = BaseLinkExtractor() 43 | >>> l = Link('http://scrapinghub.com/some/path/../dir') 44 | >>> le.normalize_link(l).url 45 | 'http://scrapinghub.com/some/dir' 46 | >>> l = Link('http://scrapinghub.com/some//./path/') 47 | >>> le.normalize_link(l).url 48 | 'http://scrapinghub.com/some/path/' 49 | 50 | Files with disallowed extentions or protocols are not returned 51 | >>> le.normalize_link(Link('myimage.jpg')) is None 52 | True 53 | >>> le.normalize_link(Link('file:///tmp/mydoc.htm')) is None 54 | True 55 | >>> le.normalize_link(Link('http://scrapinghub.com')).url 56 | 'http://scrapinghub.com/' 57 | 58 | Fragments are removed 59 | >>> le.normalize_link(Link('http://example.com/#something')).url 60 | 'http://example.com/' 61 | >>> le.normalize_link(Link('http://example.com/#something')).fragment 62 | 'something' 63 | >>> le.normalize_link(Link('http://scrapinghub.com#some fragment')).url 64 | 'http://scrapinghub.com/' 65 | 66 | Ajax crawling 67 | >>> le.normalize_link(Link('http://example.com/#!something')).url 68 | 'http://example.com/?_escaped_fragment_=something' 69 | >>> le.normalize_link(Link('http://example.com/page.html?arg=1#!something')).url 70 | 'http://example.com/page.html?arg=1&_escaped_fragment_=something' 71 | """ 72 | if len(link.url) > self.max_url_len: 73 | return 74 | parsed = urlparse(link.url) 75 | extention = os.path.splitext(parsed.path)[1].lower() 76 | if parsed.scheme not in self.allowed_schemes or \ 77 | extention in self.ignore_extensions: 78 | return 79 | # path normalization 80 | path = parsed.path or '/' 81 | path = path if path[0] != '.' else '/' + path 82 | path = posixpath.normpath(path) 83 | if parsed.path.endswith('/') and not path.endswith('/'): 84 | path += '/' 85 | if parsed.fragment.startswith('!'): 86 | query = '_escaped_fragment_=%s' % parsed.fragment[1:] 87 | query = parsed.query + '&' + query if parsed.query else query 88 | parsed = parsed._replace(query=query) 89 | link.fragment = parsed.fragment 90 | if path != parsed.path or parsed.fragment: 91 | link.url = parsed._replace(path=path, fragment='').geturl() 92 | return link 93 | 94 | 95 | -------------------------------------------------------------------------------- /slyd/slyd/html.py: -------------------------------------------------------------------------------- 1 | """ 2 | Removes JavaScript from HTML 3 | 4 | This module removes all existing JavaScript in an HTML document. 5 | 6 | """ 7 | import re 8 | from scrapely.htmlpage import HtmlTag, HtmlTagType, parse_html 9 | from slybot.utils import htmlpage_from_response 10 | from slybot.baseurl import insert_base_url 11 | from .annotations import add_tagids 12 | from .utils import serialize_tag 13 | 14 | ### Known weaknesses 15 | # Doesn't deal with JS hidden in CSS 16 | # Doesn't deal with meta redirect javascript URIs 17 | 18 | INTRINSIC_EVENT_ATTRIBUTES = ("onload", "onunload", "onclick", "ondblclick", 19 | "onmousedown", "onmouseup", "onmouseover", 20 | "onmousemove", "onmouseout", "onfocus", 21 | "onblur", "onkeypress", "onkeydown", 22 | "onkeyup", "onsubmit", "onreset", "onselect", 23 | "onchange", "onerror", "onbeforeunload") 24 | 25 | URI_ATTRIBUTES = ("action", "background", "cite", "classid", "codebase", 26 | "data", "href", "longdesc", "profile", "src", "usemap") 27 | 28 | AS_SCRIPT_REGION_BEGIN = "" 29 | AS_SCRIPT_REGION_END = "" 30 | 31 | _AS_COMMENT_BEGIN = "" 33 | _ENTITY_RE = re.compile("&#(\d+);") 34 | 35 | 36 | def _deentitize_unicode(mystr): 37 | """replaces all entities in the form &#\d+; by its 38 | unicode equivalent. 39 | """ 40 | return _ENTITY_RE.sub(lambda m: unichr(int(m.groups()[0])), mystr) 41 | 42 | 43 | def html4annotation(htmlpage, baseurl=None): 44 | """Convert the given html document for the annotation UI 45 | 46 | This adds tags, removes scripts and optionally adds a base url 47 | """ 48 | htmlpage = add_tagids(htmlpage) 49 | cleaned_html = descriptify(htmlpage) 50 | if baseurl: 51 | cleaned_html = insert_base_url(cleaned_html, baseurl) 52 | return cleaned_html 53 | 54 | 55 | def extract_html(response): 56 | """Extracts an html page from the response. 57 | """ 58 | return htmlpage_from_response(response).body 59 | 60 | 61 | def descriptify(doc): 62 | """Clean JavaScript in a html source string. 63 | """ 64 | parsed = parse_html(doc) 65 | newdoc = [] 66 | inserted_comment = False 67 | for element in parsed: 68 | if isinstance(element, HtmlTag): 69 | if not inserted_comment and element.tag == "script" and element.tag_type == HtmlTagType.OPEN_TAG: 70 | newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END) 71 | inserted_comment = True 72 | elif element.tag == "script" and element.tag_type == HtmlTagType.CLOSE_TAG: 73 | if inserted_comment: 74 | inserted_comment = False 75 | newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END) 76 | elif element.tag == "noscript": 77 | newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END) 78 | else: 79 | for key, val in element.attributes.copy().items(): 80 | # Empty intrinsic events 81 | if key in INTRINSIC_EVENT_ATTRIBUTES: 82 | element.attributes[key] = "" 83 | # Rewrite javascript URIs 84 | elif key in URI_ATTRIBUTES and val is not None and "javascript:" in _deentitize_unicode(val): 85 | element.attributes[key] = "about:blank" 86 | else: 87 | continue 88 | newdoc.append(serialize_tag(element)) 89 | else: 90 | text = doc[element.start:element.end] 91 | if inserted_comment and text.strip() and not (text.startswith("")): 92 | newdoc.append(_AS_COMMENT_BEGIN + text + _AS_COMMENT_END) 93 | else: 94 | newdoc.append(text) 95 | 96 | return ''.join(newdoc) 97 | -------------------------------------------------------------------------------- /slyd/slyd/projects.py: -------------------------------------------------------------------------------- 1 | """ 2 | Projects Resource 3 | 4 | Manages listing/creation/deletion/renaming of slybot projects on 5 | the local filesystem. Routes to the appropiate resource for fetching 6 | pages and project spec manipulation. 7 | """ 8 | 9 | import json, re, shutil, errno, os 10 | from os.path import join 11 | from twisted.web.resource import NoResource 12 | from .resource import SlydJsonResource 13 | 14 | 15 | # stick to alphanum . and _. Do not allow only .'s (so safe for FS path) 16 | _INVALID_PROJECT_RE = re.compile('[^A-Za-z0-9._]|^\.*$') 17 | 18 | 19 | def allowed_project_name(name): 20 | return not _INVALID_PROJECT_RE.search(name) 21 | 22 | 23 | class ProjectsResource(SlydJsonResource): 24 | 25 | def __init__(self, settings): 26 | SlydJsonResource.__init__(self) 27 | self.projectsdir = settings['SPEC_DATA_DIR'] 28 | 29 | def getChildWithDefault(self, project_path_element, request): 30 | # TODO: check exists, user has access, etc. 31 | # rely on the CrawlerSpec for this as storage and auth 32 | # can be customized 33 | request.project = project_path_element 34 | try: 35 | next_path_element = request.postpath.pop(0) 36 | except IndexError: 37 | next_path_element = None 38 | if next_path_element not in self.children: 39 | raise NoResource("No such child resource.") 40 | request.prepath.append(project_path_element) 41 | return self.children[next_path_element] 42 | 43 | def list_projects(self): 44 | try: 45 | for fname in os.listdir(self.projectsdir): 46 | if os.path.isdir(os.path.join(self.projectsdir, fname)): 47 | yield fname 48 | except OSError as ex: 49 | if ex.errno != errno.ENOENT: 50 | raise 51 | 52 | def create_project(self, project_name): 53 | project_filename = self.project_filename(project_name) 54 | os.makedirs(project_filename) 55 | with open(join(project_filename, 'project.json'), 'wb') as outf: 56 | outf.write('{}') 57 | os.makedirs(join(project_filename, 'spiders')) 58 | 59 | def rename_project(self, from_name, to_name): 60 | os.rename(self.project_filename(from_name), 61 | self.project_filename(to_name)) 62 | 63 | def remove_project(self, name): 64 | shutil.rmtree(self.project_filename(name)) 65 | 66 | def project_filename(self, project_name): 67 | return join(self.projectsdir, project_name) 68 | 69 | def handle_project_command(self, command_spec): 70 | command = command_spec.get('cmd') 71 | dispatch_func = self.project_commands.get(command) 72 | if dispatch_func is None: 73 | self.bad_request( 74 | "unrecognised cmd arg %s, available commands: %s" % 75 | (command, ', '.join(self.project_commands.keys()))) 76 | args = command_spec.get('args', []) 77 | for project in args: 78 | if not allowed_project_name(project): 79 | self.bad_request('invalid project name %s' % project) 80 | try: 81 | retval = dispatch_func(self, *args) 82 | except TypeError: 83 | self.bad_request("incorrect args for %s" % command) 84 | except OSError as ex: 85 | if ex.errno == errno.ENOENT: 86 | self.error(404, "Not Found", "No such resource") 87 | elif ex.errno == errno.EEXIST or ex.errno == errno.ENOTEMPTY: 88 | self.bad_request("A project with that name already exists") 89 | raise 90 | return retval or '' 91 | 92 | def render_GET(self, request): 93 | request.write(json.dumps(sorted(self.list_projects()))) 94 | return '\n' 95 | 96 | def render_POST(self, request): 97 | obj = self.read_json(request) 98 | return self.handle_project_command(obj) 99 | 100 | project_commands = { 101 | 'create': create_project, 102 | 'mv': rename_project, 103 | 'rm': remove_project 104 | } 105 | -------------------------------------------------------------------------------- /slyd/tests/test_spec.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tempfile import mkdtemp 3 | from os.path import join, basename 4 | from shutil import rmtree 5 | from distutils.dir_util import copy_tree 6 | from twisted.trial import unittest 7 | from twisted.internet.defer import inlineCallbacks 8 | from slyd.crawlerspec import create_crawler_spec_resource 9 | from slyd.crawlerspec import convert_spider_templates 10 | from .utils import TestSite, test_spec_manager 11 | from .settings import SPEC_DATA_DIR 12 | 13 | 14 | class CrawlerSpecTest(unittest.TestCase): 15 | spider = """ 16 | { 17 | "exclude_patterns": [], 18 | "follow_patterns": [ 19 | ".+MobileHomePark.php?key=d+" 20 | ], 21 | "links_to_follow": "patterns", 22 | "respect_nofollow": true, 23 | "start_urls": [ 24 | "http://www.mhvillage.com/" 25 | ], 26 | "templates": [] 27 | } 28 | """ 29 | 30 | def setUp(self): 31 | sm = test_spec_manager() 32 | spec_resource = create_crawler_spec_resource(sm) 33 | self.temp_project_dir = mkdtemp(dir=SPEC_DATA_DIR, 34 | prefix='test-run-') 35 | self.project = basename(self.temp_project_dir) 36 | self.specsite = TestSite(spec_resource, project=self.project) 37 | test_project_dir = join(SPEC_DATA_DIR, 'test') 38 | copy_tree(test_project_dir, self.temp_project_dir) 39 | 40 | @inlineCallbacks 41 | def _get_check_resource(self, resource, converter=None): 42 | result = yield self.specsite.get(resource) 43 | ffile = join(self.temp_project_dir, resource + ".json") 44 | fdata = json.load(open(ffile)) 45 | if converter: 46 | converter(fdata) 47 | rdata = json.loads(result.value()) 48 | self.assertEqual(fdata, rdata) 49 | 50 | def test_get_resource(self): 51 | self._get_check_resource("project") 52 | self._get_check_resource("spiders/pinterest.com", 53 | convert_spider_templates) 54 | 55 | @inlineCallbacks 56 | def post_command(self, spider, cmd, *args, **kwargs): 57 | obj = {'cmd': cmd, 'args': args} 58 | result = yield self.specsite.post(spider, data=json.dumps(obj)) 59 | self.assertEqual(result.responseCode, kwargs.get('expect', 200)) 60 | 61 | @inlineCallbacks 62 | def test_updating(self): 63 | result = yield self.specsite.post('spiders/testpost', data=self.spider) 64 | self.assertEqual(result.responseCode, 200) 65 | result = yield self.specsite.get('spiders/testpost') 66 | self.assertEqual(json.loads(result.value()), json.loads(self.spider)) 67 | 68 | # should fail - missing required fields 69 | result = yield self.specsite.post('spiders/testpost', data='{}') 70 | self.assertEqual(result.responseCode, 400) 71 | 72 | @inlineCallbacks 73 | def test_commands(self): 74 | self.post_command('spiders', 'unknown', expect=400) 75 | self.post_command('spiders', 'mv', expect=400) 76 | self.post_command('spiders', 'mv', '../notallowed', 'whatever', expect=400) 77 | self.post_command('spiders', 'mv', 'notallowedexists', 'whatever', expect=404) 78 | self.post_command('spiders', 'rm', 'notexists', expect=404) 79 | # TODO: mv to existing spider - 400 80 | yield self.specsite.post('spiders/c', data=self.spider) 81 | self._get_check_resource('spiders/c') 82 | self.post_command('spiders', 'mv', 'c', 'c2') 83 | result = yield self.specsite.get('spiders/c') 84 | self.assertEqual(result.value(), '{}\n') 85 | self._get_check_resource('spiders/c2') 86 | yield self.specsite.post('spiders/c3', data=self.spider) 87 | # overwrites 88 | self.post_command('spiders', 'mv', 'c2', 'c3') 89 | result = yield self.specsite.get('spiders/c2') 90 | self.assertEqual(result.value(), '{}\n') 91 | self.post_command('spiders', 'rm', 'c3') 92 | result = yield self.specsite.get('spiders/c3') 93 | self.assertEqual(result.value(), '{}\n') 94 | 95 | def tearDown(self): 96 | rmtree(self.temp_project_dir) 97 | -------------------------------------------------------------------------------- /slyd/media/js/emberui.js: -------------------------------------------------------------------------------- 1 | /* Ember - JQuery UI integration */ 2 | 3 | // Put jQuery UI inside its own namespace 4 | JQ = Ember.Namespace.create(); 5 | 6 | // Create a new mixin for jQuery UI widgets using the Ember 7 | // mixin syntax. 8 | JQ.Widget = Em.Mixin.create({ 9 | // When Ember creates the view's DOM element, it will call this 10 | // method. 11 | didInsertElement: function() { 12 | // Make jQuery UI options available as Ember properties 13 | var options = this._gatherOptions(); 14 | 15 | // Make sure that jQuery UI events trigger methods on this view. 16 | this._gatherEvents(options); 17 | 18 | // Create a new instance of the jQuery UI widget based on its `uiType` 19 | // and the current element. 20 | var ui; 21 | if (jQuery.ui[this.get('uiType')]) { 22 | ui = jQuery.ui[this.get('uiType')](options, this.get('element')); 23 | } else { 24 | ui = jQuery.custom[this.get('uiType')](options, this.get('element')); 25 | } 26 | 27 | // Save off the instance of the jQuery UI widget as the `ui` property 28 | // on this Ember view. 29 | this.set('ui', ui); 30 | }, 31 | 32 | // When Ember tears down the view's DOM element, it will call 33 | // this method. 34 | willDestroyElement: function() { 35 | var ui = this.get('ui'); 36 | 37 | if (ui) { 38 | // Tear down any observers that were created to make jQuery UI 39 | // options available as Ember properties. 40 | var observers = this._observers; 41 | for (var prop in observers) { 42 | if (observers.hasOwnProperty(prop)) { 43 | this.removeObserver(prop, observers[prop]); 44 | } 45 | } 46 | ui._destroy(); 47 | } 48 | }, 49 | 50 | // Each jQuery UI widget has a series of options that can be configured. 51 | // For instance, to disable a button, you call 52 | // `button.options('disabled', true)` in jQuery UI. To make this compatible 53 | // with Ember bindings, any time the Ember property for a 54 | // given jQuery UI option changes, we update the jQuery UI widget. 55 | _gatherOptions: function() { 56 | var uiOptions = this.get('uiOptions'), options = {}; 57 | // The view can specify a list of jQuery UI options that should be treated 58 | // as Ember properties. 59 | uiOptions.forEach(function(key) { 60 | options[key] = this.get(key); 61 | 62 | // Set up an observer on the Ember property. When it changes, 63 | // call jQuery UI's `option` method to reflect the property onto 64 | // the jQuery UI widget. 65 | var observer = function() { 66 | var value = this.get(key); 67 | this.get('ui').option(key, value); 68 | }; 69 | 70 | this.addObserver(key, observer); 71 | 72 | // Insert the observer in a Hash so we can remove it later. 73 | this._observers = this._observers || {}; 74 | this._observers[key] = observer; 75 | }, this); 76 | return options; 77 | }, 78 | 79 | // Each jQuery UI widget has a number of custom events that they can 80 | // trigger. For instance, the progressbar widget triggers a `complete` 81 | // event when the progress bar finishes. Make these events behave like 82 | // normal Ember events. For instance, a subclass of JQ.ProgressBarView 83 | // could implement the `complete` method to be notified when the jQuery 84 | // UI widget triggered the event. 85 | _gatherEvents: function(options) { 86 | var uiEvents = this.get('uiEvents') || [], self = this; 87 | 88 | uiEvents.forEach(function(event) { 89 | var callback = self[event]; 90 | 91 | if (callback) { 92 | // You can register a handler for a jQuery UI event by passing 93 | // it in along with the creation options. Update the options hash 94 | // to include any event callbacks. 95 | options[event] = function(event, ui) { callback.call(self, event, ui); }; 96 | } 97 | }); 98 | } 99 | }); -------------------------------------------------------------------------------- /slybot/docs/spiderlets.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Spiderlets 3 | ========== 4 | 5 | .. warning:: 6 | 7 | This functionality is experimental and its API may undergo changes from time 8 | to time. Use at your own risk and please contribute feedback to make it 9 | stable. 10 | 11 | 12 | The slybot spider alone is not able to solve all the crawling and extraction difficulties that may arise for every possible case: data 13 | presented in a way partially or not suitable for the similarity algorithm, arbitrary post data and ajax requests, complex url 14 | normalization not handled by an addon, etc. Spiderlets are a method to extend any AS spider in a way that everything it is possible 15 | to do with a normal scrapy spider, can be done with a spiderlet. 16 | 17 | Spiderlets are handled by a spider middleware. In order to be enabled, the setting ``SPIDERLETS_MODULE`` must be present, with the 18 | value being the module name that contains the spiderlets submodules. For example, if your spiderlets are in the module 19 | ``mylib.spiderlets``, then you set the value of ``SPIDERLETS_MODULE`` to ``mylib.spiderlets``. 20 | 21 | What is a spiderlet 22 | ___________________ 23 | 24 | A spiderlet is an instance of a python class which implements at least one of the predefined methods described below. In order to 25 | attach a spiderlet to a given spider, you use the class attribute ``name``. The value of this attribute must match the name of the 26 | spider: 27 | 28 | :: 29 | 30 | class MySpiderlet: 31 | name = "myspider" 32 | 33 | def process_request(self, request, response): 34 | ... 35 | return request 36 | 37 | def process_item(self, item, response): 38 | ... 39 | return item 40 | 41 | def process_start_request(self, request): 42 | ... 43 | return request 44 | 45 | def parse_login_page(self, response): 46 | ... 47 | return request 48 | 49 | 50 | Three of the methods, ``process_request``, ``process_item`` and ``process_start_request``, are attached to the output of the 51 | autoscraping spider. The autoscraping spider generates two kind of objects: a request or an item. And depending on the kind 52 | of object generated and the source, the spiderlet addon passes it to one or another method of your spiderlet. So, each item 53 | issued by the spider is passed to ``process_item``, and each request is passed to ``process_start_request`` or ``process_request``, 54 | depending on the source: a start url, or a request generated by the spider as a consequence of a link extraction from a received 55 | response. Starting requests usually need to be processed in a different way, and they don't have an associated response from which 56 | they were generated. 57 | 58 | ``process_request`` and ``process_start_request`` are commonly used for normalize request url, filter it, or override the request 59 | callback (eventually defined as a new method of the spiderlet). The default callback for every request generated by the spider is 60 | the ``parse`` method of the autoscraping spider. Whenever you need to generate a FormRequest for sending post 61 | data or simulating an AJAX call, you will need to create your own callbacks in the spiderlet and point the request callbacks to them. 62 | 63 | ``process_item`` is mostly used for item post processing. An important feature is that the values of the item fields returned by the 64 | autoscraping spider are always lists, even if it is single valued. So you have to consider this fact when you are accessing the item 65 | fields inside the ``process_item`` method. There is no restriction, however, on the types of data contained in the items returned by the 66 | spiderlet. 67 | 68 | The fourth method, ``parse_login_page``, is of a different kind. Instead of process an output from the spider, it process an 69 | incoming response, and it is applied only to those responses which its callback is the ``parse_login_page`` method of the autoscraping 70 | spider. If you define ``parse_login_page`` method in your spiderlet, the request callback will be overridden by this new one. This 71 | feature allows to write your own login handler when the slybot default one (based on the generic solution implemented in the 72 | `loginforms library `_) does not fit well for a given case. 73 | 74 | Another very practical feature of a spiderlet is that you can access the autoscraping spider methods and attributes with the spiderlet 75 | attribute ``self.spider``. ``self.spider.log`` or ``self.spider.parse`` are among the most commonly methods needed to be accessed from 76 | the spiderlet. 77 | 78 | -------------------------------------------------------------------------------- /slyd/media/js/vendor/ic-ajax.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * ic-ajax 3 | * 4 | * - (c) 2013 Instructure, Inc 5 | * - please see license at https://github.com/instructure/ic-ajax/blob/master/LICENSE 6 | * - inspired by discourse ajax: https://github.com/discourse/discourse/blob/master/app/assets/javascripts/discourse/mixins/ajax.js#L19 7 | */ 8 | 9 | ;(function(root, factory) { 10 | if (typeof define === 'function' && define.amd) { 11 | define(['ember'], function(Ember) { return factory(Ember); }); 12 | } else if (typeof exports === 'object') { 13 | module.exports = factory(require('ember')); 14 | } else { 15 | root.ic = root.ic || {}; 16 | root.ic.ajax = factory(Ember); 17 | } 18 | }(this, function(Ember) { 19 | 20 | /* 21 | * jQuery.ajax wrapper, supports the same signature except providing 22 | * `success` and `error` handlers will throw an error (use promises instead) 23 | * and it resolves only the response (no access to jqXHR or textStatus). 24 | */ 25 | 26 | var ajax = function() { 27 | return ajax.raw.apply(null, arguments).then(function(result) { 28 | return result.response; 29 | }); 30 | }; 31 | 32 | /* 33 | * Same as `ajax` except it resolves an object with `{response, textStatus, 34 | * jqXHR}`, useful if you need access to the jqXHR object for headers, etc. 35 | */ 36 | 37 | ajax.raw = function() { 38 | return makePromise(parseArgs.apply(null, arguments)); 39 | }; 40 | 41 | /* 42 | * Defines a fixture that will be used instead of an actual ajax 43 | * request to a given url. This is useful for testing, allowing you to 44 | * stub out responses your application will send without requiring 45 | * libraries like sinon or mockjax, etc. 46 | * 47 | * For example: 48 | * 49 | * ajax.defineFixture('/self', { 50 | * response: { firstName: 'Ryan', lastName: 'Florence' }, 51 | * textStatus: 'success' 52 | * jqXHR: {} 53 | * }); 54 | * 55 | * @param {String} url 56 | * @param {String} method 57 | * @param {Object} fixture 58 | */ 59 | ajax.defineFixture = function(url, method, fixture) { 60 | ajax.FIXTURES = ajax.FIXTURES || {}; 61 | ajax.FIXTURES[method + '_' + url] = {data: fixture, callCount: 0}; 62 | }; 63 | 64 | /* 65 | * Looks up a fixture by url and HTTP method. 66 | * 67 | * @param {String} url 68 | * @param {String} method 69 | */ 70 | ajax.lookupFixture = function(url, method) { 71 | var fixture = ajax.FIXTURES && ajax.FIXTURES[method + '_' + url]; 72 | if (fixture) { 73 | fixture.callCount += 1; 74 | return fixture.data; 75 | }; 76 | }; 77 | 78 | /* 79 | * Looks up how many times a fixture has been called by url and HTTP method. 80 | * 81 | * @param {String} url 82 | * @param {String} method 83 | */ 84 | ajax.callCount = function(url, method) { 85 | var fixture = ajax.FIXTURES && ajax.FIXTURES[method + '_' + url]; 86 | if (fixture) { 87 | return fixture.callCount; 88 | }; 89 | return -1; 90 | } 91 | 92 | function makePromise(settings) { 93 | return new Ember.RSVP.Promise(function(resolve, reject) { 94 | var fixture = ajax.lookupFixture(settings.url, settings.type); 95 | if (fixture) { 96 | return resolve(fixture); 97 | } 98 | settings.success = makeSuccess(resolve, reject); 99 | settings.error = makeError(resolve, reject); 100 | Ember.$.ajax(settings); 101 | }); 102 | }; 103 | 104 | function parseArgs() { 105 | var settings = {}; 106 | if (arguments.length === 1) { 107 | if (typeof arguments[0] === "string") { 108 | settings.url = arguments[0]; 109 | } else { 110 | settings = arguments[0]; 111 | } 112 | } else if (arguments.length === 2) { 113 | settings = arguments[1]; 114 | settings.url = arguments[0]; 115 | } 116 | if (settings.success || settings.error) { 117 | throw new Error("ajax should use promises, received 'success' or 'error' callback"); 118 | } 119 | return settings; 120 | } 121 | 122 | function makeSuccess(resolve, reject) { 123 | return function(response, textStatus, jqXHR) { 124 | Ember.run(null, resolve, { 125 | response: response, 126 | textStatus: textStatus, 127 | jqXHR: jqXHR 128 | }); 129 | } 130 | } 131 | 132 | function makeError(resolve, reject) { 133 | return function(jqXHR, textStatus, errorThrown) { 134 | Ember.run(null, reject, { 135 | jqXHR: jqXHR, 136 | textStatus: textStatus, 137 | errorThrown: errorThrown 138 | }); 139 | }; 140 | } 141 | 142 | return ajax; 143 | 144 | })); 145 | -------------------------------------------------------------------------------- /slyd/media/js/templates/toolbox-template.handlebars: -------------------------------------------------------------------------------- 1 |
2 | {{#view ASTool.RenameTextField value=name}} 3 |
Template {{name}}
4 | {{/view}} 5 |
6 | 7 |
8 |

Annotations

9 |
10 | {{#if annotations}} 11 |
12 | Map attribute 13 | To field 14 |
15 |
16 | {{#each annotations}} 17 | {{view ASTool.AnnotationWidget annotation=this templateName="annotation-widget"}} 18 | {{/each}} 19 |
20 | {{else}} 21 |
No annotations have been created yet.
22 | {{/if}} 23 |
24 | 25 |

Extracted item

26 |
27 | {{view ASTool.ItemSelect value=scrapes}} 28 | {{view ASTool.InlineHelp message="select_item"}} 29 |
30 | {{view ASTool.ButtonView action="editItems" argument=this label="Edit items" name="editItems" class="blue-button"}} 31 |
32 |
33 | 34 |

Extractors

35 |
36 |
37 | {{#each mappedFieldsData}} 38 | {{#view ASTool.ExtractorDropTarget fieldName=fieldName dragging=controller.draggingExtractor}} 39 |
40 | {{fieldName}}[+Drop here] 41 | {{#if extractors}} 42 |
43 | {{/if}} 44 | {{#each extractors}} 45 |
46 | {{#view ASTool.ExtractorView extractor=this}} 47 | 48 | {{view.extractorTypeLabel}} {{view.extractorDefinition}} 49 | 50 | {{/view}} 51 |
52 | {{/each}} 53 |
54 | {{/view}} 55 | {{else}} 56 |
No field mappings have been defined yet.
57 | {{/each}} 58 |
59 |
60 |

Drag extractors to the fields above

61 | {{view ASTool.InlineHelp message="extractors"}} 62 |
63 | {{#each extractors}} 64 |
65 | {{#view ASTool.ExtractorView extractor=this}} 66 | 67 | {{view.extractorTypeLabel}} {{view.extractorDefinition}} 68 | 69 | {{/view}} 70 | {{view ASTool.ButtonView action="deleteExtractor" argument=this icon="ui-icon-trash" class="red-button"}} 71 |
72 | {{else}} 73 |
No extractors have been created yet.
74 | {{/each}} 75 |
76 |
77 |
78 | {{view ASTool.TextField value=newReExtractor width='160px' name='reExtractorField' placeholder='Enter a RegEx' action="createExtractor"}} 79 |
- or choose a type -
80 |
{{view ASTool.TypeSelect value=newTypeExtractor name='typeExtractorCombo' width='140px'}}
81 |
82 |
83 | {{view ASTool.ButtonView action="createExtractor" icon="ui-icon-circle-plus" label="New extractor" disabled=createExtractorDisabled class="blue-button"}} 84 |
85 |
86 | 87 |
88 |

Required fields

89 |
90 | {{view ASTool.InlineHelp message="template_required"}} 91 |

Check the fields you want to make required for this template:

92 |
93 | {{#each mappedFieldsData}} 94 |
95 | {{view ASTool.RequiredFieldCheckbox checked=this.required fieldName=this.fieldName}} 96 | {{this.fieldName}} 97 |
98 | {{else}} 99 |
No field mappings have been defined yet.
100 | {{/each}} 101 |
102 |
103 |
104 | 105 | {{#if controller.showFloatingAnnotationWidgetAt}} 106 | {{view ASTool.AnnotationWidget annotation=controller.floatingAnnotation inDoc=true pos=controller.showFloatingAnnotationWidgetAt templateName="floating-annotation-widget" id="annotationWidget"}} 107 | {{/if}} -------------------------------------------------------------------------------- /slyd/README.md: -------------------------------------------------------------------------------- 1 | How to try it: 2 | -------------- 3 | 4 | The recommended way to install dependencies is to use virtualenv and 5 | then do: 6 | 7 | pip install -r requirements.txt 8 | 9 | Run the server using: 10 | 11 | twistd -n slyd 12 | 13 | and point your browser to: 14 | http://localhost:9001/static/main.html 15 | 16 | Chrome and Firefox are supported, but it works better with chrome. 17 | 18 | Slyd API Notes 19 | -------------- 20 | 21 | This will be moved to separate docs - it's currently some notes for developers 22 | 23 | All resources are either under /static/ or /projects/. 24 | 25 | 26 | project listing/creation/deletion/renaming 27 | 28 | To get list all existing projects, just GET http://localhost:9001/projects: 29 | 30 | $ curl http://localhost:9001/projects -> ["project1", "project2"] 31 | 32 | New projects can be created by posting to /projects, for example: 33 | 34 | $ curl -d '{"cmd": "create", "args": ["project_X"]}' http://localhost:9001/projects 35 | 36 | To delete a project: 37 | 38 | $ curl -d '{"cmd": "rm", "args": ["project_X"]}' http://localhost:9001/projects 39 | 40 | To rename a project: 41 | 42 | $ curl -d '{"cmd": "mv", "args": ["oldname", "newname"]}' http://localhost:9001/projects 43 | 44 | Please note that projects will not be overwritten when renaming or creating new ones (if a project 45 | with the given name already exists an error from the 400 family will be returned). 46 | 47 | spec 48 | 49 | The project specification is available under /projects/PROJECT_ID/spec. The path format 50 | mirrors the slybot format documented here: 51 | http://slybot.readthedocs.org/en/latest/project.html 52 | 53 | Currently, this is read only, but it will soon support PUT/POST. 54 | 55 | The entire spec is returned for a GET request to the root: 56 | 57 | $ curl http://localhost:9001/projects/78/spec 58 | {"project": { 59 | "version": "1308771278", 60 | "name": "demo" 61 | .. 62 | } 63 | 64 | A list of available spiders can be retrieved: 65 | 66 | $ curl http://localhost:9001/projects/78/spec/spiders 67 | ["accommodationforstudents.com", "food.com", "pinterest.com", "pin", "mhvillage"] 68 | 69 | and specific resources can be requested: 70 | 71 | $ curl http://localhost:9001/projects/78/spec/spiders/accommodationforstudents.com 72 | { 73 | "templates": 74 | ... 75 | "respect_nofollow": true 76 | } 77 | 78 | The spec can be updating by POSTing: 79 | 80 | $ curl --data @newlinkedin.js http://localhost:9001/projects/78/spec/spiders/linkedin 81 | 82 | An HTTP 400 will be returned if the uploaded spec does not validate. 83 | 84 | Basic commands are available for manipulating spider files. For example: 85 | 86 | $ curl -d '{"cmd": "rm", "args": ["spidername"]}' http://localhost:9001/projects/78/spec/spiders 87 | 88 | Available commands are: 89 | * mv - move spider from first arg to second. If the second exists it is overwritten. 90 | * rm - delete spider 91 | 92 | 93 | bot/fetch 94 | 95 | Accepts json object with the following fields: 96 | * request - same as scrapy requst object. At least needs a url 97 | * spider - spider name within in the project 98 | * page_id - unique ID for this page, must match the id used in templates (not yet implemented) 99 | * parent_fp - fingerprint of parent request. This is used for managing referrer url, cookies, etc. 100 | 101 | Returns a json object containing (so far): 102 | * page - page content, not yet annotated but will be 103 | * response - object containing the response data: http code and headers 104 | * items - array of items extracted 105 | * fp - request fingerprint 106 | * error - error message, present if there was an error 107 | * links - array of links followed 108 | 109 | Coming soon in the response: 110 | * template_id - id of template that matched 111 | * trace - textual trace of the matching process - for debugging 112 | 113 | 114 | If you want to work on an existing project, put it in data/projects/PROJECTID, these can be downloaded from dash or by: 115 | 116 | $ bin/sh2sly data/projects -p 78 -k YOURAPIKEY 117 | 118 | Then you can extract data: 119 | 120 | $ curl -d '{"request": {"url": "http://www.pinterest.com/pin/339740365610932893/"}, "spider": "pinterest.com"}' http://localhost:9001/projects/78/bot/fetch 121 | { 122 | "fp": "0f2686acdc6a71eeddc49045b7cea0b6f81e6b61", 123 | "items": [ 124 | { 125 | "url": "http://www.pinterest.com/pin/339740365610932893/", 126 | "_template": "527387aa4d6c7133c6551481", 127 | "image": [ 128 | "http://media-cache-ak0.pinimg.com/736x/6c/c5/35/6cc5352046df0f8d8852cbdfb31542bb.jpg" 129 | ], 130 | "_type": "pin", 131 | "name": [ 132 | "Career Driven" 133 | ] 134 | } 135 | ], 136 | "page": "\n ...." 137 | } 138 | 139 | Testing 140 | ------- 141 | 142 | A Karma test eviroment is available. To run the ui tests: 143 | 144 | npm install 145 | export PATH="./node_modules/.bin:$PATH" 146 | karma start 147 | 148 | You can download npm from https://npmjs.org 149 | 150 | Look at karma.conf.js to configure test options. 151 | 152 | The tests are located in: 153 | media/tests 154 | 155 | slyd can be tested using twisted: 156 | 157 | trial tests 158 | -------------------------------------------------------------------------------- /slyd/media/js/templates/toolbox-annotation.handlebars: -------------------------------------------------------------------------------- 1 | 2 |
3 |

Annotation options

4 |
5 |
6 |

Selected region

7 |
8 |

Ancestors

{{view ASTool.InlineHelp message="selected_region_ancestors"}} 9 |
10 |
    11 | {{#each ancestorPaths}} 12 |
  • 13 | {{view ASTool.CSSPathWidget action="selectElement" argument=this.element label=this.label minWidth="73px" class="light-button"}} 14 |
  • 15 | {{/each}} 16 |
17 |
18 |
19 |

Children

{{view ASTool.InlineHelp message="selected_region_children"}} 20 |
21 |
    22 | {{#each childPaths}} 23 |
  • 24 | {{view ASTool.CSSPathWidget action="selectElement" argument=this.element label=this.label minWidth="73px" class="light-button"}} 25 |
  • 26 | {{/each}} 27 |
28 |
29 |
30 | 31 |

Attribute mappings

32 |
33 | {{#unless mappingAttribute}} 34 |

Not mapped

35 |
36 | {{#each unmappedAttributes}} 37 |
38 | {{#view ASTool.ElemAttributeView attribute=this}} 39 |
40 |
{{view.name}}
41 | 42 | {{view ASTool.ButtonView action="mapAttribute" argument=this icon="ui-icon-transferthick-e-w" class="blue-button"}} 43 | {{view ASTool.ButtonView action="makeSticky" argument=this icon="ui-icon-pin-s" class="blue-button"}} 44 | 45 |
46 |
{{view.value}}
47 | {{/view}} 48 |
49 | {{else}} 50 |
No unmapped attributes.
51 | {{/each}} 52 |
53 | 54 |
55 | 56 |

Mapped

57 |
58 | {{#each mappedAttributes}} 59 |
60 | {{#view ASTool.ElemAttributeView attribute=this}} 61 |
{{view.name}} -> {{view.field}}
62 | 63 | {{view ASTool.ButtonView action="unmapAttribute" argument=this icon="ui-icon-trash" class="red-button"}} 64 | 65 | {{/view}} 66 |
67 | {{else}} 68 |
No mapped attributes.
69 | {{/each}} 70 |
71 | 72 |
73 | 74 |

Required attributes

{{view ASTool.InlineHelp message="sticky_fields"}} 75 |
76 | {{#each stickyAttributes}} 77 |
78 | {{#view ASTool.ElemAttributeView attribute=this}} 79 |
{{view.name}}
80 | 81 | {{view ASTool.ButtonView action="unmapAttribute" argument=this icon="ui-icon-trash" class="red-button"}} 82 | 83 | {{/view}} 84 |
85 | {{else}} 86 |
No required attributes.
87 | {{/each}} 88 |
89 | {{else}} 90 | {{render "item" controller.scrapedItem}} 91 | {{/unless}} 92 |
93 | 94 |

Ignored subregions

95 |
96 |
97 | {{#each model.ignores}} 98 |
99 | {{#view ASTool.IgnoreWidget ignore=this}} 100 |
101 | 102 | {{view ASTool.CheckBox checked=this.ignoreBeneath name="ignoreBeneath"}} 103 |
104 | {{view ASTool.ButtonView action="deleteIgnore" argument=this icon="ui-icon-trash" class="red-button"}} 105 | {{/view}} 106 | 107 |
108 | {{else}} 109 |
No ignored subregions defined.
110 | {{/each}} 111 |
112 |
113 | {{view ASTool.ToggleButton checked=selectingIgnore icon="ui-icon-cancel" id="addIgnore" name="addIgnore"}} 114 | {{view ASTool.InlineHelp message="ignored_subregions"}} 115 |
116 |
117 | 118 |

Variant

119 |
120 |
121 | 122 | {{view ASTool.VariantSelect value=currentVariant}} 123 | {{view ASTool.InlineHelp message="variant"}} 124 |
125 |
126 |
127 |
128 |
129 | {{view ASTool.ButtonView action="doneEditing" icon="ui-icon-circle-check" label="Save changes" class="blue-button"}} 130 | {{view ASTool.ButtonView action="cancelEdit" icon="ui-icon-circle-close" label="Discard changes" class="red-button"}} 131 |
-------------------------------------------------------------------------------- /slyd/media/js/templates/toolbox-spider.handlebars: -------------------------------------------------------------------------------- 1 | 2 |
3 | {{#view ASTool.RenameTextField value=name}} 4 | Spider {{name}} 5 | {{/view}} 6 |
7 | 8 |
9 |

Initialize

10 |
11 |

Start Pages

12 |
13 | {{#each start_urls}} 14 |
15 | {{view ASTool.ButtonView action="fetchPage" argument=this label=this minWidth='86%' title=this class="light-button"}} 16 | {{view ASTool.ButtonView action="deleteStartUrl" argument=this icon="ui-icon-trash" class="red-button"}} 17 |
18 | {{else}} 19 |
No start pages for this spider.
20 | {{/each}} 21 |
22 | 23 |
24 | {{view ASTool.TextField value=newStartUrl width='83%' name='startUrlTextField' placeholder='Enter page url' action="addStartUrl"}} 25 | {{view ASTool.ButtonView action="addStartUrl" icon="ui-icon-circle-plus" disabled=hasStartUrl 26 | class="blue-button fix_plus_icon"}} 27 |
28 | 29 |
30 | 31 | {{view ASTool.CheckBox checked=performLogin name="performLoginCheck"}} 32 | {{view ASTool.InlineHelp message="perform_login"}} 33 |
34 | 35 | {{#if performLogin}} 36 |
37 |
38 | {{view ASTool.TextField value=loginUrl width='94%' name='loginUrlField' placeholder='Login URL'}} 39 |
40 | {{view ASTool.TextField value=loginUser width='94%' name='loginUserField' placeholder='Login user'}} 41 |
42 | {{view ASTool.TextField value=loginPassword width='94%' name='loginPasswordField' placeholder='Login password'}} 43 |
44 | {{/if}} 45 |
46 | 47 |

Crawling

48 |
49 | {{view ASTool.FollowSelect value=controller.links_to_follow}} 50 | 51 | {{#if displayNofollow}} 52 |
53 | {{view ASTool.CheckBox checked=respect_nofollow name="respectNoFollow"}} 54 | Respect nofollow 55 |
56 | {{/if}} 57 | 58 | {{#if displayEditPatterns}} 59 |

Follow links that match this patterns

60 | {{view ASTool.InlineHelp message="follow_links"}} 61 |
62 | {{#each follow_patterns}} 63 |
64 | {{#view ASTool.PatternTextField pattern=this action="editFollowPattern"}} 65 | {{view ASTool.ButtonView label=this minWidth='86%' title=this class="light-button"}} 66 | {{/view}} 67 | {{view ASTool.ButtonView action="deleteFollowPattern" argument=this icon="ui-icon-trash" class="red-button"}} 68 |
69 | {{else}} 70 |
No follow patterns defined yet.
71 | {{/each}} 72 |
73 |
74 | {{view ASTool.TextField value=newFollowPattern width='83%' name='followPatternTextField' placeholder='New follow pattern' action="addFollowPattern"}} 75 | {{view ASTool.ButtonView action="addFollowPattern" icon="ui-icon-circle-plus" disabled=hasFollowPattern class="blue-button fix_plus_icon"}} 76 |
77 | 78 |
79 | 80 |

Exclude links that match this patterns

81 | {{view ASTool.InlineHelp message="exclude_links"}} 82 |
83 | {{#each exclude_patterns}} 84 |
85 | {{#view ASTool.PatternTextField pattern=this action="editExcludePattern"}} 86 | {{view ASTool.ButtonView label=this minWidth='86%' title=this class="light-button"}} 87 | {{/view}} 88 | {{view ASTool.ButtonView action="deleteExcludePattern" argument=this icon="ui-icon-trash" class="red-button"}} 89 |
90 | {{else}} 91 |
No exclude patterns defined yet.
92 | {{/each}} 93 |
94 |
95 | {{view ASTool.TextField value=newExcludePattern width='83%' name='excludePatternTextField' placeholder='New exclude pattern' action="addExcludePattern"}} 96 | {{view ASTool.ButtonView action="addExcludePattern" icon="ui-icon-circle-plus" disabled=hasExcludePattern class="blue-button fix_plus_icon"}} 97 |
98 | {{/if}} 99 | 100 |
101 | {{view ASTool.CheckBox checked=showLinks name="showLinks"}} 102 | Overlay blocked links 103 | {{view ASTool.InlineHelp message="overlay_blocked_links"}} 104 |
105 |
106 | 107 |

Extraction

108 | 109 |
110 |

Templates

111 |
112 | {{#each templates}} 113 |
114 | {{view ASTool.ButtonView action="editTemplate" argument=this label=name minWidth='86%' title=url class="light-button"}} 115 | {{view ASTool.ButtonView action="deleteTemplate" argument=this icon="ui-icon-trash" class="red-button"}} 116 |
117 | {{else}} 118 |
No templates exist for this spider yet.
119 | {{/each}} 120 |
121 |
122 |
123 | 124 |
125 | {{view ASTool.ButtonView action="saveSpider" icon="ui-icon-circle-arrow-n" label="Save spider" class="blue-button big"}} 126 |
127 | -------------------------------------------------------------------------------- /slyd/media/js/vendor/loading.js: -------------------------------------------------------------------------------- 1 | (function(w){var k=function(b,c){typeof c=="undefined"&&(c={});this.init(b,c)},a=k.prototype,o,p=["canvas","vml"],f=["oval","spiral","square","rect","roundRect"],x=/^\#([a-fA-F0-9]{6}|[a-fA-F0-9]{3})$/,v=navigator.appVersion.indexOf("MSIE")!==-1&&parseFloat(navigator.appVersion.split("MSIE")[1])===8?true:false,y=!!document.createElement("canvas").getContext,q=true,n=function(b,c,a){var b=document.createElement(b),d;for(d in a)b[d]=a[d];typeof c!=="undefined"&&c.appendChild(b);return b},m=function(b, 2 | c){for(var a in c)b.style[a]=c[a];return b},t=function(b,c){for(var a in c)b.setAttribute(a,c[a]);return b},u=function(b,c,a,d){b.save();b.translate(c,a);b.rotate(d);b.translate(-c,-a);b.beginPath()};a.init=function(b,c){if(typeof c.safeVML==="boolean")q=c.safeVML;try{this.mum=document.getElementById(b)!==void 0?document.getElementById(b):document.body}catch(a){this.mum=document.body}c.id=typeof c.id!=="undefined"?c.id:"canvasLoader";this.cont=n("div",this.mum,{id:c.id});if(y)o=p[0],this.can=n("canvas", 3 | this.cont),this.con=this.can.getContext("2d"),this.cCan=m(n("canvas",this.cont),{display:"none"}),this.cCon=this.cCan.getContext("2d");else{o=p[1];if(typeof k.vmlSheet==="undefined"){document.getElementsByTagName("head")[0].appendChild(n("style"));k.vmlSheet=document.styleSheets[document.styleSheets.length-1];var d=["group","oval","roundrect","fill"],e;for(e in d)k.vmlSheet.addRule(d[e],"behavior:url(#default#VML); position:absolute;")}this.vml=n("group",this.cont)}this.setColor(this.color);this.draw(); 4 | m(this.cont,{display:"none"})};a.cont={};a.can={};a.con={};a.cCan={};a.cCon={};a.timer={};a.activeId=0;a.diameter=40;a.setDiameter=function(b){this.diameter=Math.round(Math.abs(b));this.redraw()};a.getDiameter=function(){return this.diameter};a.cRGB={};a.color="#000000";a.setColor=function(b){this.color=x.test(b)?b:"#000000";this.cRGB=this.getRGB(this.color);this.redraw()};a.getColor=function(){return this.color};a.shape=f[0];a.setShape=function(b){for(var c in f)if(b===f[c]){this.shape=b;this.redraw(); 5 | break}};a.getShape=function(){return this.shape};a.density=40;a.setDensity=function(b){this.density=q&&o===p[1]?Math.round(Math.abs(b))<=40?Math.round(Math.abs(b)):40:Math.round(Math.abs(b));if(this.density>360)this.density=360;this.activeId=0;this.redraw()};a.getDensity=function(){return this.density};a.range=1.3;a.setRange=function(b){this.range=Math.abs(b);this.redraw()};a.getRange=function(){return this.range};a.speed=2;a.setSpeed=function(b){this.speed=Math.round(Math.abs(b))};a.getSpeed=function(){return this.speed}; 6 | a.fps=24;a.setFPS=function(b){this.fps=Math.round(Math.abs(b));this.reset()};a.getFPS=function(){return this.fps};a.getRGB=function(b){b=b.charAt(0)==="#"?b.substring(1,7):b;return{r:parseInt(b.substring(0,2),16),g:parseInt(b.substring(2,4),16),b:parseInt(b.substring(4,6),16)}};a.draw=function(){var b=0,c,a,d,e,h,k,j,r=this.density,s=Math.round(r*this.range),l,i,q=0;i=this.cCon;var g=this.diameter;if(o===p[0]){i.clearRect(0,0,1E3,1E3);t(this.can,{width:g,height:g});for(t(this.cCan,{width:g,height:g});b< 7 | r;){l=b<=s?1-1/s*b:l=0;k=270-360/r*b;j=k/180*Math.PI;i.fillStyle="rgba("+this.cRGB.r+","+this.cRGB.g+","+this.cRGB.b+","+l.toString()+")";switch(this.shape){case f[0]:case f[1]:c=g*0.07;e=g*0.47+Math.cos(j)*(g*0.47-c)-g*0.47;h=g*0.47+Math.sin(j)*(g*0.47-c)-g*0.47;i.beginPath();this.shape===f[1]?i.arc(g*0.5+e,g*0.5+h,c*l,0,Math.PI*2,false):i.arc(g*0.5+e,g*0.5+h,c,0,Math.PI*2,false);break;case f[2]:c=g*0.12;e=Math.cos(j)*(g*0.47-c)+g*0.5;h=Math.sin(j)*(g*0.47-c)+g*0.5;u(i,e,h,j);i.fillRect(e,h-c*0.5, 8 | c,c);break;case f[3]:case f[4]:a=g*0.3,d=a*0.27,e=Math.cos(j)*(d+(g-d)*0.13)+g*0.5,h=Math.sin(j)*(d+(g-d)*0.13)+g*0.5,u(i,e,h,j),this.shape===f[3]?i.fillRect(e,h-d*0.5,a,d):(c=d*0.55,i.moveTo(e+c,h-d*0.5),i.lineTo(e+a-c,h-d*0.5),i.quadraticCurveTo(e+a,h-d*0.5,e+a,h-d*0.5+c),i.lineTo(e+a,h-d*0.5+d-c),i.quadraticCurveTo(e+a,h-d*0.5+d,e+a-c,h-d*0.5+d),i.lineTo(e+c,h-d*0.5+d),i.quadraticCurveTo(e,h-d*0.5+d,e,h-d*0.5+d-c),i.lineTo(e,h-d*0.5+c),i.quadraticCurveTo(e,h-d*0.5,e+c,h-d*0.5))}i.closePath();i.fill(); 9 | i.restore();++b}}else{m(this.cont,{width:g,height:g});m(this.vml,{width:g,height:g});switch(this.shape){case f[0]:case f[1]:j="oval";c=140;break;case f[2]:j="roundrect";c=120;break;case f[3]:case f[4]:j="roundrect",c=300}a=d=c;e=500-d;for(h=-d*0.5;b=1;)b.removeChild(b.firstChild)}};a.redraw=function(){this.clean();this.draw()};a.reset=function(){typeof this.timer=== 11 | "number"&&(this.hide(),this.show())};a.tick=function(b){var a=this.con,f=this.diameter;b||(this.activeId+=360/this.density*this.speed);o===p[0]?(a.clearRect(0,0,f,f),u(a,f*0.5,f*0.5,this.activeId/180*Math.PI),a.drawImage(this.cCan,0,0,f,f),a.restore()):(this.activeId>=360&&(this.activeId-=360),m(this.vml,{rotation:this.activeId}))};a.show=function(){if(typeof this.timer!=="number"){var a=this;this.timer=self.setInterval(function(){a.tick()},Math.round(1E3/this.fps));m(this.cont,{display:"block"})}}; 12 | a.hide=function(){typeof this.timer==="number"&&(clearInterval(this.timer),delete this.timer,m(this.cont,{display:"none"}))};a.kill=function(){var a=this.cont;typeof this.timer==="number"&&this.hide();o===p[0]?(a.removeChild(this.can),a.removeChild(this.cCan)):a.removeChild(this.vml);for(var c in this)delete this[c]};w.CanvasLoader=k})(window); -------------------------------------------------------------------------------- /slybot/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Slybot.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Slybot.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /slyd/media/js/controllers/annotation-controller.js: -------------------------------------------------------------------------------- 1 | ASTool.AnnotationController = Em.ObjectController.extend(ASTool.BaseControllerMixin, 2 | ASTool.DocumentViewDataSource, ASTool.DocumentViewListener, { 3 | 4 | needs: ['application', 'template_index'], 5 | 6 | navigationLabelBinding: 'content.name', 7 | 8 | mappingAttribute: null, 9 | 10 | documentView: null, 11 | 12 | currentlySelectedElement: null, 13 | 14 | _selectingIgnore: false, 15 | 16 | highlightedElement: null, 17 | 18 | scrapedItemBinding: 'controllers.template_index.scrapedItem', 19 | 20 | urlBinding: 'controllers.template_index.url', 21 | 22 | currentVariantBinding: Em.Binding.oneWay("content.variant"), 23 | 24 | selectingIgnore: function(key, selectingIgnore) { 25 | if (arguments.length > 1) { 26 | this.set('_selectingIgnore', selectingIgnore); 27 | if (selectingIgnore) { 28 | this.set('documentView.restrictToDescendants', this.get('content.element')); 29 | this.set('documentView.partialSelectionEnabled', false); 30 | } else { 31 | this.set('documentView.restrictToDescendants', null); 32 | this.set('documentView.partialSelectionEnabled', true); 33 | } 34 | } 35 | return this.get('_selectingIgnore'); 36 | }.property('_selectingIgnore'), 37 | 38 | sprites: function() { 39 | var sprites = []; 40 | if (this.get('currentlySelectedElement')) { 41 | sprites.pushObject(ASTool.AnnotationSprite.create( 42 | { 'annotation': this.content, 43 | 'highlighted': 'true' })); 44 | } 45 | if (this.highlightedElement) { 46 | sprites.pushObject(ASTool.ElementSprite.create({ 47 | element: this.highlightedElement, 48 | fillColor: 'rgba(255,149,0,0.2)', 49 | strokeColor: 'rgba(255,149,0,0.6)', 50 | zPosition: 1000, 51 | })); 52 | } 53 | 54 | var annotationSprites = this.get('controllers.template_index.sprites').filter(function(sprite) { 55 | return sprite.get('annotation.id') != this.content.get('id'); 56 | }.bind(this)); 57 | 58 | var ignoredElements = this.get('content.ignores').map(function(ignore) { 59 | return ASTool.IgnoreSprite.create({ ignore: ignore }); 60 | }); 61 | 62 | return sprites.concat(annotationSprites).concat(ignoredElements); 63 | }.property('currentlySelectedElement', 64 | 'controllers.template_index.sprites', 65 | 'content.ignores.@each.highlighted', 66 | 'content.ignores.@each.ignoreBeneath', 67 | 'highlightedElement'), 68 | 69 | clearGeneratedIns: function(insElement) { 70 | $(insElement).removePartialAnnotation(); 71 | }, 72 | 73 | cancelEdit: function() { 74 | this.set('content.selectedElement', null); 75 | if (this.get('content.generated') && 76 | this.get('content.element') != this.get('currentlySelectedElement')) { 77 | this.clearGeneratedIns(this.get('currentlySelectedElement')); 78 | } 79 | this.transitionToRoute('template'); 80 | }, 81 | 82 | saveEdit: function() { 83 | this.get('controllers.template_index').saveAnnotations(); 84 | this.transitionToRoute('template'); 85 | }, 86 | 87 | actions: { 88 | 89 | doneEditing: function(annotation) { 90 | this.saveEdit(annotation); 91 | }, 92 | 93 | cancelEdit: function(annotation) { 94 | this.cancelEdit(annotation); 95 | }, 96 | 97 | mapAttribute: function(attribute) { 98 | this.set('mappingAttribute', attribute); 99 | }, 100 | 101 | fieldSelected: function(field) { 102 | this.get('content').addMapping(this.get('mappingAttribute.name'), field); 103 | this.set('mappingAttribute', null); 104 | }, 105 | 106 | variantSelected: function(variant) { 107 | this.set('content.variant', variant); 108 | }, 109 | 110 | makeSticky: function(attribute) { 111 | this.get('controllers.template_index').makeSticky(this.get('content'), 112 | attribute.get('name')); 113 | }, 114 | 115 | unmapAttribute: function(attribute) { 116 | this.get('content').removeMapping(attribute.name); 117 | }, 118 | 119 | deleteIgnore: function(ignore) { 120 | this.get('content.ignores').removeObject(ignore); 121 | }, 122 | 123 | highlightElement: function(element) { 124 | this.set('highlightedElement', element); 125 | if (element) { 126 | this.documentView.scrollToElement(element); 127 | } 128 | }, 129 | 130 | selectElement: function(element) { 131 | this.documentActions['elementSelected'].call(this, element); 132 | } 133 | }, 134 | 135 | confirmChangeSelection: function() { 136 | return confirm(ASTool.Messages.get('confirm_change_selection')); 137 | }, 138 | 139 | documentActions: { 140 | 141 | elementSelected: function(element, partialSelection) { 142 | if (this.get('selectingIgnore')) { 143 | if (element) { 144 | this.get('content').addIgnore(element); 145 | } 146 | this.set('selectingIgnore', false); 147 | } else { 148 | var needsConfirmation = this.get('content.ignores').length || 149 | this.get('content.mappedAttributes').length; 150 | if (!needsConfirmation || this.confirmChangeSelection()) { 151 | if (this.get('content.generated')) { 152 | this.clearGeneratedIns(this.get('content.element')); 153 | } 154 | this.set('highlightedElement', null); 155 | this.set('content.selectedElement', element); 156 | this.set('content.generated', !!partialSelection); 157 | this.get('content').removeIgnores(); 158 | this.get('content').removeMappings(); 159 | this.set('currentlySelectedElement', element); 160 | } 161 | } 162 | }, 163 | 164 | partialSelection: function(selection) { 165 | var element = $('').get(0); 166 | selection.getRangeAt(0).surroundContents(element); 167 | this.documentActions['elementSelected'].call(this, element, true); 168 | selection.collapse(); 169 | }, 170 | }, 171 | 172 | willEnter: function() { 173 | this.get('documentView').config({ mode: 'select', 174 | listener: this, 175 | dataSource: this, 176 | partialSelects: true }); 177 | this.set('currentlySelectedElement', this.get('content.element')); 178 | }, 179 | 180 | willLeave: function() { 181 | this.set('selectingIgnore', false); 182 | this.set('currentlySelectedElement', null); 183 | }, 184 | }); 185 | --------------------------------------------------------------------------------