├── slyd
    ├── slyd
    │   ├── __init__.py
    │   ├── settings.py
    │   ├── resource.py
    │   ├── utils.py
    │   ├── tap.py
    │   ├── html.py
    │   └── projects.py
    ├── tests
    │   ├── __init__.py
    │   ├── resources
    │   │   ├── data
    │   │   │   └── projects
    │   │   │   │   └── test
    │   │   │   │       ├── extractors.json
    │   │   │   │       ├── project.json
    │   │   │   │       ├── spiders
    │   │   │   │           ├── mh2.json
    │   │   │   │           └── mhvillage.json
    │   │   │   │       └── items.json
    │   │   └── docroot
    │   │   │   └── test.html
    │   ├── settings.py
    │   ├── mockserver.py
    │   ├── test_bot.py
    │   ├── utils.py
    │   ├── test_projects.py
    │   └── test_spec.py
    ├── media
    │   ├── portia.png
    │   ├── minispider.png
    │   ├── images
    │   │   ├── info.png
    │   │   ├── Chevron.gif
    │   │   ├── IconHome.gif
    │   │   ├── toolbox_show.png
    │   │   └── ChevronOverlay.png
    │   ├── css
    │   │   ├── images
    │   │   │   ├── animated-overlay.gif
    │   │   │   ├── ui-icons_FFF_256x240.png
    │   │   │   ├── ui-icons_2e83ff_256x240.png
    │   │   │   ├── ui-icons_333333_256x240.png
    │   │   │   ├── ui-icons_cd0a0a_256x240.png
    │   │   │   ├── ui-icons_ffffff_256x240.png
    │   │   │   ├── ui-bg_flat_0_aaaaaa_40x100.png
    │   │   │   ├── ui-bg_flat_100_444444_40x100.png
    │   │   │   ├── ui-bg_flat_100_999999_40x100.png
    │   │   │   ├── ui-bg_flat_100_aaaaaa_40x100.png
    │   │   │   ├── ui-bg_flat_100_cccccc_40x100.png
    │   │   │   ├── ui-bg_flat_100_ffffff_40x100.png
    │   │   │   ├── ui-bg_glass_55_fbf9ee_1x400.png
    │   │   │   └── ui-bg_glass_95_fef1ec_1x400.png
    │   │   └── breadcrumb.css
    │   ├── js
    │   │   ├── templates
    │   │   │   ├── application.handlebars
    │   │   │   ├── inline-textfield.handlebars
    │   │   │   ├── annotated-document-view.handlebars
    │   │   │   ├── topbar-project.handlebars
    │   │   │   ├── topbar-projects.handlebars
    │   │   │   ├── navigation.handlebars
    │   │   │   ├── topbar-extraction.handlebars
    │   │   │   ├── toolbox.handlebars
    │   │   │   ├── toolbox-projects.handlebars
    │   │   │   ├── toolbox-items.handlebars
    │   │   │   ├── toolbox-project.handlebars
    │   │   │   ├── item.handlebars
    │   │   │   ├── annotation-widget.handlebars
    │   │   │   ├── edit-item.handlebars
    │   │   │   ├── floating-annotation-widget.handlebars
    │   │   │   ├── topbar-browse.handlebars
    │   │   │   ├── toolbox-template.handlebars
    │   │   │   ├── toolbox-annotation.handlebars
    │   │   │   └── toolbox-spider.handlebars
    │   │   ├── controllers
    │   │   │   ├── application-controller.js
    │   │   │   ├── controllers.js
    │   │   │   ├── navigation-controller.js
    │   │   │   ├── items-controller.js
    │   │   │   ├── projects-controller.js
    │   │   │   ├── project-controller.js
    │   │   │   └── annotation-controller.js
    │   │   ├── app.js
    │   │   ├── messages.js
    │   │   ├── jqplugins.js
    │   │   ├── emberui.js
    │   │   └── vendor
    │   │   │   ├── ic-ajax.js
    │   │   │   └── loading.js
    │   ├── start.html
    │   ├── tests
    │   │   ├── fixtures.js
    │   │   └── integration_test_helper.js
    │   └── main.html
    ├── themes
    │   └── classic
    │   │   └── images
    │   │       ├── animated-overlay.gif
    │   │       ├── ui-icons_222222_256x240.png
    │   │       ├── ui-icons_2e83ff_256x240.png
    │   │       ├── ui-icons_cd0a0a_256x240.png
    │   │       ├── ui-icons_eaebda_256x240.png
    │   │       ├── ui-icons_fe412f_256x240.png
    │   │       ├── ui-bg_flat_0_111111_40x100.png
    │   │       ├── ui-bg_flat_0_aaaaaa_40x100.png
    │   │       ├── ui-bg_glass_55_fbf9ee_1x400.png
    │   │       ├── ui-bg_glass_95_fef1ec_1x400.png
    │   │       ├── ui-bg_inset-soft_15_404040_1x100.png
    │   │       ├── ui-bg_highlight-hard_25_262626_1x100.png
    │   │       ├── ui-bg_highlight-hard_25_b72724_1x100.png
    │   │       ├── ui-bg_highlight-hard_45_1e1a1a_1x100.png
    │   │       └── ui-bg_highlight-soft_75_9e9e9e_1x100.png
    ├── requirements.txt
    ├── package.json
    ├── twisted
    │   └── plugins
    │   │   └── slyd_plugin.py
    ├── .gitignore
    ├── bin
    │   └── sh2sly
    ├── karma.conf.js
    └── README.md
├── slybot
    ├── debian
    │   ├── compat
    │   ├── pyversions
    │   ├── copyright
    │   ├── changelog
    │   ├── rules
    │   └── control
    ├── slybot
    │   ├── validation
    │   │   ├── __init__.py
    │   │   ├── schema.py
    │   │   └── schemas.json
    │   ├── __init__.py
    │   ├── tests
    │   │   ├── data
    │   │   │   ├── test_params.txt
    │   │   │   └── SampleProject
    │   │   │   │   ├── project.json
    │   │   │   │   ├── extractors.json
    │   │   │   │   └── spiders
    │   │   │   │       ├── example3.com.json
    │   │   │   │       ├── any_allowed_domains.json
    │   │   │   │       ├── example2.com.json
    │   │   │   │       ├── allowed_domains.json
    │   │   │   │       ├── cargurus.json
    │   │   │   │       ├── pinterest.com.json
    │   │   │   │       ├── seedsofchange.com.json
    │   │   │   │       ├── example.com.json
    │   │   │   │       ├── ebay.json
    │   │   │   │       ├── ebay3.json
    │   │   │   │       ├── ebay4.json
    │   │   │   │       └── ebay2.json
    │   │   ├── test_fieldtypes.py
    │   │   ├── test_dupefilter.py
    │   │   ├── test_schema_validation.py
    │   │   └── test_baseurl.py
    │   ├── fieldtypes
    │   │   ├── point.py
    │   │   ├── images.py
    │   │   ├── price.py
    │   │   ├── url.py
    │   │   ├── number.py
    │   │   ├── __init__.py
    │   │   └── text.py
    │   ├── settings.py
    │   ├── dupefilter.py
    │   ├── linkextractor
    │   │   ├── regex.py
    │   │   ├── ecsv.py
    │   │   ├── __init__.py
    │   │   ├── xml.py
    │   │   └── base.py
    │   ├── utils.py
    │   ├── spidermanager.py
    │   ├── closespider.py
    │   ├── baseurl.py
    │   ├── item.py
    │   ├── extractors.py
    │   ├── generic_form.py
    │   └── spiderlets.py
    ├── Makefile.buildbot
    ├── scrapy.cfg
    ├── .gitignore
    ├── requirements.txt
    ├── bin
    │   ├── slybot
    │   ├── makedeb
    │   └── portiacrawl
    ├── README.rst
    ├── setup.py
    └── docs
    │   ├── index.rst
    │   ├── spiderlets.rst
    │   └── make.bat
├── LICENSE
└── README.md


/slyd/slyd/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/slyd/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/slybot/debian/compat:
--------------------------------------------------------------------------------
1 | 7
2 | 


--------------------------------------------------------------------------------
/slybot/debian/pyversions:
--------------------------------------------------------------------------------
1 | 2.5-
2 | 


--------------------------------------------------------------------------------
/slybot/slybot/validation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/slybot/Makefile.buildbot:
--------------------------------------------------------------------------------
1 | build:
2 | 	bin/makedeb
3 | 


--------------------------------------------------------------------------------
/slybot/slybot/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.9'
2 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/test_params.txt:
--------------------------------------------------------------------------------
1 | Cars
2 | Boats


--------------------------------------------------------------------------------
/slyd/tests/resources/data/projects/test/extractors.json:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/slybot/debian/copyright:
--------------------------------------------------------------------------------
1 | Copyright (C) 2011-2012 Scrapinghub
2 | 


--------------------------------------------------------------------------------
/slybot/scrapy.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | default = slybot.settings
3 | 


--------------------------------------------------------------------------------
/slybot/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | docs/_build
3 | slybot.egg-info/
4 | 


--------------------------------------------------------------------------------
/slybot/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy
2 | scrapely
3 | loginform
4 | lxml
5 | 


--------------------------------------------------------------------------------
/slyd/media/portia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/portia.png


--------------------------------------------------------------------------------
/slyd/media/minispider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/minispider.png


--------------------------------------------------------------------------------
/slyd/media/images/info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/info.png


--------------------------------------------------------------------------------
/slyd/media/images/Chevron.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/Chevron.gif


--------------------------------------------------------------------------------
/slyd/media/images/IconHome.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/IconHome.gif


--------------------------------------------------------------------------------
/slyd/media/images/toolbox_show.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/toolbox_show.png


--------------------------------------------------------------------------------
/slyd/tests/resources/data/projects/test/project.json:
--------------------------------------------------------------------------------
1 | {
2 |     "version": "1308771278",
3 |     "name": "demo"
4 | }


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/project.json:
--------------------------------------------------------------------------------
1 | {
2 |     "version": "1", 
3 |     "name": "SampleProject"
4 | }
5 | 


--------------------------------------------------------------------------------
/slyd/media/images/ChevronOverlay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/images/ChevronOverlay.png


--------------------------------------------------------------------------------
/slyd/media/css/images/animated-overlay.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/animated-overlay.gif


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-icons_FFF_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_FFF_256x240.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/animated-overlay.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/animated-overlay.gif


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-icons_2e83ff_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_2e83ff_256x240.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-icons_333333_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_333333_256x240.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-icons_cd0a0a_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_cd0a0a_256x240.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-icons_ffffff_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-icons_ffffff_256x240.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-bg_flat_0_aaaaaa_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_0_aaaaaa_40x100.png


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/extractors.json:
--------------------------------------------------------------------------------
1 | {
2 |     "4fad3762688f920d76000000": {
3 |         "regular_expression": "(\\d+)"
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-bg_flat_100_444444_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_444444_40x100.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-bg_flat_100_999999_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_999999_40x100.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-bg_flat_100_aaaaaa_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_aaaaaa_40x100.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-bg_flat_100_cccccc_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_cccccc_40x100.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-bg_flat_100_ffffff_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_flat_100_ffffff_40x100.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-bg_glass_55_fbf9ee_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_glass_55_fbf9ee_1x400.png


--------------------------------------------------------------------------------
/slyd/media/css/images/ui-bg_glass_95_fef1ec_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/media/css/images/ui-bg_glass_95_fef1ec_1x400.png


--------------------------------------------------------------------------------
/slyd/requirements.txt:
--------------------------------------------------------------------------------
1 | twisted
2 | scrapy
3 | loginform
4 | lxml
5 | jsonschema
6 | -e git://github.com/scrapy/scrapely.git#egg=scrapely
7 | -e ../slybot
8 | 


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-icons_222222_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_222222_256x240.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-icons_2e83ff_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_2e83ff_256x240.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-icons_cd0a0a_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_cd0a0a_256x240.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-icons_eaebda_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_eaebda_256x240.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-icons_fe412f_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-icons_fe412f_256x240.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-bg_flat_0_111111_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_flat_0_111111_40x100.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-bg_flat_0_aaaaaa_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_flat_0_aaaaaa_40x100.png


--------------------------------------------------------------------------------
/slyd/media/js/templates/application.handlebars:
--------------------------------------------------------------------------------
1 | {{outlet topbar}}
2 | {{view ASTool.AnnotatedDocumentView}}
3 | <div id="toolbox">
4 | 	{{outlet main}}
5 | </div>
6 | 


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-bg_glass_55_fbf9ee_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_glass_55_fbf9ee_1x400.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-bg_glass_95_fef1ec_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_glass_95_fef1ec_1x400.png


--------------------------------------------------------------------------------
/slyd/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": {
3 |     "bower": "*",
4 |     "karma-ember-preprocessor": "*",
5 |     "karma-qunit": "*",
6 |     "karma": "0.10.2"
7 |   }
8 | }


--------------------------------------------------------------------------------
/slybot/bin/slybot:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | os.environ['SCRAPY_SETTINGS_MODULE'] = 'slybot.settings'
4 | 
5 | from scrapy.cmdline import execute
6 | execute()
7 | 


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-bg_inset-soft_15_404040_1x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_inset-soft_15_404040_1x100.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-bg_highlight-hard_25_262626_1x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_highlight-hard_25_262626_1x100.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-bg_highlight-hard_25_b72724_1x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_highlight-hard_25_b72724_1x100.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-bg_highlight-hard_45_1e1a1a_1x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_highlight-hard_45_1e1a1a_1x100.png


--------------------------------------------------------------------------------
/slyd/themes/classic/images/ui-bg_highlight-soft_75_9e9e9e_1x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gmx/portia/master/slyd/themes/classic/images/ui-bg_highlight-soft_75_9e9e9e_1x100.png


--------------------------------------------------------------------------------
/slybot/debian/changelog:
--------------------------------------------------------------------------------
1 | python-slybot (0.9) unstable; urgency=low
2 | 
3 |   * Initial release.
4 | 
5 |  -- Scrapinghub Team <info@scrapinghub.com>  Wed, 31 Oct 2012 16:32:13 -0300
6 | 


--------------------------------------------------------------------------------
/slyd/tests/resources/docroot/test.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <head>
3 | <title>test</title>
4 | </head>
5 | <body>
6 | Simple testing page with a <a href="http://scrapy.org/">link to Scrapy</a>
7 | </body>
8 | </html>


--------------------------------------------------------------------------------
/slyd/media/js/templates/inline-textfield.handlebars:
--------------------------------------------------------------------------------
1 | <span>
2 | 	{{#if view.isEditing}}
3 | 		<span class="editable-name">
4 | 	 		{{view view.textField value=view.value}}
5 | 	 	</span>
6 | 	{{else}}
7 | 		{{yield}}
8 | 	{{/if}}
9 | </span>


--------------------------------------------------------------------------------
/slyd/twisted/plugins/slyd_plugin.py:
--------------------------------------------------------------------------------
1 | """Registers 'twistd slyd' command."""
2 | from twisted.application.service import ServiceMaker
3 | 
4 | finger = ServiceMaker(
5 |     'slyd', 'slyd.tap', 'A server for creating scrapely spiders', 'slyd')
6 | 


--------------------------------------------------------------------------------
/slyd/media/start.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 | 	<style>
 4 | 		html { 
 5 | 		   width:100%; 
 6 | 		   height:100%; 
 7 | 		   background:url(portia.png) center center no-repeat;
 8 | 		}
 9 | 	</style>
10 | </head>
11 | <body>
12 | </body>
13 | </html>


--------------------------------------------------------------------------------
/slybot/bin/makedeb:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | version=$(python setup.py --version)-r$(git log --oneline | wc -l)+$(date +%Y%m%d%H%M)${BUILD_CODE:+~$BUILD_CODE}
4 | debchange -m -D unstable --force-distribution -v $version "Automatic build"
5 | debuild --no-lintian -us -uc -b
6 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/example3.com.json:
--------------------------------------------------------------------------------
1 | {
2 |     "templates": [],
3 |     "start_urls": ["http://www.example.com/index.html"],
4 |     "exclude_patterns": [], 
5 |     "follow_patterns": [], 
6 |     "links_to_follow": "patterns", 
7 |     "respect_nofollow": true
8 | }
9 | 


--------------------------------------------------------------------------------
/slyd/.gitignore:
--------------------------------------------------------------------------------
 1 | # python
 2 | *.py[cod]
 3 | 
 4 | # editor files
 5 | *.orig
 6 | *.bak
 7 | *.swp
 8 | *.project
 9 | *.sublime-*
10 | 
11 | # twisted
12 | dropin.cache
13 | twistd.log
14 | twistd.pid
15 | _trial_temp*
16 | 
17 | # local data files
18 | data/*
19 | 
20 | # npm files
21 | node_modules/*
22 | npm-debug.log
23 | 


--------------------------------------------------------------------------------
/slyd/media/js/controllers/application-controller.js:
--------------------------------------------------------------------------------
1 | ASTool.ApplicationController = Em.Controller.extend(ASTool.BaseControllerMixin, {
2 | 
3 | 	currentPathDidChange: function() {
4 | 		// Always reset the document view when leaving a route.
5 | 		this.get('documentView').reset();			
6 |   	}.observes('currentPath'),
7 | });
8 | 


--------------------------------------------------------------------------------
/slybot/slybot/fieldtypes/point.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class GeoPointFieldTypeProcessor(object):
 3 |     """Renders point with tags"""
 4 | 
 5 |     name = 'geopoint'
 6 |     description = 'geo point'
 7 |     multivalue = True
 8 | 
 9 |     def extract(self, value):
10 |         return value
11 | 
12 |     def adapt(self, value, htmlpage):
13 |         return value
14 | 
15 | 


--------------------------------------------------------------------------------
/slyd/tests/resources/data/projects/test/spiders/mh2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exclude_patterns": [], 
 3 |     "follow_patterns": [
 4 |         ".+MobileHomePark\\.php\\?key=\\d+"
 5 |     ], 
 6 |     "links_to_follow": "patterns", 
 7 |     "respect_nofollow": true, 
 8 |     "start_urls": [
 9 |         "http://www.mhvillage.com/"
10 |     ], 
11 |     "templates": []
12 | }


--------------------------------------------------------------------------------
/slyd/tests/resources/data/projects/test/spiders/mhvillage.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [],
 3 |     "start_urls": [
 4 |         "http://www.mhvillage.com/"
 5 |     ],
 6 |     "exclude_patterns": [],
 7 |     "follow_patterns": [
 8 |         ".+MobileHomePark\\.php\\?key=\\d+"
 9 |     ],
10 |     "links_to_follow": "patterns",
11 |     "respect_nofollow": true
12 | }


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/any_allowed_domains.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [], 
 3 |     "start_urls": [
 4 |         "http://www.ebay.com/"
 5 |     ], 
 6 |     "allowed_domains": [],
 7 |     "exclude_patterns": [], 
 8 |     "respect_nofollow": true, 
 9 |     "follow_patterns": [], 
10 |     "scrapes": "default", 
11 |     "links_to_follow": "none"
12 | }
13 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/annotated-document-view.handlebars:
--------------------------------------------------------------------------------
1 | <div id="scraped-doc">
2 | 	<iframe id="scraped-doc-iframe" src="start.html"></iframe>
3 | 	<canvas id="infocanvas" class="doc-canvas"></canvas>
4 | 	<div id="loader-container"></div>
5 | 	<div style="position:absolute;z-index:20;width:100%;pointer-events:none">
6 | 		<div id="hovered-element-info"></div>
7 | 	</div>
8 | </div>
9 | 


--------------------------------------------------------------------------------
/slybot/slybot/fieldtypes/images.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Images 
 3 | """
 4 | from scrapely.extractors import extract_image_url
 5 | from slybot.fieldtypes.url import UrlFieldTypeProcessor
 6 | 
 7 | class ImagesFieldTypeProcessor(UrlFieldTypeProcessor):
 8 |     name = 'image'
 9 |     description = 'extracts image URLs'
10 | 
11 |     def extract(self, text):
12 |         return extract_image_url(text)
13 |         
14 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/example2.com.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [],
 3 |     "start_urls": [],
 4 |     "init_requests": [
 5 |         {
 6 |             "type": "start",
 7 |             "url": "http://www.example.com/index.html"
 8 |         }
 9 |     ], 
10 |     "exclude_patterns": [], 
11 |     "follow_patterns": [], 
12 |     "links_to_follow": "patterns", 
13 |     "respect_nofollow": true
14 | }
15 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/allowed_domains.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [], 
 3 |     "start_urls": [
 4 |         "http://www.ebay.com/sch/ebayadvsearch/?rt=nc"
 5 |     ], 
 6 |     "allowed_domains": [
 7 |         "www.ebay.com",
 8 |         "www.yahoo.com"
 9 |     ], 
10 |     "exclude_patterns": [], 
11 |     "respect_nofollow": true, 
12 |     "follow_patterns": [], 
13 |     "links_to_follow": "none"
14 | }
15 | 


--------------------------------------------------------------------------------
/slybot/README.rst:
--------------------------------------------------------------------------------
 1 | ==============
 2 | Slybot crawler
 3 | ==============
 4 | 
 5 | Slybot is a Python web crawler for doing web scraping. It's implemented on top of the
 6 | `Scrapy`_ web crawling framework and the `Scrapely`_ extraction library.
 7 | 
 8 | The documentation (including installation and usage) can be found at:
 9 | http://slybot.readthedocs.org/
10 | 
11 | .. _Scrapely: https://github.com/scrapy/scrapely
12 | .. _Scrapy: http://scrapy.org
13 | 


--------------------------------------------------------------------------------
/slyd/tests/settings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Slyd test settings
 3 | 
 4 | Imports slyd settings and adds necessary overrides for test setup
 5 | """
 6 | from slyd.settings import *
 7 | 
 8 | LOG_LEVEL = 'DEBUG'
 9 | 
10 | # testing never makes remote requests. A cache may serve stale content.
11 | HTTPCACHE_ENABLED = False
12 | 
13 | RESOURCE_DIR = join(dirname(__file__), 'resources')
14 | DATA_DIR = join(RESOURCE_DIR, 'data')
15 | SPEC_DATA_DIR = join(DATA_DIR, 'projects')
16 | 


--------------------------------------------------------------------------------
/slybot/slybot/settings.py:
--------------------------------------------------------------------------------
 1 | SPIDER_MANAGER_CLASS = 'slybot.spidermanager.SlybotSpiderManager'
 2 | EXTENSIONS = {'slybot.closespider.SlybotCloseSpider': 1}
 3 | ITEM_PIPELINES = ['slybot.dupefilter.DupeFilterPipeline']
 4 | SPIDER_MIDDLEWARES = {'slybot.spiderlets.SpiderletsMiddleware': 999} # as close as possible to spider output
 5 | SLYDUPEFILTER_ENABLED = True
 6 | PROJECT_DIR = 'slybot-project'
 7 | 
 8 | try:
 9 |     from local_slybot_settings import *
10 | except ImportError:
11 |     pass
12 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/topbar-project.handlebars:
--------------------------------------------------------------------------------
 1 | <div id="topbar">
 2 | 	<div  class="nav-container">
 3 | 		{{render "navigation"}}
 4 | 	</div>
 5 | 	<div style="float:left;margin-top:2px">
 6 | 		{{view ASTool.TextField value=spiderPage width='320px' name='spiderPageTextField' placeholder='Enter page url' action="addSpider"}}
 7 | 		{{view ASTool.ButtonView action="addSpider" icon="ui-icon-circle-plus" label="New spider" class="blue-button" disabled=createSpiderDisabled}}
 8 | 	</div>
 9 | </div>
10 | 


--------------------------------------------------------------------------------
/slybot/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | # -*- makefile -*-
 3 | # Sample debian/rules that uses debhelper.
 4 | # This file was originally written by Joey Hess and Craig Small.
 5 | # As a special exception, when this file is copied by dh-make into a
 6 | # dh-make output file, you may use that output file without restriction.
 7 | # This special exception was added by Craig Small in version 0.37 of dh-make.
 8 | 
 9 | # Uncomment this to turn on verbose mode.
10 | #export DH_VERBOSE=1
11 | 
12 | %:
13 | 	dh  $@
14 | 


--------------------------------------------------------------------------------
/slybot/slybot/fieldtypes/price.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Price field types
 3 | """
 4 | from scrapely import extractors
 5 | 
 6 | class PriceTypeProcessor(object):
 7 |     """Extracts price from text"""
 8 |     name = "price"
 9 |     description = "extracts a price decimal number in the text passed"
10 | 
11 |     def extract(self, htmlregion):
12 |         return extractors.contains_any_numbers(htmlregion.text_content)
13 | 
14 |     def adapt(self, text, htmlpage):
15 |         return extractors.extract_price(text)
16 | 
17 | 


--------------------------------------------------------------------------------
/slyd/media/js/controllers/controllers.js:
--------------------------------------------------------------------------------
 1 | ASTool.BaseControllerMixin = Ember.Mixin.create({
 2 | 
 3 | 	openAccordion: function(accordionNumber) {
 4 | 		$( ".accordion" ).accordion("option", "active", accordionNumber);
 5 | 	},
 6 | 
 7 | 	getUnusedName: function(baseName, usedNames) {
 8 | 		var i = 1;
 9 | 		var newName = baseName;
10 | 		while(usedNames.any(function(usedName) {
11 | 			return usedName == newName
12 | 		})) {
13 | 			newName = baseName + '_' + i++;
14 | 		}
15 | 		return newName;
16 | 	},
17 | });
18 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/topbar-projects.handlebars:
--------------------------------------------------------------------------------
 1 | <div id="topbar">
 2 | 	<div  class="nav-container">
 3 | 		{{render "navigation"}}
 4 | 	</div>
 5 | 	<div style="float:left;margin-top:2px">
 6 | 		{{view ASTool.TextField value=projectSite width='320px' name='projectSiteTextField' placeholder='Enter site url' action="createProject" id="projectSiteTextField"}}
 7 | 		{{view ASTool.ButtonView action="createProject" icon="ui-icon-circle-plus" label="Start" class="blue-button" disabled=createProjectDisabled}}
 8 | 	</div>
 9 | </div>
10 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/cargurus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [
 3 |     ], 
 4 |     "start_urls": [
 5 |         "http://www.cargurus.com/Cars/sitemap.html"
 6 |     ], 
 7 |     "exclude_patterns": [
 8 |         "-Pictures-", 
 9 |         "-Specs-", 
10 |         "-Price-", 
11 |         "_v", 
12 |         "-Videos-"
13 |     ], 
14 |     "follow_patterns": [
15 |         "-Overview-", 
16 |         "-Reviews-", 
17 |         "/rss/"
18 |     ], 
19 |     "links_to_follow": "patterns", 
20 |     "respect_nofollow": false
21 | }
22 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/navigation.handlebars:
--------------------------------------------------------------------------------
 1 | <div id="breadcrumb" class="breadCrumb">
 2 | 	<ul>
 3 | 		{{#if previousRoutes}}		
 4 | 			{{#each previousRoutes}}
 5 | 				<li>
 6 | 					<a href="#"{{action "gotoRoute" this.route}}>{{this.route}} <b>{{this.label}}</b></a>
 7 | 				</li>
 8 | 			{{/each}}
 9 | 			<li>
10 | 				{{currentRoute.route}} <b>{{currentRoute.label}}</b>
11 | 			</li>
12 | 		{{else}}
13 | 			<li>
14 | 				<a href="#">{{currentRoute.route}} <b>{{currentRoute.label}}</b></a>
15 | 			</li>
16 | 		{{/if}}
17 | 	</ul>
18 | </div>
19 | 	
20 | 
21 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/pinterest.com.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [], 
 3 |     "start_urls": [
 4 |         "http://pinterest.com/popular/"
 5 |     ], 
 6 |     "init_requests": [
 7 |         {
 8 |             "username": "test", 
 9 |             "loginurl": "https://pinterest.com/login/", 
10 |             "password": "testpass", 
11 |             "type": "login"
12 |         }
13 |     ], 
14 |     "exclude_patterns": [], 
15 |     "respect_nofollow": true, 
16 |     "follow_patterns": [], 
17 |     "links_to_follow": "patterns"
18 | }
19 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/topbar-extraction.handlebars:
--------------------------------------------------------------------------------
 1 | <div id="topbar">
 2 | 	<div  class="nav-container">
 3 | 		{{render "navigation"}}
 4 | 	</div>
 5 | 	<div style="float:left;margin-top:2px">
 6 | 		<span class="url">
 7 | 			{{#view ASTool.LabelWithTooltip title=controller.url}}
 8 | 				{{controller.url}}
 9 | 			{{/view}}
10 | 		</span>
11 | 		<span style="float:left;">
12 | 			{{#if showContinueBrowsing}}
13 | 				{{view ASTool.ButtonView action="continueBrowsing" label="Continue browsing" class="blue-button"}}
14 | 			{{/if}}
15 | 		</span>
16 | 	</div>
17 | </div>
18 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/seedsofchange.com.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [],
 3 |     "start_urls": [
 4 |         "http://www.seedsofchange.com/garden_center/browse_category.aspx?id=123"
 5 |     ], 
 6 |     "exclude_patterns": [
 7 |         "/tellafriend.aspx.+"
 8 |     ], 
 9 |     "follow_patterns": [
10 |         "/garden_center/browse_category.aspx.+", 
11 |         "/garden_center/detailedCategoryDisplay.aspx.+", 
12 |         "/garden_center/product_details.aspx.+"
13 |     ], 
14 |     "links_to_follow": "patterns", 
15 |     "respect_nofollow": true
16 | }
17 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/toolbox.handlebars:
--------------------------------------------------------------------------------
 1 | <div style="position:absolute;height:100%;width:100%">
 2 | 	<div class="container" style="height:100%;width:400px;">
 3 | 		<div class="bar">
 4 | 			<div style="position:absolute;left:8px;top:45%">
 5 | 				<img src="images/toolbox_show.png">
 6 | 			</div>
 7 | 		</div>
 8 | 		<span style="float:left;margin:0px 5px 0px -32px">
 9 | 			{{view ASTool.PinToolBoxButton action="editAnnotation" class="textless-button pin-button"}}
10 | 		</span>
11 | 		<div style="padding-top:10px;margin-left:40px;padding-right:2px">
12 | 			{{yield}}
13 | 		</div>	
14 | 	</div>
15 | </div>
16 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/example.com.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [],
 3 |     "start_urls": ["http://www.example.com/index.html"],
 4 |     "init_requests": [
 5 |         {
 6 |             "type": "start",
 7 |             "url": "http://www.example.com/products.csv",
 8 |             "link_extractor": {
 9 |                 "type": "column",
10 |                 "value": 1,
11 |                 "delimiter": ","
12 |             }
13 |         }
14 |     ], 
15 |     "exclude_patterns": [], 
16 |     "follow_patterns": [], 
17 |     "links_to_follow": "patterns", 
18 |     "respect_nofollow": true
19 | }
20 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/toolbox-projects.handlebars:
--------------------------------------------------------------------------------
 1 | <div style="margin:10px 0px 0px 10px">
 2 | 	<div style="margin-top:10px"></div>
 3 | 	<h4>Open project</h4>
 4 | 	<div class="scrolling-container" style="max-height:600px">
 5 | 		{{#each}}
 6 | 			<div style="margin:4px 0px 4px 0px">
 7 | 				{{view ASTool.ButtonView action="openProject" argument=this label=this minWidth='85%' class="light-button"}}
 8 | 				{{view ASTool.ButtonView action="deleteProject" argument=this icon="ui-icon-trash" class="red-button"}}
 9 | 			</div>
10 | 		{{else}}
11 | 			<h5>No projects have been created yet.</h5>
12 | 		{{/each}}
13 | 	</div>
14 | </div>


--------------------------------------------------------------------------------
/slyd/slyd/settings.py:
--------------------------------------------------------------------------------
 1 | """Scrapy settings"""
 2 | from os.path import join, dirname
 3 | 
 4 | EXTENSIONS = {
 5 |     'scrapy.contrib.logstats.LogStats': None,
 6 |     'scrapy.webservice.WebService': None,
 7 |     'scrapy.telnet.TelnetConsole': None,
 8 |     'scrapy.contrib.throttle.AutoThrottle': None
 9 | }
10 | 
11 | LOG_LEVEL = 'DEBUG'
12 | 
13 | # location of slybot projects - assumes a subdir per project
14 | DATA_DIR = join(dirname(dirname(__file__)), 'data')
15 | SPEC_DATA_DIR = join(DATA_DIR, 'projects')
16 | 
17 | 
18 | # recommended for development - use scrapy to cache http responses
19 | HTTPCACHE_ENABLED = True
20 | HTTPCACHE_DIR = join(DATA_DIR, 'cache')
21 | 


--------------------------------------------------------------------------------
/slybot/slybot/fieldtypes/url.py:
--------------------------------------------------------------------------------
 1 | from urlparse import urljoin
 2 | from scrapy.utils.url import safe_download_url
 3 | from scrapy.utils.markup import unquote_markup
 4 | from slybot.baseurl import get_base_url
 5 | 
 6 | class UrlFieldTypeProcessor(object):
 7 |     """Renders URLs as links"""
 8 | 
 9 |     name = 'url'
10 |     description = 'URL'
11 |     limit = 80
12 | 
13 |     def extract(self, text):
14 |         return text
15 | 
16 |     def adapt(self, text, htmlpage):
17 |         text = text.encode(htmlpage.encoding)
18 |         joined = urljoin(get_base_url(htmlpage).encode(htmlpage.encoding), text)
19 |         return safe_download_url(unquote_markup(joined, encoding=htmlpage.encoding))
20 | 
21 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/test_fieldtypes.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from scrapely.htmlpage import HtmlPage
 3 | 
 4 | from slybot.fieldtypes import UrlFieldTypeProcessor
 5 | 
 6 | class FieldTypesUrlEncoding(TestCase):
 7 |     def test_not_standard_chars_in_url(self):
 8 |         body = u'<html><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" /></html>'
 9 |         url = u'fotos/produtos/Mam\xe3e noel.jpg'
10 |         htmlpage = HtmlPage(url=u"http://www.example.com/", body=body, encoding='cp1252')
11 |         processor = UrlFieldTypeProcessor()
12 |         self.assertEqual(processor.adapt(url, htmlpage), u'http://www.example.com/fotos/produtos/Mam%C3%A3e%20noel.jpg')
13 | 
14 | 


--------------------------------------------------------------------------------
/slybot/debian/control:
--------------------------------------------------------------------------------
 1 | Source: python-slybot
 2 | Section: python
 3 | Priority: extra
 4 | Maintainer: Scrapinghub Team <info@scrapinghub.com>
 5 | Build-Depends: debhelper (>= 7), python (>=2.5)
 6 | Standards-Version: 3.8.3
 7 | Homepage: https://github.com/scrapy/slybot
 8 | 
 9 | Package: python-slybot
10 | Architecture: all
11 | Depends: ${shlibs:Depends}, ${misc:Depends}, ${python:Depends},
12 |  scrapy-0.18 | scrapy-0.20 | scrapy-0.22 | scrapy-0.23,
13 |  python-scrapely,
14 |  python-loginform,
15 |  python-lxml
16 | Description: A web crawler implemented in Python.
17 |  Slybot is a Python web crawler for doing web scraping. It's implemented on top
18 |  of the Scrapy web crawling framework and the Scrapely extraction library.
19 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/toolbox-items.handlebars:
--------------------------------------------------------------------------------
 1 | <div style="margin:10px 0px 0px 10px">
 2 | 	<h4>Items</h4>
 3 | 	<div>
 4 | 		<div class="scrolling-container" style="max-height:500px">
 5 | 			{{#each model}}
 6 | 				{{view ASTool.EditItemView item=this}}
 7 | 		 	{{else}}
 8 | 		 		<h4>No items have been defined yet.</h4>
 9 | 		 	{{/each}}
10 | 		</div>
11 | 		<div style="text-align:center;margin-top:10px">
12 | 			{{view ASTool.ButtonView action="addItem" icon="ui-icon-circle-plus" label="Item" class="blue-button"}}
13 | 			{{view ASTool.ButtonView action="saveChanges" icon="ui-icon-circle-triangle-w" label="Save changes" class="blue-button"}}
14 | 			{{view ASTool.ButtonView action="undoChanges" icon="ui-icon-arrowreturnthick-1-w" label="Discard changes" class="red-button"}}
15 | 		</div>
16 | 	</div>
17 | </div>
18 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/toolbox-project.handlebars:
--------------------------------------------------------------------------------
 1 | <div style="margin:10px 0px 0px 10px">
 2 | 	<div style="text-align:center;font-size:1.1em;margin-bottom:5px">
 3 | 		{{#view ASTool.RenameTextField value=name}}
 4 | 			<span class="editable-name">Project {{name}}</span>
 5 | 		{{/view}}
 6 | 	</div>
 7 | 
 8 | 	<h4>Spiders</h4>
 9 | 	<div>
10 | 		<div class="scrolling-container" style="max-height:600px">
11 | 			{{#each}}
12 | 				<div style="margin:4px 0px 4px 0px">
13 | 					{{view ASTool.ButtonView action="editSpider" argument=this label=this minWidth="82%" class="light-button"}}
14 | 					{{view ASTool.ButtonView action="deleteSpider" argument=this icon="ui-icon-trash" class="red-button"}}
15 | 				</div>
16 | 			{{else}}
17 | 				<h3>No spiders for this project.</h3>
18 | 			{{/each}}
19 | 		</div>
20 | 	</div>
21 | </div>
22 | 
23 | 


--------------------------------------------------------------------------------
/slyd/tests/mockserver.py:
--------------------------------------------------------------------------------
 1 | from twisted.web.server import Site
 2 | from twisted.web.resource import Resource
 3 | 
 4 | class Root(Resource):
 5 | 
 6 |     def __init__(self):
 7 |         Resource.__init__(self)
 8 |         self.putChild("status", Status())
 9 | 
10 |     def getChild(self, name, request):
11 |         return self
12 | 
13 |     def render(self, request):
14 |         return 'Slyd mock HTTP server\n'
15 | 
16 | # TODO: make PR for scrapy to share code
17 | if __name__ == "__main__":
18 |     root = Root()
19 |     factory = Site(root)
20 |     httpPort = reactor.listenTCP(8998, factory)
21 | 
22 |     def print_listening():
23 |         httpHost = httpPort.getHost()
24 |         print("Mock server running at http://%s:%d/" % (
25 |             httpHost.host, httpHost.port))
26 |     reactor.callWhenRunning(print_listening)
27 |     reactor.run()


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/ebay.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [], 
 3 |     "start_urls": [
 4 |         "http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc"
 5 |     ], 
 6 |     "init_requests": [
 7 |         {
 8 |             "type": "form",
 9 |             "form_url": "http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
10 |             "xpath": "//form[@name='adv_search_from']",
11 |             "fields": [
12 |                 {
13 |                   "xpath": ".//*[@name='_nkw']",
14 |                   "type": "constants",
15 |                   "value": ["Cars"]
16 |                 },
17 |                 {
18 |                   "xpath": ".//*[@name='_in_kw']",
19 |                   "type": "iterate"
20 |                 }
21 |             ]
22 |         }
23 |     ],
24 |     "exclude_patterns": [], 
25 |     "respect_nofollow": true, 
26 |     "follow_patterns": [], 
27 |     "links_to_follow": "none"
28 | }
29 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/item.handlebars:
--------------------------------------------------------------------------------
 1 | <div>
 2 | 	<h5 style="text-align:center;">Choose an item field</h5>
 3 | 	<div style="text-align:center; margin-top:10px">
 4 | 		<h4>Item {{name}}</h4>
 5 | 		{{#if fields}}
 6 | 			<div style="margin-top:10px">
 7 | 			<table style="margin:0 auto;">
 8 | 				<tr class="small-label"> <td>Field</td> <td>Type</td> <td>Required</td> <td>Vary</td> </tr>
 9 | 				{{#each fields}}
10 | 					<tr>
11 | 						<td><h5 style="color:#666">{{name}}</h5></td>
12 | 						<td><h5>{{type}}</h5></td>
13 | 						<td><h5>{{#if required}}yes{{else}}no{{/if}}</h5></td>
14 | 						<td><h5>{{#if vary}}yes{{else}}no{{/if}}</h5></td>
15 | 						<td style="padding-bottom:4px;">{{view ASTool.ButtonView icon="ui-icon-circle-check"  action="fieldSelected" argument=name class="blue-button"}}</td>
16 | 					</tr>
17 | 				{{/each}}
18 | 			</table>
19 | 		{{else}}
20 | 			<h5>The item has no fields.</h5>
21 | 		{{/if}}
22 | 	</div>
23 | </div>


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/ebay3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [], 
 3 |     "start_urls": [
 4 |         "http://www.ebay.com/sch/ebayadvsearch/?rt=nc"
 5 |     ], 
 6 |     "init_requests": [
 7 |         {
 8 |             "type": "form",
 9 |             "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
10 |             "xpath": "//form[@name='adv_search_from']",
11 |             "fields": [
12 |                 {
13 |                   "xpath": ".//*[@name='_nkw']",
14 |                   "type": "constants",
15 |                   "value": ["{search_string}"]
16 |                 },
17 |                 {
18 |                   "xpath": ".//*[@name='_in_kw']",
19 |                   "type": "iterate"
20 |                 }
21 |             ]
22 |         }
23 |     ],
24 |     "exclude_patterns": [], 
25 |     "respect_nofollow": true, 
26 |     "follow_patterns": [], 
27 |     "scrapes": "default", 
28 |     "links_to_follow": "none"
29 | }
30 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/ebay4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [], 
 3 |     "start_urls": [
 4 |         "http://www.ebay.com/sch/ebayadvsearch/?rt=nc"
 5 |     ], 
 6 |     "init_requests": [
 7 |         {
 8 |             "type": "form",
 9 |             "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
10 |             "xpath": "//form[@name='adv_search_from']",
11 |             "fields": [
12 |                 {
13 |                   "xpath": ".//*[@name='_nkw']",
14 |                   "type": "constants",
15 |                   "value": ["{search_string}"]
16 |                 },
17 |                 {
18 |                   "xpath": ".//*[@name='_in_kw']",
19 |                   "type": "iterate"
20 |                 }
21 |             ]
22 |         }
23 |     ],
24 |     "exclude_patterns": [], 
25 |     "respect_nofollow": true, 
26 |     "follow_patterns": [], 
27 |     "scrapes": "default", 
28 |     "links_to_follow": "none"
29 | }
30 | 


--------------------------------------------------------------------------------
/slybot/setup.py:
--------------------------------------------------------------------------------
 1 | from slybot import __version__
 2 | try:
 3 |     from setuptools import setup
 4 | except ImportError:
 5 |     from distutils.core import setup
 6 | 
 7 | install_requires = ['Scrapy', 'scrapely', 'loginform', 'lxml']
 8 | tests_requires = ['jsonschema'] + install_requires
 9 | 
10 | setup(name='slybot',
11 |       version=__version__,
12 |       license='BSD',
13 |       description='Slybot crawler',
14 |       author='Scrapy project',
15 |       author_email='info@scrapy.org',
16 |       url='http://github.com/scrapy/slybot',
17 |       packages=['slybot', 'slybot.fieldtypes', 'slybot.tests', 'slybot.linkextractor'],
18 |       platforms = ['Any'],
19 |       scripts = ['bin/slybot', 'bin/portiacrawl'],
20 |       install_requires = install_requires,
21 |       tests_requires = tests_requires,
22 |       classifiers = ['Development Status :: 4 - Beta',
23 |                      'License :: OSI Approved :: BSD License',
24 |                      'Operating System :: OS Independent',
25 |                      'Programming Language :: Python']
26 | )
27 | 


--------------------------------------------------------------------------------
/slybot/slybot/fieldtypes/number.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Numeric data extraction
 3 | """
 4 | 
 5 | from scrapely.extractors import contains_any_numbers, extract_number
 6 | 
 7 | class NumberTypeProcessor(object):
 8 |     """NumberTypeProcessor
 9 | 
10 |     Extracts a number from text
11 | 
12 |     >>> from scrapely.extractors import htmlregion
13 |     >>> n = NumberTypeProcessor()
14 |     >>> n.extract(htmlregion(u"there are no numbers here"))
15 |     >>> n.extract(htmlregion(u"foo 34"))
16 |     u'foo 34'
17 |     >>> n.adapt(u"foo 34", None)
18 |     u'34'
19 | 
20 |     If more than one number is present, nothing is extracted
21 |     >>> n.adapt(u"34 48", None) is None
22 |     True
23 |     """
24 |     name = 'number'
25 |     description = 'extracts a single number in the text passed'
26 |     
27 |     def extract(self, htmlregion):
28 |         """Only matches and extracts strings with at least one number"""
29 |         return contains_any_numbers(htmlregion.text_content)
30 |         
31 |     def adapt(self, text, htmlpage):
32 |         return extract_number(text)
33 | 
34 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/data/SampleProject/spiders/ebay2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "templates": [], 
 3 |     "start_urls": [
 4 |         "http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc"
 5 |     ], 
 6 |     "init_requests": [
 7 |         {
 8 |             "type": "form",
 9 |             "form_url": "http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
10 |             "xpath": "//form[@name='adv_search_from']",
11 |             "fields": [
12 |                 {
13 |                   "xpath": ".//*[@name='_nkw']",
14 |                   "type": "inurl",
15 |                   "value": "file://tmp/test_params.txt"
16 |                 },
17 |                 {
18 |                   "name": "_nkw2",
19 |                   "type": "inurl",
20 |                   "value": "file://tmp/test_params.txt"
21 |                 },
22 |                 {
23 |                   "xpath": ".//*[@name='_in_kw']",
24 |                   "type": "iterate"
25 |                 }
26 |             ]
27 |         }
28 |     ],
29 |     "exclude_patterns": [], 
30 |     "respect_nofollow": true, 
31 |     "follow_patterns": [], 
32 |     "scrapes": "default", 
33 |     "links_to_follow": "none"
34 | }
35 | 


--------------------------------------------------------------------------------
/slybot/slybot/dupefilter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Duplicates filter middleware for autoscraping
 3 | """
 4 | from scrapy.exceptions import NotConfigured
 5 | from scrapy.exceptions import DropItem
 6 | 
 7 | from slybot.item import create_item_version
 8 | 
 9 | class DupeFilterPipeline(object):
10 |     def __init__(self, settings):
11 |         if not settings.getbool('SLYDUPEFILTER_ENABLED'):
12 |             raise NotConfigured
13 |         self._itemversion_cache = {}
14 | 
15 |     @classmethod
16 |     def from_crawler(cls, crawler):
17 |         return cls(crawler.settings)
18 | 
19 |     def process_item(self, item, spider):
20 |         """Checks whether a scrapy item is a dupe, based on version (not vary)
21 |         fields of the item class"""
22 |         if not hasattr(item, 'version_fields'):
23 |             return item
24 |         version = create_item_version(item)
25 |         if version in self._itemversion_cache:
26 |             old_url = self._itemversion_cache[version]
27 |             raise DropItem("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url))
28 |         self._itemversion_cache[version] = item["url"]
29 |         return item
30 | 
31 | 


--------------------------------------------------------------------------------
/slybot/slybot/linkextractor/regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from scrapy.link import Link
 3 | 
 4 | from .base import BaseLinkExtractor
 5 | 
 6 | # Based on http://blog.mattheworiordan.com/post/13174566389/url-regular-expression-for-links-with-or-without-the
 7 | # leaves aside the fragment part, not needed for link extraction
 8 | URL_DEFAULT_REGEX = r'(?:[A-Za-z0-9.\-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.\-]+)(?:(?:\/[\+~%\/.\w\-_]*)?\??(?:[\-\+=&;%@.\w_]*)(?:#[.\!\/\w]*)?)?'
 9 | 
10 | class RegexLinkExtractor(BaseLinkExtractor):
11 |     def __init__(self, regex=None, **kwargs):
12 |         super(RegexLinkExtractor, self).__init__(**kwargs)
13 |         self.allowed_schemes = filter(lambda x: x and isinstance(x, basestring), self.allowed_schemes)
14 |         regex = regex or '(?:%s)://%s' % ('|'.join(self.allowed_schemes), URL_DEFAULT_REGEX)
15 |         self.regex = re.compile(regex)
16 | 
17 |     def _extract_links(self, response):
18 |         """First extract regex groups(). If empty, extracts from regex group()"""
19 |         for s in self.regex.finditer(response.body):
20 |             if s.groups():
21 |                 for url in s.groups():
22 |                     yield Link(url)
23 |             else:
24 |                 yield Link(s.group())
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/slybot/slybot/linkextractor/ecsv.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from cStringIO import StringIO
 3 | 
 4 | from scrapy.link import Link
 5 | 
 6 | from .base import BaseLinkExtractor
 7 | 
 8 | # see http://docs.python.org/2/library/csv.html#csv-fmt-params
 9 | _FORMAT_PARAMETERS = (
10 |     ('delimiter', ','),
11 |     ('quotechar', '"'),
12 |     ('doublequote', True),
13 |     ('escapechar', None),
14 |     ('lineterminator', '\r\n'),
15 |     ('skipinitialspace', False),
16 |     ('strict', False),
17 | )
18 | 
19 | class CsvLinkExtractor(BaseLinkExtractor):
20 |     def __init__(self, column=0, **kwargs):
21 |         self.fmtparams = dict((key, kwargs.pop(key, default)) for key, default in _FORMAT_PARAMETERS)
22 |         for key, val in self.fmtparams.items():
23 |             if isinstance(val, unicode):
24 |                 self.fmtparams[key] = val.encode()
25 |         super(CsvLinkExtractor, self).__init__(**kwargs)
26 |         self.allowed_schemes = filter(lambda x: x and isinstance(x, basestring), self.allowed_schemes)
27 |         self.column = column
28 | 
29 |     def _extract_links(self, response):
30 |         buff = StringIO(response.body)
31 |         reader = csv.reader(buff, **self.fmtparams)
32 |         for row in reader:
33 |             if len(row) > self.column:
34 |                 yield Link(row[self.column])
35 | 
36 | 


--------------------------------------------------------------------------------
/slybot/slybot/linkextractor/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Link extraction for auto scraping
 3 | """
 4 | from scrapy.utils.misc import load_object
 5 | 
 6 | from .base import BaseLinkExtractor, ALLOWED_SCHEMES
 7 | from .html import HtmlLinkExtractor
 8 | from .xml import XmlLinkExtractor, RssLinkExtractor, SitemapLinkExtractor, AtomLinkExtractor
 9 | from .regex import RegexLinkExtractor
10 | from .ecsv import CsvLinkExtractor
11 | 
12 | _TYPE_MAP = (
13 |     # type, class, ignore value
14 |     ('regex', RegexLinkExtractor, False),
15 |     ('xpath', XmlLinkExtractor, False),
16 |     ('column', CsvLinkExtractor, False),
17 |     ('html', HtmlLinkExtractor, True),
18 |     ('rss', RssLinkExtractor, True),
19 |     ('sitemap', SitemapLinkExtractor, True),
20 |     ('atom', AtomLinkExtractor, True),
21 | )
22 | def create_linkextractor_from_specs(specs):
23 |     """Return a link extractor instance from specs. By default, return a HtmlLinkExtractor.
24 |     """
25 |     specs = specs.copy()
26 |     ltype, value = specs.pop('type'), specs.pop('value')
27 |     if ltype == 'module':
28 |         cls = load_object(value)
29 |         return cls(**specs)
30 |     for key, cls, ignore in _TYPE_MAP:
31 |         if key == ltype:
32 |             if ignore:
33 |                 return cls(**specs)
34 |             return cls(value, **specs)
35 |     raise ValueError("Invalid link extractor type specification")
36 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/annotation-widget.handlebars:
--------------------------------------------------------------------------------
 1 | {{#unless view.creatingField}}
 2 | 	<div style="float:right; margin-right:4px">
 3 | 			{{view ASTool.ButtonView action="editAnnotation" argument=this icon="ui-icon-gear" class="blue-button"}}
 4 | 			{{view ASTool.ButtonView action="deleteAnnotation" argument=this icon="ui-icon-trash" class="red-button"}}
 5 | 	</div>
 6 | 	{{#unless view.hasMultipleMappings}}
 7 | 		{{view view.attributeSelect}}
 8 | 		{{view view.fieldSelect}}
 9 | 		<div class="attribute-value">
10 | 			<h5 style="margin-bottom:4px">Attribute value:</h5>
11 | 			{{view.attributeValue}}
12 | 		</div>
13 | 	{{else}}
14 | 		<div>
15 | 			{{#each view.mappings}}
16 | 				<div>
17 | 					<div class="attribute multimap">{{this.name}}</div>
18 | 					<div class="field multimap" >{{this.mappedField}}</div>
19 | 				</div>
20 | 			{{/each}}
21 | 		</div>
22 | 	{{/unless}}
23 | {{else}}
24 | 	<div style="float:right;margin-right:4px"{{action createField target="view"}}>{{view ASTool.ButtonView icon="ui-icon-circle-plus" class="blue-button fix_plus_icon" disabled=view.createFieldDisabled}}</div>
25 | 	
26 | 	<label class="small-label" style="margin-left:40px">Field name</label>
27 | 	<label class="small-label" style="margin-left:64px">Field type</label>
28 | 	<div style="margin-top:5px">
29 | 		{{view view.fieldTextField}}
30 | 		<span style="margin-left:15px">{{view view.typeSelect}}</span>
31 | 	</div>
32 | {{/unless}}


--------------------------------------------------------------------------------
/slyd/slyd/resource.py:
--------------------------------------------------------------------------------
 1 | import json, errno
 2 | from twisted.web.resource import Resource, NoResource, ErrorPage
 3 | 
 4 | 
 5 | class SlydJsonResource(Resource):
 6 |     """Base Resource for Slyd Resources
 7 | 
 8 |     This sets the content type to JSON and handles errors
 9 |     """
10 | 
11 |     def render(self, request):
12 |         request.setResponseCode(200)
13 |         request.setHeader('Content-Type', 'application/json')
14 |         try:
15 |             return Resource.render(self, request)
16 |         except IOError as ex:
17 |             if ex.errno == errno.ENOENT:
18 |                 return NoResource().render(request)
19 |             else:
20 |                 raise
21 |         except ErrorPage as ex:
22 |             return ex.render(request)
23 | 
24 |     def error(self, request, status, why):
25 |         raise ErrorPage(request, status, why)
26 | 
27 |     def bad_request(self, why):
28 |         self.error(400, "Bad Request", why)
29 | 
30 |     def read_json(self, request):
31 |         try:
32 |             return json.load(request.content)
33 |         except ValueError as ex:
34 |             self.bad_request("Error parsing json. %s" % ex.message)
35 | 
36 | 
37 | class SlydJsonObjectResource(SlydJsonResource):
38 |     """Extends SlydJsonResource, converting
39 |     the returned data to JSON
40 |     """
41 | 
42 |     def render(self, request):
43 |         resp = SlydJsonResource.render(self, request)
44 |         if resp is not None:
45 |             return json.dumps(resp)
46 | 


--------------------------------------------------------------------------------
/slyd/media/js/controllers/navigation-controller.js:
--------------------------------------------------------------------------------
 1 | ASTool.NavigationController = Em.Controller.extend({
 2 | 
 3 | 	needs: ['application', 'project_index', 'spider_index', 'template_index', 'annotation'],
 4 | 
 5 | 	currentPathBinding: 'controllers.application.currentPath',
 6 | 
 7 | 	splittedPath: function() {
 8 | 		return this.get('currentPath').split('.').filter(function(pathElem) {
 9 | 			return pathElem != 'index';
10 | 		});
11 | 	}.property('currentPath'),
12 | 
13 | 	previousRoutes: function() {
14 | 		var splitted = this.get('splittedPath');
15 | 		return splitted.slice(0, splitted.length - 1).map(function(route) {
16 | 			return { route: route, label: this.labelForRoute(route) };
17 | 		}.bind(this));
18 | 	}.property('splittedPath'),
19 | 
20 | 	currentRoute: function() {
21 | 		var splitted = this.get('splittedPath');
22 | 		var route = splitted[splitted.length - 1];
23 | 		return { route: route, label: this.labelForRoute(route) };
24 | 	}.property('splittedPath'),
25 | 
26 | 	labelForRoute: function(route) {
27 | 		var controllerName = this.get('needs').find(function(name) {
28 | 			return name.indexOf(route) == 0;
29 | 		});
30 | 		if (controllerName) {
31 | 			var controller = this.get('controllers.' + controllerName);
32 | 			return controller.get('navigationLabel');
33 | 		} else {
34 | 			return '';
35 | 		}
36 | 	},
37 | 
38 | 	actions: {
39 | 
40 | 		gotoRoute: function(route) {
41 | 			ASTool.ToolboxViewMixin.expandToolbox = true;
42 | 			this.transitionToRoute(route);
43 | 		},
44 | 	},
45 | });
46 | 


--------------------------------------------------------------------------------
/slybot/slybot/utils.py:
--------------------------------------------------------------------------------
 1 | from urlparse import urlparse
 2 | import os
 3 | import json
 4 | 
 5 | from scrapely.htmlpage import HtmlPage
 6 | 
 7 | def iter_unique_scheme_hostname(urls):
 8 |     """Return an iterator of tuples (scheme, hostname) over the given urls,
 9 |     filtering dupes
10 |     """
11 |     scheme_hostname = set()
12 |     for x in urls:
13 |         p = urlparse(x)
14 |         scheme_hostname.add((p.scheme, p.hostname))
15 |     return list(scheme_hostname)
16 | 
17 | def open_project_from_dir(project_dir):
18 |     specs = {"spiders": {}}
19 |     with open(os.path.join(project_dir, "project.json")) as f:
20 |         specs["project"] = json.load(f)
21 |     with open(os.path.join(project_dir, "items.json")) as f:
22 |         specs["items"] = json.load(f)
23 |     with open(os.path.join(project_dir, "extractors.json")) as f:
24 |         specs["extractors"] = json.load(f)
25 |     for fname in os.listdir(os.path.join(project_dir, "spiders")):
26 |         if fname.endswith(".json"):
27 |             spider_name = os.path.splitext(fname)[0]
28 |             with open(os.path.join(project_dir, "spiders", fname)) as f:
29 |                 try:
30 |                     specs["spiders"][spider_name] = json.load(f)
31 |                 except ValueError, e:
32 |                     raise ValueError("Error parsing spider (invalid JSON): %s: %s" % (fname, e))
33 |     return specs
34 | 
35 | def htmlpage_from_response(response):
36 |     return HtmlPage(response.url, response.headers, \
37 |             response.body_as_unicode(), encoding=response.encoding)
38 | 


--------------------------------------------------------------------------------
/slybot/slybot/linkextractor/xml.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Link extraction for auto scraping
 3 | """
 4 | from scrapy.link import Link
 5 | from scrapy.selector import XmlXPathSelector
 6 | 
 7 | from slybot.linkextractor.base import BaseLinkExtractor
 8 | 
 9 | class XmlLinkExtractor(BaseLinkExtractor):
10 |     """Link extractor for XML sources"""
11 |     def __init__(self, xpath, **kwargs):
12 |         self.remove_namespaces = kwargs.pop('remove_namespaces', False)
13 |         super(XmlLinkExtractor, self).__init__(**kwargs)
14 |         self.xpath = xpath
15 | 
16 |     def _extract_links(self, response):
17 |         xxs = XmlXPathSelector(response)
18 |         if self.remove_namespaces:
19 |             xxs.remove_namespaces()
20 |         for url in xxs.select(self.xpath).extract():
21 |             yield Link(url.encode(response.encoding))
22 | 
23 | class RssLinkExtractor(XmlLinkExtractor):
24 |     """Link extraction from RSS feeds"""
25 |     def __init__(self, **kwargs):
26 |         super(RssLinkExtractor, self).__init__("//item/link/text()", **kwargs)
27 | 
28 | class SitemapLinkExtractor(XmlLinkExtractor):
29 |     """Link extraction for sitemap.xml feeds"""
30 |     def __init__(self, **kwargs):
31 |         kwargs['remove_namespaces'] = True
32 |         super(SitemapLinkExtractor, self).__init__("//urlset/url/loc/text() | //sitemapindex/sitemap/loc/text()", **kwargs)
33 | 
34 | class AtomLinkExtractor(XmlLinkExtractor):
35 |      def __init__(self, **kwargs):
36 |         kwargs['remove_namespaces'] = True
37 |         super(AtomLinkExtractor, self).__init__("//link/@href", **kwargs)
38 | 
39 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/edit-item.handlebars:
--------------------------------------------------------------------------------
 1 | <div class="editable-item-container">
 2 | 	<div>
 3 | 		<span style="margin:2px 0px 0px 10px">
 4 | 			{{#view ASTool.InlineTextField value=name}}
 5 | 				<span class="editable-name">Item {{name}}</span>
 6 | 			{{/view}}
 7 | 		</span>
 8 | 		<span style="float:right">
 9 | 			{{view ASTool.ButtonView action="deleteItem" argument=this icon="ui-icon-trash" class="red-button"}}
10 | 		</span>
11 | 	</div>
12 | 	<div style="text-align:center;margin-top:20px">
13 | 		{{#if view.item.fields}}
14 | 			<table style="margin:0 auto;">
15 | 				<tr class="small-label"> <td>Field</td> <td>Type</td> <td>Required</td> <td>Vary</td> </tr>
16 | 				{{#each view.item.fields}}
17 | 					<tr>
18 | 						<td>
19 | 							{{#view ASTool.InlineTextField value=name}}
20 | 								<span class="editable-name">{{name}}</span>
21 | 							{{/view}}
22 | 						</td>
23 | 						<td>{{view ASTool.TypeSelect name="fieldType" value=type itemField=this}}</td>
24 | 						<td>{{view ASTool.CheckBox checked=required name="fieldRequired"}}</td>
25 | 						<td>{{view ASTool.CheckBox checked=vary name="fieldVary"}}</td>
26 | 						<td>{{view ASTool.ButtonView icon="ui-icon-trash" class="red-button" action="deleteField" argument=view.item argument2=this}}</td>
27 | 					</tr>
28 | 				{{/each}}
29 | 			</table>
30 | 		{{else}}
31 | 			<h5>No fields defined yet.</h5>
32 | 		{{/if}}
33 | 		<div style="text-align:center;margin-top:10px">
34 | 			{{view ASTool.ButtonView action="addField" argument=this icon="ui-icon-circle-plus" label="Field" class="blue-button"}}
35 | 		</div>
36 | 	</div>
37 | </div>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Scrapinghub.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice, 
 8 |        this list of conditions and the following disclaimer.
 9 |     
10 |     2. Redistributions in binary form must reproduce the above copyright 
11 |        notice, this list of conditions and the following disclaimer in the
12 |        documentation and/or other materials provided with the distribution.
13 | 
14 |     3. Neither the name of Portia nor the names of its contributors may be used
15 |        to endorse or promote products derived from this software without
16 |        specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/test_dupefilter.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from os.path import dirname
 3 | 
 4 | from scrapy.http import HtmlResponse
 5 | from scrapy.settings import Settings
 6 | from scrapy.item import DictItem
 7 | from scrapy.exceptions import DropItem
 8 | 
 9 | from slybot.spidermanager import SlybotSpiderManager
10 | from slybot.dupefilter import DupeFilterPipeline
11 | 
12 | _PATH = dirname(__file__)
13 | 
14 | class DupeFilterTest(TestCase):
15 |     smanager = SlybotSpiderManager("%s/data/SampleProject" % _PATH)
16 | 
17 |     def test_dupefilter(self):
18 |         name = "seedsofchange2"
19 |         spider = self.smanager.create(name)
20 |         spec = self.smanager._specs["spiders"][name]
21 |         t1, t2 = spec["templates"]
22 | 
23 |         dupefilter = DupeFilterPipeline(Settings({"SLYDUPEFILTER_ENABLED": True}))
24 | 
25 |         response1 = HtmlResponse(url=t1["url"], body=t1["original_body"].encode('utf-8'))
26 |         response2 = HtmlResponse(url=t2["url"], body=t2["original_body"].encode('utf-8'))
27 | 
28 |         result1 = spider.handle_html(response1)
29 |         for item1 in result1:
30 |             if isinstance(item1, DictItem):
31 |                 break
32 | 
33 |         result2 = spider.handle_html(response2)
34 |         for item2 in result2:
35 |             if isinstance(item2, DictItem):
36 |                 break
37 |        
38 |         self.assertEqual(item1, dupefilter.process_item(item1, spider))
39 |         self.assertEqual(item2, dupefilter.process_item(item2, spider))
40 | 
41 |         self.assertRaises(DropItem, dupefilter.process_item, item1, spider)
42 | 
43 | 


--------------------------------------------------------------------------------
/slyd/slyd/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | html page utils
 3 | """
 4 | from scrapely.htmlpage import HtmlTagType
 5 | 
 6 | 
 7 | def _quotify(mystr):
 8 |     """
 9 |     quotifies an html tag attribute value.
10 |     Assumes then, that any ocurrence of ' or " in the
11 |     string is escaped if original string was quoted
12 |     with it.
13 |     So this function does not altere the original string
14 |     except for quotation at both ends, and is limited just
15 |     to guess if string must be quoted with '"' or "'"
16 |     """
17 |     quote = '"'
18 |     l = len(mystr)
19 |     for i in range(l):
20 |         if mystr[i] == "\\" and i + 1 < l and mystr[i+1] == "'":
21 |             quote = "'"; break
22 |         elif mystr[i] == "\\" and i + 1 < l and mystr[i+1] == '"':
23 |             quote = '"'; break
24 |         elif mystr[i] == "'":
25 |             quote = '"'; break
26 |         elif mystr[i] == '"':
27 |             quote = "'"; break
28 |     return quote + mystr + quote
29 | 
30 | 
31 | def serialize_tag(tag):
32 |     """
33 |     Converts a tag into a string when a slice [tag.start:tag.end]
34 |     over the source can't be used because tag has been modified
35 |     """
36 |     out = "<"
37 |     if tag.tag_type == HtmlTagType.CLOSE_TAG:
38 |         out += "/"
39 |     out += tag.tag
40 | 
41 |     attributes = []
42 |     for key, val in tag.attributes.iteritems():
43 |         aout = key
44 |         if val is not None:
45 |             aout += "=" + _quotify(val)
46 |         attributes.append(aout)
47 |     if attributes:
48 |         out += " " + " ".join(attributes)
49 | 
50 |     if tag.tag_type == HtmlTagType.UNPAIRED_TAG:
51 |         out += "/"
52 |     return out + ">"
53 | 


--------------------------------------------------------------------------------
/slyd/media/css/breadcrumb.css:
--------------------------------------------------------------------------------
 1 | /* Breadcrumb Styles
 2 |  ---------------------------------------------------------------------*/
 3 | 
 4 | .breadCrumb
 5 | {
 6 | 	margin: 0;
 7 | 	float: left;
 8 | 	display: block;
 9 | 	height: 21px;
10 | 	overflow: hidden;
11 | 	width: 400px;
12 | 	padding:3px;
13 | 	padding-top:4px;
14 | 	background:#fff;
15 | }
16 | .breadCrumb ul
17 | {
18 | 	margin: 0;
19 | 	padding: 0;
20 | 	height: 21px;
21 | 	display: block;
22 | }
23 | .breadCrumb ul li
24 | {
25 | 	display: block;
26 | 	float: left;
27 | 	position: relative;
28 | 	height: 21px;
29 | 	overflow: hidden;
30 | 	line-height: 21px;
31 | 	margin: 0px 6px 0px 0;
32 | 	padding: 0px 10px 0px 0;
33 | 	font-size: 1em;
34 | 	background: url(../images/Chevron.gif) no-repeat 100% 0;
35 | }
36 | .breadCrumb ul li div.chevronOverlay
37 | {
38 | 	position: absolute;
39 | 	right: 0;
40 | 	top: 0;
41 | 	z-index: 2;
42 | }
43 | .breadCrumb ul li span
44 | {
45 | 	display: block;
46 | 	overflow: hidden;
47 | }
48 | .breadCrumb ul li a
49 | {
50 | 	display: block;
51 | 	position: relative;
52 | 	height: 21px;
53 | 	line-height: 21px;
54 | 	overflow: hidden;
55 | 	float: left;
56 | }
57 | .breadCrumb ul li.first a
58 | {
59 | 	height: 16px !important;
60 | 	text-indent:-1000em;
61 | 	width:16px;
62 | 	padding: 0;
63 | 	margin-top: 2px;
64 | 	overflow: hidden;
65 | 	background:url(../images/IconHome.gif) no-repeat 0 0;
66 | }
67 | .breadCrumb ul li.first a:hover
68 | {
69 | 	background-position: 0 -16px;
70 | }
71 | .breadCrumb ul li.last
72 | {
73 | 	background: none;
74 | 	margin-right: 0;
75 | 	padding-right: 0;
76 | }
77 | .chevronOverlay
78 | {
79 | 	display: none;
80 | 	background: url(../images/ChevronOverlay.png) no-repeat 100% 0;
81 | 	width: 13px;
82 | 	height: 20px;
83 | }
84 | 


--------------------------------------------------------------------------------
/slybot/slybot/spidermanager.py:
--------------------------------------------------------------------------------
 1 | import tempfile, shutil, atexit
 2 | from zipfile import ZipFile
 3 | 
 4 | from zope.interface import implements
 5 | from scrapy.interfaces import ISpiderManager
 6 | from scrapy.utils.misc import load_object
 7 | 
 8 | from slybot.spider import IblSpider
 9 | from slybot.utils import open_project_from_dir
10 | 
11 | class SlybotSpiderManager(object):
12 | 
13 |     implements(ISpiderManager)
14 | 
15 |     def __init__(self, datadir, spider_cls=None):
16 |         self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
17 |         self._specs = open_project_from_dir(datadir)
18 | 
19 |     @classmethod
20 |     def from_crawler(cls, crawler):
21 |         datadir = crawler.settings['PROJECT_DIR']
22 |         spider_cls = crawler.settings['SLYBOT_SPIDER_CLASS']
23 |         return cls(datadir, spider_cls)
24 | 
25 |     def create(self, name, **args):
26 |         spec = self._specs["spiders"][name]
27 |         items = self._specs["items"]
28 |         extractors = self._specs["extractors"]
29 |         return self.spider_cls(name, spec, items, extractors, **args)
30 | 
31 |     def list(self):
32 |         return self._specs["spiders"].keys()
33 | 
34 | class ZipfileSlybotSpiderManager(SlybotSpiderManager):
35 | 
36 |     def __init__(self, datadir, zipfile=None, spider_cls=None):
37 |         if zipfile:
38 |             datadir = tempfile.mkdtemp(prefix='slybot-')
39 |             ZipFile(zipfile).extractall(datadir)
40 |             atexit.register(shutil.rmtree, datadir)
41 |         super(ZipfileSlybotSpiderManager, self).__init__(datadir, spider_cls)
42 | 
43 |     @classmethod
44 |     def from_crawler(cls, crawler):
45 |         s = crawler.settings
46 |         sm = cls(s['PROJECT_DIR'], s['PROJECT_ZIPFILE'], s['SLYBOT_SPIDER_CLASS'])
47 |         return sm
48 | 


--------------------------------------------------------------------------------
/slyd/media/js/app.js:
--------------------------------------------------------------------------------
 1 | /*************************** Application **************************/ 
 2 | ASTool = Em.Application.create({
 3 | 	LOG_TRANSITIONS: true, 
 4 | 	ready: function(){
 5 | 
 6 | 	} 
 7 | });
 8 | 
 9 | // Leave 'null' for using window.location. Define it to override.
10 | var SLYD_URL = null;
11 | 
12 | Ember.Application.initializer({
13 |  	name: 'slydApiInitializer',
14 |   
15 | 	initialize: function(container, application) {
16 |     	container.register('api:slyd', ASTool.SlydApi);
17 |     	application.inject('route', 'slyd', 'api:slyd');
18 |     	application.inject('adapter', 'slyd', 'api:slyd');
19 |     	application.inject('controller', 'slyd', 'api:slyd');
20 |   	}
21 | });
22 | 
23 | 
24 | Ember.Application.initializer({
25 |  	name: 'documentViewInitializer',
26 |   
27 | 	initialize: function(container, application) {
28 |     	container.register('document:view', ASTool.DocumentView);
29 |     	application.inject('controller', 'documentView', 'document:view');
30 |   	}
31 | });
32 | 
33 | 
34 | Ember.Application.initializer({
35 |  	name: 'annotationsStoreInitializer',
36 |   
37 | 	initialize: function(container, application) {
38 |     	container.register('annotations:store', ASTool.AnnotationsStore);
39 |     	application.inject('route', 'annotationsStore', 'annotations:store');
40 |     	application.inject('controller', 'annotationsStore', 'annotations:store');
41 |   	}
42 | });
43 | 
44 | 
45 | function s4() {
46 | 	return Math.floor((1 + Math.random()) * 0x10000)
47 | 		.toString(16)
48 | 		.substring(1);
49 | };
50 | 
51 | function guid() {
52 | 	return s4() + s4() + '-' + s4() + '-' + s4() + '-' +
53 | 		s4() + '-' + s4() + s4() + s4();
54 | }
55 | 
56 | function shortGuid() {
57 | 	return s4() + '.' + s4() + '.' + s4();
58 | }
59 | 
60 | ASTool.guid = guid;
61 | ASTool.shortGuid = shortGuid;


--------------------------------------------------------------------------------
/slyd/bin/sh2sly:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Downloads AS projects from scrapinghub in slybot format and saves them locally
 4 | 
 5 | Used for testing.
 6 | 
 7 | This uses the python-scrapinghub library
 8 | """
 9 | 
10 | 
11 | def sh2sly(apikey, projects, destination):
12 |     from scrapinghub import Connection, Project
13 |     import tempfile, zipfile, os
14 |     conn = Connection(apikey)
15 |     if not projects:
16 |         projects = conn.project_ids()
17 |         #projects = xrange(1300)
18 |     out = tempfile.TemporaryFile()
19 |     for projectid in projects:
20 |         project = Project(conn, projectid)
21 |         project.autoscraping_project_slybot(outputfile=out)
22 |         try:
23 |             zf = zipfile.ZipFile(out)
24 |             scount = len([f for f in zf.namelist() if f.startswith('spiders/')])
25 |             if scount == 0:
26 |                 print "skipping %s - no spiders" % projectid
27 |                 continue
28 |             pdest = os.path.join(destination, str(projectid))
29 |             zf.extractall(pdest)
30 |             print "extracted project with %d spiders to %s" % (scount, pdest)
31 |         except zipfile.BadZipfile:
32 |             print "error with zipfile in project %s, skipping" % projectid
33 |         out.truncate()
34 | 
35 | 
36 | def main():
37 |     import argparse
38 |     parser = argparse.ArgumentParser(
39 |         description='fetch slybot projects from scrapinghub')
40 |     parser.add_argument('-p', '--project', type=int, action='append',
41 |                    help='scrapinghub project, may be repeated, default is all')
42 |     parser.add_argument('-k', '--apikey',
43 |                        help='api key')
44 |     parser.add_argument('destination')
45 |     args = parser.parse_args()
46 |     sh2sly(args.apikey, args.project, args.destination)
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/slyd/media/js/controllers/items-controller.js:
--------------------------------------------------------------------------------
 1 | ASTool.ItemsController = Em.ArrayController.extend(ASTool.BaseControllerMixin, {
 2 | 	
 3 | 	needs: ['application', 'annotation'],
 4 | 
 5 | 	documentView: null,
 6 | 
 7 | 	addItem: function() {
 8 | 		var newItem = ASTool.Item.create({ name: ASTool.shortGuid() });
 9 | 		this.addField(newItem);
10 | 		this.pushObject(newItem);
11 | 	},
12 | 	
13 | 	addField: function(owner, name, type) {
14 | 		var newField = ASTool.ItemField.create({ name: name || 'new field',
15 | 										         type: type || 'text',
16 | 										         required: false,
17 | 										         vary: false });
18 | 		owner.set('fields', owner.fields || []);
19 | 		owner.fields.pushObject(newField);
20 | 	},
21 | 
22 | 	saveChanges: function() {
23 | 		this.get('slyd').saveItems(this.content.toArray()).then(function() {
24 | 				this.transitionToRoute('template');
25 | 			}.bind(this));
26 | 	},
27 | 
28 | 	actions: {
29 | 		
30 | 		addItem: function() {
31 | 			this.addItem();
32 | 		},
33 | 		
34 | 		addField: function(item) {
35 | 			this.addField(item);
36 | 		},
37 | 		
38 | 		deleteItem: function(item) {
39 | 			this.removeObject(item);
40 | 		},
41 | 
42 | 		deleteField: function(item, field) {
43 | 			item.get('fields').removeObject(field);
44 | 		},
45 | 
46 | 		saveChanges: function() {
47 | 			this.saveChanges();
48 | 		},
49 | 
50 | 		undoChanges: function() {
51 | 			this.get('slyd').loadItems().then(function(items) {
52 | 				this.set('content', items);
53 | 				this.transitionToRoute('template');
54 | 			}.bind(this));
55 | 		},
56 | 	},
57 | 
58 | 	willEnter: function() {
59 | 		if (this.get('documentView.canvas')) {
60 | 			this.set('documentView.canvas.interactionsBlocked', true);	
61 | 		}	
62 | 	},
63 | 
64 | 	willLeave: function() {
65 | 		if (this.get('documentView.canvas')) {
66 | 			this.set('documentView.canvas.interactionsBlocked', false);	
67 | 		}	
68 | 	},
69 | });


--------------------------------------------------------------------------------
/slybot/bin/portiacrawl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Allow to easily run slybot spiders on console. If spider is not given, print a list of available spiders inside the project"""
 3 | import os
 4 | import subprocess
 5 | from optparse import OptionParser
 6 | 
 7 | def main():
 8 |     parser = OptionParser(description=__doc__,
 9 |             usage="%prog <project dir/project zip> [spider] [options]")
10 |     parser.add_option("--settings", help="Give specific settings module (must be on python path)", default='slybot.settings')
11 |     parser.add_option("--logfile", help="Specify log file")
12 |     parser.add_option("-a", help="Add spider arguments", dest="spargs", action="append", default=[], metavar="NAME=VALUE")
13 | 
14 |     opts, args = parser.parse_args()
15 | 
16 |     try:
17 |         project_specs = args[0]
18 |         if not os.path.exists(project_specs) or len(args) > 2:
19 |             parser.print_help()
20 |             return
21 |     except IndexError:
22 |         parser.print_help()
23 |         return
24 |     
25 |     
26 |     if opts.settings:
27 |         os.environ["SCRAPY_SETTINGS_MODULE"] = opts.settings
28 | 
29 |     command_spec = ["scrapy", "crawl", args[1]] if len(args) == 2 else ["scrapy", "list"]
30 |     if project_specs.endswith(".zip"):
31 |         command_spec.extend([
32 |             "-s", "PROJECT_ZIPFILE=%s" % project_specs,
33 |             "-s", "SPIDER_MANAGER_CLASS=slybot.spidermanager.ZipfileSlybotSpiderManager",
34 |         ])
35 |     else:
36 |         command_spec.extend([
37 |             "-s", "PROJECT_DIR=%s" % project_specs,
38 |             "-s", "SPIDER_MANAGER_CLASS=slybot.spidermanager.SlybotSpiderManager",
39 |         ])
40 | 
41 |     if opts.logfile:
42 |         command_spec.append("--logfile=%s" % opts.logfile)
43 | 
44 |     for sparg in opts.spargs:
45 |         command_spec.append("-a")
46 |         command_spec.append(sparg)
47 | 
48 |     subprocess.call(command_spec)
49 | 
50 | main()
51 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/floating-annotation-widget.handlebars:
--------------------------------------------------------------------------------
 1 | {{#unless view.creatingField}}
 2 | 	<div>
 3 | 		<span class="attribute">Map attribute</span>
 4 | 		<span style="margin-left:44px" class="field">To field</span>
 5 | 		<div style="display:inline-block;margin-top:2px;margin-left:50px">
 6 | 			{{view ASTool.InlineHelp message="annotation_widget"}}
 7 | 		</div>
 8 | 		<div style="display:inline-block;margin-left:6px;margin-top:-1px">
 9 | 			{{view ASTool.ButtonView action="editAnnotation" argument=controller.floatingAnnotation icon="ui-icon-gear" class="clear-button"}}
10 | 		</div>
11 | 		<div style="display:inline-block;margin-left:-10px;margin-top:-1px">
12 | 			{{view ASTool.ButtonView action="deleteAnnotation" argument=controller.floatingAnnotation icon="ui-icon-trash" class="clear-button"}}
13 | 		</div>
14 | 	</div>
15 | 	{{#unless view.hasMultipleMappings}}
16 | 		<div style="margin-top:2px">
17 | 			{{view view.attributeSelect}}
18 | 			{{view view.fieldSelect}}
19 | 			<div style="display:inline-block;margin-top:1px">
20 | 			<span {{action showCreateFieldWidget target="view"}}>{{view ASTool.ButtonView icon="ui-icon-circle-plus" class="clear-button" label="field"}}</span>
21 | 			</div>
22 | 		</div>
23 | 	{{else}}
24 | 		<div>
25 | 			{{#each view.mappings}}
26 | 				<div class="multimapf">
27 | 					<div style="display:inline-block;width:120px">{{this.name}}</div>
28 | 					<div style="display:inline-block;">{{this.mappedField}}</div>
29 | 				</div>
30 | 			{{/each}}
31 | 		</div>
32 | 	{{/unless}}
33 | {{else}}
34 | 	<label style="margin-left:0px;">Field name</label>
35 | 	<label style="margin-left:70px;">Field type</label>
36 | 	<div style="margin-top:5px">
37 | 		{{view view.fieldTextField width=115 class="darktextfield"}}
38 | 		<span style="margin-left:6px">{{view view.typeSelect}}</span>
39 | 		<span {{action createField target="view"}}>{{view ASTool.ButtonView icon="ui-icon-circle-check" class="clear-button" disabled=view.createFieldDisabled}}</span>
40 | 	</div>
41 | {{/unless}}
42 | 


--------------------------------------------------------------------------------
/slybot/slybot/fieldtypes/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Field Types
 3 | 
 4 | Spiders extracts items of a given type. These item types are defined by a
 5 | schema, which specifies the type of each field in the item. This module
 6 | contains FieldProcessor implementations, which are the classes responsible for
 7 | custom processing of these types. 
 8 | 
 9 | We keep the types of scrapers supported flexible and allow different methods
10 | for each. In the future, we expect many different types, for example one might
11 | be a mechanical turk scraper and the fields would have to do user validation
12 | and provide error messages.
13 | """
14 | from .text import (
15 |     RawFieldTypeProcessor,
16 |     TextFieldTypeProcessor,
17 |     SafeHtmlFieldTypeProcessor
18 | )
19 | from .images import ImagesFieldTypeProcessor
20 | from .url import UrlFieldTypeProcessor
21 | from .number import NumberTypeProcessor
22 | from .point import GeoPointFieldTypeProcessor
23 | from .price import PriceTypeProcessor
24 | 
25 | class FieldTypeManager(object):
26 |     _TYPEMAP = dict((c.name, c) for c in (
27 |         RawFieldTypeProcessor, TextFieldTypeProcessor,
28 |         ImagesFieldTypeProcessor, NumberTypeProcessor,
29 |         UrlFieldTypeProcessor, SafeHtmlFieldTypeProcessor,
30 |         GeoPointFieldTypeProcessor, PriceTypeProcessor,
31 |     ))
32 |     _names = sorted(_TYPEMAP.keys())
33 | 
34 |     def available_type_names(self):
35 |         """Find the names of all field types available. """
36 |         return self._names
37 | 
38 |     def type_processor_class(self, name):
39 |         """Retrieve the class for the given extractor
40 | 
41 |         This can be useful to introspect on the constructor arguments. If no
42 |         suitable type is found, it will default to the RawFieldTypeProcessor
43 |         (no processing of extracted data is done).
44 |         """
45 |         return self._TYPEMAP.get(name, RawFieldTypeProcessor)
46 | 
47 |     def all_processor_classes(self):
48 |         """Retrieve all processor classes registered"""
49 |         return self._TYPEMAP.values()
50 | 
51 | 


--------------------------------------------------------------------------------
/slybot/slybot/validation/schema.py:
--------------------------------------------------------------------------------
 1 | """Simple validation of specifications passed to slybot"""
 2 | from os.path import dirname, join
 3 | import json, re
 4 | from urlparse import urlparse
 5 | 
 6 | from jsonschema import Draft3Validator, ValidationError, RefResolver
 7 | 
 8 | _PATH = dirname(__file__)
 9 | 
10 | def load_schemas():
11 |     filename = join(_PATH, "schemas.json")
12 |     return dict((s["id"], s) for s in json.load(open(filename)))
13 | 
14 | _SCHEMAS = load_schemas()
15 | 
16 | class SlybotJsonSchemaValidator(Draft3Validator):
17 |     DEFAULT_TYPES = Draft3Validator.DEFAULT_TYPES.copy()
18 |     DEFAULT_TYPES.update({
19 |         "mapping": dict,
20 |     })
21 |     def validate_format(self, fmt, instance, schema):
22 |         if schema["type"] != "string":
23 |             raise ValidationError("Invalid keyword 'format' for type '%s'" % schema["type"])
24 | 
25 |         if fmt == "regex":
26 |             try:
27 |                 re.compile(instance)
28 |             except:
29 |                 raise ValidationError("Invalid regular expression: %s" % repr(instance))
30 |         elif fmt == "url":
31 |             parsed = urlparse(instance)
32 |             if not parsed.scheme or not parsed.netloc:
33 |                 raise ValidationError("Invalid url: '%s'" % repr(instance))
34 |                
35 |         return None
36 | 
37 | def get_schema_validator(schema):
38 |     resolver = RefResolver("", schema, _SCHEMAS)
39 |     return SlybotJsonSchemaValidator(_SCHEMAS[schema], resolver=resolver)
40 | 
41 | def validate_project_schema(specs):
42 |     
43 |     project = specs["project"]
44 |     get_schema_validator("project").validate(project)
45 | 
46 |     items = specs["items"]
47 |     get_schema_validator("items").validate(items)
48 | 
49 |     extractors = specs["extractors"]
50 |     get_schema_validator("extractors").validate(extractors)
51 | 
52 |     spider_schema_validator = get_schema_validator("spider")
53 |     for spider in specs["spiders"].values():
54 |         spider_schema_validator.validate(spider)
55 | 
56 |     return True
57 | 
58 | 


--------------------------------------------------------------------------------
/slyd/slyd/tap.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The module is used by the Twisted plugin system
 3 | (twisted.plugins.slyd_plugin) to register twistd command to manage
 4 | slyd server. The command can be used with 'twistd slyd'.
 5 | """
 6 | from os.path import join, dirname
 7 | from twisted.python import usage
 8 | from twisted.web.resource import Resource
 9 | from twisted.application.internet import TCPServer
10 | from twisted.web.server import Site
11 | from twisted.web.static import File
12 | 
13 | DEFAULT_PORT = 9001
14 | DEFAULT_DOCROOT = join(dirname(dirname(__file__)), 'media')
15 | 
16 | 
17 | class Options(usage.Options):
18 |     optParameters = [
19 |         ['port', 'p', DEFAULT_PORT, 'Port number to listen on.', int],
20 |         ['docroot', 'd', DEFAULT_DOCROOT, 'Default doc root for static media.'],
21 |     ]
22 | 
23 | 
24 | def create_root(config):
25 |     from scrapy import log
26 |     from scrapy.settings import CrawlerSettings
27 |     from slyd.crawlerspec import (CrawlerSpecManager,
28 |         create_crawler_spec_resource)
29 |     from slyd.bot import create_bot_resource
30 |     import slyd.settings
31 |     from slyd.projects import ProjectsResource
32 | 
33 |     root = Resource()
34 |     root.putChild("static", File(config['docroot']))
35 | 
36 |     crawler_settings = CrawlerSettings(settings_module=slyd.settings)
37 |     spec_manager = CrawlerSpecManager(crawler_settings)
38 | 
39 |     # add project management at /projects
40 |     projects = ProjectsResource(crawler_settings)
41 |     root.putChild('projects', projects)
42 | 
43 |     # add crawler at /projects/PROJECT_ID/bot
44 |     log.msg("Slybot specs loading from %s/[PROJECT]" % spec_manager.basedir,
45 |         level=log.DEBUG)
46 |     projects.putChild("bot", create_bot_resource(spec_manager))
47 | 
48 |     # add spec at /projects/PROJECT_ID/spec
49 |     spec = create_crawler_spec_resource(spec_manager)
50 |     projects.putChild("spec", spec)
51 |     return root
52 | 
53 | 
54 | def makeService(config):
55 |     root = create_root(config)
56 |     site = Site(root)
57 |     return TCPServer(config['port'], site)
58 | 


--------------------------------------------------------------------------------
/slybot/slybot/closespider.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This extension closes spiders after they have been crawling inefficiently for a
 3 | while
 4 | Each SLYCLOSE_SPIDER_CHECK_PERIOD seconds, it checks that at least SLYCLOSE_SPIDER_PERIOD_ITEMS
 5 | have been extracted along the last time interval of same length.
 6 | """
 7 | 
 8 | from twisted.internet import task
 9 | 
10 | from scrapy.xlib.pydispatch import dispatcher
11 | from scrapy import signals
12 | from scrapy.exceptions import NotConfigured
13 | 
14 | DEFAULT_CHECK_PERIOD = 3600
15 | DEFAULT_PERIOD_MIN_ITEMS = 200
16 | 
17 | class SlybotCloseSpider(object):
18 |     
19 |     def __init__(self, crawler):
20 |         if not crawler.settings.getbool("SLYCLOSE_SPIDER_ENABLED"):
21 |             raise NotConfigured
22 | 
23 |         self.crawler = crawler
24 |         self.check_period = crawler.settings.getint("SLYCLOSE_SPIDER_CHECK_PERIOD", DEFAULT_CHECK_PERIOD)
25 |         self.period_items = crawler.settings.getint("SLYCLOSE_SPIDER_PERIOD_ITEMS", DEFAULT_PERIOD_MIN_ITEMS)
26 | 
27 |         self.items_in_period = 0
28 | 
29 |         dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
30 |         dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
31 |         dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
32 | 
33 |     def spider_opened(self, spider):
34 |         self.task = task.LoopingCall(self._check_crawled_items, spider)
35 |         self.task.start(self.check_period, now=False)
36 | 
37 |     def spider_closed(self, spider):
38 |         if self.task.running:
39 |             self.task.stop()
40 | 
41 |     def item_scraped(self, item, spider):
42 |         self.items_in_period += 1
43 |             
44 |     def _check_crawled_items(self, spider):
45 |         if self.items_in_period >= self.period_items:
46 |             self.items_in_period = 0
47 |         else:
48 |             spider.log("Closing spider because of low item throughput. Items in last period: %d" % self.items_in_period)
49 |             self.crawler.engine.close_spider(spider, 'slybot_fewitems_scraped')
50 | 
51 |     @classmethod
52 |     def from_crawler(cls, crawler):
53 |         return cls(crawler)
54 | 


--------------------------------------------------------------------------------
/slyd/media/tests/fixtures.js:
--------------------------------------------------------------------------------
 1 | var spiderNamesJson = ['spider1'];
 2 | 
 3 | var spider1Json = {
 4 |     "allowed_domains": [], 
 5 |     "exclude_patterns": [], 
 6 |     "follow_patterns": [], 
 7 |     "init_requests": [], 
 8 |     "links_to_follow": "none", 
 9 |     "respect_nofollow": true, 
10 |     "start_urls": [
11 |         "http://site1.com",
12 |         "http://site2.com", 
13 |     ], 
14 |     "templates": [
15 |         {
16 |             "name": "t1",
17 |             "annotated_body": "<html><body><div data-scrapy-annotate=\"{&quot;id&quot;:&quot;ac93d6a6-63f5-f88d-03b3-7aff740f1cff&quot;,&quot;name&quot;:&quot;Annotation 1&quot;,&quot;annotations&quot;:{}}\"><h1 id='xxx'>Some content</h1><h1>More content</h1></div><body></html>",
18 |             "extractors": [], 
19 |             "original_body": "<html><body><div><h1 id='xxx'>Some content</h1><h1>More content</h1></div></body></html>",
20 |             "page_id": "", 
21 |             "page_type": "item", 
22 |             "scrapes": "default", 
23 |             "url": "http://site1.com",
24 |         }, 
25 |         {
26 |             "name": "t2",
27 |             "annotated_body": "<html><body><div data-scrapy-annotate=\"{&quot;id&quot;:&quot;ac93d6a6-63f5-f88d-03b3-7aff740f1cff&quot;,&quot;name&quot;:&quot;Annotation 1&quot;,&quot;annotations&quot;:{&quot;content&quot;:&quot;description&quot;}}\"><h1 id='xxx'>Some content</h1><h1>More content</h1></div><body></html>",
28 |             "extractors": [], 
29 |             "original_body": "<html><body><div><h1 id='xxx'>Some content</h1><h1>More content</h1></div></body></html>",
30 |             "page_id": "", 
31 |             "page_type": "item", 
32 |             "scrapes": "default", 
33 |             "url": "http://site2.com"
34 |         }
35 |     ]
36 | };
37 | 
38 | var fetchedPageJson = {
39 |     page: "<html><body><div>HELLO</div></body></html>",
40 |     items: [],
41 | };
42 | 
43 | var itemsJson = {
44 |     "default": {
45 |         "fields": {
46 |             "description": {
47 |                 "required": false, 
48 |                 "type": "safe html", 
49 |                 "vary": false
50 |             }, 
51 |             "images": {
52 |                 "required": true, 
53 |                 "type": "image", 
54 |                 "vary": true
55 |             }, 
56 |         }
57 |     }
58 | };


--------------------------------------------------------------------------------
/slybot/slybot/baseurl.py:
--------------------------------------------------------------------------------
 1 | """
 2 | html page utils
 3 | """
 4 | import urlparse, re
 5 | from scrapely.htmlpage import parse_html, HtmlTagType
 6 | 
 7 | ABSURLRE = re.compile("^https?\:\/\/")
 8 | DOCTYPERE = re.compile("<!DOCTYPE.*?>", re.S | re.I)
 9 | 
10 | def _is_abs_url(url):
11 |     return bool(ABSURLRE.match(url))
12 | 
13 | def insert_base_url(html, base):
14 |     """
15 |     Inserts the given base url if does not exist in html source,
16 |     or replace the existing if needed
17 |     """
18 |     baseurl = baseelement = headelement = htmlelement = None
19 |     for element in parse_html(html):
20 |         if getattr(element, "tag", None) == "base":
21 |             baseurl = element.attributes.get("href", None)
22 |             baseelement = element
23 |         elif getattr(element, "tag", None) == "head" and \
24 |                 element.tag_type == HtmlTagType.OPEN_TAG:
25 |             headelement = element
26 |         elif getattr(element, "tag", None) == "html" and \
27 |                 element.tag_type == HtmlTagType.OPEN_TAG:
28 |             htmlelement = element
29 | 
30 |     if baseurl:
31 |         if not _is_abs_url(baseurl):
32 |             absurl = urlparse.urljoin(base, baseurl)
33 |             # replace original base tag
34 |             basetag = '<base href="%s" />' % absurl
35 |             html = html[:baseelement.start] + basetag + html[baseelement.end:]
36 | 
37 |     else:
38 |         # Generate new base element and include
39 |         basetag = '<base href="%s" />' % base
40 |         if headelement:
41 |             insertpos = headelement.end
42 |         else:
43 |             if htmlelement:
44 |                 basetag = "\n<head>%s</head>\n" % basetag
45 |                 insertpos = htmlelement.end
46 |             else:
47 |                 doctype_match = DOCTYPERE.search(html)
48 |                 if doctype_match:
49 |                     insertpos = doctype_match.end()
50 |                 else:
51 |                     insertpos = 0
52 |         html = html[:insertpos] + basetag + html[insertpos:]
53 | 
54 |     return html
55 | 
56 | def get_base_url(htmlpage):
57 |     """Return the base url of the given HtmlPage""" 
58 |     for element in htmlpage.parsed_body:
59 |         if getattr(element, "tag", None) == "base":
60 |             return element.attributes.get("href") or htmlpage.url
61 |     return htmlpage.url
62 | 


--------------------------------------------------------------------------------
/slyd/media/js/messages.js:
--------------------------------------------------------------------------------
 1 | ASTool.Messages = Ember.Namespace.create({
 2 | 	// Inline help messages.
 3 | 	overlay_blocked_links: 'Enable this options to highlight links not followed at crawl time in red and followed links in green.',
 4 | 	follow_links: 'Links that match any of the regular expressions in this list will be followed (they should also be in the domain of one of the start pages).',
 5 | 	exclude_links: 'Links that match any of the regular expressions in this list will be excluded.',
 6 | 	perform_login: 'Select this option if the site you are crawling requires login credentials.',
 7 | 	template_required: 'This setting is equivalent to marking the fields as required in the item definition, but limiting the scope to this template only.',
 8 | 	extractors: 'With <b>regular expression extractors</b>, the extracted data is matched against the specified expression and replaced by the match group enclosed between parentheses. If there is no match, the field is not extracted.<br/><br/><b>Type extractors</b> override the type specified in the item definition.',
 9 | 	select_item: 'You can choose what item type is extracted by this template using the combobox. You can also create and modify items by clicking on the Edit Items button.',
10 | 	variant: 'By selecting a different variant than <b>Base(0)</b> in your annotation, the resulting extracted data will be assigned to the base item special field variants, which is a list of objects similar to an item.',
11 | 	ignored_subregions: 'Allows you to define subregions that should be excluded from the extraction process.',
12 | 	selected_region_ancestors: 'Refine your selection by navigating its ancestors.',
13 | 	selected_region_children: 'Refine your selection by navigating its children.',
14 | 	sticky_fields: 'Required attributes are not extracted, but they must be present for a page to match the template.',
15 | 	annotation_widget: 'Select the attribute you want to extract and an item field to map it. <br/><br/>Choose <b>-just required-</b> to indicate that the template must match a particular feature without generating any extracted data. <br/><br/> You can create new fields by clicking the <b>+ field button</b> or by seleting the <b>-create new-</b> option from the <b>field</b> combobox.',
16 | 
17 | 	// Other messages.
18 | 	confirm_change_selection: 'If you select a different region you will lose all current attribute mappings and ignored subregions, proceed anyway?',
19 | });
20 | 


--------------------------------------------------------------------------------
/slyd/media/js/controllers/projects-controller.js:
--------------------------------------------------------------------------------
 1 | ASTool.ProjectsIndexController = Em.ArrayController.extend(ASTool.BaseControllerMixin, {
 2 | 	needs: ['application'],
 3 | 
 4 | 	projectSite: null,
 5 | 
 6 | 	createProjectDisabled: function() {
 7 | 		return Em.isEmpty(this.get('projectSite'));
 8 | 	}.property('projectSite'),
 9 | 
10 | 	actions: {
11 | 		
12 | 		openProject: function(projectName) {
13 | 			this.set('slyd.project', projectName);
14 | 			this.transitionToRoute('project', { id: projectName });
15 | 		},
16 | 
17 | 		deleteProject: function(projectName) {
18 | 			if (confirm('Are you sure you want to delete this project? This operation cannot be undone.')) {
19 | 				this.get('slyd').deleteProject(projectName);
20 | 				this.removeObject(projectName);
21 | 			} 
22 | 		},
23 | 
24 | 		createProject: function() {
25 | 			var newProjectName = this.getUnusedName('new_project', this.get('content'));
26 | 			this.get('slyd').createProject(newProjectName).then(function() {
27 | 				this.set('slyd.project', newProjectName);
28 | 				// Initialize items spec.
29 | 				this.get('slyd').saveItems([
30 | 					ASTool.Item.create({ name: 'default', fields: [ ]
31 | 					})
32 | 				]);
33 | 				// Initialize extractors spec.
34 | 				this.get('slyd').saveExtractors([]);
35 | 				// Setup automatic creation of an initial spider.
36 | 				this.set('controllers.application.siteWizard', this.get('projectSite'));
37 | 				this.set('projectSite', null);
38 | 				this.transitionToRoute('project', { id: newProjectName });
39 | 			}.bind(this));
40 | 		}
41 | 	},
42 | 
43 | 	animateProjectSiteInput: function() {
44 | 		var animateBorderColor = function () {
45 | 			$('#projectSiteTextField')
46 | 				.animate({ 'border-color': 'rgba(88,150,220,0.4)', 'background-color': 'rgba(130,210,230,0.1)' }, 1000)
47 | 				.animate({ 'border-color': '#BBBBB', 'background-color': '#FFFFFF' }, 1000, animateBorderColor)
48 | 		};
49 | 		Em.run.schedule('afterRender', this, function() {
50 | 			$('#projectSiteTextField')
51 | 				.hide()
52 | 				.show('fast', animateBorderColor)
53 | 				.click(function(e) {
54 | 					$('#projectSiteTextField').stop(true)
55 | 					.css({ 'border-color': '#BBBBB', 'background-color': '#FFFFFF' });
56 | 				});
57 | 		});
58 | 	},
59 | 
60 | 	willEnter: function() {
61 | 		this.get('documentView').showSpider();
62 | 		if (Em.isEmpty(this.get('content'))) {
63 | 			this.animateProjectSiteInput();	
64 | 		}
65 | 	}
66 | });
67 | 


--------------------------------------------------------------------------------
/slybot/slybot/item.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | from collections import defaultdict
 3 | 
 4 | from scrapy.item import DictItem, Field
 5 | from scrapely.descriptor import ItemDescriptor, FieldDescriptor
 6 | 
 7 | from slybot.fieldtypes import FieldTypeManager
 8 | 
 9 | class SlybotItem(DictItem):
10 |     # like DictItem.__setitem__ but doesn't check the field is declared
11 |     def __setitem__(self, name, value):
12 |         self._values[name] = value
13 |     @classmethod
14 |     def create_iblitem_class(cls, schema):
15 |         class IblItem(cls):
16 |             fields = defaultdict(dict)
17 |             version_fields = []
18 |             for _name, _meta in schema['fields'].items():
19 |                 fields[_name] = Field(_meta)
20 |                 if not _meta.get("vary", False):
21 |                     version_fields.append(_name)
22 |             version_fields = sorted(version_fields)
23 |         return IblItem
24 |    
25 | def create_slybot_item_descriptor(schema):
26 |     field_type_manager = FieldTypeManager()
27 |     descriptors = []
28 |     for pname, pdict in schema['fields'].items():
29 |         required = pdict['required']
30 |         pclass = field_type_manager.type_processor_class(pdict['type'])
31 |         processor = pclass()
32 |         descriptor = SlybotFieldDescriptor(pname, pname, processor, required)
33 |         descriptors.append(descriptor)
34 |     return ItemDescriptor("", "", descriptors)
35 | 
36 | class SlybotFieldDescriptor(FieldDescriptor):
37 |     """Extends the scrapely field descriptor to use slybot fieldtypes and
38 |     to be created from a slybot item schema
39 |     """
40 | 
41 |     def __init__(self, name, description, field_type_processor, required=False):
42 |         """Create a new SlybotFieldDescriptor with the given name and description. 
43 |         The field_type_processor is used for extraction and is publicly available
44 |         """
45 |         FieldDescriptor.__init__(self, name, description, 
46 |             field_type_processor.extract, required)
47 |         # add an adapt method
48 |         self.adapt = field_type_processor.adapt
49 | 
50 | def create_item_version(item):
51 |     """Item version based on hashlib.sha1 algorithm"""
52 |     if not item.version_fields:
53 |         return
54 |     _hash = hashlib.sha1()
55 |     for attrname in item.version_fields:
56 |         _hash.update(repr(item.get(attrname)))
57 |     return _hash.digest()
58 | 
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | portia
 2 | ======
 3 | Visual scraping for Scrapy.
 4 | 
 5 | 
 6 | Overview
 7 | ========
 8 | 
 9 | Portia is a tool for visually scraping web sites without any programming knowledge. Just annotate web pages with a point and click editor to indicate what data you want to extract, and portia will learn how to scrape similar pages
10 | from the site.
11 | 
12 | Portia has a web based UI served by a [Twisted] server, so you can install it on almost any modern platform.
13 | 
14 | Requirements
15 | ============
16 | 
17 | * Python 2.7
18 | * Works on Linux, Windows, Mac OSX, BSD
19 | * Supported browsers: Latest versions of Chrome (recommended) or Firefox
20 | 
21 | 
22 | Repository structure
23 | ====================
24 | 
25 | There are two main components in this repository, __slyd__ and __slybot__:
26 | 
27 | ###slyd
28 | 
29 | The visual editor used to create your scraping projects.
30 | 
31 | ###slybot
32 | 
33 | The Python web crawler that performs the actual site scraping. It's implemented on top of the [Scrapy] web crawling
34 | framework and the [Scrapely] extraction library. It uses projects created with __slyd__ as input.
35 | 
36 | 
37 | How to install portia
38 | =============================
39 | 
40 | The recommended way to install dependencies is to use __virtualenv__ and then do:
41 | 
42 | 	cd slyd
43 | 	pip install -r requirements.txt
44 | 
45 | As __slybot__ is a __slyd__ dependency, it will also get installed.
46 | 
47 | Running portia
48 | ==============
49 | 
50 | First, you need to start the ui and create a project. Run __slyd__ using:
51 | 
52 | 	cd slyd
53 | 	twistd -n slyd
54 | 
55 | and point your browser to: `http://localhost:9001/static/main.html`
56 | 
57 | Choose the site you want to scrape and create a project. Every project is created with a default spider named after the domain of the site you are scraping. When you are ready, you can run your project with __slybot__ to do the actual crawling/extraction.
58 | 
59 | Projects created with __slyd__ can be found at:
60 | 
61 | 	slyd/data/projects
62 | 
63 | To run one of those projects use:
64 | 
65 | 	portiacrawl project_path spidername
66 | 
67 | Where `spidername` should be one of the project spiders. If you don't remember the name of the spider, just use:
68 | 
69 | 	portiacrawl project_path
70 | 
71 | and you will get the list of spiders for that project.
72 | 
73 | 
74 | [Twisted]: https://twistedmatrix.com
75 | [Scrapely]: https://github.com/scrapy/scrapely
76 | [Scrapy]: http://scrapy.org
77 | 


--------------------------------------------------------------------------------
/slybot/slybot/tests/test_schema_validation.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from unittest import TestCase
 4 | from os.path import dirname, join
 5 | 
 6 | from slybot.validation.schema import get_schema_validator, \
 7 |             ValidationError, validate_project_schema
 8 | from slybot.utils import open_project_from_dir
 9 | 
10 | _TEST_PROJECT_DIR = join(dirname(__file__), "data/SampleProject")
11 | 
12 | class JsonSchemaTest(TestCase):
13 | 
14 |     def assertRaisesRegexp(self, eclass, pattern, func, *args):
15 |         """assertRaisesRegexp is not provided in python versions below 2.7"""
16 |         try:
17 |             func(*args)
18 |         except eclass, e:
19 |             m = re.search(pattern, e.message)
20 |             if not m:
21 |                 raise AssertionError('"%s" does not match "%s"' % (pattern, e.message))
22 |         else:
23 |             raise AssertionError("%s not raised" % eclass.__name__)
24 | 
25 |     def test_regex_formatting_wrong(self):
26 |         obj = {
27 |             "0": {
28 |                 "regular_expression": "Item: (\d+"
29 |             }
30 |         }
31 |         validator = get_schema_validator("extractors")
32 |         self.assertRaisesRegexp(ValidationError, "Invalid regular expression",
33 |                     validator.validate, obj)
34 | 
35 |     def test_regex_formatting_ok(self):
36 |         obj = {
37 |             "0": {
38 |                 "regular_expression": "Item: (\d+)"
39 |             }
40 |         }
41 |         validator = get_schema_validator("extractors")
42 |         self.assertEqual(validator.validate(obj), None)
43 | 
44 |     def test_valid_url(self):
45 |         obj = {
46 |             "start_urls": ['http://www.example.com/'],
47 |             "links_to_follow": "none",
48 |             "respect_nofollow": True,
49 |             "templates": [],
50 |         }
51 |         validator = get_schema_validator("spider")
52 |         self.assertEqual(validator.validate(obj), None)
53 | 
54 |     def test_invalid_url(self):
55 |         obj = {
56 |             "start_urls": ['www.example.com'],
57 |             "links_to_follow": "none",
58 |             "respect_nofollow": True,
59 |             "templates": [],
60 |         }
61 |         validator = get_schema_validator("spider")
62 |         self.assertRaisesRegexp(ValidationError, "Invalid url:", validator.validate, obj)
63 | 
64 |     def test_test_project(self):
65 |         specs = open_project_from_dir(_TEST_PROJECT_DIR)
66 |         self.assertTrue(validate_project_schema(specs))
67 | 
68 | 


--------------------------------------------------------------------------------
/slyd/tests/test_bot.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from os.path import join
 3 | from twisted.trial import unittest
 4 | from twisted.internet.defer import inlineCallbacks
 5 | from twisted.web.server import Site
 6 | from twisted.web.static import File
 7 | from twisted.internet import reactor
 8 | from slyd.bot import create_bot_resource
 9 | from .utils import TestSite, test_spec_manager
10 | from .settings import RESOURCE_DIR
11 | 
12 | 
13 | class BotTest(unittest.TestCase):
14 |     def setUp(self):
15 |         # configure bot resource
16 |         sm = test_spec_manager()
17 |         self.bot_resource = create_bot_resource(sm)
18 |         self.botsite = TestSite(self.bot_resource)
19 | 
20 |         # configure fake website to crawl
21 |         docroot = join(RESOURCE_DIR, 'docroot')
22 |         factory = Site(File(docroot))
23 |         self.listen_port = reactor.listenTCP(8997, factory)
24 | 
25 | 
26 |     def _fetch(self, url, **params):
27 |         req = dict(params)
28 |         req.setdefault('request', {})['url'] = url
29 |         request_json = json.dumps(req)
30 |         return self.botsite.post('fetch', data=request_json)
31 | 
32 |     @inlineCallbacks
33 |     def test_fetch(self):
34 |         # test status code
35 |         result = yield self._fetch("http://localhost:8997/notexists")
36 |         self.assertEqual(result.responseCode, 200)
37 |         status = json.loads(result.value())['response']['status']
38 |         self.assertEqual(status, 404)
39 | 
40 |         # get an existing file
41 |         test_url = "http://localhost:8997/test.html"
42 |         result = yield self._fetch(test_url)
43 |         self.assertEqual(result.responseCode, 200)
44 |         value = json.loads(result.value())
45 |         # expect 200 response and base href added
46 |         self.assertEqual(value['response']['status'], 200)
47 |         self.assertIn('<base href="%s"' % test_url, value['page'])
48 | 
49 |         # parse fetched data
50 |         test_url = "http://localhost:8997/pin1.html"
51 |         result = yield self._fetch(test_url, spider='pinterest.com')
52 |         self.assertEqual(result.responseCode, 200)
53 |         value = json.loads(result.value())
54 |         # check item
55 |         item = value['items'][0]
56 |         self.assertEqual(item['url'], test_url)
57 |         self.assertEqual(item['name'][0], u'Luheca Designs')
58 |         # check links
59 |         self.assertIn('links', value)
60 | 
61 |     def tearDown(self):
62 |         self.bot_resource.stop()
63 |         self.listen_port.stopListening()
64 | 


--------------------------------------------------------------------------------
/slyd/media/js/controllers/project-controller.js:
--------------------------------------------------------------------------------
 1 | ASTool.ProjectIndexController = Em.ArrayController.extend(ASTool.BaseControllerMixin, {
 2 | 
 3 | 	needs: ['application', 'spider_index'],
 4 | 
 5 | 	documentView: null,
 6 | 
 7 | 	spiderPage: null,
 8 | 
 9 | 	nameBinding: 'slyd.project',
10 | 
11 | 	navigationLabelBinding: 'slyd.project',
12 | 
13 | 	createSpiderDisabled: function() {
14 | 		return Em.isEmpty(this.get('spiderPage'));
15 | 	}.property('spiderPage'),
16 | 
17 | 	addSpider: function() {
18 | 		var siteUrl = this.get('spiderPage') || this.get('controllers.application.siteWizard');
19 | 		if (siteUrl.indexOf('http') != 0) {
20 | 			siteUrl = 'http://' + siteUrl;
21 | 		}
22 | 		var newSpiderName = this.getUnusedName(URI.parse(siteUrl).hostname, this.get('content'));
23 | 		this.set('controllers.application.siteWizard', siteUrl);
24 | 		var spider = ASTool.Spider.create( 
25 | 			{ 'name': newSpiderName,
26 | 			  'start_urls': [],
27 | 			  'follow_patterns': [],
28 | 			  'exclude_patterns': [],
29 | 			  'init_requests': [],
30 | 			  'templates': [] });
31 | 		this.pushObject(newSpiderName);
32 | 		this.set('spiderPage', null);
33 | 		return this.get('slyd').saveSpider(spider, newSpiderName).then(function() {
34 | 				this.editSpider(newSpiderName);
35 | 		}.bind(this), function(error) {
36 | 			console.log(error);
37 | 		});
38 | 	},
39 | 
40 | 	editSpider: function(spiderName) {
41 | 		this.get('slyd').loadSpider(spiderName).then(function(spider) {
42 | 			this.transitionToRoute('spider', spider);
43 | 		}.bind(this));
44 | 	},
45 | 
46 | 	actions: {
47 | 
48 | 		editSpider: function(spiderName) {
49 | 			this.editSpider(spiderName);
50 | 		},
51 | 
52 | 		addSpider: function() {
53 | 			this.addSpider();
54 | 		},
55 | 
56 | 		deleteSpider: function(spiderName) {
57 | 			if (confirm('Are you sure you want to delete spider ' + spiderName + '?')) {
58 | 				this.get('slyd').deleteSpider(spiderName);
59 | 				this.removeObject(spiderName);
60 | 			}
61 | 		},
62 | 
63 | 		rename: function(oldName, newName) {
64 | 			this.get('slyd').renameProject(oldName, newName).then(
65 | 				function() {
66 | 					this.replaceRoute('project', { id: newName });
67 | 				}.bind(this),
68 | 				function(reason) {
69 | 					this.set('name', oldName);
70 | 					alert('The name ' + newName + ' is not a valid project name.');
71 | 				}.bind(this)
72 | 			);
73 | 		},
74 | 	},
75 | 
76 | 	willEnter: function() {
77 | 		this.get('documentView').showSpider();
78 | 		if (this.get('controllers.application.siteWizard')) {
79 | 			Em.run.next(this, this.addSpider);
80 | 		}
81 | 	},
82 | });
83 | 


--------------------------------------------------------------------------------
/slyd/karma.conf.js:
--------------------------------------------------------------------------------
 1 | module.exports = function(karma) {
 2 |     karma.set({
 3 |         basePath: 'media',
 4 | 
 5 |         files: [
 6 |           "js/vendor/jquery-1.9.1.js",
 7 |           "js/vendor/jquery-ui-1.10.3.js",
 8 |           "js/vendor/handlebars-1.0.0.js",
 9 |           "js/vendor/ember-debug.js",
10 |           "js/vendor/ember-browser-detect.js",
11 |           "js/vendor/loading.js",
12 |           "js/vendor/ic-ajax.js",
13 |           "js/vendor/uri.js",
14 |           "js/vendor/jBreadCrumb.1.1.js",
15 |           
16 |           "js/jqplugins.js",
17 |           "js/app.js",
18 |           "js/api.js",
19 |           "js/documentview.js",
20 |           "js/canvas.js",
21 |           "js/models.js",
22 |           "js/routes.js",
23 |           "js/emberui.js",
24 |           "js/views.js",
25 |           "js/messages.js",
26 |           "js/controllers/controllers.js",
27 |           "js/controllers/navigation-controller.js",
28 |           "js/controllers/annotation-controller.js",
29 |           "js/controllers/template-controller.js",
30 |           "js/controllers/application-controller.js",
31 |           "js/controllers/items-controller.js",
32 |           "js/controllers/project-controller.js",
33 |           "js/controllers/projects-controller.js",
34 |           "js/controllers/spider-controller.js",
35 |           
36 |           "js/templates/*.handlebars",
37 | 
38 |           "tests/fixtures.js",
39 |           "tests/*.js",
40 | 
41 |           "css/normalize.css",
42 |           "css/jquery-ui-1.10.3.custom.css",
43 |           "css/style.css",
44 |           "css/breadcrumb.css",
45 |         ],
46 | 
47 |         logLevel: karma.LOG_WARN,
48 | 
49 |         browsers: ['Chrome', 'PhantomJS', 'Firefox'],
50 | 
51 |         // Disable security to avoid XSS errors.
52 |         customLaunchers: {
53 |             Chrome_without_security: {
54 |               base: 'Chrome',
55 |               flags: ['--disable-web-security']
56 |             },
57 |             PhantomJS_without_security: {
58 |               base: 'PhantomJS',
59 |               flags: ['--web-security=no']
60 |             }
61 |         },
62 |         singleRun: true,
63 |         autoWatch: false,
64 | 
65 |         frameworks: ["qunit"],
66 | 
67 |         plugins: [
68 |             'karma-qunit',
69 |             'karma-chrome-launcher',
70 |             'karma-firefox-launcher',
71 |             'karma-ember-preprocessor',
72 |             'karma-phantomjs-launcher'
73 |         ],
74 | 
75 |         preprocessors: {
76 |             "**/*.handlebars": 'ember'
77 |         }
78 |     });
79 | };
80 | 


--------------------------------------------------------------------------------
/slybot/slybot/extractors.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrapely.extractors import htmlregion
 4 | 
 5 | from slybot.fieldtypes import FieldTypeManager
 6 | from slybot.item import SlybotFieldDescriptor
 7 | 
 8 | def create_regex_extractor(pattern):
 9 |     """Create extractor from a regular expression.
10 |     Only groups from match are extracted and concatenated, so it
11 |     is required to define at least one group. Ex:
12 |     >>> extractor = create_regex_extractor("(\d+).*(\.\d+)")
13 |     >>> extractor(u"The price of this product is <div>45</div> </div class='small'>.50</div> pounds")
14 |     u'45.50'
15 |     """
16 |     ereg = re.compile(pattern, re.S)
17 |     def _extractor(txt):
18 |         m = ereg.search(txt)
19 |         if m:
20 |             return htmlregion(u"".join(filter(None, m.groups() or m.group())))
21 |     
22 |     _extractor.__name__ = "Regex: %s" % pattern.encode("utf-8")
23 |     return _extractor
24 | 
25 | class PipelineExtractor:
26 |     def __init__(self, *extractors):
27 |         self.extractors = extractors
28 |     
29 |     def __call__(self, value):
30 |         for extractor in self.extractors:
31 |             value = extractor(value) if value else value
32 |         return value
33 | 
34 |     @property
35 |     def __name__(self):
36 |         return repr(self.extractors) 
37 |     
38 | 
39 | def apply_extractors(descriptor, template_extractors, extractors):
40 |     field_type_manager = FieldTypeManager()
41 | 
42 |     for field_name, field_extractors in template_extractors.items():
43 |         equeue = []
44 |         for eid in field_extractors:
45 |             extractor_doc = extractors[eid]
46 |             if "regular_expression" in extractor_doc:
47 |                 equeue.append(create_regex_extractor(extractor_doc["regular_expression"]))
48 |             elif "type_extractor" in extractor_doc: # overrides default one
49 |                 descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, 
50 |                     field_name, field_type_manager.type_processor_class(extractor_doc["type_extractor"])())
51 |         if not field_name in descriptor.attribute_map:
52 |             # if not defined type extractor, use text type by default, as it is by far the most commonly used
53 |             descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, 
54 |                     field_name, field_type_manager.type_processor_class("text")())
55 |             
56 |         if equeue:
57 |             equeue.insert(0, descriptor.attribute_map[field_name].extractor)
58 |             descriptor.attribute_map[field_name].extractor = PipelineExtractor(*equeue)
59 | 
60 | 


--------------------------------------------------------------------------------
/slybot/slybot/fieldtypes/text.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Text types
 3 | """
 4 | from scrapely.extractors import text as extract_text, safehtml
 5 | 
 6 | class _BaseTextProcessor(object):
 7 |     """basic text processor, defines identity functions, some of which 
 8 |     are overridden in subclasses
 9 |     """
10 |     def extract(self, text):
11 |         """Matches and extracts any string, as it is"""
12 |         return text
13 |     
14 |     def adapt(self, text, htmlpage):
15 |         return text
16 |     
17 | class RawFieldTypeProcessor(_BaseTextProcessor):
18 |     """Extracts the raw data, without processing. Data is escaped for presentation
19 |     
20 |     >>> from scrapely.extractors import htmlregion
21 |     >>> r = RawFieldTypeProcessor()
22 |     >>> html = htmlregion(u'<p>test</p>')
23 |     >>> r.extract(html)
24 |     u'<p>test</p>'
25 |     >>> r.adapt(html, None)
26 |     u'<p>test</p>'
27 |     """
28 |     name = 'raw html'
29 |     description = 'raw html as it appears in the page'
30 | 
31 | class TextFieldTypeProcessor(_BaseTextProcessor):
32 |     """Extracts strings, removing all HTML markup
33 | 
34 |     >>> from scrapely.extractors import htmlregion
35 |     >>> p = TextFieldTypeProcessor()
36 |     >>> html = htmlregion(u'<p>test</p><!-- comment --><script> // script</script>!')
37 |     >>> extracted = p.extract(html)
38 |     >>> extracted
39 |     u'test !'
40 |     >>> p.adapt(extracted, None)
41 |     u'test !'
42 |     >>> html = htmlregion(u'<p>&nbsp;\\n<p>')
43 |     >>> p.extract(html)
44 |     u''
45 |     """
46 |     name = 'text'
47 |     description = 'extracts text from web pages, cleaning all markup'
48 |     
49 |     def extract(self, htmlregion):
50 |         return extract_text(htmlregion.text_content)
51 | 
52 |     
53 | class SafeHtmlFieldTypeProcessor(_BaseTextProcessor):
54 |     """Extracts strings, with only a safe subset of HTML remaining
55 | 
56 |     Extraction checks for presence of text content, and adapt transforms the HTML
57 |     >>> from scrapely.extractors import htmlregion
58 |     >>> p = SafeHtmlFieldTypeProcessor()
59 |     >>> html = htmlregion(u'<p>test</p> <blink>foo')
60 |     >>> p.extract(html)
61 |     u'<p>test</p> <blink>foo'
62 |     >>> p.adapt(html)
63 |     u'<p>test</p> foo'
64 |     
65 |     html without text must not be extracted
66 |     >>> html = htmlregion(u'<br/>')
67 | 
68 |     """
69 |     name = 'safe html'
70 |     description = 'removes all but a small subset of html tags'
71 |     def extract(self, htmlregion):
72 |         if extract_text(htmlregion.text_content):
73 |             return htmlregion
74 | 
75 |     def adapt(self, text, htmlpage=None):
76 |         """Remove html markup"""
77 |         return safehtml(text)
78 | 
79 | 


--------------------------------------------------------------------------------
/slyd/media/tests/integration_test_helper.js:
--------------------------------------------------------------------------------
 1 | document.write('<div id="ember-testing-container"><div id="ember-testing"></div></div>');
 2 | 
 3 | ASTool.rootElement = '#ember-testing';
 4 | ASTool.setupForTesting();
 5 | 
 6 | Ember.Test.registerAsyncHelper('iframeClick', function(app, selector, context) {
 7 |     var $el = $('#scraped-doc-iframe').contents().find(selector);
 8 |     Ember.run(function() {
 9 |         $el.mouseup();
10 |     });
11 | });
12 | 
13 | Ember.Test.registerAsyncHelper('focus', function(app, selector, context) {
14 |     var $el = $(selector);
15 |     Ember.run(function() {
16 |         $el.focus();
17 |     });
18 | });
19 | 
20 | Ember.Test.registerAsyncHelper('blur', function(app, selector, context) {
21 |     var $el = $(selector);
22 |     Ember.run(function() {
23 |         $el.blur();
24 |     });
25 | });
26 | 
27 | Ember.Test.registerAsyncHelper('iframeClick', function(app, selector, context) {
28 |     var $el =$('#scraped-doc-iframe').contents().find(selector);
29 |     Ember.run(function() {
30 |         $el.mouseup();
31 |     });
32 | });
33 | 
34 | Ember.Test.registerAsyncHelper('sleep', function(app, value) {
35 |     return Ember.Test.promise(function(resolve) {
36 |         Ember.Test.adapter.asyncStart();
37 |         setTimeout(function() {
38 |             Ember.Test.adapter.asyncEnd();
39 |             Em.run(null, resolve, value);
40 |         }, 250);
41 |     });
42 | });
43 | 
44 | 
45 | ASTool.injectTestHelpers();
46 | 
47 | function exists(selector) {
48 |     return !!find(selector).length;
49 | }
50 | 
51 | function hasAnnotation(annotatedDoc, attribute, field) {
52 |     var pattern = "annotations&quot;:{&quot;" + attribute + "&quot;:&quot;" + field + "&quot;}";
53 |     
54 |     return annotatedDoc.indexOf(pattern) != -1;
55 | }
56 | 
57 | /* Define Function.prototype.bind in case the installed version of
58 |    PhantomJS does not implement it.
59 |    Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Function/bind#Compatibility
60 |  */
61 | if (!Function.prototype.bind) {
62 |   Function.prototype.bind = function (oThis) {
63 |     if (typeof this !== "function") {
64 |         // closest thing possible to the ECMAScript 5 internal IsCallable function
65 |         throw new TypeError("Function.prototype.bind - what is trying to be bound is not callable");
66 |     }
67 | 
68 |     var aArgs = Array.prototype.slice.call(arguments, 1), 
69 |         fToBind = this, 
70 |         fNOP = function () {},
71 |         fBound = function () {
72 |             return fToBind.apply(this instanceof fNOP && oThis
73 |                                  ? this
74 |                                  : oThis,
75 |                                aArgs.concat(Array.prototype.slice.call(arguments)));
76 |         };
77 | 
78 |     fNOP.prototype = this.prototype;
79 |     fBound.prototype = new fNOP();
80 | 
81 |     return fBound;
82 |   };
83 | }
84 | 
85 | 


--------------------------------------------------------------------------------
/slybot/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ==============================
 2 | Slybot |version| documentation
 3 | ==============================
 4 | 
 5 | Slybot is a Python web crawler for doing web scraping. It's implemented on top of the
 6 | `Scrapy`_ web crawling framework and the `Scrapely`_ extraction library.
 7 | 
 8 | Requirements
 9 | ============
10 | 
11 | * `Scrapy`_
12 | * `Scrapely`_
13 | * `loginform`_
14 | * lxml
15 | 
16 | Installation
17 | ============
18 | 
19 | To install the last development version::
20 | 
21 |     pip install git+https://github.com/scrapy/slybot.git
22 | 
23 | To install the last stable version::
24 | 
25 |     pip install slybot
26 | 
27 | Quick Usage
28 | ===========
29 | 
30 | Create a directory called ``slybot-project``, place your slybot project (JSON
31 | files) there.
32 | 
33 | To get a list of all spiders::
34 | 
35 |     slybot list
36 | 
37 | To run a specific specific spider::
38 | 
39 |     slybot crawl <spider_name>
40 | 
41 | For those familiar with Scrapy, ``slybot`` is a thin wrapper for the ``scrapy``
42 | command, that just instructs Scrapy to use Slybot settings
43 | (``slybot.settings``). All commands and arguments supported by the ``scrapy``
44 | command are also supported by the ``slybot`` command, although a few of them
45 | don't apply.
46 | 
47 | Slybot projects
48 | ===============
49 | 
50 | Slybot projects are configured through a collection of JSON files which are
51 | documented in :doc:`project`.
52 | 
53 | Example project
54 | ===============
55 | 
56 | There is a working slybot project example in `slybot/tests/data/SampleProject`_
57 | that is used for tests and hence use most of the available features.
58 | 
59 | User interfaces
60 | ===============
61 | 
62 | Here is a list of known UIs to create Slybot templates:
63 | 
64 | * `Scrapely tool`_ (command line interface)
65 | * `Scrapinghub Autoscraping`_ (visual, web-based)
66 | 
67 | Advanced configuration
68 | ======================
69 | 
70 | Slybot is a `Scrapy`_ project, so it can be tuned and configured using `Scrapy
71 | settings`_. For more information see: :doc:`config`.
72 | 
73 | Spiderlets
74 | ==========
75 | 
76 | The behaviour of Slybot spiders can also be tuned with small (Python) code
77 | snippets called spiderlets. For more information see :doc:`spiderlets`.
78 | 
79 | Table of contents
80 | =================
81 | 
82 | .. toctree::
83 |    :maxdepth: 2
84 | 
85 |    project
86 |    config
87 |    spiderlets
88 | 
89 | .. _Scrapy: https://github.com/scrapy/scrapy
90 | .. _Scrapely: https://github.com/scrapy/scrapely
91 | .. _loginform: https://github.com/scrapy/loginform
92 | .. _slybot/tests/data/SampleProject: https://github.com/scrapy/slybot/tree/master/slybot/tests/data/SampleProject
93 | .. _Scrapy settings: http://doc.scrapy.org/en/latest/topics/settings.html
94 | .. _Scrapely tool: https://github.com/scrapy/scrapely#usage-command-line-tool
95 | .. _Scrapinghub Autoscraping: http://scrapinghub.com/autoscraping.html
96 | 


--------------------------------------------------------------------------------
/slyd/media/js/jqplugins.js:
--------------------------------------------------------------------------------
 1 | jQuery.fn.getUniquePath = function () {
 2 | 	if (this.length != 1) {
 3 | 		throw 'Requires one element.';	
 4 | 	}
 5 | 	var path, node = this;
 6 | 	while (node.length) {
 7 | 		var realNode = node[0], name = realNode.localName;
 8 | 		if (!name) {
 9 | 			break;
10 | 		} 
11 | 		name = name.toLowerCase();
12 | 		var parent = node.parent();
13 | 		var siblings = parent.children(name);
14 | 		if (siblings.length > 1) { 
15 | 			name += ':eq(' + siblings.index(realNode) + ')';
16 | 		}
17 | 		path = name + (path ? '>' + path : '');
18 | 		node = parent;
19 | 	}
20 | 	return path;
21 | };
22 | 
23 | jQuery.fn.getPath = function() {
24 | 	var path = [this.prop('tagName')];
25 | 	this.parents().not('html').each(function() {
26 | 		var entry = this.tagName.toLowerCase();
27 | 		path.push(entry);
28 | 	});
29 | 	return path.reverse().join(' > ');
30 | };
31 | 
32 | jQuery.fn.getAttributeList = function() {
33 | 	var attributeList = [];
34 | 	if (this.text()) {
35 | 		attributeList.push(ASTool.Attribute.create({
36 | 			name: 'content',
37 | 			value: this.text()}));
38 | 	}
39 | 	var element = this.get(0);
40 | 	$(element.attributes).each(function() {
41 | 		if ($.inArray(this.nodeName, jQuery.fn.getAttributeList.ignoredAttributes) == -1 &&
42 | 		    this.nodeValue) {
43 | 			attributeList.push(ASTool.Attribute.create({
44 | 				name: this.nodeName,
45 | 				value: this.nodeValue}));
46 | 		}
47 | 	})
48 | 	return attributeList;
49 | };
50 | 
51 | jQuery.fn.getAttributeList.ignoredAttributes = ['id', 'class',
52 | 	'width', 'style', 'height', 'cellpadding',
53 | 	'cellspacing', 'border', 'bgcolor', 'color', 'colspan',
54 | 	'data-scrapy-annotate'];
55 | 
56 | jQuery.fn.boundingBox = function() {
57 | 	var rect = {};
58 | 	rect.left = this.offset().left;
59 | 	rect.top = this.offset().top;
60 | 	rect.width = this.outerWidth();
61 | 	rect.height = this.outerHeight();
62 | 	return rect;
63 | };
64 | 
65 | jQuery.fn.isDescendant = function(parent) {
66 | 	return $(parent).find(this).length > 0;
67 | };
68 | 
69 | jQuery.fn.findAnnotatedElements = function() {
70 | 	return this.find('[data-scrapy-annotate]');	
71 | };
72 | 
73 | jQuery.fn.findAnnotatedElement = function(annotationId) {
74 | 	var selector = '[data-scrapy-annotate*="' + annotationId + '"]';
75 | 	return this.find(selector);
76 | };
77 | 
78 | jQuery.fn.findIgnoredElements = function(annotationId) {
79 | 	var selector;
80 | 	if (annotationId) {
81 | 		selector = '[data-scrapy-ignore*="' + annotationId + '"], [data-scrapy-ignore-beneath*="' + annotationId + '"]';
82 | 	} else {
83 | 		selector = '[data-scrapy-ignore], [data-scrapy-ignore-beneath]';
84 | 	}
85 | 	return this.find(selector);
86 | };
87 | 
88 | jQuery.fn.removePartialAnnotation = function() {
89 | 	// FIXME: this may leave empty text node children.
90 | 	var element = this.get(0);
91 | 	var textNode = element.childNodes[0];
92 | 	var parentNode = element.parentNode;
93 | 	$(textNode).unwrap();
94 | 	parentNode.normalize();
95 | }  


--------------------------------------------------------------------------------
/slyd/tests/utils.py:
--------------------------------------------------------------------------------
 1 | from cStringIO import StringIO
 2 | from twisted.internet.defer import succeed
 3 | from twisted.web import server
 4 | from twisted.web.test.test_web import DummyRequest
 5 | from scrapy.settings import CrawlerSettings
 6 | from slyd.crawlerspec import CrawlerSpecManager
 7 | from slyd.projects import ProjectsResource
 8 | import tests.settings as test_settings
 9 | 
10 | 
11 | def test_spec_manager():
12 |     """Create a CrawlerSpecManager configured to use test settings"""
13 |     crawler_settings = CrawlerSettings(settings_module=test_settings)
14 |     return CrawlerSpecManager(crawler_settings)
15 | 
16 | def test_projects_resource(temp_projects_dir):
17 |     """Create a ProjectsResource configured to use test settings"""
18 |     crawler_settings = CrawlerSettings(settings_module=test_settings)
19 |     projects = ProjectsResource(crawler_settings)
20 |     projects.projectsdir = temp_projects_dir
21 |     return projects
22 | 
23 | 
24 | class _SlydDummyRequest(DummyRequest):
25 |     def __init__(self, method, url, project='test', data=None, args=None, headers=None):
26 |         DummyRequest.__init__(self, url.split('/'))
27 |         if data is not None:
28 |             self.content = StringIO(data)
29 |         if project is not None:
30 |             self.project = project
31 |         self.method = method
32 |         self.headers.update(headers or {})
33 |         # set args
34 |         args = args or {}
35 |         for k, v in args.items():
36 |             self.addArg(k, v)
37 | 
38 |     def value(self):
39 |         return "".join(self.written)
40 | 
41 | 
42 | class TestSite(server.Site):
43 |     """A Site used for test_settings
44 | 
45 |     Adds some convenience methods for GET and POST and result
46 |     capture
47 |     """
48 | 
49 |     def __init__(self, resource, project='test'):
50 |         server.Site.__init__(self, resource)
51 |         self.project = project
52 | 
53 |     def get(self, url, args=None, headers=None):
54 |         return self._request("GET", url, args, headers, None)
55 | 
56 |     def post(self, url, data, args=None, headers=None):
57 |         return self._request("POST", url, args, headers, data)
58 | 
59 |     def _request(self, method, url, args, headers, data):
60 |         request = _SlydDummyRequest(method, url, self.project,
61 |             data, args, headers)
62 |         resource = self.getResourceFor(request)
63 |         result = resource.render(request)
64 |         return self._resolveResult(request, result)
65 | 
66 |     def _resolveResult(self, request, result):
67 |         if isinstance(result, str):
68 |             request.write(result)
69 |             request.finish()
70 |             return succeed(request)
71 |         elif result is server.NOT_DONE_YET:
72 |             if request.finished:
73 |                 return succeed(request)
74 |             else:
75 |                 return request.notifyFinish().addCallback(lambda _: request)
76 |         else:
77 |             raise ValueError("Unexpected return value: %r" % (result,))
78 | 


--------------------------------------------------------------------------------
/slybot/slybot/generic_form.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import itertools
 3 | from lxml import html
 4 | 
 5 | from scrapy.http.request.form import _get_inputs
 6 | 
 7 | class GenericForm:
 8 | 
 9 |     def __init__(self, **kwargs):
10 |         self.kwargs = kwargs
11 | 
12 |     def _pick_node(self, doc, selector):
13 |         nodes = doc.xpath(selector['xpath'])
14 |         if nodes:
15 |             return nodes[0]
16 | 
17 |     def _filter_by_regex(self, lines, regex):
18 |         search_regex = re.compile(regex).search
19 |         return [l for l in lines if search_regex(l)]
20 | 
21 |     def _get_field_values(self, form, field_descriptor):
22 |         if 'name' in field_descriptor:
23 |             field_name = field_descriptor['name']
24 |         else:
25 |             select_field = self._pick_node(form, field_descriptor)
26 |             field_name = select_field.name
27 | 
28 |         field_type = field_descriptor['type']
29 |         if field_type == 'constants':
30 |             return [[field_name, option] for option in self.get_value(field_descriptor)]
31 |         elif field_type == 'iterate':
32 |             select_field = self._pick_node(form, field_descriptor)
33 |             values = self._filter_by_regex(select_field.value_options,
34 |                                            self.get_value(field_descriptor))
35 |             return [[select_field.name, option] for option in values]
36 |         elif field_type == 'inurl':
37 |             return [[field_name, option] for option in field_descriptor['file_values']]
38 | 
39 |     def get_value(self, field_descriptor):
40 |         values = field_descriptor.get('value', '')
41 |         if isinstance(values, list):
42 |             return [val.format(**self.kwargs) for val in values]
43 |         else:
44 |             return values.format(**self.kwargs)
45 | 
46 |     def set_values_url_field(self, field_descriptor, body):
47 |         field_descriptor['file_values'] = body.split('\n')
48 | 
49 |     def get_url_field(self, form_descriptor):
50 |         for i, field_descriptor in enumerate(form_descriptor['fields']):
51 |             if (field_descriptor['type'] == 'inurl'
52 |                 and (not 'file_values' in field_descriptor or
53 |                      not field_descriptor['file_values'])):
54 |                 yield i, field_descriptor
55 | 
56 |     def fill_generic_form(self, url, body, form_descriptor):
57 | 
58 |         doc = html.document_fromstring(body, base_url=url)
59 |         form = self._pick_node(doc, form_descriptor)
60 |         if form is None:
61 |             raise Exception('Generic form not found')
62 | 
63 |         # Get all the possible inputs for each field
64 |         values = [self._get_field_values(form, field)
65 |                   for field in form_descriptor['fields']]
66 | 
67 |         for params in itertools.product(*values):
68 |             form_values = dict(_get_inputs(form, None, False, None, None))
69 |             for name, option in params:
70 |                 form_values[name] = option
71 |             yield form_values.items(), form.action or form.base_url, form.method
72 | 


--------------------------------------------------------------------------------
/slyd/tests/test_projects.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import unittest
 3 | from tempfile import mkdtemp
 4 | from os.path import join, exists
 5 | from shutil import rmtree
 6 | from scrapy.tests.mockserver import Status
 7 | from twisted.internet.defer import inlineCallbacks
 8 | from twisted.web.resource import NoResource, Resource
 9 | from .utils import TestSite, test_projects_resource
10 | from .settings import DATA_DIR
11 | 
12 | 
13 | class ProjectsTest(unittest.TestCase):
14 | 
15 |     def setUp(self):
16 |         self.temp_projects_dir = mkdtemp(dir=DATA_DIR,
17 |             prefix='test-run-')
18 |         root = Resource()
19 |         projects = test_projects_resource(self.temp_projects_dir)
20 |         root.putChild('projects', projects)
21 |         projects.putChild('status', Status())
22 |         self.projectssite = TestSite(root, None)
23 | 
24 |     def check_project_exists(self, project_name):
25 |         self.assertTrue(exists(join(self.temp_projects_dir, project_name)))
26 |         self.assertTrue(
27 |             exists(join(self.temp_projects_dir, project_name, 'spiders')))
28 | 
29 |     def check_project_not_exists(self, project_name):
30 |         self.assertFalse(exists(join(self.temp_projects_dir, project_name)))
31 | 
32 |     @inlineCallbacks
33 |     def test_childaccess(self):
34 |         with self.assertRaises(NoResource):
35 |             yield self.projectssite.get("projects/noresource")
36 |         with self.assertRaises(NoResource):
37 |             yield self.projectssite.get("projects/project/noresource")
38 |         yield self.projectssite.get("projects/project/status")
39 | 
40 |     @inlineCallbacks
41 |     def post_command(self, cmd, *args, **kwargs):
42 |         obj = {'cmd': cmd, 'args': args}
43 |         result = yield self.projectssite.post('projects', data=json.dumps(obj))
44 |         self.assertEqual(result.responseCode, kwargs.get('expect', 200))
45 | 
46 |     @inlineCallbacks
47 |     def test_list_projects(self):
48 |         result = yield self.projectssite.get('projects')
49 |         self.assertEqual(json.loads(result.value()), [])
50 |         self.post_command('create', 'project1')
51 |         self.post_command('create', 'project2')
52 |         result = yield self.projectssite.get('projects')
53 |         self.assertEqual(json.loads(result.value()), ['project1', 'project2'])
54 | 
55 |     def test_commands(self):
56 |         self.post_command('rm', 'doesnotexist', expect=404)
57 |         self.post_command('create', 'project1')
58 |         self.check_project_exists('project1')
59 |         self.post_command('mv', 'project1', 'project2')
60 |         self.check_project_exists('project2')
61 |         self.post_command('rm', 'project2')
62 |         self.check_project_not_exists('project2')
63 |         # Don't allow overwrites when creating or renaming projects
64 |         self.post_command('create', 'project1')
65 |         self.post_command('create', 'project1', expect=400)
66 |         self.post_command('create', 'project2')
67 |         self.post_command('mv', 'project1', 'project2', expect=400)
68 | 
69 |     def tearDown(self):
70 |         rmtree(self.temp_projects_dir)
71 | 


--------------------------------------------------------------------------------
/slybot/slybot/spiderlets.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Spider middleware for AS for completing the work made by AS with a "spiderlet" code
 3 | 
 4 | """
 5 | import pkgutil, inspect
 6 | 
 7 | from scrapy.xlib.pydispatch import dispatcher
 8 | from scrapy import signals
 9 | from scrapy.exceptions import NotConfigured
10 | from scrapy.http import Request
11 | 
12 | class DefaultSpiderlet(object):
13 |     name = None
14 |     def __init__(self, spider):
15 |         self.spider = spider
16 |     def process_request(self, request, response):
17 |         return request
18 |     def process_item(self, item, response):
19 |         return item
20 |     def process_start_request(self, request):
21 |         return request
22 | 
23 |     def parse_login_page(self, response):
24 |         return self.spider.parse_login_page(response)
25 | 
26 | def list_spiderlets(spiderlets_module_path):
27 |     spiderlets_module = __import__(spiderlets_module_path, {}, {}, [''])
28 |     seen_classes = set()
29 |     for _, mname, _  in pkgutil.iter_modules(spiderlets_module.__path__):
30 |         module = __import__(".".join([spiderlets_module_path, mname]), {}, {}, [''])
31 |         for cls in [c for c in vars(module).itervalues() if inspect.isclass(c)]:
32 |             if cls in seen_classes:
33 |                 continue
34 |             seen_classes.add(cls)
35 |             name = getattr(cls, 'name', None)
36 |             if name:
37 |                 yield cls
38 | 
39 | def _load_spiderlet(spiderlets_module_path, spider):
40 |     for cls in list_spiderlets(spiderlets_module_path):
41 |         if cls.name == spider.name:
42 |             class _spiderlet_cls(cls, DefaultSpiderlet):
43 |                 pass
44 |             spider.log("SpiderletMiddleware: loaded %s" % _spiderlet_cls.name)
45 |             return _spiderlet_cls(spider)
46 |     return DefaultSpiderlet(spider)
47 | 
48 | class SpiderletsMiddleware(object):
49 |     @classmethod
50 |     def from_crawler(cls, crawler):
51 |         return cls(crawler.settings)
52 | 
53 |     def __init__(self, settings):
54 |         self.annotating = "annotating" in settings.getlist('SHUB_JOB_TAGS')
55 |         self.spiderlets_module_path = settings["SPIDERLETS_MODULE"]
56 |         if not self.spiderlets_module_path:
57 |             raise NotConfigured
58 |         dispatcher.connect(self.spider_opened, signals.spider_opened)
59 | 
60 |     def spider_opened(self, spider):
61 |         self.spiderlet = _load_spiderlet(self.spiderlets_module_path, spider)
62 |         
63 |     def process_spider_output(self, response, result, spider):
64 |         for item_or_request in result:
65 |             if isinstance(item_or_request, Request):
66 |                 yield self.spiderlet.process_request(item_or_request, response)
67 |             else:
68 |                 yield self.spiderlet.process_item(item_or_request, response)
69 | 
70 |     def process_start_requests(self, start_requests, spider):
71 |         for request in start_requests:
72 |             if request.callback == spider.parse_login_page:
73 |                 request.callback = self.spiderlet.parse_login_page
74 |             yield self.spiderlet.process_start_request(request)
75 |     
76 | 


--------------------------------------------------------------------------------
/slyd/media/main.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <head>
 3 | 	<meta charset="utf-8">
 4 | 
 5 | 	<link href="css/normalize.css" rel="stylesheet">
 6 | 	<link href="css/jquery-ui-1.10.3.custom.css" rel="stylesheet">
 7 | 	<link href="css/style.css" rel="stylesheet">
 8 | 	<link href="css/breadcrumb.css" rel="stylesheet">
 9 | 	
10 | 	<script src="js/vendor/jquery-1.9.1.js"></script>
11 | 	<script src="js/vendor/jquery-ui-1.10.3.js"></script>
12 | 	<script src="js/vendor/handlebars-1.0.0.js"></script>
13 | 	<script src="js/vendor/ember-debug.js"></script>
14 | 	<script src="js/vendor/ember-browser-detect.js"></script>
15 | 	<script src="js/vendor/loading.js"></script>
16 | 	<script src="js/vendor/ic-ajax.js"></script>
17 | 	<script src="js/vendor/uri.js"></script>
18 | 	<script src="js/vendor/jBreadCrumb.1.1.js"></script>
19 | 	<script src="js/jqplugins.js"></script>
20 | 	<script src="js/app.js"></script>
21 | 	<script src="js/api.js"></script>
22 | 	<script src="js/documentview.js"></script>
23 | 	<script src="js/canvas.js"></script>
24 | 	<script src="js/models.js"></script>
25 | 	<script src="js/routes.js"></script>
26 | 	<script src="js/emberui.js"></script>
27 | 	<script src="js/views.js"></script>
28 | 	<script src="js/messages.js"></script>
29 | 	<script src="js/controllers/controllers.js"></script>
30 | 	<script src="js/controllers/navigation-controller.js"></script>
31 | 	<script src="js/controllers/annotation-controller.js"></script>
32 | 	<script src="js/controllers/template-controller.js"></script>
33 | 	<script src="js/controllers/application-controller.js"></script>
34 | 	<script src="js/controllers/items-controller.js"></script>
35 | 	<script src="js/controllers/project-controller.js"></script>
36 | 	<script src="js/controllers/projects-controller.js"></script>
37 | 	<script src="js/controllers/spider-controller.js"></script>
38 | 	
39 | 	<script>
40 | 		function compile(template) {
41 | 		  var templateName = template.split('/templates/').reverse()[0].replace('.handlebars', '');
42 | 		  $.ajax({
43 | 		    url: template,
44 | 		    cache: false,
45 | 		    async: false,
46 | 		    success: function (source) {
47 | 		      var input = Ember.Handlebars.precompile(source.toString());
48 | 		      Ember.TEMPLATES[templateName] = Ember.Handlebars.template(input);
49 | 		    }
50 | 		  });
51 | 		}
52 | 		compile('js/templates/inline-textfield.handlebars');
53 | 		compile('js/templates/annotated-document-view.handlebars');
54 | 		compile('js/templates/application.handlebars');
55 | 		compile('js/templates/annotation-widget.handlebars');
56 | 		compile('js/templates/floating-annotation-widget.handlebars');
57 | 		compile('js/templates/edit-item.handlebars');
58 | 		compile('js/templates/item.handlebars');
59 | 		compile('js/templates/navigation.handlebars');
60 | 		compile('js/templates/toolbox.handlebars');
61 | 		compile('js/templates/toolbox-projects.handlebars');
62 | 		compile('js/templates/toolbox-project.handlebars');
63 | 		compile('js/templates/toolbox-spider.handlebars');
64 | 		compile('js/templates/toolbox-template.handlebars');
65 | 		compile('js/templates/toolbox-annotation.handlebars');
66 | 		compile('js/templates/toolbox-items.handlebars');
67 | 		compile('js/templates/topbar-project.handlebars');
68 | 		compile('js/templates/topbar-projects.handlebars');
69 | 		compile('js/templates/topbar-browse.handlebars');
70 | 		compile('js/templates/topbar-extraction.handlebars');
71 | 	</script>
72 | </head>
73 | <body>
74 | </body>


--------------------------------------------------------------------------------
/slybot/slybot/tests/test_baseurl.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for apply_annotations
 3 | """
 4 | 
 5 | from unittest import TestCase
 6 | from slybot.baseurl import insert_base_url, get_base_url
 7 | from scrapely.htmlpage import HtmlPage
 8 | 
 9 | class TestApplyAnnotations(TestCase):
10 |     def test_insert_base_relative(self):
11 |         """Replace relative base href"""
12 |         html_in = '<html><head><base href="products/"><body></body></html>'
13 |         html_target = '<html><head><base href="http://localhost:8000/products/" />\
14 | <body></body></html>'
15 |         html_out = insert_base_url(html_in, "http://localhost:8000/")
16 |         self.assertEqual(html_out, html_target)
17 | 
18 |     def test_insert_base_noreplace(self):
19 |         """base tag dont need to be replaced"""
20 |         html_in = html_target = '<html><head><base href="http://localhost:8000/products/"><body></body></html>'
21 |         html_out = insert_base_url(html_in, "http://localhost:8000/users/blog.html")
22 |         self.assertEqual(html_out, html_target)
23 |         
24 |     def test_insert_base_addbase(self):
25 |         """add base tag when not present"""
26 |         html_in = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\
27 | <body></body></html>'
28 |         html_target = '<html><head><base href="http://localhost:8000/" />\
29 | <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\
30 | <body></body></html>'
31 |         html_out = insert_base_url(html_in, "http://localhost:8000/")
32 |         self.assertEqual(html_out, html_target)
33 | 
34 |     def test_insert_base_commented(self):
35 |         """Test weird case when base tag is commented in origin"""
36 |         html_in = '<html><head><!-- <base href="http://example.com/"> --></head>\
37 | <body>Body</body></html>'
38 |         html_target = '<html><head><base href="http://example.com/" />\
39 | <!-- <base href="http://example.com/"> --></head><body>Body</body></html>'
40 |         html_out = insert_base_url(html_in, "http://example.com/")
41 |         self.assertEqual(html_out, html_target)
42 | 
43 |     def test_insert_base_nohead(self):
44 |         """Test base insert when no head element is present"""
45 |         html_in = '<html><body>Body</body></html>'
46 |         html_target = '<html>\n\
47 | <head><base href="http://localhost:8000/" /></head>\n\
48 | <body>Body</body></html>'
49 |         html_out = insert_base_url(html_in, "http://localhost:8000/")
50 |         self.assertEqual(html_out, html_target)
51 | 
52 |     def test_get_base_url(self):
53 |         """Basic get_base_url test"""
54 |         html = u'<html><head><base href="http://example.com/products/" />\
55 | <body></body></html>'
56 |         page = HtmlPage("http://example.com/products/p19.html", body=html)
57 |         self.assertEqual(get_base_url(page), "http://example.com/products/")
58 | 
59 |     def test_get_base_url_nobase(self):
60 |         """Base tag does not exists"""
61 |         html = u'<html><head><body></body></html>'
62 |         page = HtmlPage("http://example.com/products/p19.html", body=html)
63 |         self.assertEqual(get_base_url(page), "http://example.com/products/p19.html")
64 | 
65 |     def test_get_base_url_empty_basehref(self):
66 |         """Base tag exists but href is empty"""
67 |         html = u'<html><head><base href="" />\
68 | <body></body></html>'
69 |         url = "http://example.com/products/p19.html"
70 |         page = HtmlPage(url, body=html)
71 |         self.assertEqual(get_base_url(page), url)
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/slybot/slybot/validation/schemas.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "id": "project",
 4 |         "type": "object",
 5 |         "properties": {
 6 |             "name": {"type": "string", "required": true},
 7 |             "version": {"type": "string", "required": true},
 8 |             "comment": {"type": "string"}
 9 |         }
10 |     },
11 |     {
12 |         "id": "items",
13 |         "additionalProperties": {
14 |             "$ref": "item"
15 |         }
16 |     },
17 |     {
18 |         "id": "item",
19 |         "type": "object",
20 |         "properties": {
21 |             "fields": {"additionalProperties": {"$ref": "field"}, "required": true}
22 |         }
23 |     },
24 |     {
25 |         "id": "field",
26 |         "type": "object",
27 |         "properties": {
28 |             "type": {"type": "string", "required": true},
29 |             "required": {"type": "boolean", "required": true},
30 |             "vary": {"type": "boolean", "required": true}
31 |         }
32 |     },
33 |     {
34 |         "id": "extractors",
35 |         "additionalProperties": {"$ref": "extractor"}
36 |     },
37 |     {
38 |         "id": "extractor",
39 |         "type": "object",
40 |         "properties": {
41 |             "type_extractor": {"type": "string"},
42 |             "regular_expression": {"type": "string", "format": "regex"}
43 |         }
44 |     },
45 |     {
46 |         "id": "spider",
47 |         "type": "object",
48 |         "properties": {
49 |             "start_urls": {"type": "array", "items": {"type": "string", "format": "url"}, "required": true},
50 |             "links_to_follow": {"type": "string", "enum": ["none", "patterns"], "required": true},
51 |             "follow_patterns": {"type": "array", "items": {"type": "string", "format": "regex"}},
52 |             "exclude_patterns": {"type": "array", "items": {"type": "string", "format": "regex"}},
53 |             "respect_nofollow": {"type": "boolean", "required": true},
54 |             "allowed_domains": {"type": "array", "items": {"type": "string"}},
55 |             "templates": {"type": "array", "items": {"$ref": "template"}, "required": true},
56 |             "init_requests": {"type": "array", "items": {"$ref": "request"}}
57 |         }
58 |     },
59 |     {
60 |         "id": "template",
61 |         "type": "object",
62 |         "properties": {
63 |             "page_id": {"type": "string", "required": true},
64 |             "page_type": {"type": "string", "enum": ["links", "item"], "required": true},
65 |             "scrapes": {"type": "string", "required": true},
66 |             "url": {"type": "string", "format": "url", "required": true},
67 |             "extractors": {"additionalProperties": {"type": "array", "items": {"type": "string"}}, "required": true},
68 |             "annotated_body": {"type": "string", "required": true},
69 |             "original_body": {"type": "string", "required": true}
70 |         }
71 |     },
72 |     {
73 |         "id": "link_extractor",
74 |         "type": "object",
75 |         "properties": {
76 |             "type": {"type": "string", "required": true},
77 |             "value": {"type": "any", "required": true},
78 |             "additionalProperties": {"type": "any"}
79 |         }
80 |     },
81 |     {
82 |         "id": "request",
83 |         "type": "object",
84 |         "properties": {
85 |             "type": {"type": "string", "required": true},
86 |             "link_extractor": {"$ref": "link_extractor"},
87 |             "additionalProperties": {"type": "any"}
88 |         }
89 |     }
90 | ]
91 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/topbar-browse.handlebars:
--------------------------------------------------------------------------------
 1 | <div id="topbar">
 2 | 	<div class="nav-container">
 3 | 		{{render "navigation"}}
 4 | 	</div>
 5 | 
 6 | 	<div style="float:left;margin-top:2px">
 7 | 		<span style="float:left">
 8 | 			{{view ASTool.ButtonView action="browseBack" icon="ui-icon-arrowthick-1-w" disabled=browseBackDisabled}}
 9 | 		</span>
10 | 		<span style="float:left">
11 | 			{{view ASTool.ButtonView action="reload" icon="ui-icon-arrowrefresh-1-e" disabled=reloadDisabled}}
12 | 		</span>
13 | 		
14 | 		<span class="url">
15 | 			{{#view ASTool.LabelWithTooltip title=controller.currentUrl}}
16 | 				{{controller.currentUrl}}
17 | 			{{/view}}
18 | 		</span>
19 | 
20 | 		{{#unless addTemplateDisabled}}
21 | 			<span style="float:left">
22 | 				{{view ASTool.ButtonView action="addTemplate" label="Annotate this page" class="red-button"}}
23 | 			</span>
24 | 		{{/unless}}
25 | 
26 | 		{{#unless showItemsDisabled}}
27 | 			<span style="float:left;margin-left:2px">
28 | 				{{view ASTool.ButtonView action="toggleShowItems" label=itemsButtonLabel class="black-button"}}
29 | 			</span>
30 | 		{{/unless}}
31 | 
32 | 		{{#if showNoItemsExtracted}}
33 | 			<span class="small-label" style="float:left;margin:5px 0px 0px 10px;">
34 | 				No items extracted
35 | 			</span>
36 | 		{{/if}}
37 | 
38 | 		{{#if saving}}
39 | 			<span class="small-label red-label" style="float:left;margin:5px 0px 0px 10px;">
40 | 				Saving spider...
41 | 			</span>
42 | 		{{/if}}
43 | 
44 | 
45 | 		{{#if showItems}}
46 | 			{{#if controller.extractedItems.length}}
47 | 				<div class="extracted-items-container">
48 | 					<span style="float:right;pointer-events:all">
49 | 						{{view ASTool.ButtonView action="toggleShowItems" class="clear-button" icon="ui-icon-closethick"}}
50 | 					</span>
51 | 					<h3 style="text-align:center" class="important-label">Displaying {{controller.extractedItems.length}} extracted items</h3>
52 | 					<div style="max-height:500px; padding:8px;" class="scrolling-container">
53 | 						{{#each controller.extractedItems}}
54 | 							<div style="margin-bottom:20px">
55 | 								{{#view ASTool.ExtractedItemView extractedItem=this}}
56 | 									{{#each view.textFields}}
57 | 										<div style="margin-bottom:5px">
58 | 											<span class="small-label blue-label">{{this.name}}:</span>
59 | 											<span style="color:white;word-wrap:break-word;">{{this.value}}</span>
60 | 										</div>
61 | 									{{/each}}
62 | 									{{#each view.imageFields}}
63 | 										<div style="margin-bottom:5px">
64 | 											<span class="small-label blue-label">{{this.name}}:</span>
65 | 											{{#each this.value}}
66 | 												<span style="color:white;word-wrap:break-word;">{{this}}</span>
67 | 												<div style="margin:10px;text-align:center">{{view ASTool.ImageView src=this width="200px"}}</div>
68 | 											{{/each}}	
69 | 										</div>
70 | 									{{/each}}
71 | 									{{#if view.variants}}
72 | 										<h3 class="important-label">Item variants</h3>
73 | 										{{#each view.variants}}
74 | 											<div style="margin-bottom:8px">
75 | 												{{#each fields}}
76 | 													<div style="margin-bottom:3px">
77 | 														<span class="small-label green-label">{{this.name}}:</span>
78 | 														<span style="color:white;word-wrap:break-word;">{{this.value}}</span>
79 | 													</div>
80 | 												{{/each}}
81 | 											</div>
82 | 										{{/each}}
83 | 									{{/if}}
84 | 								{{/view}}
85 | 							</div>
86 | 						{{/each}}
87 | 					</div>
88 | 				</div>
89 | 			{{/if}}
90 | 		{{/if}}	
91 | 	</div>
92 | </div>
93 | 


--------------------------------------------------------------------------------
/slyd/tests/resources/data/projects/test/items.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "accomodation": {
  3 |         "fields": {
  4 |             "address": {
  5 |                 "required": false, 
  6 |                 "type": "text", 
  7 |                 "vary": true
  8 |             }, 
  9 |             "id": {
 10 |                 "required": true, 
 11 |                 "type": "text", 
 12 |                 "vary": false
 13 |             }, 
 14 |             "name": {
 15 |                 "required": true, 
 16 |                 "type": "text", 
 17 |                 "vary": true
 18 |             }, 
 19 |             "price": {
 20 |                 "required": true, 
 21 |                 "type": "text", 
 22 |                 "vary": true
 23 |             }, 
 24 |             "services": {
 25 |                 "required": false, 
 26 |                 "type": "text", 
 27 |                 "vary": true
 28 |             }, 
 29 |             "url": {
 30 |                 "required": false, 
 31 |                 "type": "url", 
 32 |                 "vary": true
 33 |             }
 34 |         }
 35 |     }, 
 36 |     "default": {
 37 |         "fields": {
 38 |             "description": {
 39 |                 "required": false, 
 40 |                 "type": "safe html", 
 41 |                 "vary": false
 42 |             }, 
 43 |             "image_url": {
 44 |                 "required": false, 
 45 |                 "type": "image", 
 46 |                 "vary": false
 47 |             }, 
 48 |             "images": {
 49 |                 "required": false, 
 50 |                 "type": "image", 
 51 |                 "vary": false
 52 |             }, 
 53 |             "name": {
 54 |                 "required": false, 
 55 |                 "type": "text", 
 56 |                 "vary": false
 57 |             }, 
 58 |             "price": {
 59 |                 "required": false, 
 60 |                 "type": "number", 
 61 |                 "vary": false
 62 |             }, 
 63 |             "url": {
 64 |                 "required": false, 
 65 |                 "type": "url", 
 66 |                 "vary": false
 67 |             }
 68 |         }
 69 |     }, 
 70 |     "pin": {
 71 |         "fields": {
 72 |             "image": {
 73 |                 "required": true, 
 74 |                 "type": "image", 
 75 |                 "vary": true
 76 |             }, 
 77 |             "name": {
 78 |                 "required": true, 
 79 |                 "type": "text", 
 80 |                 "vary": false
 81 |             }, 
 82 |             "url": {
 83 |                 "required": false, 
 84 |                 "type": "url", 
 85 |                 "vary": true
 86 |             }
 87 |         }
 88 |     }, 
 89 |     "recipe": {
 90 |         "fields": {
 91 |             "directions": {
 92 |                 "required": true, 
 93 |                 "type": "text", 
 94 |                 "vary": true
 95 |             }, 
 96 |             "id": {
 97 |                 "required": true, 
 98 |                 "type": "text", 
 99 |                 "vary": false
100 |             }, 
101 |             "image": {
102 |                 "required": true, 
103 |                 "type": "text", 
104 |                 "vary": true
105 |             }, 
106 |             "ingredients": {
107 |                 "required": true, 
108 |                 "type": "text", 
109 |                 "vary": true
110 |             }, 
111 |             "name": {
112 |                 "required": true, 
113 |                 "type": "text", 
114 |                 "vary": true
115 |             }, 
116 |             "url": {
117 |                 "required": false, 
118 |                 "type": "url", 
119 |                 "vary": true
120 |             }
121 |         }
122 |     }
123 | }


--------------------------------------------------------------------------------
/slybot/slybot/linkextractor/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Link extraction for auto scraping
 3 | """
 4 | import re, os, posixpath
 5 | from urlparse import urlparse
 6 | from scrapy.linkextractor import IGNORED_EXTENSIONS
 7 | 
 8 | _ONCLICK_LINK_RE = re.compile("(?P<sep>('|\"))(?P<url>.+?)(?P=sep)")
 9 | 
10 | _ignored_exts = frozenset(['.' + e for e in IGNORED_EXTENSIONS])
11 | 
12 | # allowed protocols
13 | ALLOWED_SCHEMES = frozenset(['http', 'https', None, ''])
14 | 
15 | class BaseLinkExtractor(object):
16 | 
17 |     def __init__(self, max_url_len=2083, ignore_extensions=_ignored_exts, 
18 |         allowed_schemes=ALLOWED_SCHEMES):
19 |         """Creates a new LinkExtractor
20 | 
21 |         The defaults are a good guess for the first time crawl. After that, we
22 |         expect that they can be learned.
23 |         """
24 |         self.max_url_len = max_url_len
25 |         self.ignore_extensions = ignore_extensions
26 |         self.allowed_schemes = allowed_schemes
27 |     
28 |     def _extract_links(self, source):
29 |         raise NotImplementedError
30 | 
31 |     def links_to_follow(self, source):
32 |         """Returns normalized extracted links"""
33 |         for link in self._extract_links(source):
34 |             link = self.normalize_link(link)
35 |             if link is not None:
36 |                 yield link
37 | 
38 |     def normalize_link(self, link):
39 |         """Normalize a link
40 |         
41 |         >>> from scrapy.link import Link
42 |         >>> le = BaseLinkExtractor()
43 |         >>> l = Link('http://scrapinghub.com/some/path/../dir')
44 |         >>> le.normalize_link(l).url
45 |         'http://scrapinghub.com/some/dir'
46 |         >>> l = Link('http://scrapinghub.com/some//./path/')
47 |         >>> le.normalize_link(l).url
48 |         'http://scrapinghub.com/some/path/'
49 | 
50 |         Files with disallowed extentions or protocols are not returned
51 |         >>> le.normalize_link(Link('myimage.jpg')) is None
52 |         True
53 |         >>> le.normalize_link(Link('file:///tmp/mydoc.htm')) is None
54 |         True
55 |         >>> le.normalize_link(Link('http://scrapinghub.com')).url
56 |         'http://scrapinghub.com/'
57 |         
58 |         Fragments are removed
59 |         >>> le.normalize_link(Link('http://example.com/#something')).url
60 |         'http://example.com/'
61 |         >>> le.normalize_link(Link('http://example.com/#something')).fragment
62 |         'something'
63 |         >>> le.normalize_link(Link('http://scrapinghub.com#some fragment')).url
64 |         'http://scrapinghub.com/'
65 | 
66 |         Ajax crawling
67 |         >>> le.normalize_link(Link('http://example.com/#!something')).url
68 |         'http://example.com/?_escaped_fragment_=something'
69 |         >>> le.normalize_link(Link('http://example.com/page.html?arg=1#!something')).url
70 |         'http://example.com/page.html?arg=1&_escaped_fragment_=something'
71 |         """
72 |         if len(link.url) > self.max_url_len:
73 |             return
74 |         parsed = urlparse(link.url)
75 |         extention = os.path.splitext(parsed.path)[1].lower() 
76 |         if parsed.scheme not in self.allowed_schemes or \
77 |                 extention in self.ignore_extensions:
78 |             return
79 |         # path normalization
80 |         path = parsed.path or '/'
81 |         path = path if path[0] != '.' else '/' + path
82 |         path = posixpath.normpath(path)
83 |         if parsed.path.endswith('/') and not path.endswith('/'):
84 |             path += '/'
85 |         if parsed.fragment.startswith('!'):
86 |             query = '_escaped_fragment_=%s' % parsed.fragment[1:]
87 |             query = parsed.query + '&' + query if parsed.query else query
88 |             parsed = parsed._replace(query=query)
89 |         link.fragment = parsed.fragment
90 |         if path != parsed.path or parsed.fragment:
91 |             link.url = parsed._replace(path=path, fragment='').geturl()
92 |         return link
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/slyd/slyd/html.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Removes JavaScript from HTML
 3 | 
 4 |     This module removes all existing JavaScript in an HTML document.
 5 | 
 6 | """
 7 | import re
 8 | from scrapely.htmlpage import HtmlTag, HtmlTagType, parse_html
 9 | from slybot.utils import htmlpage_from_response
10 | from slybot.baseurl import insert_base_url
11 | from .annotations import add_tagids
12 | from .utils import serialize_tag
13 | 
14 | ### Known weaknesses
15 | #     Doesn't deal with JS hidden in CSS
16 | #     Doesn't deal with meta redirect javascript URIs
17 | 
18 | INTRINSIC_EVENT_ATTRIBUTES = ("onload", "onunload", "onclick", "ondblclick",
19 |                             "onmousedown", "onmouseup", "onmouseover",
20 |                             "onmousemove", "onmouseout", "onfocus",
21 |                             "onblur", "onkeypress", "onkeydown",
22 |                             "onkeyup", "onsubmit", "onreset", "onselect",
23 |                             "onchange", "onerror", "onbeforeunload")
24 | 
25 | URI_ATTRIBUTES = ("action", "background", "cite", "classid", "codebase",
26 |                 "data", "href", "longdesc", "profile", "src", "usemap")
27 | 
28 | AS_SCRIPT_REGION_BEGIN = "<!-- begin region added by slyd-->"
29 | AS_SCRIPT_REGION_END = "<!-- end region added by slyd-->"
30 | 
31 | _AS_COMMENT_BEGIN = "<!-- begin_ascomment:"
32 | _AS_COMMENT_END = ":end_ascomment -->"
33 | _ENTITY_RE = re.compile("&#(\d+);")
34 | 
35 | 
36 | def _deentitize_unicode(mystr):
37 |     """replaces all entities in the form &#\d+; by its
38 |     unicode equivalent.
39 |     """
40 |     return _ENTITY_RE.sub(lambda m: unichr(int(m.groups()[0])), mystr)
41 | 
42 | 
43 | def html4annotation(htmlpage, baseurl=None):
44 |     """Convert the given html document for the annotation UI
45 | 
46 |     This adds tags, removes scripts and optionally adds a base url
47 |     """
48 |     htmlpage = add_tagids(htmlpage)
49 |     cleaned_html = descriptify(htmlpage)
50 |     if baseurl:
51 |         cleaned_html = insert_base_url(cleaned_html, baseurl)
52 |     return cleaned_html
53 | 
54 | 
55 | def extract_html(response):
56 |     """Extracts an html page from the response.
57 |     """
58 |     return htmlpage_from_response(response).body
59 | 
60 | 
61 | def descriptify(doc):
62 |     """Clean JavaScript in a html source string.
63 |     """
64 |     parsed = parse_html(doc)
65 |     newdoc = []
66 |     inserted_comment = False
67 |     for element in parsed:
68 |         if isinstance(element, HtmlTag):
69 |             if not inserted_comment and element.tag == "script" and element.tag_type == HtmlTagType.OPEN_TAG:
70 |                 newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END)
71 |                 inserted_comment = True
72 |             elif element.tag == "script" and element.tag_type == HtmlTagType.CLOSE_TAG:
73 |                 if inserted_comment:
74 |                     inserted_comment = False
75 |                 newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END)
76 |             elif element.tag == "noscript":
77 |                 newdoc.append(_AS_COMMENT_BEGIN + doc[element.start:element.end] + _AS_COMMENT_END)
78 |             else:
79 |                 for key, val in element.attributes.copy().items():
80 |                     # Empty intrinsic events
81 |                     if key in INTRINSIC_EVENT_ATTRIBUTES:
82 |                         element.attributes[key] = ""
83 |                     # Rewrite javascript URIs
84 |                     elif key in URI_ATTRIBUTES and val is not None and "javascript:" in _deentitize_unicode(val):
85 |                         element.attributes[key] = "about:blank"
86 |                     else:
87 |                         continue
88 |                 newdoc.append(serialize_tag(element))
89 |         else:
90 |             text = doc[element.start:element.end]
91 |             if inserted_comment and text.strip() and not (text.startswith("<!--") and text.endswith("-->")):
92 |                 newdoc.append(_AS_COMMENT_BEGIN + text + _AS_COMMENT_END)
93 |             else:
94 |                 newdoc.append(text)
95 | 
96 |     return ''.join(newdoc)
97 | 


--------------------------------------------------------------------------------
/slyd/slyd/projects.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Projects Resource
  3 | 
  4 | Manages listing/creation/deletion/renaming of slybot projects on
  5 | the local filesystem. Routes to the appropiate resource for fetching
  6 | pages and project spec manipulation.
  7 | """
  8 | 
  9 | import json, re, shutil, errno, os
 10 | from os.path import join
 11 | from twisted.web.resource import NoResource
 12 | from .resource import SlydJsonResource
 13 | 
 14 | 
 15 | # stick to alphanum . and _. Do not allow only .'s (so safe for FS path)
 16 | _INVALID_PROJECT_RE = re.compile('[^A-Za-z0-9._]|^\.*$')
 17 | 
 18 | 
 19 | def allowed_project_name(name):
 20 |     return not _INVALID_PROJECT_RE.search(name)
 21 | 
 22 | 
 23 | class ProjectsResource(SlydJsonResource):
 24 | 
 25 |     def __init__(self, settings):
 26 |     	SlydJsonResource.__init__(self)
 27 |     	self.projectsdir = settings['SPEC_DATA_DIR']
 28 | 
 29 |     def getChildWithDefault(self, project_path_element, request):
 30 |         # TODO: check exists, user has access, etc.
 31 |         # rely on the CrawlerSpec for this as storage and auth
 32 |         # can be customized
 33 |         request.project = project_path_element
 34 |         try:
 35 |             next_path_element = request.postpath.pop(0)
 36 |         except IndexError:
 37 |             next_path_element = None
 38 |         if next_path_element not in self.children:
 39 |             raise NoResource("No such child resource.")
 40 |         request.prepath.append(project_path_element)
 41 |         return self.children[next_path_element]
 42 | 
 43 |     def list_projects(self):
 44 |         try:
 45 |             for fname in os.listdir(self.projectsdir):
 46 |                 if os.path.isdir(os.path.join(self.projectsdir, fname)):
 47 |                     yield fname
 48 |         except OSError as ex:
 49 |             if ex.errno != errno.ENOENT:
 50 |                 raise
 51 | 
 52 |     def create_project(self, project_name):
 53 |     	project_filename = self.project_filename(project_name)
 54 |         os.makedirs(project_filename)
 55 |         with open(join(project_filename, 'project.json'), 'wb') as outf:
 56 |             outf.write('{}')
 57 |         os.makedirs(join(project_filename, 'spiders'))
 58 | 
 59 |     def rename_project(self, from_name, to_name):
 60 |         os.rename(self.project_filename(from_name),
 61 |             self.project_filename(to_name))
 62 | 
 63 |     def remove_project(self, name):
 64 |         shutil.rmtree(self.project_filename(name))
 65 | 
 66 |     def project_filename(self, project_name):
 67 |         return join(self.projectsdir, project_name)
 68 | 
 69 |     def handle_project_command(self, command_spec):
 70 |         command = command_spec.get('cmd')
 71 |         dispatch_func = self.project_commands.get(command)
 72 |         if dispatch_func is None:
 73 |             self.bad_request(
 74 |                 "unrecognised cmd arg %s, available commands: %s" %
 75 |                 (command, ', '.join(self.project_commands.keys())))
 76 |         args = command_spec.get('args', [])
 77 |         for project in args:
 78 |             if not allowed_project_name(project):
 79 |                 self.bad_request('invalid project name %s' % project)
 80 |         try:
 81 |             retval = dispatch_func(self, *args)
 82 |         except TypeError:
 83 |             self.bad_request("incorrect args for %s" % command)
 84 |         except OSError as ex:
 85 |             if ex.errno == errno.ENOENT:
 86 |                 self.error(404, "Not Found", "No such resource")
 87 |             elif ex.errno == errno.EEXIST or ex.errno == errno.ENOTEMPTY:
 88 |                 self.bad_request("A project with that name already exists")
 89 |             raise
 90 |         return retval or ''
 91 | 
 92 |     def render_GET(self, request):
 93 |         request.write(json.dumps(sorted(self.list_projects())))
 94 |         return '\n'
 95 | 
 96 |     def render_POST(self, request):
 97 |         obj = self.read_json(request)
 98 |         return self.handle_project_command(obj)
 99 | 
100 |     project_commands = {
101 |         'create': create_project,
102 |         'mv': rename_project,
103 |         'rm': remove_project
104 |     }
105 | 


--------------------------------------------------------------------------------
/slyd/tests/test_spec.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tempfile import mkdtemp
 3 | from os.path import join, basename
 4 | from shutil import rmtree
 5 | from distutils.dir_util import copy_tree
 6 | from twisted.trial import unittest
 7 | from twisted.internet.defer import inlineCallbacks
 8 | from slyd.crawlerspec import create_crawler_spec_resource
 9 | from slyd.crawlerspec import convert_spider_templates
10 | from .utils import TestSite, test_spec_manager
11 | from .settings import SPEC_DATA_DIR
12 | 
13 | 
14 | class CrawlerSpecTest(unittest.TestCase):
15 |     spider = """
16 |         {
17 |             "exclude_patterns": [],
18 |             "follow_patterns": [
19 |                 ".+MobileHomePark.php?key=d+"
20 |             ],
21 |             "links_to_follow": "patterns",
22 |             "respect_nofollow": true,
23 |             "start_urls": [
24 |                 "http://www.mhvillage.com/"
25 |             ],
26 |             "templates": []
27 |         }
28 |     """
29 | 
30 |     def setUp(self):
31 |         sm = test_spec_manager()
32 |         spec_resource = create_crawler_spec_resource(sm)
33 |         self.temp_project_dir = mkdtemp(dir=SPEC_DATA_DIR,
34 |             prefix='test-run-')
35 |         self.project = basename(self.temp_project_dir)
36 |         self.specsite = TestSite(spec_resource, project=self.project)
37 |         test_project_dir = join(SPEC_DATA_DIR, 'test')
38 |         copy_tree(test_project_dir, self.temp_project_dir)
39 | 
40 |     @inlineCallbacks
41 |     def _get_check_resource(self, resource, converter=None):
42 |         result = yield self.specsite.get(resource)
43 |         ffile = join(self.temp_project_dir, resource + ".json")
44 |         fdata = json.load(open(ffile))
45 |         if converter:
46 |             converter(fdata)
47 |         rdata = json.loads(result.value())
48 |         self.assertEqual(fdata, rdata)
49 | 
50 |     def test_get_resource(self):
51 |         self._get_check_resource("project")
52 |         self._get_check_resource("spiders/pinterest.com",
53 |             convert_spider_templates)
54 | 
55 |     @inlineCallbacks
56 |     def post_command(self, spider, cmd, *args, **kwargs):
57 |         obj = {'cmd': cmd, 'args': args}
58 |         result = yield self.specsite.post(spider, data=json.dumps(obj))
59 |         self.assertEqual(result.responseCode, kwargs.get('expect', 200))
60 | 
61 |     @inlineCallbacks
62 |     def test_updating(self):
63 |         result = yield self.specsite.post('spiders/testpost', data=self.spider)
64 |         self.assertEqual(result.responseCode, 200)
65 |         result = yield self.specsite.get('spiders/testpost')
66 |         self.assertEqual(json.loads(result.value()), json.loads(self.spider))
67 | 
68 |         # should fail - missing required fields
69 |         result = yield self.specsite.post('spiders/testpost', data='{}')
70 |         self.assertEqual(result.responseCode, 400)
71 | 
72 |     @inlineCallbacks
73 |     def test_commands(self):
74 |         self.post_command('spiders', 'unknown', expect=400)
75 |         self.post_command('spiders', 'mv', expect=400)
76 |         self.post_command('spiders', 'mv', '../notallowed', 'whatever', expect=400)
77 |         self.post_command('spiders', 'mv', 'notallowedexists', 'whatever', expect=404)
78 |         self.post_command('spiders', 'rm', 'notexists', expect=404)
79 |         # TODO: mv to existing spider - 400
80 |         yield self.specsite.post('spiders/c', data=self.spider)
81 |         self._get_check_resource('spiders/c')
82 |         self.post_command('spiders', 'mv', 'c', 'c2')
83 |         result = yield self.specsite.get('spiders/c')
84 |         self.assertEqual(result.value(), '{}\n')
85 |         self._get_check_resource('spiders/c2')
86 |         yield self.specsite.post('spiders/c3', data=self.spider)
87 |         # overwrites
88 |         self.post_command('spiders', 'mv', 'c2', 'c3')
89 |         result = yield self.specsite.get('spiders/c2')
90 |         self.assertEqual(result.value(), '{}\n')
91 |         self.post_command('spiders', 'rm', 'c3')
92 |         result = yield self.specsite.get('spiders/c3')
93 |         self.assertEqual(result.value(), '{}\n')
94 | 
95 |     def tearDown(self):
96 |         rmtree(self.temp_project_dir)
97 | 


--------------------------------------------------------------------------------
/slyd/media/js/emberui.js:
--------------------------------------------------------------------------------
 1 | /* Ember - JQuery UI integration */
 2 | 
 3 | // Put jQuery UI inside its own namespace
 4 | JQ = Ember.Namespace.create();
 5 | 
 6 | // Create a new mixin for jQuery UI widgets using the Ember
 7 | // mixin syntax.
 8 | JQ.Widget = Em.Mixin.create({
 9 |     // When Ember creates the view's DOM element, it will call this
10 |     // method.
11 |     didInsertElement: function() {
12 |         // Make jQuery UI options available as Ember properties
13 |         var options = this._gatherOptions();
14 | 
15 |         // Make sure that jQuery UI events trigger methods on this view.
16 |         this._gatherEvents(options);
17 | 
18 |         // Create a new instance of the jQuery UI widget based on its `uiType`
19 |         // and the current element.
20 |         var ui;
21 |         if (jQuery.ui[this.get('uiType')]) {
22 |             ui = jQuery.ui[this.get('uiType')](options, this.get('element'));
23 |         } else {
24 |             ui = jQuery.custom[this.get('uiType')](options, this.get('element'));
25 |         }
26 | 
27 |         // Save off the instance of the jQuery UI widget as the `ui` property
28 |         // on this Ember view.
29 |         this.set('ui', ui);
30 |     },
31 | 
32 |     // When Ember tears down the view's DOM element, it will call
33 |     // this method.
34 |     willDestroyElement: function() {
35 |         var ui = this.get('ui');
36 | 
37 |         if (ui) {
38 |             // Tear down any observers that were created to make jQuery UI
39 |             // options available as Ember properties.
40 |             var observers = this._observers;
41 |             for (var prop in observers) {
42 |                 if (observers.hasOwnProperty(prop)) {
43 |                     this.removeObserver(prop, observers[prop]);
44 |                 }
45 |             }
46 |             ui._destroy();
47 |         }
48 |     },
49 | 
50 |     // Each jQuery UI widget has a series of options that can be configured.
51 |     // For instance, to disable a button, you call
52 |     // `button.options('disabled', true)` in jQuery UI. To make this compatible
53 |     // with Ember bindings, any time the Ember property for a
54 |     // given jQuery UI option changes, we update the jQuery UI widget.
55 |     _gatherOptions: function() {
56 |         var uiOptions = this.get('uiOptions'), options = {};
57 |         // The view can specify a list of jQuery UI options that should be treated
58 |         // as Ember properties.
59 |         uiOptions.forEach(function(key) {
60 |             options[key] = this.get(key);
61 | 
62 |             // Set up an observer on the Ember property. When it changes,
63 |             // call jQuery UI's `option` method to reflect the property onto
64 |             // the jQuery UI widget.
65 |             var observer = function() {
66 |                 var value = this.get(key);
67 |                 this.get('ui').option(key, value);
68 |             };
69 | 
70 |             this.addObserver(key, observer);
71 | 
72 |             // Insert the observer in a Hash so we can remove it later.
73 |             this._observers = this._observers || {};
74 |             this._observers[key] = observer;
75 |         }, this);
76 |         return options;
77 |     },
78 | 
79 |     // Each jQuery UI widget has a number of custom events that they can
80 |     // trigger. For instance, the progressbar widget triggers a `complete`
81 |     // event when the progress bar finishes. Make these events behave like
82 |     // normal Ember events. For instance, a subclass of JQ.ProgressBarView
83 |     // could implement the `complete` method to be notified when the jQuery
84 |     // UI widget triggered the event.
85 |     _gatherEvents: function(options) {
86 |         var uiEvents = this.get('uiEvents') || [], self = this;
87 | 
88 |         uiEvents.forEach(function(event) {
89 |             var callback = self[event];
90 | 
91 |             if (callback) {
92 |                 // You can register a handler for a jQuery UI event by passing
93 |                 // it in along with the creation options. Update the options hash
94 |                 // to include any event callbacks.
95 |                  options[event] = function(event, ui) { callback.call(self, event, ui); };
96 |             }
97 |         });
98 |     }
99 | });


--------------------------------------------------------------------------------
/slybot/docs/spiderlets.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Spiderlets
 3 | ==========
 4 | 
 5 | .. warning::
 6 | 
 7 |    This functionality is experimental and its API may undergo changes from time
 8 |    to time. Use at your own risk and please contribute feedback to make it
 9 |    stable.
10 | 
11 | 
12 | The slybot spider alone is not able to solve all the crawling and extraction difficulties that may arise for every possible case: data
13 | presented in a way partially or not suitable for the similarity algorithm, arbitrary post data and ajax requests, complex url 
14 | normalization not handled by an addon, etc. Spiderlets are a method to extend any AS spider in a way that everything it is possible
15 | to do with a normal scrapy spider, can be done with a spiderlet.
16 | 
17 | Spiderlets are handled by a spider middleware. In order to be enabled, the setting ``SPIDERLETS_MODULE`` must be present, with the
18 | value being the module name that contains the spiderlets submodules. For example, if your spiderlets are in the module
19 | ``mylib.spiderlets``, then you set the value of ``SPIDERLETS_MODULE`` to ``mylib.spiderlets``.
20 | 
21 | What is a spiderlet
22 | ___________________
23 | 
24 | A spiderlet is an instance of a python class which implements at least one of the predefined methods described below. In order to
25 | attach a spiderlet to a given spider, you use the class attribute ``name``. The value of this attribute must match the name of the
26 | spider:
27 | 
28 | ::
29 | 
30 |     class MySpiderlet:
31 |         name = "myspider"
32 | 
33 |         def process_request(self, request, response):
34 |             ...
35 |             return request
36 | 
37 |         def process_item(self, item, response):
38 |             ...
39 |             return item
40 | 
41 |         def process_start_request(self, request):
42 |             ...
43 |             return request
44 | 
45 |         def parse_login_page(self, response):
46 |             ...
47 |             return request
48 | 
49 | 
50 | Three of the methods, ``process_request``, ``process_item`` and ``process_start_request``, are attached to the output of the
51 | autoscraping spider. The autoscraping spider generates two kind of objects: a request or an item. And depending on the kind
52 | of object generated and the source, the spiderlet addon passes it to one or another method of your spiderlet. So, each item
53 | issued by the spider is passed to ``process_item``, and each request is passed to ``process_start_request`` or ``process_request``,
54 | depending on the source: a start url, or a request generated by the spider as a consequence of a link extraction from a received
55 | response. Starting requests usually need to be processed in a different way, and they don't have an associated response from which
56 | they were generated.
57 | 
58 | ``process_request`` and ``process_start_request`` are commonly used for normalize request url, filter it, or override the request
59 | callback (eventually defined as a new method of the spiderlet). The default callback for every request generated by the spider is
60 | the ``parse`` method of the autoscraping spider. Whenever you need to generate a FormRequest for sending post
61 | data or simulating an AJAX call, you will need to create your own callbacks in the spiderlet and point the request callbacks to them.
62 | 
63 | ``process_item`` is mostly used for item post processing. An important feature is that the values of the item fields returned by the
64 | autoscraping spider are always lists, even if it is single valued. So you have to consider this fact when you are accessing the item
65 | fields inside the ``process_item`` method. There is no restriction, however, on the types of data contained in the items returned by the
66 | spiderlet.
67 | 
68 | The fourth method, ``parse_login_page``, is of a different kind. Instead of process an output from the spider, it process an
69 | incoming response, and it is applied only to those responses which its callback is the ``parse_login_page`` method of the autoscraping
70 | spider. If you define ``parse_login_page`` method in your spiderlet, the request callback will be overridden by this new one. This
71 | feature allows to write your own login handler when the slybot default one (based on the generic solution implemented in the
72 | `loginforms library <https://github.com/scrapy/loginform>`_) does not fit well for a given case.
73 | 
74 | Another very practical feature of a spiderlet is that you can access the autoscraping spider methods and attributes with the spiderlet
75 | attribute ``self.spider``. ``self.spider.log`` or ``self.spider.parse`` are among the most commonly methods needed to be accessed from
76 | the spiderlet.
77 | 
78 | 


--------------------------------------------------------------------------------
/slyd/media/js/vendor/ic-ajax.js:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * ic-ajax
  3 |  *
  4 |  * - (c) 2013 Instructure, Inc
  5 |  * - please see license at https://github.com/instructure/ic-ajax/blob/master/LICENSE
  6 |  * - inspired by discourse ajax: https://github.com/discourse/discourse/blob/master/app/assets/javascripts/discourse/mixins/ajax.js#L19
  7 |  */
  8 | 
  9 | ;(function(root, factory) {
 10 |   if (typeof define === 'function' && define.amd) {
 11 |     define(['ember'], function(Ember) { return factory(Ember); });
 12 |   } else if (typeof exports === 'object') {
 13 |     module.exports = factory(require('ember'));
 14 |   } else {
 15 |     root.ic = root.ic || {};
 16 |     root.ic.ajax = factory(Ember);
 17 |   }
 18 | }(this, function(Ember) {
 19 | 
 20 |   /*
 21 |    * jQuery.ajax wrapper, supports the same signature except providing
 22 |    * `success` and `error` handlers will throw an error (use promises instead)
 23 |    * and it resolves only the response (no access to jqXHR or textStatus).
 24 |    */
 25 | 
 26 |   var ajax = function() {
 27 |     return ajax.raw.apply(null, arguments).then(function(result) {
 28 |       return result.response;
 29 |     });
 30 |   };
 31 | 
 32 |   /*
 33 |    * Same as `ajax` except it resolves an object with `{response, textStatus,
 34 |    * jqXHR}`, useful if you need access to the jqXHR object for headers, etc.
 35 |    */
 36 | 
 37 |   ajax.raw = function() {
 38 |     return makePromise(parseArgs.apply(null, arguments));
 39 |   };
 40 | 
 41 |   /*
 42 |    * Defines a fixture that will be used instead of an actual ajax
 43 |    * request to a given url. This is useful for testing, allowing you to
 44 |    * stub out responses your application will send without requiring
 45 |    * libraries like sinon or mockjax, etc.
 46 |    *
 47 |    * For example:
 48 |    *
 49 |    *    ajax.defineFixture('/self', {
 50 |    *      response: { firstName: 'Ryan', lastName: 'Florence' },
 51 |    *      textStatus: 'success'
 52 |    *      jqXHR: {}
 53 |    *    });
 54 |    *
 55 |    * @param {String} url
 56 |    * @param {String} method
 57 |    * @param {Object} fixture
 58 |    */
 59 |   ajax.defineFixture = function(url, method, fixture) {
 60 |     ajax.FIXTURES = ajax.FIXTURES || {};
 61 |     ajax.FIXTURES[method + '_' + url] = {data: fixture, callCount: 0};
 62 |   };
 63 | 
 64 |   /*
 65 |    * Looks up a fixture by url and HTTP method.
 66 |    *
 67 |    * @param {String} url
 68 |    * @param {String} method
 69 |    */
 70 |   ajax.lookupFixture = function(url, method) {
 71 |     var fixture = ajax.FIXTURES && ajax.FIXTURES[method + '_' + url];
 72 |     if (fixture) {
 73 |       fixture.callCount += 1;
 74 |       return fixture.data;
 75 |     };
 76 |   };
 77 | 
 78 |   /*
 79 |    * Looks up how many times a fixture has been called by url and HTTP method.
 80 |    *
 81 |    * @param {String} url
 82 |    * @param {String} method
 83 |    */
 84 |   ajax.callCount = function(url, method) {
 85 |     var fixture = ajax.FIXTURES && ajax.FIXTURES[method + '_' + url];
 86 |     if (fixture) {
 87 |       return fixture.callCount;
 88 |     };
 89 |     return -1;
 90 |   }
 91 | 
 92 |   function makePromise(settings) {
 93 |     return new Ember.RSVP.Promise(function(resolve, reject) {
 94 |       var fixture = ajax.lookupFixture(settings.url, settings.type);
 95 |       if (fixture) {
 96 |         return resolve(fixture);
 97 |       }
 98 |       settings.success = makeSuccess(resolve, reject);
 99 |       settings.error = makeError(resolve, reject);
100 |       Ember.$.ajax(settings);
101 |     });
102 |   };
103 | 
104 |   function parseArgs() {
105 |     var settings = {};
106 |     if (arguments.length === 1) {
107 |       if (typeof arguments[0] === "string") {
108 |         settings.url = arguments[0];
109 |       } else {
110 |         settings = arguments[0];
111 |       }
112 |     } else if (arguments.length === 2) {
113 |       settings = arguments[1];
114 |       settings.url = arguments[0];
115 |     }
116 |     if (settings.success || settings.error) {
117 |       throw new Error("ajax should use promises, received 'success' or 'error' callback");
118 |     }
119 |     return settings;
120 |   }
121 | 
122 |   function makeSuccess(resolve, reject) {
123 |     return function(response, textStatus, jqXHR) {
124 |       Ember.run(null, resolve, {
125 |         response: response,
126 |         textStatus: textStatus,
127 |         jqXHR: jqXHR
128 |       });
129 |     }
130 |   }
131 | 
132 |   function makeError(resolve, reject) {
133 |     return function(jqXHR, textStatus, errorThrown) {
134 |       Ember.run(null, reject, {
135 |         jqXHR: jqXHR,
136 |         textStatus: textStatus,
137 |         errorThrown: errorThrown
138 |       });
139 |     };
140 |   }
141 | 
142 |   return ajax;
143 | 
144 | }));
145 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/toolbox-template.handlebars:
--------------------------------------------------------------------------------
  1 | <div style="text-align:center;font-size:1.1em;margin:10px 0px 10px 0px">
  2 | 	{{#view ASTool.RenameTextField value=name}}
  3 | 		<div class="editable-name">Template {{name}}</div>
  4 | 	{{/view}}
  5 | </div>
  6 | 
  7 | <div class="accordion">
  8 | 	<h3>Annotations</h3>
  9 | 	<div class="section">
 10 | 		{{#if annotations}}
 11 | 			<div style="height:10px">
 12 | 				<span class="small-label" style="float:left;margin-left:5px;margin-right:5px;width:120px;text-align:center">Map attribute</span>
 13 | 				<span class="small-label" style="float:left;width:120px;text-align:center">To field</span>
 14 | 			</div>
 15 | 			<div class="scrolling-container">
 16 | 				{{#each annotations}}
 17 | 					{{view ASTool.AnnotationWidget annotation=this templateName="annotation-widget"}}
 18 | 				{{/each}}
 19 | 			</div>
 20 | 			{{else}}
 21 | 				<h5>No annotations have been created yet.</h5>
 22 | 		{{/if}}
 23 | 	</div>
 24 | 	
 25 | 	<h3>Extracted item</h3>
 26 | 	<div class="section" style="text-align:center;">
 27 | 		<label class="small-label">Extracted item type:</label> {{view ASTool.ItemSelect value=scrapes}}
 28 | 		{{view ASTool.InlineHelp message="select_item"}}
 29 | 		<div style="margin-top:10px">
 30 | 			{{view ASTool.ButtonView action="editItems" argument=this label="Edit items" name="editItems" class="blue-button"}}
 31 | 		</div>
 32 | 	</div>
 33 | 
 34 | 	<h3>Extractors</h3>
 35 | 	<div class="section" style="text-align:center;">
 36 | 		<div class="scrolling-container" style="max-height:250px;margin-top:0px">
 37 | 			{{#each mappedFieldsData}}
 38 | 				{{#view ASTool.ExtractorDropTarget fieldName=fieldName dragging=controller.draggingExtractor}}
 39 | 					<div class="target-container">
 40 | 						 <span class="target"><b style="margin-right:10px">{{fieldName}}</b>[+Drop here]</span>
 41 | 						 {{#if extractors}}
 42 | 							<div style="margin-top:10px"></div>
 43 | 						{{/if}}
 44 | 						{{#each extractors}}
 45 | 							<div style="margin:4px 0px 4px 0px">
 46 | 								{{#view ASTool.ExtractorView extractor=this}}
 47 | 									<span>
 48 | 										{{view.extractorTypeLabel}} {{view.extractorDefinition}}
 49 | 									</span>
 50 | 								{{/view}}
 51 | 							</div>
 52 | 						{{/each}}
 53 | 					</div>
 54 | 				{{/view}}
 55 | 			{{else}}
 56 | 				<h5>No field mappings have been defined yet.</h5>
 57 | 			{{/each}}
 58 | 		</div>
 59 | 		<div style="margin-top:10px"></div>
 60 | 		<h4>Drag extractors to the fields above</h4>
 61 | 		{{view ASTool.InlineHelp message="extractors"}}
 62 | 		<div class="scrolling-container ui-corner-all" style="max-height:100px;">
 63 | 			{{#each extractors}}
 64 | 				<div style="margin:4px 0px 4px 0px">
 65 | 					{{#view ASTool.ExtractorView extractor=this}}
 66 | 						<span>
 67 | 							{{view.extractorTypeLabel}} {{view.extractorDefinition}}
 68 | 						</span>
 69 | 					{{/view}}
 70 | 					{{view ASTool.ButtonView action="deleteExtractor" argument=this icon="ui-icon-trash" class="red-button"}}
 71 | 				</div>
 72 | 			{{else}}
 73 | 				<h5>No extractors have been created yet.</h5>
 74 | 			{{/each}}
 75 | 		</div>
 76 | 		<div class="create-extractor-container">
 77 | 			<div style="float:left;width:53%;text-align:right">
 78 | 				{{view ASTool.TextField value=newReExtractor width='160px' name='reExtractorField' placeholder='Enter a RegEx' action="createExtractor"}}
 79 | 				<div class="small-label" style="margin:5px 0px 5px 0px">- or choose a type -</div>
 80 | 				<div class="typeBox">{{view ASTool.TypeSelect value=newTypeExtractor name='typeExtractorCombo' width='140px'}}</div>
 81 | 			</div>
 82 | 			<div style="float:left;width:47%;margin-top: 23px">
 83 | 				{{view ASTool.ButtonView action="createExtractor" icon="ui-icon-circle-plus" label="New extractor" disabled=createExtractorDisabled class="blue-button"}}
 84 | 			</div>
 85 | 		</div>
 86 | 		
 87 | 	</div>
 88 | 	<h3>Required fields</h3>
 89 | 	<div class="section">
 90 | 		<span style="float:right">{{view ASTool.InlineHelp message="template_required"}}</span>
 91 | 		<h4 style="width:92%">Check the fields you want to make required for this template:</h4>
 92 | 		<div class="scrolling-container">
 93 | 			{{#each mappedFieldsData}}
 94 | 				<div style="margin:4px 0px 4px 10px;">
 95 | 					{{view ASTool.RequiredFieldCheckbox checked=this.required fieldName=this.fieldName}}
 96 | 					<span class="important-label">{{this.fieldName}}</span>
 97 | 				</div>
 98 | 			{{else}}
 99 | 				<h5 style="text-align:center">No field mappings have been defined yet.</h5>
100 | 			{{/each}}
101 | 		</div>
102 | 	</div>
103 | </div>
104 | 
105 | {{#if controller.showFloatingAnnotationWidgetAt}}
106 | 	{{view ASTool.AnnotationWidget annotation=controller.floatingAnnotation inDoc=true pos=controller.showFloatingAnnotationWidgetAt templateName="floating-annotation-widget" id="annotationWidget"}}
107 | {{/if}}


--------------------------------------------------------------------------------
/slyd/README.md:
--------------------------------------------------------------------------------
  1 | How to try it:
  2 | --------------
  3 | 
  4 | The recommended way to install dependencies is to use virtualenv and
  5 | then do:
  6 | 
  7 | 	pip install -r requirements.txt
  8 | 
  9 | Run the server using:
 10 | 
 11 | 	twistd -n slyd
 12 | 
 13 | and point your browser to:
 14 | 	http://localhost:9001/static/main.html
 15 | 
 16 | Chrome and Firefox are supported, but it works better with chrome.
 17 | 
 18 | Slyd API Notes
 19 | --------------
 20 | 
 21 | This will be moved to separate docs - it's currently some notes for developers
 22 | 
 23 | All resources are either under /static/ or /projects/.
 24 | 
 25 | 
 26 | project listing/creation/deletion/renaming
 27 | 
 28 | To get list all existing projects, just GET http://localhost:9001/projects:
 29 | 
 30 | 	$ curl http://localhost:9001/projects -> ["project1", "project2"]
 31 | 
 32 | New projects can be created by posting to /projects, for example:
 33 | 
 34 | 	$ curl -d '{"cmd": "create", "args": ["project_X"]}' http://localhost:9001/projects
 35 | 
 36 | To delete a project:
 37 | 
 38 | 	$ curl -d '{"cmd": "rm", "args": ["project_X"]}' http://localhost:9001/projects
 39 | 
 40 | To rename a project:
 41 | 
 42 | 	$ curl -d '{"cmd": "mv", "args": ["oldname", "newname"]}' http://localhost:9001/projects
 43 | 
 44 | Please note that projects will not be overwritten when renaming or creating new ones (if a project
 45 | with the given name already exists an error from the 400 family will be returned).
 46 | 
 47 | spec
 48 | 
 49 | The project specification is available under /projects/PROJECT_ID/spec. The path format
 50 | mirrors the slybot format documented here:
 51 | http://slybot.readthedocs.org/en/latest/project.html
 52 | 
 53 | Currently, this is read only, but it will soon support PUT/POST.
 54 | 
 55 | The entire spec is returned for a GET request to the root:
 56 | 
 57 | 	$ curl http://localhost:9001/projects/78/spec
 58 | 	{"project": {
 59 |     "version": "1308771278",
 60 |     "name": "demo"
 61 |     ..
 62 | 	}
 63 | 
 64 | A list of available spiders can be retrieved:
 65 | 
 66 |   $ curl http://localhost:9001/projects/78/spec/spiders
 67 | ["accommodationforstudents.com", "food.com", "pinterest.com", "pin", "mhvillage"]
 68 | 
 69 | and specific resources can be requested:
 70 | 
 71 | 	$ curl http://localhost:9001/projects/78/spec/spiders/accommodationforstudents.com
 72 | 	{
 73 |     	"templates":
 74 |     ...
 75 | 	    "respect_nofollow": true
 76 | 	}
 77 | 
 78 | The spec can be updating by POSTing:
 79 | 
 80 |   $ curl --data @newlinkedin.js http://localhost:9001/projects/78/spec/spiders/linkedin
 81 | 
 82 | An HTTP 400 will be returned if the uploaded spec does not validate.
 83 | 
 84 | Basic commands are available for manipulating spider files. For example:
 85 | 
 86 |   $ curl -d '{"cmd": "rm", "args": ["spidername"]}' http://localhost:9001/projects/78/spec/spiders
 87 | 
 88 | Available commands are:
 89 | * mv - move spider from first arg to second. If the second exists it is overwritten.
 90 | * rm - delete spider
 91 | 
 92 | 
 93 | bot/fetch
 94 | 
 95 | Accepts json object with the following fields:
 96 | * request - same as scrapy requst object. At least needs a url
 97 | * spider - spider name within in the project
 98 | * page_id - unique ID for this page, must match the id used in templates (not yet implemented)
 99 | * parent_fp - fingerprint of parent request. This is used for managing referrer url, cookies, etc.
100 | 
101 | Returns a json object containing (so far):
102 | * page - page content, not yet annotated but will be
103 | * response - object containing the response data: http code and headers
104 | * items - array of items extracted
105 | * fp - request fingerprint
106 | * error - error message, present if there was an error
107 | * links - array of links followed
108 | 
109 | Coming soon in the response:
110 | * template_id - id of template that matched
111 | * trace - textual trace of the matching process - for debugging
112 | 
113 | 
114 | If you want to work on an existing project, put it in data/projects/PROJECTID, these can be downloaded from dash or by:
115 | 
116 | $ bin/sh2sly data/projects -p 78 -k YOURAPIKEY
117 | 
118 | Then you can extract data:
119 | 
120 | $ curl -d '{"request": {"url": "http://www.pinterest.com/pin/339740365610932893/"}, "spider": "pinterest.com"}' http://localhost:9001/projects/78/bot/fetch
121 | {
122 |   "fp": "0f2686acdc6a71eeddc49045b7cea0b6f81e6b61",
123 |    "items": [
124 |       {
125 |          "url": "http://www.pinterest.com/pin/339740365610932893/",
126 |          "_template": "527387aa4d6c7133c6551481",
127 |          "image": [
128 |             "http://media-cache-ak0.pinimg.com/736x/6c/c5/35/6cc5352046df0f8d8852cbdfb31542bb.jpg"
129 |          ],
130 |          "_type": "pin",
131 |          "name": [
132 |             "Career Driven"
133 |          ]
134 |       }
135 |    ],
136 |    "page": "<!DOCTYPE html>\n ...."
137 | }
138 | 
139 | Testing
140 | -------
141 | 
142 | A Karma test eviroment is available. To run the ui tests:
143 | 
144 |     npm install
145 |     export PATH="./node_modules/.bin:$PATH"
146 |     karma start
147 | 
148 | You can download npm from https://npmjs.org
149 | 
150 | Look at karma.conf.js to configure test options.
151 | 
152 | The tests are located in:
153 |   media/tests
154 | 
155 | slyd can be tested using twisted:
156 | 
157 |     trial tests
158 | 


--------------------------------------------------------------------------------
/slyd/media/js/templates/toolbox-annotation.handlebars:
--------------------------------------------------------------------------------
  1 | 
  2 | <div style="text-align:center;margin:10px 0px 10px 0px">
  3 | 	<h4>Annotation options</h4>
  4 | </div>
  5 | <div class="accordion">
  6 | 	<h3>Selected region</h3>
  7 | 	<div>
  8 | 		<h4>Ancestors</h4>{{view ASTool.InlineHelp message="selected_region_ancestors"}}
  9 | 		<div class="scrolling-container" style="max-height:180px;">
 10 | 			<ul style="list-style-type:none;padding:0px;margin:0px">
 11 | 				{{#each ancestorPaths}}
 12 | 					<li style="float:left; padding:3px 2px 3px 2px;">
 13 | 						{{view ASTool.CSSPathWidget action="selectElement" argument=this.element label=this.label minWidth="73px" class="light-button"}}
 14 | 					</li>
 15 | 				{{/each}}
 16 | 			</ul>
 17 | 		</div>
 18 | 		<br/>
 19 | 		<h4>Children</h4>{{view ASTool.InlineHelp message="selected_region_children"}}
 20 | 		<div class="scrolling-container" style="max-height:180px;">
 21 | 			<ul style="list-style-type:none;padding:0px;margin:0px">
 22 | 				{{#each childPaths}}
 23 | 					<li style="float:left;padding:3px 2px 3px 2px;">
 24 | 						{{view ASTool.CSSPathWidget action="selectElement" argument=this.element label=this.label  minWidth="73px" class="light-button"}}
 25 | 					</li>
 26 | 				{{/each}}
 27 | 			</ul>
 28 | 		</div>
 29 | 	</div>
 30 | 
 31 | 	<h3>Attribute mappings</h3>
 32 | 	<div>
 33 | 		{{#unless mappingAttribute}}
 34 | 			<h4>Not mapped</h4>
 35 | 			<div class="scrolling-container" style="max-height:180px;">	
 36 | 				{{#each unmappedAttributes}}
 37 | 					<div class="attribute-field-mapping">
 38 | 						{{#view ASTool.ElemAttributeView attribute=this}}
 39 | 							<div>
 40 | 								<div class="attribute-name">{{view.name}}</div>
 41 | 								<span style="float:right">
 42 | 									{{view ASTool.ButtonView action="mapAttribute" argument=this icon="ui-icon-transferthick-e-w" class="blue-button"}}
 43 | 							    	{{view ASTool.ButtonView action="makeSticky" argument=this icon="ui-icon-pin-s" class="blue-button"}}
 44 | 						    	</span>
 45 | 						    </div>
 46 | 						    <div class="attribute-val">{{view.value}}</div>
 47 | 						{{/view}}
 48 | 					</div>
 49 | 				{{else}}
 50 | 					<h5>No unmapped attributes.</h5>
 51 | 				{{/each}}
 52 | 			</div>
 53 | 
 54 | 			<div style="margin-bottom:10px"></div>
 55 | 
 56 | 			<h4>Mapped</h4>
 57 | 			<div class="scrolling-container" style="max-height:180px;">
 58 | 				{{#each mappedAttributes}}
 59 | 					<div class="attribute-field-mapping" style="padding-bottom:14px">
 60 | 						{{#view ASTool.ElemAttributeView attribute=this}}
 61 | 							<div class="attribute-name">{{view.name}} -> {{view.field}}</div>
 62 | 							<span style="float:right">
 63 | 								{{view ASTool.ButtonView action="unmapAttribute" argument=this icon="ui-icon-trash" class="red-button"}}
 64 | 							</span>
 65 | 						{{/view}}
 66 | 					</div>
 67 | 				{{else}}
 68 | 					<h5>No mapped attributes.</h5>
 69 | 				{{/each}}
 70 | 			</div>
 71 | 
 72 | 			<div style="margin-bottom:10px"></div>
 73 | 
 74 | 			<h4>Required attributes</h4>{{view ASTool.InlineHelp message="sticky_fields"}}
 75 | 			<div class="scrolling-container" style="max-height:180px;">	
 76 | 				{{#each stickyAttributes}}
 77 | 					<div class="attribute-field-mapping" style="padding-bottom:14px">
 78 | 						{{#view ASTool.ElemAttributeView attribute=this}}
 79 | 							<div class="attribute-name">{{view.name}}</div>
 80 | 							<span style="float:right">
 81 | 								{{view ASTool.ButtonView action="unmapAttribute" argument=this icon="ui-icon-trash" class="red-button"}}
 82 | 							</span>
 83 | 						{{/view}}
 84 | 					</div>
 85 | 				{{else}}
 86 | 					<h5>No required attributes.</h5>
 87 | 				{{/each}}
 88 | 			</div>
 89 | 		{{else}}
 90 | 			{{render "item" controller.scrapedItem}}
 91 | 		{{/unless}}
 92 | 	</div>
 93 | 	
 94 | 	<h3>Ignored subregions</h3>
 95 | 	<div style="text-align:center">
 96 | 		<div class="scrolling-container">
 97 | 			{{#each model.ignores}}
 98 | 				<div>
 99 | 					{{#view ASTool.IgnoreWidget ignore=this}}
100 | 						<div class="ignore-widget">
101 | 							<label class="small-label">Ignore elements beneath</label>
102 | 							{{view ASTool.CheckBox checked=this.ignoreBeneath name="ignoreBeneath"}}
103 | 						</div>
104 | 						{{view ASTool.ButtonView action="deleteIgnore" argument=this icon="ui-icon-trash" class="red-button"}}
105 | 					{{/view}}
106 | 					
107 | 				</div>
108 | 				{{else}}
109 | 					<h5>No ignored subregions defined.</h5>
110 | 			{{/each}}
111 | 		</div>
112 | 		<div style="margin-top:10px;">
113 | 			{{view ASTool.ToggleButton checked=selectingIgnore icon="ui-icon-cancel" id="addIgnore" name="addIgnore"}}
114 | 			<label for="addIgnore">Ignore region</label>{{view ASTool.InlineHelp message="ignored_subregions"}}
115 | 		</div>
116 | 	</div>
117 | 
118 | 	<h3>Variant</h3>
119 | 	<div>
120 | 		<div class="variantBox" style="margin-right:20px">
121 | 			<label class="important-label">Choose a variant:</label>
122 | 			{{view ASTool.VariantSelect value=currentVariant}}
123 | 			{{view ASTool.InlineHelp message="variant"}}
124 | 		</div>
125 | 	</div>
126 | </div>
127 | <br/>
128 | <div style="text-align:center">
129 | {{view ASTool.ButtonView action="doneEditing" icon="ui-icon-circle-check" label="Save changes" class="blue-button"}}
130 | {{view ASTool.ButtonView action="cancelEdit" icon="ui-icon-circle-close" label="Discard changes" class="red-button"}}
131 | </div>


--------------------------------------------------------------------------------
/slyd/media/js/templates/toolbox-spider.handlebars:
--------------------------------------------------------------------------------
  1 | 
  2 | <div style="text-align:center;font-size:1.1em;margin:10px 0px 10px 0px">
  3 | 	{{#view ASTool.RenameTextField value=name}}
  4 |   		<span class="editable-name">Spider {{name}}</span>
  5 | 	{{/view}}
  6 | </div>
  7 | 
  8 | <div class="accordion">	
  9 | 	<h3>Initialize</h3>
 10 | 	<div class="section">
 11 | 		<h4>Start Pages</h4>
 12 | 		<div class="scrolling-container" style="max-height:150px;">
 13 | 			{{#each start_urls}}
 14 | 				<div style="margin:4px 0px 4px 0px">
 15 | 					{{view ASTool.ButtonView action="fetchPage" argument=this label=this minWidth='86%' title=this class="light-button"}}
 16 | 					{{view ASTool.ButtonView action="deleteStartUrl" argument=this icon="ui-icon-trash" class="red-button"}}
 17 | 				</div>
 18 | 			{{else}}
 19 | 				<h5>No start pages for this spider.</h5>
 20 | 			{{/each}}
 21 | 		</div>
 22 | 	
 23 | 		<div style="margin-top:5px">
 24 | 			{{view ASTool.TextField value=newStartUrl width='83%' name='startUrlTextField' placeholder='Enter page url' action="addStartUrl"}}
 25 | 			{{view ASTool.ButtonView action="addStartUrl" icon="ui-icon-circle-plus" disabled=hasStartUrl
 26 | 			class="blue-button fix_plus_icon"}}
 27 | 		</div>
 28 | 		
 29 | 		<div style="margin-top:20px">
 30 | 			<label class="important-label">Perform login</label>
 31 | 			{{view ASTool.CheckBox checked=performLogin name="performLoginCheck"}}
 32 | 			{{view ASTool.InlineHelp message="perform_login"}}
 33 | 		</div>
 34 | 		
 35 | 		{{#if performLogin}}
 36 | 			<div class="scrolling-container">
 37 | 				<div style="margin-top:10px"></div>
 38 | 				{{view ASTool.TextField value=loginUrl width='94%' name='loginUrlField' placeholder='Login URL'}}
 39 | 				<div style="margin-top:5px"></div>
 40 | 				{{view ASTool.TextField value=loginUser width='94%' name='loginUserField' placeholder='Login user'}}
 41 | 				<div style="margin-top:5px"></div>
 42 | 				{{view ASTool.TextField value=loginPassword width='94%' name='loginPasswordField' placeholder='Login password'}}
 43 | 			</div>
 44 | 		{{/if}}
 45 | 	</div>
 46 | 	
 47 | 	<h3>Crawling</h3>
 48 | 	<div class="section">
 49 | 		{{view ASTool.FollowSelect value=controller.links_to_follow}}
 50 | 		
 51 | 		{{#if displayNofollow}}
 52 | 			<div style="margin-top:10px; margin-bottom: 10px">
 53 | 				{{view ASTool.CheckBox checked=respect_nofollow name="respectNoFollow"}}
 54 | 				<span class="important-label">Respect nofollow</span>
 55 | 			</div>
 56 | 		{{/if}}
 57 | 		
 58 | 		{{#if displayEditPatterns}}
 59 | 			<h4>Follow links that match this patterns</h4>
 60 | 			{{view ASTool.InlineHelp message="follow_links"}}
 61 | 			<div class="scrolling-container" style="max-height:150px">
 62 | 				{{#each follow_patterns}}
 63 | 					<div style="margin:4px 0px 4px 0px">
 64 | 						{{#view ASTool.PatternTextField pattern=this action="editFollowPattern"}}
 65 |   							{{view ASTool.ButtonView label=this minWidth='86%' title=this class="light-button"}}
 66 | 						{{/view}}
 67 | 						{{view ASTool.ButtonView action="deleteFollowPattern" argument=this icon="ui-icon-trash" class="red-button"}}
 68 | 					</div>
 69 | 				{{else}}
 70 | 					<h5>No follow patterns defined yet.</h5>
 71 | 				{{/each}}
 72 | 			</div>
 73 | 			<div style="margin-top:5px">
 74 | 				{{view ASTool.TextField value=newFollowPattern width='83%' name='followPatternTextField' placeholder='New follow pattern' action="addFollowPattern"}}
 75 | 				{{view ASTool.ButtonView action="addFollowPattern" icon="ui-icon-circle-plus" disabled=hasFollowPattern class="blue-button fix_plus_icon"}}
 76 | 			</div>
 77 | 		
 78 | 			<div style="margin-top:10px"></div>
 79 | 			
 80 | 			<h4>Exclude links that match this patterns</h4>
 81 | 			{{view ASTool.InlineHelp message="exclude_links"}}
 82 | 			<div class="scrolling-container" style="max-height:150px">
 83 | 				{{#each exclude_patterns}}
 84 | 					<div style="margin:4px 0px 4px 0px">
 85 | 						{{#view ASTool.PatternTextField pattern=this action="editExcludePattern"}}
 86 |   							{{view ASTool.ButtonView label=this minWidth='86%' title=this class="light-button"}}
 87 | 						{{/view}}
 88 | 						{{view ASTool.ButtonView action="deleteExcludePattern" argument=this icon="ui-icon-trash" class="red-button"}}
 89 | 					</div>
 90 | 				{{else}}
 91 | 					<h5>No exclude patterns defined yet.</h5>
 92 | 				{{/each}}
 93 | 			</div>
 94 | 			<div style="margin-top:5px">
 95 | 				{{view ASTool.TextField value=newExcludePattern width='83%' name='excludePatternTextField' placeholder='New exclude pattern' action="addExcludePattern"}}
 96 | 				{{view ASTool.ButtonView action="addExcludePattern" icon="ui-icon-circle-plus" disabled=hasExcludePattern class="blue-button fix_plus_icon"}}
 97 | 			</div>
 98 | 		{{/if}}
 99 | 
100 | 		<div style="margin-top:10px">
101 | 			{{view ASTool.CheckBox checked=showLinks name="showLinks"}}
102 | 			<span class="important-label">Overlay blocked links</span>
103 | 			{{view ASTool.InlineHelp message="overlay_blocked_links"}}
104 | 		</div>
105 | 	</div>
106 | 	
107 | 	<h3>Extraction</h3>
108 | 	
109 | 	<div class="section">
110 | 		<h4>Templates</h4>
111 | 		<div class="scrolling-container" style="max-height:350px;">
112 | 			{{#each templates}}
113 | 				<div style="margin:4px 0px 4px 0px">
114 | 					{{view ASTool.ButtonView action="editTemplate" argument=this label=name minWidth='86%' title=url class="light-button"}}
115 | 					{{view ASTool.ButtonView action="deleteTemplate" argument=this icon="ui-icon-trash" class="red-button"}}
116 | 				</div>
117 | 			{{else}}
118 | 				<h5>No templates exist for this spider yet.</h5>
119 | 			{{/each}}
120 | 		</div>
121 | 	</div>
122 | </div>
123 | 
124 | <div style="margin-top:10px;text-align:center">
125 | 	{{view ASTool.ButtonView action="saveSpider" icon="ui-icon-circle-arrow-n" label="Save spider" class="blue-button big"}}
126 | </div>
127 | 


--------------------------------------------------------------------------------
/slyd/media/js/vendor/loading.js:
--------------------------------------------------------------------------------
 1 | (function(w){var k=function(b,c){typeof c=="undefined"&&(c={});this.init(b,c)},a=k.prototype,o,p=["canvas","vml"],f=["oval","spiral","square","rect","roundRect"],x=/^\#([a-fA-F0-9]{6}|[a-fA-F0-9]{3})$/,v=navigator.appVersion.indexOf("MSIE")!==-1&&parseFloat(navigator.appVersion.split("MSIE")[1])===8?true:false,y=!!document.createElement("canvas").getContext,q=true,n=function(b,c,a){var b=document.createElement(b),d;for(d in a)b[d]=a[d];typeof c!=="undefined"&&c.appendChild(b);return b},m=function(b,
 2 | c){for(var a in c)b.style[a]=c[a];return b},t=function(b,c){for(var a in c)b.setAttribute(a,c[a]);return b},u=function(b,c,a,d){b.save();b.translate(c,a);b.rotate(d);b.translate(-c,-a);b.beginPath()};a.init=function(b,c){if(typeof c.safeVML==="boolean")q=c.safeVML;try{this.mum=document.getElementById(b)!==void 0?document.getElementById(b):document.body}catch(a){this.mum=document.body}c.id=typeof c.id!=="undefined"?c.id:"canvasLoader";this.cont=n("div",this.mum,{id:c.id});if(y)o=p[0],this.can=n("canvas",
 3 | this.cont),this.con=this.can.getContext("2d"),this.cCan=m(n("canvas",this.cont),{display:"none"}),this.cCon=this.cCan.getContext("2d");else{o=p[1];if(typeof k.vmlSheet==="undefined"){document.getElementsByTagName("head")[0].appendChild(n("style"));k.vmlSheet=document.styleSheets[document.styleSheets.length-1];var d=["group","oval","roundrect","fill"],e;for(e in d)k.vmlSheet.addRule(d[e],"behavior:url(#default#VML); position:absolute;")}this.vml=n("group",this.cont)}this.setColor(this.color);this.draw();
 4 | m(this.cont,{display:"none"})};a.cont={};a.can={};a.con={};a.cCan={};a.cCon={};a.timer={};a.activeId=0;a.diameter=40;a.setDiameter=function(b){this.diameter=Math.round(Math.abs(b));this.redraw()};a.getDiameter=function(){return this.diameter};a.cRGB={};a.color="#000000";a.setColor=function(b){this.color=x.test(b)?b:"#000000";this.cRGB=this.getRGB(this.color);this.redraw()};a.getColor=function(){return this.color};a.shape=f[0];a.setShape=function(b){for(var c in f)if(b===f[c]){this.shape=b;this.redraw();
 5 | break}};a.getShape=function(){return this.shape};a.density=40;a.setDensity=function(b){this.density=q&&o===p[1]?Math.round(Math.abs(b))<=40?Math.round(Math.abs(b)):40:Math.round(Math.abs(b));if(this.density>360)this.density=360;this.activeId=0;this.redraw()};a.getDensity=function(){return this.density};a.range=1.3;a.setRange=function(b){this.range=Math.abs(b);this.redraw()};a.getRange=function(){return this.range};a.speed=2;a.setSpeed=function(b){this.speed=Math.round(Math.abs(b))};a.getSpeed=function(){return this.speed};
 6 | a.fps=24;a.setFPS=function(b){this.fps=Math.round(Math.abs(b));this.reset()};a.getFPS=function(){return this.fps};a.getRGB=function(b){b=b.charAt(0)==="#"?b.substring(1,7):b;return{r:parseInt(b.substring(0,2),16),g:parseInt(b.substring(2,4),16),b:parseInt(b.substring(4,6),16)}};a.draw=function(){var b=0,c,a,d,e,h,k,j,r=this.density,s=Math.round(r*this.range),l,i,q=0;i=this.cCon;var g=this.diameter;if(o===p[0]){i.clearRect(0,0,1E3,1E3);t(this.can,{width:g,height:g});for(t(this.cCan,{width:g,height:g});b<
 7 | r;){l=b<=s?1-1/s*b:l=0;k=270-360/r*b;j=k/180*Math.PI;i.fillStyle="rgba("+this.cRGB.r+","+this.cRGB.g+","+this.cRGB.b+","+l.toString()+")";switch(this.shape){case f[0]:case f[1]:c=g*0.07;e=g*0.47+Math.cos(j)*(g*0.47-c)-g*0.47;h=g*0.47+Math.sin(j)*(g*0.47-c)-g*0.47;i.beginPath();this.shape===f[1]?i.arc(g*0.5+e,g*0.5+h,c*l,0,Math.PI*2,false):i.arc(g*0.5+e,g*0.5+h,c,0,Math.PI*2,false);break;case f[2]:c=g*0.12;e=Math.cos(j)*(g*0.47-c)+g*0.5;h=Math.sin(j)*(g*0.47-c)+g*0.5;u(i,e,h,j);i.fillRect(e,h-c*0.5,
 8 | c,c);break;case f[3]:case f[4]:a=g*0.3,d=a*0.27,e=Math.cos(j)*(d+(g-d)*0.13)+g*0.5,h=Math.sin(j)*(d+(g-d)*0.13)+g*0.5,u(i,e,h,j),this.shape===f[3]?i.fillRect(e,h-d*0.5,a,d):(c=d*0.55,i.moveTo(e+c,h-d*0.5),i.lineTo(e+a-c,h-d*0.5),i.quadraticCurveTo(e+a,h-d*0.5,e+a,h-d*0.5+c),i.lineTo(e+a,h-d*0.5+d-c),i.quadraticCurveTo(e+a,h-d*0.5+d,e+a-c,h-d*0.5+d),i.lineTo(e+c,h-d*0.5+d),i.quadraticCurveTo(e,h-d*0.5+d,e,h-d*0.5+d-c),i.lineTo(e,h-d*0.5+c),i.quadraticCurveTo(e,h-d*0.5,e+c,h-d*0.5))}i.closePath();i.fill();
 9 | i.restore();++b}}else{m(this.cont,{width:g,height:g});m(this.vml,{width:g,height:g});switch(this.shape){case f[0]:case f[1]:j="oval";c=140;break;case f[2]:j="roundrect";c=120;break;case f[3]:case f[4]:j="roundrect",c=300}a=d=c;e=500-d;for(h=-d*0.5;b<r;){l=b<=s?1-1/s*b:l=0;k=270-360/r*b;switch(this.shape){case f[1]:a=d=c*l;e=500-c*0.5-c*l*0.5;h=(c-c*l)*0.5;break;case f[0]:case f[2]:v&&(h=0,this.shape===f[2]&&(e=500-d*0.5));break;case f[3]:case f[4]:a=c*0.95,d=a*0.28,v?(e=0,h=500-d*0.5):(e=500-a,h=
10 | -d*0.5),q=this.shape===f[4]?0.6:0}i=t(m(n("group",this.vml),{width:1E3,height:1E3,rotation:k}),{coordsize:"1000,1000",coordorigin:"-500,-500"});i=m(n(j,i,{stroked:false,arcSize:q}),{width:a,height:d,top:h,left:e});n("fill",i,{color:this.color,opacity:l});++b}}this.tick(true)};a.clean=function(){if(o===p[0])this.con.clearRect(0,0,1E3,1E3);else{var b=this.vml;if(b.hasChildNodes())for(;b.childNodes.length>=1;)b.removeChild(b.firstChild)}};a.redraw=function(){this.clean();this.draw()};a.reset=function(){typeof this.timer===
11 | "number"&&(this.hide(),this.show())};a.tick=function(b){var a=this.con,f=this.diameter;b||(this.activeId+=360/this.density*this.speed);o===p[0]?(a.clearRect(0,0,f,f),u(a,f*0.5,f*0.5,this.activeId/180*Math.PI),a.drawImage(this.cCan,0,0,f,f),a.restore()):(this.activeId>=360&&(this.activeId-=360),m(this.vml,{rotation:this.activeId}))};a.show=function(){if(typeof this.timer!=="number"){var a=this;this.timer=self.setInterval(function(){a.tick()},Math.round(1E3/this.fps));m(this.cont,{display:"block"})}};
12 | a.hide=function(){typeof this.timer==="number"&&(clearInterval(this.timer),delete this.timer,m(this.cont,{display:"none"}))};a.kill=function(){var a=this.cont;typeof this.timer==="number"&&this.hide();o===p[0]?(a.removeChild(this.can),a.removeChild(this.cCan)):a.removeChild(this.vml);for(var c in this)delete this[c]};w.CanvasLoader=k})(window);


--------------------------------------------------------------------------------
/slybot/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Slybot.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Slybot.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/slyd/media/js/controllers/annotation-controller.js:
--------------------------------------------------------------------------------
  1 | ASTool.AnnotationController = Em.ObjectController.extend(ASTool.BaseControllerMixin,
  2 | 	ASTool.DocumentViewDataSource, ASTool.DocumentViewListener, {
  3 | 
  4 | 	needs: ['application', 'template_index'],
  5 | 
  6 | 	navigationLabelBinding: 'content.name',
  7 | 	
  8 | 	mappingAttribute: null,
  9 | 	
 10 | 	documentView: null,
 11 | 
 12 | 	currentlySelectedElement: null,
 13 | 
 14 | 	_selectingIgnore: false,
 15 | 
 16 | 	highlightedElement: null,
 17 | 
 18 | 	scrapedItemBinding: 'controllers.template_index.scrapedItem',
 19 | 
 20 | 	urlBinding: 'controllers.template_index.url',
 21 | 
 22 | 	currentVariantBinding: Em.Binding.oneWay("content.variant"),
 23 | 	
 24 | 	selectingIgnore: function(key, selectingIgnore) {
 25 | 		if (arguments.length > 1) {
 26 | 			this.set('_selectingIgnore', selectingIgnore);
 27 | 			if (selectingIgnore) {
 28 | 				this.set('documentView.restrictToDescendants', this.get('content.element'));
 29 | 				this.set('documentView.partialSelectionEnabled', false);
 30 | 			} else {
 31 | 				this.set('documentView.restrictToDescendants', null);
 32 | 				this.set('documentView.partialSelectionEnabled', true);
 33 | 			}
 34 | 		}
 35 | 		return this.get('_selectingIgnore');
 36 | 	}.property('_selectingIgnore'),
 37 | 
 38 | 	sprites: function() {
 39 | 		var sprites = [];
 40 | 		if (this.get('currentlySelectedElement')) {
 41 | 			sprites.pushObject(ASTool.AnnotationSprite.create(
 42 | 				{ 'annotation': this.content,
 43 | 				  'highlighted': 'true' }));
 44 | 		}
 45 | 		if (this.highlightedElement) {
 46 | 			sprites.pushObject(ASTool.ElementSprite.create({
 47 | 				element: this.highlightedElement,
 48 | 				fillColor: 'rgba(255,149,0,0.2)',
 49 | 				strokeColor: 'rgba(255,149,0,0.6)',
 50 | 				zPosition: 1000,
 51 | 			}));
 52 | 		}
 53 | 
 54 | 		var annotationSprites = this.get('controllers.template_index.sprites').filter(function(sprite) {
 55 | 			return sprite.get('annotation.id') != this.content.get('id');
 56 | 		}.bind(this));
 57 | 
 58 | 		var ignoredElements = this.get('content.ignores').map(function(ignore) {
 59 | 			return ASTool.IgnoreSprite.create({ ignore: ignore });
 60 | 		});
 61 | 
 62 | 		return sprites.concat(annotationSprites).concat(ignoredElements);
 63 | 	}.property('currentlySelectedElement',
 64 | 			   'controllers.template_index.sprites',
 65 | 			   'content.ignores.@each.highlighted',
 66 | 			   'content.ignores.@each.ignoreBeneath',
 67 | 			   'highlightedElement'),
 68 | 	
 69 | 	clearGeneratedIns: function(insElement) {
 70 | 		$(insElement).removePartialAnnotation();
 71 | 	},
 72 | 	
 73 | 	cancelEdit: function() {
 74 | 		this.set('content.selectedElement', null);
 75 | 		if (this.get('content.generated') &&
 76 | 			this.get('content.element') != this.get('currentlySelectedElement')) {
 77 | 			this.clearGeneratedIns(this.get('currentlySelectedElement'));
 78 | 		}
 79 | 		this.transitionToRoute('template');
 80 | 	},
 81 | 
 82 | 	saveEdit: function() {
 83 | 		this.get('controllers.template_index').saveAnnotations();
 84 | 		this.transitionToRoute('template');
 85 | 	},
 86 | 	
 87 | 	actions: {
 88 | 		
 89 | 		doneEditing: function(annotation) {
 90 | 			this.saveEdit(annotation);
 91 | 		},
 92 | 		
 93 | 		cancelEdit: function(annotation) {
 94 | 			this.cancelEdit(annotation);
 95 | 		},
 96 | 		
 97 | 		mapAttribute: function(attribute) {
 98 | 			this.set('mappingAttribute', attribute);
 99 | 		},
100 | 
101 | 		fieldSelected: function(field) {
102 | 			this.get('content').addMapping(this.get('mappingAttribute.name'), field);
103 | 			this.set('mappingAttribute', null);
104 | 		},
105 | 
106 | 		variantSelected: function(variant) {
107 | 			this.set('content.variant', variant);
108 | 		},
109 | 
110 | 		makeSticky: function(attribute) {
111 | 			this.get('controllers.template_index').makeSticky(this.get('content'),
112 | 				attribute.get('name'));
113 | 		},
114 | 
115 | 		unmapAttribute: function(attribute) {
116 | 			this.get('content').removeMapping(attribute.name);
117 | 		},
118 | 
119 | 		deleteIgnore: function(ignore) {
120 | 			this.get('content.ignores').removeObject(ignore);
121 | 		},
122 | 
123 | 		highlightElement: function(element) {
124 | 			this.set('highlightedElement', element);
125 | 			if (element) {
126 | 				this.documentView.scrollToElement(element);	
127 | 			}
128 | 		},
129 | 
130 | 		selectElement: function(element) {
131 | 			this.documentActions['elementSelected'].call(this, element);
132 | 		}
133 | 	},
134 | 
135 | 	confirmChangeSelection: function() {
136 | 		return confirm(ASTool.Messages.get('confirm_change_selection'));
137 | 	},
138 | 	
139 | 	documentActions: {
140 | 		
141 | 		elementSelected: function(element, partialSelection) {
142 | 			if (this.get('selectingIgnore')) {
143 | 				if (element) {
144 | 					this.get('content').addIgnore(element);	
145 | 				}
146 | 				this.set('selectingIgnore', false);
147 | 			} else {
148 | 				var needsConfirmation = this.get('content.ignores').length ||
149 | 					this.get('content.mappedAttributes').length;
150 | 				if (!needsConfirmation || this.confirmChangeSelection()) {
151 | 					if (this.get('content.generated')) {
152 | 						this.clearGeneratedIns(this.get('content.element'));	
153 | 					}
154 | 					this.set('highlightedElement', null);
155 | 					this.set('content.selectedElement', element);
156 | 					this.set('content.generated', !!partialSelection);
157 | 					this.get('content').removeIgnores();
158 | 					this.get('content').removeMappings();
159 | 					this.set('currentlySelectedElement', element);
160 | 				}
161 | 			}
162 | 		},
163 | 		
164 | 		partialSelection: function(selection) {
165 | 			var element = $('<ins/>').get(0);
166 | 			selection.getRangeAt(0).surroundContents(element);
167 | 			this.documentActions['elementSelected'].call(this, element, true);
168 | 			selection.collapse();
169 | 		},
170 | 	},
171 | 
172 | 	willEnter: function() {
173 | 		this.get('documentView').config({ mode: 'select',
174 | 										  listener: this,
175 | 										  dataSource: this,
176 | 										  partialSelects: true });
177 | 		this.set('currentlySelectedElement', this.get('content.element'));
178 | 	},
179 | 
180 | 	willLeave: function() {
181 | 		this.set('selectingIgnore', false);
182 | 		this.set('currentlySelectedElement', null);
183 | 	},
184 | });
185 | 


--------------------------------------------------------------------------------