├── scrapy_eagle
├── __init__.py
├── worker
│ ├── __init__.py
│ ├── picklecompat.py
│ ├── connection.py
│ ├── dupefilter.py
│ ├── queue.py
│ ├── spiders.py
│ └── scheduler.py
└── dashboard
│ ├── __init__.py
│ ├── views
│ ├── __init__.py
│ ├── root.py
│ ├── react_app.py
│ ├── servers.py
│ ├── processes.py
│ └── jobs.py
│ ├── .babelrc
│ ├── templates
│ ├── static
│ │ ├── css
│ │ │ ├── bundle.css.map
│ │ │ ├── bundle.css
│ │ │ └── main.css
│ │ ├── img
│ │ │ └── system-logo.jpg
│ │ └── js
│ │ │ └── vendor
│ │ │ └── jquery.navgoco.min.js
│ └── index.html
│ ├── react-src
│ ├── components
│ │ ├── Home.jsx
│ │ ├── jobs
│ │ │ ├── Root.jsx
│ │ │ ├── JobsConfig.scss
│ │ │ ├── JobsConfig.jsx
│ │ │ └── JobsItem.jsx
│ │ ├── servers
│ │ │ ├── Root.jsx
│ │ │ ├── ServerSubProcess.jsx
│ │ │ ├── ServerSet.jsx
│ │ │ └── ServerNode.jsx
│ │ ├── ListItem.jsx
│ │ ├── App.scss
│ │ ├── List.jsx
│ │ └── App.jsx
│ ├── services
│ │ └── httpservice.js
│ ├── reducers
│ │ ├── servers.jsx
│ │ └── jobs.jsx
│ └── main.jsx
│ ├── utils
│ ├── __init__.py
│ ├── spiderskit.py
│ ├── commandskit.py
│ ├── ip.py
│ └── processkit.py
│ ├── green_threads
│ ├── __init__.py
│ ├── heartbeat.py
│ ├── executor.py
│ └── stats.py
│ ├── webpack.config.dev.js
│ ├── webpack.config.prod.js
│ ├── package.json
│ ├── settings.py
│ ├── memory.py
│ └── main.py
├── docs
└── images
│ └── logo_readme.jpg
├── .travis.yml
├── requirements.txt
├── MANIFEST.in
├── pytest.ini
├── tox.ini
├── generator.py
├── setup.py
├── tests
└── test_queue.py
├── .gitignore
└── README.rst
/scrapy_eagle/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapy_eagle/worker/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 | "presets": ["react", "es2015"]
3 | }
4 |
--------------------------------------------------------------------------------
/docs/images/logo_readme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rafaelcapucho/scrapy-eagle/HEAD/docs/images/logo_readme.jpg
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.5"
4 | install: "pip install -r requirements.txt"
5 | script: nosetests
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | pymongo
3 | requests
4 | redis
5 | scrapy>=1.1.0
6 | flask-socketio
7 | flask-cors
8 | gevent
9 | psutil
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/css/bundle.css.map:
--------------------------------------------------------------------------------
1 | {"version":3,"sources":[],"names":[],"mappings":"","file":"../css/bundle.css","sourceRoot":""}
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/img/system-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rafaelcapucho/scrapy-eagle/HEAD/scrapy_eagle/dashboard/templates/static/img/system-logo.jpg
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft docs
2 |
3 | include *.in
4 | include *.ini
5 | include *.rst
6 | include *.txt
7 |
8 | recursive-include scrapy_eagle/dashboard/templates *
9 |
10 | global-exclude __pycache__ *.py[cod]
11 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/root.py:
--------------------------------------------------------------------------------
1 | import json
2 | import flask
3 |
4 |
5 | root = flask.Blueprint('root', __name__)
6 |
7 |
8 | @root.route('/')
9 | def index():
10 |
11 | return flask.redirect('/app')
12 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | norecursedirs =
3 | .*
4 | dist
5 | build
6 | python_files =
7 | test_*.py
8 | *_test.py
9 | tests.py
10 | ignore =
11 | setup.py
12 | addopts =
13 | -rxEfsw -v
14 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/Home.jsx:
--------------------------------------------------------------------------------
1 | var React = require('react');
2 |
3 | var Home = React.createClass({
4 | render: function() {
5 | return
App Home
6 | }
7 | });
8 |
9 | module.exports = Home;
10 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py{35}-scrapy{11}
3 |
4 | [testenv]
5 | basepython =
6 | py35: python3.5
7 | deps =
8 | -rrequirements.txt
9 | commands =
10 | scrapy11: pip install scrapy>=1.1,<1.2
11 | {posargs:py.test}
12 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/jobs/Root.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react'
2 |
3 | export default class SpiderRoot extends React.Component {
4 | constructor(props){
5 | super(props);
6 | }
7 |
8 | render(){
9 | return this.props.children;
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/servers/Root.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react'
2 |
3 | export default class ServerRoot extends React.Component {
4 | constructor(props){
5 | super(props);
6 | }
7 |
8 | render(){
9 | return this.props.children;
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/react_app.py:
--------------------------------------------------------------------------------
1 | import flask
2 |
3 |
4 | react_app = flask.Blueprint('app', __name__)
5 |
6 |
7 | @react_app.route('/', defaults={'path': ''})
8 | @react_app.route('/')
9 | def app(path):
10 | return flask.render_template('index.html')
11 |
--------------------------------------------------------------------------------
/scrapy_eagle/worker/picklecompat.py:
--------------------------------------------------------------------------------
1 | """A pickle wrapper module with protocol=-1 by default."""
2 |
3 | try:
4 | import cPickle as pickle # PY2
5 | except ImportError:
6 | import pickle
7 |
8 |
9 | def loads(s):
10 | return pickle.loads(s)
11 |
12 |
13 | def dumps(obj):
14 | return pickle.dumps(obj, protocol=-1)
15 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from calendar import timegm
3 |
4 |
5 | def iso_to_timestamp(iso):
6 | epoch = timegm(datetime.strptime(iso, "%Y-%m-%dT%H:%M:%S.%f").timetuple())
7 | assert isinstance(epoch, int)
8 | return epoch
9 |
10 |
11 | def timestamp_to_utc(ts):
12 | return datetime.utcfromtimestamp(ts)
13 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/services/httpservice.js:
--------------------------------------------------------------------------------
1 | var Fetch = require('whatwg-fetch');
2 | var baseUrl = 'http://localhost:6060';
3 |
4 | var service = {
5 | get: function(url) {
6 | return fetch(baseUrl + url)
7 | .then(function(response) {
8 | return response.json();
9 | });
10 | }
11 | };
12 |
13 | module.exports = service;
14 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/ListItem.jsx:
--------------------------------------------------------------------------------
1 | var React = require('react');
2 |
3 | var ListItem = React.createClass({
4 |
5 | render: function() {
6 | return (
7 |
8 | {this.props.memory_used_mb} - {this.props.memory_available_mb}
9 |
10 | );
11 | }
12 |
13 | });
14 |
15 | module.exports = ListItem;
16 |
--------------------------------------------------------------------------------
/generator.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from time import sleep
4 |
5 | # When the dashboard receives a KeyboardInterrupt
6 | # the subprocess also receive a KeyboardInterrupt
7 | # you could catch or not.
8 |
9 | try:
10 | n = 1
11 | while True:
12 |
13 | print(n)
14 |
15 | n += 1
16 |
17 | #sys.stdout.flush()
18 |
19 | sleep(1)
20 |
21 | if n % 20 == 0: break
22 |
23 | print(' ')
24 |
25 | except (KeyboardInterrupt, SystemExit):
26 | print('fechou')
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/jobs/JobsConfig.scss:
--------------------------------------------------------------------------------
1 | div.scheduler {
2 |
3 | h1 {
4 | //margin: 30px 0 35px 0;
5 | }
6 |
7 | label.col-form-label {
8 | font-size: 80%;
9 | }
10 |
11 | div.odd {
12 | background-color: #3b3e42;
13 | }
14 |
15 | div.even {
16 | background-color: #2a2d2f;
17 | }
18 |
19 | div.jobTitle {
20 | margin: 10px 0 16px 0;
21 | font-size: 85%;
22 | color: #00b280;
23 | font-weight: bold;
24 | }
25 |
26 | div.box-legends {
27 | margin-top: 35px;
28 |
29 | li {
30 | font-size: 12px;
31 | }
32 |
33 | }
34 | }
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/reducers/servers.jsx:
--------------------------------------------------------------------------------
1 | const initialState = {
2 | servers_qty: 0,
3 | };
4 |
5 | export const INCREASE_SERVER = 'INCREASE_SERVER';
6 | export const SET_SERVER_QTY = 'SET_SERVER_QTY';
7 |
8 | export default function stats(state = initialState, action) {
9 |
10 | switch (action.type) {
11 |
12 | case INCREASE_SERVER:
13 |
14 | return Object.assign({}, state, {
15 | servers_qty: state.servers_qty + 1
16 | });
17 |
18 | case SET_SERVER_QTY:
19 |
20 | return Object.assign({}, state, {
21 | servers_qty: action.qty
22 | });
23 |
24 | default:
25 | return state;
26 | }
27 | }
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/spiderskit.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 |
3 | from scrapy_eagle.dashboard import settings
4 |
5 |
6 | def find_spiders():
7 |
8 | _config = settings.get_config_file()
9 |
10 | base_dir = _config.get('scrapy', 'base_dir')
11 | binary = _config.get('scrapy', 'binary')
12 |
13 | spiders = []
14 |
15 | with subprocess.Popen(
16 | [binary, 'list'],
17 | cwd=base_dir,
18 | stdout=subprocess.PIPE,
19 | bufsize=1,
20 | universal_newlines=True
21 | ) as p:
22 | for line in p.stdout:
23 | spiders.append(line.strip())
24 |
25 | return spiders
26 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/css/bundle.css:
--------------------------------------------------------------------------------
1 | body{background-color:#323539;color:#f5f5f5;font-size:100%;margin:0;padding:0;position:relative;text-rendering:optimizelegibility}a:active,a:hover,a:link,a:visited{color:#fff;outline:medium none;text-decoration:none}h1,h2,h3,h4,h5,h6{color:#f5f5f5;font-family:Montserrat,sans-serif;margin:20px 0 25px}h1{font-size:1.375em}h2{font-size:1.188em}h3{font-size:1.063em}h4{font-size:.938em}h5{font-size:.813em}h6{font-size:.75em}div.scheduler label.col-form-label{font-size:80%}div.scheduler div.odd{background-color:#3b3e42}div.scheduler div.even{background-color:#2a2d2f}div.scheduler div.jobTitle{margin:10px 0 16px;font-size:85%;color:#00b280;font-weight:700}div.scheduler div.box-legends{margin-top:35px}div.scheduler div.box-legends li{font-size:12px}
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/green_threads/__init__.py:
--------------------------------------------------------------------------------
1 | import gevent
2 |
3 | from scrapy_eagle.dashboard import settings
4 | from scrapy_eagle.dashboard.utils import spiderskit, commandskit
5 |
6 |
7 | def find_new_spiders():
8 |
9 | while True:
10 |
11 | # Open the process and execute Scrapy's list command
12 | _spiders = spiderskit.find_spiders()
13 |
14 | # Install the list of spiders names
15 | settings._spiders = _spiders
16 |
17 | gevent.sleep(10)
18 |
19 |
20 | def find_new_commands():
21 |
22 | while True:
23 |
24 | # Monitoring the command folder
25 | _commands = commandskit.find_commands()
26 |
27 | # Install the list of commands names
28 | settings._commands = _commands
29 |
30 | gevent.sleep(5)
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/App.scss:
--------------------------------------------------------------------------------
1 | body {
2 | /*font-size: 12px;*/
3 | /*font-family: Arial, Verdana, sans-serif;*/
4 | background-color: #323539;
5 | color: whitesmoke;
6 | font-size: 100%;
7 | margin: 0;
8 | padding: 0;
9 | position: relative;
10 | text-rendering: optimizelegibility;
11 | }
12 |
13 | a:link, a:visited {
14 | color: white;
15 | outline: medium none;
16 | text-decoration: none;
17 | }
18 | a:hover, a:active {
19 | color: white;
20 | outline: medium none;
21 | text-decoration: none;
22 | }
23 |
24 |
25 | h1, h2, h3, h4, h5, h6 {
26 | color: whitesmoke;
27 | font-family: "Montserrat", sans-serif;
28 | margin: 20px 0 25px 0;
29 |
30 | }
31 |
32 | h1 {font-size: 1.375em;}
33 | h2 {font-size: 1.188em;}
34 | h3 {font-size: 1.063em;}
35 | h4 {font-size: 0.938em;}
36 | h5 {font-size: 0.813em;}
37 | h6 {font-size: 0.75em;}
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/commandskit.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from scrapy_eagle.dashboard import settings
4 |
5 |
6 | def load_commands_name(dir):
7 |
8 | if os.path.exists(dir):
9 |
10 | module_names = []
11 |
12 | for d in os.listdir(dir):
13 | if d.find("__init__") == -1 and d.endswith('.py'):
14 |
15 | # Remove possible spaces
16 | d = d.replace(" ", "")
17 |
18 | # Remove the Extension
19 | d = ".".join(d.split(".")[:-1])
20 |
21 | module_names.append(d)
22 |
23 | module_names.sort()
24 |
25 | return module_names
26 |
27 | else:
28 | return []
29 |
30 |
31 | def find_commands():
32 |
33 | _config = settings.get_config_file()
34 |
35 | base_dir = _config.get('commands', 'base_dir')
36 |
37 | return load_commands_name(dir=base_dir)
38 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/webpack.config.dev.js:
--------------------------------------------------------------------------------
1 | var webpack = require('webpack');
2 | var path = require('path');
3 |
4 | var ExtractTextPlugin = require('extract-text-webpack-plugin');
5 |
6 | var BUILD_JS_DIR = path.resolve(__dirname, 'templates/static/js');
7 | var APP_DIR = path.resolve(__dirname, 'react-src');
8 |
9 | var config = {
10 | entry: APP_DIR + '/main.jsx',
11 | output: {
12 | path: BUILD_JS_DIR,
13 | filename: 'bundle.js'
14 | },
15 | module : {
16 | loaders : [
17 | {
18 | test : /\.jsx?/,
19 | include : APP_DIR,
20 | loader : 'babel'
21 | },
22 | {
23 | test: /\.scss$/,
24 | //loaders: ['style', 'css', 'sass']
25 | loader: ExtractTextPlugin.extract('css!sass')
26 | }
27 | ]
28 | },
29 | plugins: [
30 | new ExtractTextPlugin('../css/bundle.css', {
31 | allChunks: true
32 | })
33 | ]
34 | };
35 |
36 | module.exports = config;
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/List.jsx:
--------------------------------------------------------------------------------
1 | var React = require('react');
2 | var ListItem = require('./ListItem.jsx');
3 | var HTTP = require('../services/httpservice');
4 |
5 | var List = React.createClass({
6 | getInitialState: function() {
7 | return {resources: []};
8 | },
9 | componentWillMount: function() {
10 |
11 | this.socket = io.connect('http://127.0.0.1:5000/resources');
12 | this.socket.on('resources_info', function (msg) {
13 | this.setState({resources: msg.data.sub});
14 | }.bind(this));
15 |
16 | },
17 | render: function() {
18 | /*var listItems = this.state.resources.map(function(item) {
19 | return ;
23 | });
24 |
25 | return ();*/
26 | }
27 | });
28 |
29 | module.exports = List;
30 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/servers.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | import json
4 | import flask
5 |
6 | from scrapy_eagle.dashboard.memory import get_connection
7 |
8 |
9 | servers = flask.Blueprint('servers', __name__)
10 |
11 |
12 | @servers.route('/list')
13 | def listing():
14 |
15 | now = datetime.now()
16 |
17 | redis_conn = get_connection()
18 |
19 | _servers = redis_conn.zrangebyscore('eagle_servers', now.timestamp(), max='+inf')
20 |
21 | results = []
22 |
23 | for entry in _servers:
24 | parts = entry.decode('utf-8').split("-")
25 | ip, hostname = parts[0], "-".join(parts[1:])
26 | results.append({'public_ip': ip, 'hostname': hostname})
27 |
28 | # Sets in Redis usually returns in random order, sort by hostname
29 | results = sorted(results, key=lambda x: x['hostname'])
30 |
31 | return flask.Response(
32 | response=json.dumps(results, sort_keys=True),
33 | status=200,
34 | mimetype="application/json"
35 | )
36 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/webpack.config.prod.js:
--------------------------------------------------------------------------------
1 | var webpack = require('webpack');
2 | var path = require('path');
3 |
4 | var ExtractTextPlugin = require('extract-text-webpack-plugin');
5 |
6 | var BUILD_JS_DIR = path.resolve(__dirname, 'templates/static/js');
7 | var APP_DIR = path.resolve(__dirname, 'react-src');
8 |
9 | var config = {
10 | entry: APP_DIR + '/main.jsx',
11 | output: {
12 | path: BUILD_JS_DIR,
13 | filename: 'bundle.js'
14 | },
15 | plugins: [
16 | new webpack.optimize.OccurrenceOrderPlugin(),
17 | new webpack.DefinePlugin({
18 | 'process.env': {
19 | 'NODE_ENV': JSON.stringify('production')
20 | }
21 | }),
22 | new webpack.optimize.UglifyJsPlugin({
23 | compressor: {
24 | warnings: false
25 | }
26 | }),
27 | new ExtractTextPlugin('../css/bundle.css', {
28 | allChunks: true
29 | })
30 | ],
31 | module : {
32 | loaders : [
33 | {
34 | test : /\.jsx?/,
35 | include : APP_DIR,
36 | loader : 'babel'
37 | },
38 | {
39 | test: /\.scss$/,
40 | //loaders: ['style', 'css', 'sass']
41 | loader: ExtractTextPlugin.extract('css!sass')
42 | }
43 | ]
44 | }
45 | };
46 |
47 | module.exports = config;
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import io
5 | from setuptools import setup, find_packages
6 |
7 |
8 | LONG_DESC = open(os.path.join(os.path.dirname(__file__), 'README.rst')).read()
9 |
10 |
11 | def read_file(filename):
12 | with io.open(filename) as fp:
13 | return fp.read().strip()
14 |
15 |
16 | def read_requirements(filename):
17 | return [line.strip() for line in read_file(filename).splitlines()
18 | if not line.startswith('#')]
19 |
20 |
21 | setup(name='scrapy-eagle',
22 | version='0.0.37',
23 | description='Run Scrapy Distributed',
24 | long_description=LONG_DESC,
25 | author='Rafael Alfredo Capucho',
26 | author_email='rafael.capucho@gmail.com',
27 | url='http://github.com/rafaelcapucho/scrapy-eagle',
28 | packages=find_packages(),
29 | license='BSD',
30 | install_requires=read_requirements('requirements.txt'),
31 | include_package_data=True,
32 | entry_points={
33 | 'console_scripts': ['eagle_server=scrapy_eagle.dashboard.main:entry_point'],
34 | },
35 | classifiers=[
36 | 'Development Status :: 3 - Alpha',
37 | 'Framework :: Scrapy',
38 | 'Programming Language :: Python',
39 | 'Programming Language :: Python :: 3.5',
40 | 'Intended Audience :: Developers',
41 | ],
42 | )
43 |
--------------------------------------------------------------------------------
/tests/test_queue.py:
--------------------------------------------------------------------------------
1 | import mock
2 |
3 | from scrapy import Spider
4 | from scrapy.http import Request
5 |
6 | from scrapy_eagle.worker.queue import Base
7 |
8 |
9 | class TestBaseQueue(object):
10 |
11 | def setup(self):
12 | self.server = mock.Mock()
13 | self.spider = Spider(name='foo')
14 | self.spider.parse_method = lambda x: x
15 | self.key = 'key'
16 | self.q = Base(self.server, self.spider, self.key)
17 |
18 | def test_encode_decode_requests(self, q=None):
19 | if q is None:
20 | q = self.q
21 | req = Request('http://example.com',
22 | callback=self.spider.parse,
23 | meta={'foo': 'bar'})
24 | out = q._decode_request(q._encode_request(req))
25 | assert req.url == out.url
26 | assert req.meta == out.meta
27 | assert req.callback == out.callback
28 |
29 | def test_custom_serializer(self):
30 | serializer = mock.Mock()
31 | serializer.dumps = mock.Mock(side_effect=lambda x: x)
32 | serializer.loads = mock.Mock(side_effect=lambda x: x)
33 | q = Base(self.server, self.spider, self.key, serializer=serializer)
34 | self.test_encode_decode_requests(q)
35 | assert serializer.dumps.call_count == 1
36 | assert serializer.loads.call_count == 1
37 |
38 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/main.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react'
2 | import { render } from 'react-dom'
3 | import { Router, Route, IndexRoute, browserHistory } from 'react-router'
4 |
5 | import { createStore, combineReducers } from 'redux'
6 | import { Provider } from 'react-redux'
7 |
8 | import App from './components/App.jsx'
9 | import Home from './components/Home.jsx'
10 | import ServerSet from './components/servers/ServerSet.jsx'
11 | import ServerRoot from './components/servers/Root.jsx'
12 |
13 | import JobsConfig from './components/jobs/JobsConfig.jsx'
14 | import JobsRoot from './components/jobs/Root.jsx'
15 |
16 | import servers from './reducers/servers.jsx'
17 | import jobs from './reducers/jobs.jsx'
18 |
19 | var reducers = combineReducers({
20 | servers: servers,
21 | jobs: jobs
22 | });
23 |
24 | const store = createStore(reducers);
25 |
26 | render((
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 | ), document.getElementById('app'));
47 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/green_threads/heartbeat.py:
--------------------------------------------------------------------------------
1 | import os
2 | import signal
3 | from datetime import datetime, timedelta
4 |
5 | import gevent
6 |
7 |
8 | def heartbeat_servers(redis_conn, ip, hostname):
9 |
10 | while True:
11 |
12 | future = datetime.now() + timedelta(seconds=6)
13 |
14 | redis_conn.zadd(
15 | 'eagle_servers',
16 | '{ip}-{hostname}'.format(ip=ip, hostname=hostname),
17 | int(future.timestamp())
18 | )
19 |
20 | # now = datetime.now()
21 | # servers = redis_conn.zrangebyscore('servers', now.timestamp(), max='+inf')
22 |
23 | gevent.sleep(3)
24 |
25 |
26 | def heartbeat_subprocess(pid, spider, max_seconds_idle, max_size_limit, queue_info_global):
27 |
28 | last_processed = None
29 |
30 | max_size = 0
31 |
32 | while True:
33 |
34 | size = None
35 | for entry in queue_info_global:
36 | if entry['name'] == spider:
37 | size = entry['size']
38 |
39 | if size > 0:
40 | last_processed = datetime.now()
41 |
42 | if size > max_size:
43 | max_size = size
44 |
45 | if last_processed:
46 | diff = datetime.now() - last_processed
47 |
48 | # print('\nlast_processed_secs: ', diff.seconds, ' maxsize: ', max_size, ' size: ', size, '\n\n')
49 |
50 | if diff.seconds > max_seconds_idle and max_size > max_size_limit:
51 |
52 | os.kill(pid, signal.SIGHUP)
53 |
54 | break
55 |
56 | gevent.sleep(2)
57 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "react-scrapy-eagle",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "index.js",
6 | "scripts": {
7 | "start:babel": "watchify react-src/main.jsx -v -t [ babelify --presets [ es2015 react ] ] -o templates/static/js/bundle.js",
8 | "build:dev": "./node_modules/webpack/bin/webpack.js -d --progress --colors --config webpack.config.dev.js",
9 | "build:prod": "NODE_ENV=production ./node_modules/webpack/bin/webpack.js -p --progress --colors --config webpack.config.prod.js",
10 | "start": "npm run build:dev -- --watch",
11 | "build": "npm run build:prod",
12 | "test": "echo \"Error: no test specified\" && exit 1"
13 | },
14 | "author": "Rafael Capucho",
15 | "license": "ISC",
16 | "dependencies": {
17 | "babel-loader": "^6.2.4",
18 | "babel-preset-es2015": "^6.9.0",
19 | "babel-preset-react": "^6.11.1",
20 | "babelify": "^7.3.0",
21 | "classnames": "^2.2.5",
22 | "css-loader": "^0.23.1",
23 | "extract-text-webpack-plugin": "^1.0.1",
24 | "immutable": "^3.8.1",
25 | "moment": "^2.14.1",
26 | "node-sass": "^3.8.0",
27 | "react": "^15.3.1",
28 | "react-addons-pure-render-mixin": "^15.3.1",
29 | "react-breadcrumbs": "^1.3.16",
30 | "react-dom": "^15.3.1",
31 | "react-redux": "^4.4.5",
32 | "react-router": "^2.6.1",
33 | "react-switchery": "^1.0.0",
34 | "redux": "^3.5.2",
35 | "sass-loader": "^4.0.0",
36 | "style-loader": "^0.13.1",
37 | "watchify": "^3.7.0",
38 | "webpack": "^1.13.1",
39 | "whatwg-fetch": "^1.0.0"
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/reducers/jobs.jsx:
--------------------------------------------------------------------------------
1 | import { Record, OrderedMap, List } from 'immutable';
2 |
3 | const JobRecord = Record({
4 | active: undefined, // true or false
5 | frequency_minutes: undefined,
6 | last_started_at: undefined,
7 | max_concurrency: undefined,
8 | min_concurrency: undefined,
9 | max_memory_mb: undefined,
10 | priority: 0,
11 | job_type: undefined, // 'spider' or 'command'
12 | start_urls: new List()
13 | });
14 |
15 | class JobInfo extends JobRecord {
16 | getPriority(){
17 | return this.priority;
18 | }
19 | }
20 |
21 | const SpidersMap = OrderedMap({});
22 |
23 | export default (state = SpidersMap, action) => {
24 |
25 | switch (action.type) {
26 |
27 | case 'UPDATE_SPIDER_INFO':
28 |
29 | // Check if there's already one Record from this Spider
30 | if(!state.has(action.spider_id)){
31 | state = state.set(action.spider_id, new JobInfo());
32 | }
33 |
34 | return state.update(action.spider_id,
35 | (spider_record) =>
36 | spider_record.merge({
37 | 'priority': action.priority,
38 | 'frequency_minutes': action.frequency_minutes,
39 | 'last_started_at': action.last_started_at,
40 | 'max_concurrency': action.max_concurrency,
41 | 'min_concurrency': action.min_concurrency,
42 | 'max_memory_mb': action.max_memory_mb,
43 | 'job_type': action.job_type,
44 | 'start_urls': action.start_urls,
45 | 'active': action.active
46 | })
47 | );
48 |
49 | default:
50 | return state;
51 | }
52 | }
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/ip.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import os
4 | import re
5 | import requests
6 | import random
7 |
8 | def get_hostname():
9 |
10 | return os.uname()[1]
11 |
12 | def get_external_ip():
13 |
14 | source_list = [
15 | 'http://ip.dnsexit.com',
16 | 'http://ifconfig.me/ip',
17 | 'http://ipecho.net/plain',
18 | 'http://ipogre.com/linux.php',
19 | 'http://myexternalip.com/raw',
20 | 'http://icanhazip.com/',
21 | 'http://httpbin.org/ip'
22 | ]
23 |
24 | headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0'}
25 |
26 | for i in range(len(source_list)):
27 |
28 | target = random.choice(source_list)
29 |
30 | try:
31 |
32 | content = requests.get(target, headers=headers, timeout=6, verify=False)
33 |
34 | m = re.search(
35 | '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})',
36 | content.text
37 | )
38 |
39 | ip = m.group(0)
40 |
41 | if len(ip) > 0:
42 | return ip
43 |
44 | # Without Internet
45 | except requests.exceptions.ConnectionError as e:
46 |
47 | # Only interested in there kind of error
48 | if str(e).find("Temporary failure in name resolution") > -1:
49 | return None
50 |
51 | # Timeout
52 | except requests.exceptions.RequestException:
53 | # Try next
54 | source_list.pop(i)
55 |
56 | except Exception:
57 | continue
58 |
59 |
60 | return None
61 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/servers/ServerSubProcess.jsx:
--------------------------------------------------------------------------------
1 | var React = require('react');
2 | var moment = require('moment');
3 |
4 | var ServerSubProcess = React.createClass({
5 |
6 | getInitialState: function() {
7 | return {link_open_buffer: ""};
8 | },
9 | onClickKill: function(){
10 |
11 | $.get(window.location.protocol+"//"+this.props.public_ip+":"+location.port+"/processes/kill_subprocess/"+this.props.pid, function(data) {
12 |
13 | });
14 |
15 | },
16 | componentDidMount: function(){
17 | this.setState({'link_open_buffer': window.location.protocol+"//"+this.props.public_ip+":"+location.port+"/processes/read_buffer/"+this.props.pid});
18 | },
19 | render: function(){
20 |
21 | var created_at = moment.utc(this.props.created_at);
22 | var fromNow = created_at.fromNow();
23 |
24 | return (
25 |
26 |
27 | - Command: {this.props.command}
28 | - PID: {this.props.pid}
29 | - CPU: {this.props.cpu_percent}%
30 | - Memory Used: {this.props.memory_used_mb}mb
31 | - Spider: {this.props.spider}
32 | - Base Dir: {this.props.base_dir}
33 | - Created At: {fromNow}
34 | -
35 |
36 |
37 |
38 |
39 |
40 | );
41 | }
42 |
43 | });
44 |
45 | /*var Link = React.createClass({
46 |
47 | render: function(){
48 | return (
49 |
50 |
51 |
52 | );
53 | }
54 |
55 | });*/
56 |
57 | module.exports = ServerSubProcess;
58 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/settings.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | try:
4 | import configparser
5 | except ImportError:
6 | import ConfigParser as configparser
7 |
8 | from scrapy_eagle.dashboard.utils import ip
9 |
10 | buffers = {}
11 |
12 | queue_info_global = []
13 |
14 | subprocess_pids = set()
15 |
16 | # Never import these directly
17 | # Use get_config_file and get_args instead
18 | _args = None
19 | _config = None
20 | _public_ip = None
21 | _hostname = None
22 | _spiders = None
23 | _commands = None
24 |
25 |
26 | def setup_configuration(config_file=None):
27 |
28 | global _config
29 |
30 | _config = configparser.RawConfigParser()
31 | _config.read(config_file)
32 |
33 | globals()['_config'] = _config
34 |
35 | return _config
36 |
37 |
38 | def setup(config_file=None, output=True):
39 |
40 | global _args, _config, _public_ip, _hostname
41 |
42 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
43 | parser.add_argument('-c', '--config-file', help='Config file path.')
44 |
45 | _args = parser.parse_args()
46 |
47 | if not _args.config_file and not config_file:
48 | print('You should specify a config file using --config-file parameter.')
49 | exit(0)
50 |
51 | _config = setup_configuration(config_file=_args.config_file or config_file)
52 |
53 | if output:
54 | print('discovering your external entrypoint address... ', end='', flush=True)
55 |
56 | _public_ip = ip.get_external_ip()
57 |
58 | if output:
59 | print(_public_ip)
60 |
61 | _hostname = ip.get_hostname()
62 |
63 | return _args, _config
64 |
65 |
66 | def get_public_ip():
67 | return _public_ip
68 |
69 |
70 | def get_hostname():
71 | return _hostname
72 |
73 |
74 | def get_config_file():
75 | return _config
76 |
77 |
78 | def get_args():
79 | return _args
80 |
81 |
82 | def get_spiders():
83 | return _spiders
84 |
85 |
86 | def get_commands():
87 | return _commands
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/memory.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import redis
4 |
5 | from scrapy_eagle.dashboard.settings import get_config_file
6 |
7 | redis_pool = None
8 |
9 |
10 | def init_memory():
11 |
12 | global redis_pool
13 |
14 | config = get_config_file()
15 |
16 | redis_pool = redis.ConnectionPool(
17 | host=config['redis']['host'],
18 | port=config['redis']['port'],
19 | db=config['redis']['db'],
20 | password=config.get('redis', 'password', fallback='')
21 | )
22 |
23 |
24 | def get_redis_pool():
25 | return redis_pool
26 |
27 |
28 | def get_connection():
29 |
30 | if not redis_pool:
31 | init_memory()
32 |
33 | return redis.Redis(connection_pool=redis_pool)
34 |
35 |
36 | def get_job_object(key):
37 |
38 | redis_conn = get_connection()
39 |
40 | json_obj = redis_conn.get('eagle_jobs:{key}'.format(key=key))
41 |
42 | if json_obj:
43 | return json.loads(json_obj.decode('utf-8'))
44 | else:
45 | return None
46 |
47 | def update_job_object(key, fields):
48 |
49 | redis_conn = get_connection()
50 |
51 | serialized = json.dumps(fields, sort_keys=True)
52 |
53 | redis_conn.set('eagle_jobs:{key}'.format(key=key), serialized)
54 |
55 | if __name__ == "__main__":
56 |
57 | from scrapy_eagle.dashboard.settings import setup_configuration
58 |
59 | _config = setup_configuration(config_file='/etc/scrapy-eagle.ini')
60 |
61 | init_memory()
62 |
63 | o = get_job_object(key='epocacosmeticos.com.br')
64 |
65 | print(o)
66 |
67 | d = {
68 | "active": True,
69 | "max_memory_mb": 220,
70 | "job_type": "spider",
71 | "last_started_at": "2016-08-31T04:17:51.200187",
72 | "priority": 6,
73 | "start_urls": [
74 | "http://epocacosmeticos.com.br/",
75 | "http://www.epocacosmeticos.com.br/perfumes"
76 | ],
77 | "max_concurrency": 4,
78 | "min_concurrency": 1,
79 | "frequency_minutes": 1440
80 | }
81 |
82 | update_job_object(key='epocacosmeticos.com.br', fields=d)
83 |
84 | print(get_job_object(key='epocacosmeticos.com.br'))
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/green_threads/executor.py:
--------------------------------------------------------------------------------
1 | import gevent
2 | from datetime import datetime, timedelta
3 |
4 | from scrapy_eagle.dashboard import settings
5 | from scrapy_eagle.dashboard.memory import get_job_object, update_job_object
6 | from scrapy_eagle.dashboard.utils import iso_to_timestamp, timestamp_to_utc, processkit
7 |
8 |
9 | def evaluation_loop():
10 |
11 | while True:
12 |
13 | _spiders = settings.get_spiders()
14 | _commands = settings.get_commands()
15 |
16 | # When the system is starting up, spiders/commands may return empty because
17 | # we're using async execution `green_threads.find_new_spiders`.
18 | if _spiders and _commands:
19 |
20 | for key in _spiders + _commands:
21 | obj = get_job_object(key=key)
22 |
23 | if obj and obj.get('next_execution_at'):
24 |
25 | next_execution_at = timestamp_to_utc(iso_to_timestamp(obj['next_execution_at']))
26 |
27 | now = datetime.utcnow()
28 |
29 | if next_execution_at < now:
30 |
31 | dispatch(key=key, register=obj)
32 |
33 | gevent.sleep(3)
34 |
35 |
36 | def dispatch(key, register):
37 |
38 | _config = settings.get_config_file()
39 |
40 | register['last_started_at'] = datetime.utcnow().isoformat()
41 | register['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=register['frequency_minutes'])).isoformat()
42 |
43 | if register['job_type'] == "spider":
44 | command = [_config.get('scrapy', 'binary'), 'crawl', key]
45 | base_dir = _config.get('scrapy', 'base_dir')
46 | spider = True
47 |
48 | elif register['job_type'] == "command":
49 | command = [_config.get('commands', 'binary'), '-u', key + '.py']
50 | base_dir = _config.get('commands', 'base_dir')
51 | spider = False
52 |
53 | gevent.spawn(
54 | processkit.new_subprocess,
55 | base_dir=base_dir,
56 | command=command,
57 | spider=spider,
58 | subprocess_pids=settings.subprocess_pids,
59 | queue_info_global=settings.queue_info_global,
60 | buffers=settings.buffers
61 | )
62 |
63 | update_job_object(key=key, fields=register)
64 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/servers/ServerSet.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react'
2 | import { connect } from 'react-redux'
3 |
4 | var ServerNode = require('./ServerNode.jsx');
5 |
6 | var SetIntervalMixin = {
7 | componentWillMount: function() {
8 | this.intervals = [];
9 | },
10 | setInterval: function() {
11 | this.intervals.push(setInterval.apply(null, arguments));
12 | },
13 | componentWillUnmount: function() {
14 | this.intervals.forEach(clearInterval);
15 | }
16 | };
17 |
18 | var ServerSet = React.createClass({
19 |
20 | mixins: [SetIntervalMixin],
21 |
22 | getInitialState: function() {
23 | return {server_set: new Array()};
24 | },
25 |
26 | componentDidMount:function(){
27 | this.setInterval(this.updateServers, 3000);
28 | },
29 |
30 | updateServers: function() {
31 |
32 | var that = this;
33 |
34 | var server_set_new = new Array();
35 |
36 | this.serversRequest = $.ajax({
37 | url: window.location.protocol + "//" + document.domain + ":"+ location.port +"/servers/list",
38 | type: 'GET',
39 | dataType: 'json',
40 | cache: false
41 | }).done(function(data) {
42 |
43 | data.forEach(function(elem, index){
44 | server_set_new.push({public_ip: elem.public_ip, hostname: elem.hostname});
45 | })
46 |
47 | }).always(function () {
48 | that.setState({'server_set': server_set_new});
49 | that.props.set_server_qty(server_set_new.length);
50 | });
51 |
52 | },
53 |
54 | componentWillUnmount: function() {
55 | // Ref: https://facebook.github.io/react/tips/initial-ajax.html
56 | this.serversRequest.abort();
57 | },
58 | render: function(){
59 | var listServers = this.state.server_set.map(function(item) {
60 | return ;
64 | });
65 |
66 | return (
67 |
71 | );
72 | }
73 | });
74 |
75 |
76 | var mapDispatchToProps = function(dispatch){
77 | return {
78 | dispatch,
79 | set_server_qty: (qty) => { dispatch({type: 'SET_SERVER_QTY', qty: qty}); }
80 | }
81 | };
82 |
83 | export default connect(
84 | (state) => { return {} },
85 | mapDispatchToProps
86 | )(ServerSet)
87 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Scrapy Eagle
8 |
9 |
10 |
11 |
12 |
15 |
16 |
17 |
18 |
19 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/scrapy_eagle/worker/connection.py:
--------------------------------------------------------------------------------
1 | import redis
2 | import six
3 |
4 | from scrapy.utils.misc import load_object
5 |
6 |
7 | DEFAULT_REDIS_CLS = redis.StrictRedis
8 |
9 |
10 | # Sane connection defaults.
11 | DEFAULT_PARAMS = {
12 | 'socket_timeout': 30,
13 | 'socket_connect_timeout': 30,
14 | 'retry_on_timeout': True,
15 | }
16 |
17 | # Shortcut maps 'setting name' -> 'parmater name'.
18 | SETTINGS_PARAMS_MAP = {
19 | 'REDIS_URL': 'url',
20 | 'REDIS_HOST': 'host',
21 | 'REDIS_PORT': 'port',
22 | }
23 |
24 |
25 | def get_redis_from_settings(settings):
26 | """Returns a redis client instance from given Scrapy settings object.
27 |
28 | This function uses ``get_client`` to instantiate the client and uses
29 | ``DEFAULT_PARAMS`` global as defaults values for the parameters. You can
30 | override them using the ``REDIS_PARAMS`` setting.
31 |
32 | Parameters
33 | ----------
34 | settings : Settings
35 | A scrapy settings object. See the supported settings below.
36 |
37 | Returns
38 | -------
39 | server
40 | Redis client instance.
41 |
42 | Other Parameters
43 | ----------------
44 | REDIS_URL : str, optional
45 | Server connection URL.
46 | REDIS_HOST : str, optional
47 | Server host.
48 | REDIS_PORT : str, optional
49 | Server port.
50 | REDIS_PARAMS : dict, optional
51 | Additional client parameters.
52 |
53 | """
54 | params = DEFAULT_PARAMS.copy()
55 | params.update(settings.getdict('REDIS_PARAMS'))
56 | # XXX: Deprecate REDIS_* settings.
57 | for source, dest in SETTINGS_PARAMS_MAP.items():
58 | val = settings.get(source)
59 | if val:
60 | params[dest] = val
61 |
62 | # Allow ``redis_cls`` to be a path to a class.
63 | if isinstance(params.get('redis_cls'), six.string_types):
64 | params['redis_cls'] = load_object(params['redis_cls'])
65 |
66 | return get_redis(**params)
67 |
68 |
69 | # Backwards compatible alias.
70 | from_settings = get_redis_from_settings
71 |
72 |
73 | def get_redis(**kwargs):
74 | """Returns a redis client instance.
75 |
76 | Parameters
77 | ----------
78 | redis_cls : class, optional
79 | Defaults to ``redis.StrictRedis``.
80 | url : str, optional
81 | If given, ``redis_cls.from_url`` is used to instantiate the class.
82 | **kwargs
83 | Extra parameters to be passed to the ``redis_cls`` class.
84 |
85 | Returns
86 | -------
87 | server
88 | Redis client instance.
89 |
90 | """
91 | redis_cls = kwargs.pop('redis_cls', DEFAULT_REDIS_CLS)
92 | url = kwargs.pop('url', None)
93 | if url:
94 | return redis_cls.from_url(url, **kwargs)
95 | else:
96 | return redis_cls(**kwargs)
97 |
98 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/green_threads/stats.py:
--------------------------------------------------------------------------------
1 | from gevent import monkey
2 | monkey.patch_all()
3 |
4 | import gevent
5 | import gevent.pool
6 |
7 | from scrapy_eagle.dashboard import settings
8 | from scrapy_eagle.dashboard.utils.processkit import get_resources_info_from_pid, get_resources_info_from_server
9 |
10 | def send_redis_queue_info(socketio, redis_conn, spiders, queue_info_global):
11 |
12 | while True:
13 |
14 | queues = []
15 |
16 | for spider in spiders:
17 | queues.append(
18 | {
19 | 'name': spider,
20 | 'size': int(redis_conn.llen('{spider}:requests'.format(spider=spider)))
21 | }
22 | )
23 |
24 | # Don't asign directly to maintain the reference to the global object
25 | queue_info_global.clear()
26 | queue_info_global.extend(queues)
27 |
28 | socketio.emit('redis_queue_info', {'data': queues}, namespace="/queues", broadcast=True)
29 |
30 | gevent.sleep(1)
31 |
32 | def send_resources_info(socketio, subprocess_pids, public_ip):
33 |
34 | while True:
35 |
36 | dict_info_pid_greenlet = gevent.spawn(get_resources_info_from_pid)
37 | dict_info_host_greenlet = gevent.spawn(get_resources_info_from_server)
38 |
39 | subprocess_info_greenlets = []
40 |
41 | for pid, spider, command, base_dir, created_at in subprocess_pids:
42 |
43 | # We pass all the parameters that we like to keep instead
44 | # of simply use a .update() here because the returned instance
45 | # is a Greenlet instead of a dict.
46 |
47 | info_greenlet = gevent.spawn(
48 | get_resources_info_from_pid,
49 | pid=pid,
50 | spider=spider,
51 | command=command,
52 | base_dir=base_dir,
53 | created_at=created_at,
54 | )
55 |
56 | subprocess_info_greenlets.append(info_greenlet)
57 |
58 | dict_info_pid_greenlet.join()
59 | dict_info = dict_info_pid_greenlet.get()
60 | dict_info['public_ip'] = public_ip
61 |
62 | dict_info_host_greenlet.join()
63 | dict_info_host = dict_info_host_greenlet.get()
64 | dict_info.update(dict_info_host)
65 |
66 | gevent.joinall(subprocess_info_greenlets)
67 | dict_info['sub'] = [greenlet.get() for greenlet in subprocess_info_greenlets]
68 |
69 | # When get_resources_info try to access a PID that dont exists any more it
70 | # return None, here we remove those results. It happen because it takes
71 | # sometime to subprocess_pids remove PIDs that finishs.
72 | dict_info['sub'] = [x for x in dict_info['sub'] if x]
73 |
74 | _spiders = settings.get_spiders()
75 | _commands = settings.get_commands()
76 |
77 | dict_info['spiders'] = _spiders or []
78 | dict_info['commands'] = _commands or []
79 |
80 | print('\n\ndict_info: ', dict_info, '\n\n')
81 |
82 | socketio.emit('resources_info', {'data': dict_info}, namespace="/resources", broadcast=True)
83 |
84 | gevent.sleep(1)
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/processkit.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 | # -*- coding:utf-8 -*-
3 |
4 | import os
5 | import subprocess
6 | from datetime import datetime
7 |
8 | import psutil
9 | import gevent
10 |
11 | from scrapy_eagle.dashboard.green_threads import heartbeat
12 |
13 |
14 | def new_subprocess(base_dir, subprocess_pids, queue_info_global, command=None, spider=None, buffers={}):
15 |
16 | if not command:
17 | command = ['python', '-u', 'generator.py']
18 | # command = ['galculator']
19 | # command = ['/usr/bin/scrapy-py35', 'crawl', '{spider}'.format(spider)]
20 |
21 | with subprocess.Popen(
22 | command,
23 | cwd=base_dir,
24 | stdout=subprocess.PIPE,
25 | bufsize=1,
26 | universal_newlines=True
27 | ) as p:
28 |
29 | # Turn it JSON serializable
30 | created_at = datetime.utcnow().isoformat()
31 |
32 | identifier = (p.pid, spider, " ".join(command), base_dir, created_at)
33 |
34 | subprocess_pids.add(identifier)
35 |
36 | buffers[p.pid] = {'finished': False, 'lines': []}
37 |
38 | if spider:
39 | gevent.spawn(
40 | heartbeat.heartbeat_subprocess,
41 | p.pid,
42 | spider,
43 | max_seconds_idle=20,
44 | max_size_limit=15,
45 | queue_info_global=queue_info_global
46 | )
47 |
48 | for line in p.stdout:
49 |
50 | # TODO: remove empty lines
51 |
52 | if len(line.strip()) > 0:
53 |
54 | buffers[p.pid]['lines'].append(line)
55 |
56 | # print(line, end='', flush=True)
57 |
58 | buffers[p.pid]['finished'] = True
59 |
60 | subprocess_pids.remove(identifier)
61 |
62 |
63 | def _get_info_from_pid(pid=None):
64 |
65 | if not pid:
66 | pid = os.getpid()
67 |
68 | process = psutil.Process(pid)
69 |
70 | mem = process.memory_info()[0] / float(2 ** 20)
71 | mem = float('{0:.2f}'.format(mem))
72 |
73 | cpu = process.cpu_percent(interval=0.5)
74 |
75 | return pid, mem, cpu
76 |
77 |
78 | def get_resources_info_from_server():
79 |
80 | cpus = psutil.cpu_percent(interval=0.5, percpu=True)
81 |
82 | # Mem results return in bytes
83 | vmem = psutil.virtual_memory()
84 |
85 | total = vmem.total
86 | total = (total / 1024.0) / 1024.0
87 |
88 | available = vmem.available
89 | available = (available / 1024.0) / 1024.0
90 |
91 | used = total - available
92 |
93 | return {
94 | 'cpus': cpus,
95 | 'memory_total_mb': float('{0:.2f}'.format(total)),
96 | 'memory_available_mb': float('{0:.2f}'.format(available)),
97 | 'memory_used_server_mb': float('{0:.2f}'.format(used))
98 | }
99 |
100 |
101 | def get_resources_info_from_pid(pid=None, *args, **kwargs):
102 |
103 | try:
104 |
105 | pid, memory_used_mb, cpu_percent = _get_info_from_pid(pid=pid)
106 |
107 | result = {
108 | 'pid': pid,
109 | 'memory_used_mb': memory_used_mb,
110 | 'cpu_percent': cpu_percent,
111 | }
112 |
113 | result.update(kwargs)
114 |
115 | return result
116 |
117 | except psutil.NoSuchProcess:
118 | print('TODO: an error here')
119 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/processes.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import signal
4 |
5 | import flask
6 | import gevent
7 |
8 | from scrapy_eagle.dashboard.utils import processkit
9 | from scrapy_eagle.dashboard import settings
10 |
11 |
12 | processes = flask.Blueprint('processes', __name__)
13 |
14 |
15 | @processes.route('/exec_command')
16 | def exec_command():
17 |
18 | gevent.spawn(
19 | processkit.new_subprocess,
20 | base_dir='.',
21 | subprocess_pids=settings.subprocess_pids,
22 | queue_info_global=settings.queue_info_global,
23 | buffers=settings.buffers
24 | )
25 |
26 | result = {
27 | 'status': True
28 | }
29 |
30 | return flask.Response(
31 | response=json.dumps(result, sort_keys=True),
32 | status=200,
33 | mimetype="application/json"
34 | )
35 |
36 |
37 | @processes.route('/read_buffer/')
38 | def read_buffer(pid):
39 |
40 | if not settings.buffers.get(pid):
41 | return flask.Response(
42 | response=json.dumps(
43 | {'status': False, 'msg': 'PID Not Found'},
44 | sort_keys=True
45 | ),
46 | status=200,
47 | mimetype="application/json"
48 | )
49 |
50 | def generate():
51 |
52 | sent = 0
53 |
54 | while not settings.buffers[pid]['finished']:
55 |
56 | for i, row in enumerate(settings.buffers[pid]['lines'][sent:]):
57 |
58 | sent += 1
59 |
60 | yield row+'
'
61 |
62 | gevent.sleep(0.5)
63 |
64 | return flask.Response(
65 | response=generate(),
66 | status=200,
67 | mimetype="text/html"
68 | )
69 |
70 |
71 | @processes.route('/kill_subprocess/')
72 | def kill_subprocess(pid):
73 |
74 | safe = False
75 |
76 | for _pid, _, _, _, _ in settings.subprocess_pids:
77 |
78 | if pid == _pid:
79 | safe = True
80 | break
81 |
82 | if safe:
83 | os.kill(pid, signal.SIGHUP)
84 |
85 | result = {
86 | 'status': True,
87 | 'msg': 'SIGHUP signal sent to PID {0}'.format(pid)
88 | }
89 |
90 | else:
91 | result = {
92 | 'status': False,
93 | 'msg': 'PID Not Found'
94 | }
95 |
96 | return flask.Response(
97 | response=json.dumps(result, sort_keys=True),
98 | status=200,
99 | mimetype="application/json"
100 | )
101 |
102 |
103 | @processes.route('/start_spider/')
104 | def start_spider(spider):
105 |
106 | _config = settings.get_config_file()
107 |
108 | command = [_config.get('scrapy', 'binary'), 'crawl', spider]
109 |
110 | # TODO: Verify if base_dir is set before use it
111 |
112 | gevent.spawn(
113 | processkit.new_subprocess,
114 | base_dir=_config.get('scrapy', 'base_dir'),
115 | command=command,
116 | spider=spider,
117 | subprocess_pids=settings.subprocess_pids,
118 | queue_info_global=settings.queue_info_global,
119 | buffers=settings.buffers
120 | )
121 |
122 | result = {
123 | 'status': True
124 | }
125 |
126 | return flask.Response(
127 | response=json.dumps(result, sort_keys=True),
128 | status=200,
129 | mimetype="application/json"
130 | )
131 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/js/vendor/jquery.navgoco.min.js:
--------------------------------------------------------------------------------
1 | /*
2 | * jQuery Navgoco Menus Plugin v0.2.1 (2014-04-11)
3 | * https://github.com/tefra/navgoco
4 | *
5 | * Copyright (c) 2014 Chris T (@tefra)
6 | * BSD - https://github.com/tefra/navgoco/blob/master/LICENSE-BSD
7 | */
8 | !function(a){"use strict";var b=function(b,c,d){return this.el=b,this.$el=a(b),this.options=c,this.uuid=this.$el.attr("id")?this.$el.attr("id"):d,this.state={},this.init(),this};b.prototype={init:function(){var b=this;b._load(),b.$el.find("ul").each(function(c){var d=a(this);d.attr("data-index",c),b.options.save&&b.state.hasOwnProperty(c)?(d.parent().addClass(b.options.openClass),d.show()):d.parent().hasClass(b.options.openClass)?(d.show(),b.state[c]=1):d.hide()});var c=a("").prepend(b.options.caretHtml),d=b.$el.find("li > a");b._trigger(c,!1),b._trigger(d,!0),b.$el.find("li:has(ul) > a").prepend(c)},_trigger:function(b,c){var d=this;b.on("click",function(b){b.stopPropagation();var e=c?a(this).next():a(this).parent().next(),f=!1;if(c){var g=a(this).attr("href");f=void 0===g||""===g||"#"===g}if(e=e.length>0?e:!1,d.options.onClickBefore.call(this,b,e),!c||e&&f)b.preventDefault(),d._toggle(e,e.is(":hidden")),d._save();else if(d.options.accordion){var h=d.state=d._parents(a(this));d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");h.hasOwnProperty(c)||d._toggle(b,!1)}),d._save()}d.options.onClickAfter.call(this,b,e)})},_toggle:function(b,c){var d=this,e=b.attr("data-index"),f=b.parent();if(d.options.onToggleBefore.call(this,b,c),c){if(f.addClass(d.options.openClass),b.slideDown(d.options.slide),d.state[e]=1,d.options.accordion){var g=d.state=d._parents(b);g[e]=d.state[e]=1,d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");g.hasOwnProperty(c)||d._toggle(b,!1)})}}else f.removeClass(d.options.openClass),b.slideUp(d.options.slide),d.state[e]=0;d.options.onToggleAfter.call(this,b,c)},_parents:function(b,c){var d={},e=b.parent(),f=e.parents("ul");return f.each(function(){var b=a(this),e=b.attr("data-index");return e?void(d[e]=c?b:1):!1}),d},_save:function(){if(this.options.save){var b={};for(var d in this.state)1===this.state[d]&&(b[d]=1);c[this.uuid]=this.state=b,a.cookie(this.options.cookie.name,JSON.stringify(c),this.options.cookie)}},_load:function(){if(this.options.save){if(null===c){var b=a.cookie(this.options.cookie.name);c=b?JSON.parse(b):{}}this.state=c.hasOwnProperty(this.uuid)?c[this.uuid]:{}}},toggle:function(b){var c=this,d=arguments.length;if(1>=d)c.$el.find("ul").each(function(){var d=a(this);c._toggle(d,b)});else{var e,f={},g=Array.prototype.slice.call(arguments,1);d--;for(var h=0;d>h;h++){e=g[h];var i=c.$el.find('ul[data-index="'+e+'"]').first();if(i&&(f[e]=i,b)){var j=c._parents(i,!0);for(var k in j)f.hasOwnProperty(k)||(f[k]=j[k])}}for(e in f)c._toggle(f[e],b)}c._save()},destroy:function(){a.removeData(this.$el),this.$el.find("li:has(ul) > a").unbind("click"),this.$el.find("li:has(ul) > a > span").unbind("click")}},a.fn.navgoco=function(c){if("string"==typeof c&&"_"!==c.charAt(0)&&"init"!==c)var d=!0,e=Array.prototype.slice.call(arguments,1);else c=a.extend({},a.fn.navgoco.defaults,c||{}),a.cookie||(c.save=!1);return this.each(function(f){var g=a(this),h=g.data("navgoco");h||(h=new b(this,d?a.fn.navgoco.defaults:c,f),g.data("navgoco",h)),d&&h[c].apply(h,e)})};var c=null;a.fn.navgoco.defaults={caretHtml:"",accordion:!1,openClass:"open",save:!0,cookie:{name:"navgoco",expires:!1,path:"/"},slide:{duration:400,easing:"swing"},onClickBefore:a.noop,onClickAfter:a.noop,onToggleBefore:a.noop,onToggleAfter:a.noop}}(jQuery);
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/jobs/JobsConfig.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react'
2 |
3 | import { connect } from 'react-redux'
4 | //import PureRenderMixin from 'react-addons-pure-render-mixin'
5 |
6 | import JobsItem from './JobsItem.jsx'
7 |
8 | require('./JobsConfig.scss');
9 |
10 | class JobsConfig extends React.Component {
11 |
12 | constructor(props){
13 | super(props);
14 | //this.shouldComponentUpdate = PureRenderMixin.shouldComponentUpdate.bind(this);
15 | this.state = {};
16 | }
17 |
18 | componentDidMount(){
19 | this.updateSpiders();
20 | }
21 |
22 | updateSpiders(){
23 |
24 | }
25 |
26 | componentWillReceiveProps(nextProps) {
27 | // console.log('entro componentWillReceiveProps');
28 | }
29 |
30 | shouldComponentUpdate(nextProps, nextState) {
31 | return true;
32 | //return nextProps.id !== this.props.id;
33 | }
34 |
35 | render() {
36 | const {jobs} = this.props;
37 |
38 | // console.log('render!');
39 |
40 | var toggle_class = 'odd';
41 |
42 | // https://github.com/facebook/immutable-js/issues/667#issuecomment-220223640
43 | var list_spiders = jobs.entrySeq().map(([key, value]) => {
44 |
45 | if (value.job_type == 'spider') {
46 |
47 | toggle_class = (toggle_class == 'odd') ? 'even' : 'odd';
48 |
49 | return ;
55 | }
56 |
57 | });
58 |
59 | var list_commands = jobs.entrySeq().map(([key, value]) => {
60 |
61 | if (value.job_type == 'command') {
62 |
63 | toggle_class = (toggle_class == 'odd') ? 'even' : 'odd';
64 |
65 | return ;
71 | }
72 |
73 | });
74 |
75 | return (
76 |
77 |
Jobs Configuration
78 |
79 | {list_spiders}
80 |
81 |
82 |
83 |
Commands Configuration
84 |
85 | {list_commands}
86 |
87 |
88 |
89 |
90 |
Legends
91 |
92 | - Frequency: Amount of time in minutes defining when to trigger this action over time. Ex.: 60 means each hour
93 | - Max Concurrency: How many servers will be this action running.
94 | - Min Concurrency: Only dispatch this job when a minimum of resources are available.
95 | - Priority: Highest numbers is selected when the system need to choose between equals opportunities.
96 | - Max Memory: The processes are killed when reach this threshold (in megabytes) and could be reallocated in other server or in the same server.
97 | - Start URLs: A list of URLs to use as starting point, one by line.
98 | - Last started at: Last time this job was triggered.
99 |
100 |
101 |
102 |
103 |
104 | );
105 | }
106 |
107 | }
108 |
109 | var mapDispatchToProps = function(dispatch){
110 | return {
111 | dispatch
112 | }
113 | };
114 |
115 | export default connect(
116 | (state) => {
117 | return {
118 | jobs: state.jobs
119 | }
120 | },
121 | mapDispatchToProps
122 | )(JobsConfig)
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/main.py:
--------------------------------------------------------------------------------
1 | from gevent import monkey
2 | monkey.patch_all()
3 |
4 | import os
5 | import sys
6 | import signal
7 | import threading
8 |
9 | import flask
10 | import gevent
11 |
12 | from flask_cors import CORS
13 | from flask_socketio import SocketIO
14 |
15 | try:
16 | import configparser
17 | except ImportError:
18 | import ConfigParser as configparser
19 |
20 | from scrapy_eagle.dashboard import settings
21 | from scrapy_eagle.dashboard import memory
22 | from scrapy_eagle.dashboard.green_threads import heartbeat, stats, find_new_spiders, find_new_commands, executor
23 | from scrapy_eagle.dashboard.utils import processkit
24 |
25 |
26 | app = flask.Flask(__name__, static_folder='templates/static')
27 |
28 |
29 | def main():
30 |
31 | # Install the arguments and config file inside the config module
32 | _, _ = settings.setup()
33 |
34 |
35 | def shutdown():
36 |
37 | # Send a signal to all opened subprocess, closing them.
38 | for pid, _, _, _, _ in settings.subprocess_pids:
39 |
40 | print('killing subprocess: {pid}'.format(pid=pid))
41 |
42 | os.kill(pid, signal.SIGHUP)
43 |
44 | print('\nshutting down {0}...'.format(threading.currentThread().getName()))
45 |
46 | sys.exit(0)
47 |
48 |
49 | def start_periodics(socketio):
50 |
51 | redis_conn = memory.get_connection()
52 | public_ip = settings.get_public_ip()
53 | hostname = settings.get_hostname()
54 |
55 | for i in range(3):
56 | gevent.spawn(
57 | processkit.new_subprocess,
58 | base_dir='.',
59 | subprocess_pids=settings.subprocess_pids,
60 | queue_info_global=settings.queue_info_global,
61 | buffers=settings.buffers
62 | )
63 |
64 | gevent.spawn(heartbeat.heartbeat_servers, redis_conn, public_ip, hostname)
65 | gevent.spawn(stats.send_resources_info, socketio, settings.subprocess_pids, public_ip)
66 | gevent.spawn(executor.evaluation_loop)
67 | gevent.spawn(find_new_spiders)
68 | gevent.spawn(find_new_commands)
69 |
70 |
71 | def entry_point():
72 |
73 | # Graceful shutdown when kill are received
74 | signal.signal(signal.SIGTERM, lambda sig, frame: shutdown())
75 |
76 | # Graceful shutdown when terminal session are closed
77 | signal.signal(signal.SIGHUP, lambda sig, frame: shutdown())
78 |
79 | main()
80 |
81 | try:
82 |
83 | _config = settings.get_config_file()
84 |
85 | app.config['SECRET_KEY'] = _config.get('server', 'cookie_secret_key')
86 | app.config['DEBUG'] = _config.getboolean('server', 'debug', fallback=True)
87 |
88 | from scrapy_eagle.dashboard.views import servers, processes, root, jobs, react_app
89 |
90 | app.register_blueprint(root.root, url_prefix='/')
91 | app.register_blueprint(react_app.react_app, url_prefix='/app')
92 | app.register_blueprint(servers.servers, url_prefix='/servers')
93 | app.register_blueprint(processes.processes, url_prefix='/processes')
94 | app.register_blueprint(jobs.jobs, url_prefix='/jobs')
95 |
96 | CORS(app)
97 |
98 | socketio = SocketIO(app, async_mode='gevent')
99 |
100 | start_periodics(socketio)
101 |
102 | # use_reloader: avoid Flask execute twice
103 | socketio.run(
104 | app=app,
105 | host=_config.get('server', 'host', fallback='0.0.0.0'),
106 | port=_config.getint('server', 'port', fallback=5000),
107 | use_reloader=False
108 | )
109 |
110 | except (KeyboardInterrupt, SystemExit):
111 |
112 | shutdown()
113 |
114 |
115 | if __name__ == "__main__":
116 |
117 | entry_point()
118 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | **/static/js/bundle.js.map
3 |
4 | # Created by https://www.gitignore.io/api/pycharm,python,sublimetext,komodoedit,vim,linux
5 |
6 | ### PyCharm ###
7 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
8 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
9 |
10 | # User-specific stuff:
11 | .idea
12 | .idea/workspace.xml
13 | .idea/tasks.xml
14 | .idea/dictionaries
15 | .idea/vcs.xml
16 | .idea/jsLibraryMappings.xml
17 |
18 | # Sensitive or high-churn files:
19 | .idea/dataSources.ids
20 | .idea/dataSources.xml
21 | .idea/dataSources.local.xml
22 | .idea/sqlDataSources.xml
23 | .idea/dynamic.xml
24 | .idea/uiDesigner.xml
25 |
26 | # Gradle:
27 | .idea/gradle.xml
28 | .idea/libraries
29 |
30 | # Mongo Explorer plugin:
31 | .idea/mongoSettings.xml
32 |
33 | ## File-based project format:
34 | *.iws
35 |
36 | ## Plugin-specific files:
37 |
38 | # IntelliJ
39 | /out/
40 |
41 | # mpeltonen/sbt-idea plugin
42 | .idea_modules/
43 |
44 | # JIRA plugin
45 | atlassian-ide-plugin.xml
46 |
47 | # Crashlytics plugin (for Android Studio and IntelliJ)
48 | com_crashlytics_export_strings.xml
49 | crashlytics.properties
50 | crashlytics-build.properties
51 | fabric.properties
52 |
53 |
54 | ### Python ###
55 | # Byte-compiled / optimized / DLL files
56 | __pycache__/
57 | *.py[cod]
58 | *$py.class
59 |
60 | # C extensions
61 | *.so
62 |
63 | # Distribution / packaging
64 | .Python
65 | env/
66 | build/
67 | develop-eggs/
68 | dist/
69 | downloads/
70 | eggs/
71 | .eggs/
72 | lib/
73 | lib64/
74 | parts/
75 | sdist/
76 | var/
77 | *.egg-info/
78 | .installed.cfg
79 | *.egg
80 |
81 | # PyInstaller
82 | # Usually these files are written by a python script from a template
83 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
84 | *.manifest
85 | *.spec
86 |
87 | # Installer logs
88 | pip-log.txt
89 | pip-delete-this-directory.txt
90 |
91 | # Unit test / coverage reports
92 | htmlcov/
93 | .tox/
94 | .coverage
95 | .coverage.*
96 | .cache
97 | nosetests.xml
98 | coverage.xml
99 | *,cover
100 | .hypothesis/
101 |
102 | # Translations
103 | *.mo
104 | *.pot
105 |
106 | # Django stuff:
107 | *.log
108 | local_settings.py
109 |
110 | # Flask instance folder
111 | instance/
112 |
113 | # Scrapy stuff:
114 | .scrapy
115 |
116 | # Sphinx documentation
117 | docs/_build/
118 |
119 | # PyBuilder
120 | target/
121 |
122 | # IPython Notebook
123 | .ipynb_checkpoints
124 |
125 | # pyenv
126 | .python-version
127 |
128 | # celery beat schedule file
129 | celerybeat-schedule
130 |
131 | # dotenv
132 | .env
133 |
134 | # virtualenv
135 | venv/
136 | ENV/
137 |
138 | # Spyder project settings
139 | .spyderproject
140 |
141 | # Rope project settings
142 | .ropeproject
143 |
144 |
145 | ### SublimeText ###
146 | # cache files for sublime text
147 | *.tmlanguage.cache
148 | *.tmPreferences.cache
149 | *.stTheme.cache
150 |
151 | # workspace files are user-specific
152 | *.sublime-workspace
153 |
154 | # project files should be checked into the repository, unless a significant
155 | # proportion of contributors will probably not be using SublimeText
156 | # *.sublime-project
157 |
158 | # sftp configuration file
159 | sftp-config.json
160 |
161 |
162 | ### KomodoEdit ###
163 | *.komodoproject
164 | .komodotools
165 |
166 |
167 | ### Vim ###
168 | # swap
169 | [._]*.s[a-w][a-z]
170 | [._]s[a-w][a-z]
171 | # session
172 | Session.vim
173 | # temporary
174 | .netrwhist
175 | *~
176 | # auto-generated tag files
177 | tags
178 |
179 |
180 | ### Linux ###
181 | *~
182 |
183 | # temporary files which can be created if a process still has a handle open of a deleted file
184 | .fuse_hidden*
185 |
186 | # KDE directory preferences
187 | .directory
188 |
189 | # Linux trash folder which might appear on any partition or disk
190 | .Trash-*
191 |
192 |
193 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/servers/ServerNode.jsx:
--------------------------------------------------------------------------------
1 | var React = require('react');
2 | var ServerSubProcess = require('./ServerSubProcess.jsx');
3 |
4 | var ServerNode = React.createClass({
5 | getInitialState: function() {
6 | return {
7 | pid: "",
8 | public_ip: "",
9 | cpu_percent: "",
10 | memory_available_mb: "",
11 | memory_total_mb: "",
12 | memory_used_mb: "",
13 | memory_used_server_mb: "",
14 | cpus: [],
15 | subprocesses: [],
16 | spiders: []
17 | };
18 | },
19 | componentWillMount: function() {
20 |
21 | this.socket = io.connect(window.location.protocol + "//" + this.props.public_ip + ":" + location.port + "/resources");
22 | this.socket.on('resources_info', function (msg) {
23 |
24 | var buff = "[ ";
25 | for(var i = 0; i < msg.data.cpus.length; i++){
26 | if(i+1 == msg.data.cpus.length){
27 | buff += msg.data.cpus[i] + " ";
28 |
29 | } else {
30 | buff += msg.data.cpus[i] + " / ";
31 | }
32 | }
33 | buff += "]";
34 |
35 | this.setState({
36 | pid: msg.data.pid,
37 | public_ip: msg.data.public_ip,
38 | cpu_percent: msg.data.cpu_percent,
39 | memory_available_mb: msg.data.memory_available_mb,
40 | memory_total_mb: msg.data.memory_total_mb,
41 | memory_used_mb: msg.data.memory_used_mb,
42 | memory_used_server_mb: msg.data.memory_used_server_mb,
43 | cpus: buff,
44 | subprocesses: msg.data.sub,
45 | spiders: msg.data.spiders
46 | });
47 |
48 | // console.log(msg.data.cpus);
49 |
50 | }.bind(this));
51 |
52 | },
53 | componentWillUnmount: function(){
54 |
55 | this.socket.disconnect();
56 |
57 | },
58 | onClickExecCommand: function(e){
59 |
60 | $.get(window.location.protocol + "//" + this.state.public_ip + ":" + location.port + "/processes/exec_command", function(data) {
61 |
62 | });
63 |
64 | },
65 | onClickStartWorker: function(e){
66 |
67 | $.get(window.location.protocol + "//" + this.state.public_ip + ":" + location.port + "/processes/start_spider/" + this.state.selected_spider, function(data) {
68 |
69 | });
70 |
71 | },
72 | onChangeDataProvider: function(e){
73 |
74 | this.setState({'selected_spider': e.target.value});
75 |
76 | },
77 | render: function(){
78 |
79 | var listSubProcesses = this.state.subprocesses.map(function (item, i) {
80 | return ;
90 | }.bind(this));
91 |
92 | var listSpiders = this.state.spiders.map(function (item, i) {
93 | return (
94 |
95 | );
96 | }.bind(this));
97 |
98 | return (
99 |
100 |
101 | - IP: {this.props.public_ip} ({this.props.hostname})
102 | - PID: {this.state.pid}
103 | - CPU Server: {this.state.cpus}%
104 | - Memory Used Server : {this.state.memory_used_server_mb}mb
105 | - CPU Process: {this.state.cpu_percent}%
106 | - Memory Used Process: {this.state.memory_used_mb}mb
107 | - Memory Available: {this.state.memory_available_mb}mb
108 | - Memory Total: {this.state.memory_total_mb}mb
109 |
110 | -
111 |
115 |
116 |
117 |
118 |
119 | );
120 |
121 | }
122 | });
123 |
124 | module.exports = ServerNode;
125 |
--------------------------------------------------------------------------------
/scrapy_eagle/worker/dupefilter.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 |
4 | from scrapy.dupefilters import BaseDupeFilter
5 | from scrapy.utils.request import request_fingerprint
6 |
7 | from .connection import get_redis_from_settings
8 |
9 |
10 | DEFAULT_DUPEFILTER_KEY = "dupefilter:%(timestamp)s"
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | class RFPDupeFilter(BaseDupeFilter):
16 | """Redis-based request duplicates filter.
17 |
18 | This class can also be used with default Scrapy's scheduler.
19 |
20 | """
21 |
22 | logger = logger
23 |
24 | def __init__(self, server, key, debug=False):
25 | """Initialize the duplicates filter.
26 |
27 | Parameters
28 | ----------
29 | server : redis.StrictRedis
30 | The redis server instance.
31 | key : str
32 | Redis key Where to store fingerprints.
33 | debug : bool, optional
34 | Whether to log filtered requests.
35 |
36 | """
37 | self.server = server
38 | self.key = key
39 | self.debug = debug
40 | self.logdupes = True
41 |
42 | @classmethod
43 | def from_settings(cls, settings):
44 | """Returns an instance from given settings.
45 |
46 | This uses by default the key ``dupefilter:``. When using the
47 | ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
48 | it needs to pass the spider name in the key.
49 |
50 | Parameters
51 | ----------
52 | settings : scrapy.settings.Settings
53 |
54 | Returns
55 | -------
56 | RFPDupeFilter
57 | A RFPDupeFilter instance.
58 |
59 |
60 | """
61 | server = get_redis_from_settings(settings)
62 | # XXX: This creates one-time key. needed to support to use this
63 | # class as standalone dupefilter with scrapy's default scheduler
64 | # if scrapy passes spider on open() method this wouldn't be needed
65 | # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
66 | key = DEFAULT_DUPEFILTER_KEY % {'timestamp': int(time.time())}
67 | debug = settings.getbool('DUPEFILTER_DEBUG')
68 | return cls(server, key=key, debug=debug)
69 |
70 | @classmethod
71 | def from_crawler(cls, crawler):
72 | """Returns instance from crawler.
73 |
74 | Parameters
75 | ----------
76 | crawler : scrapy.crawler.Crawler
77 |
78 | Returns
79 | -------
80 | RFPDupeFilter
81 | Instance of RFPDupeFilter.
82 |
83 | """
84 | return cls.from_settings(crawler.settings)
85 |
86 | def request_seen(self, request):
87 | """Returns True if request was already seen.
88 |
89 | Parameters
90 | ----------
91 | request : scrapy.http.Request
92 |
93 | Returns
94 | -------
95 | bool
96 |
97 | """
98 | fp = self.request_fingerprint(request)
99 | # This returns the number of values added, zero if already exists.
100 | added = self.server.sadd(self.key, fp)
101 | return added == 0
102 |
103 | def request_fingerprint(self, request):
104 | """Returns a fingerprint for a given request.
105 |
106 | Parameters
107 | ----------
108 | request : scrapy.http.Request
109 |
110 | Returns
111 | -------
112 | str
113 |
114 | """
115 | return request_fingerprint(request)
116 |
117 | def close(self, reason=''):
118 | """Delete data on close. Called by Scrapy's scheduler.
119 |
120 | Parameters
121 | ----------
122 | reason : str, optional
123 |
124 | """
125 | self.clear()
126 |
127 | def clear(self):
128 | """Clears fingerprints data."""
129 | self.server.delete(self.key)
130 |
131 | def log(self, request, spider):
132 | """Logs given request.
133 |
134 | Parameters
135 | ----------
136 | request : scrapy.http.Request
137 | spider : scrapy.spiders.Spider
138 |
139 | """
140 | if self.debug:
141 | msg = "Filtered duplicate request: %(request)s"
142 | self.logger.debug(msg, {'request': request}, extra={'spider': spider})
143 | elif self.logdupes:
144 | msg = ("Filtered duplicate request %(request)s"
145 | " - no more duplicates will be shown"
146 | " (see DUPEFILTER_DEBUG to show all duplicates)")
147 | msg = "Filtered duplicate request: %(request)s"
148 | self.logger.debug(msg, {'request': request}, extra={'spider': spider})
149 | self.logdupes = False
150 |
151 |
--------------------------------------------------------------------------------
/scrapy_eagle/worker/queue.py:
--------------------------------------------------------------------------------
1 | from scrapy.utils.reqser import request_to_dict, request_from_dict
2 |
3 | from . import picklecompat
4 |
5 |
6 | class Base(object):
7 | """Per-spider queue/stack base class"""
8 |
9 | def __init__(self, server, spider, key, serializer=None):
10 | """Initialize per-spider redis queue.
11 |
12 | Parameters:
13 | server -- redis connection
14 | spider -- spider instance
15 | key -- key for this queue (e.g. "%(spider)s:queue")
16 |
17 | """
18 | if serializer is None:
19 | # Backward compatibility.
20 | # TODO: deprecate pickle.
21 | serializer = picklecompat
22 | if not hasattr(serializer, 'loads'):
23 | raise TypeError("serializer does not implement 'loads' function: %r"
24 | % serializer)
25 | if not hasattr(serializer, 'dumps'):
26 | raise TypeError("serializer '%s' does not implement 'dumps' function: %r"
27 | % serializer)
28 |
29 | self.server = server
30 | self.spider = spider
31 | self.key = key % {'spider': spider.name}
32 | self.serializer = serializer
33 |
34 | def _encode_request(self, request):
35 | """Encode a request object"""
36 | obj = request_to_dict(request, self.spider)
37 | return self.serializer.dumps(obj)
38 |
39 | def _decode_request(self, encoded_request):
40 | """Decode an request previously encoded"""
41 | obj = self.serializer.loads(encoded_request)
42 | return request_from_dict(obj, self.spider)
43 |
44 | def __len__(self):
45 | """Return the length of the queue"""
46 | raise NotImplementedError
47 |
48 | def push(self, request):
49 | """Push a request"""
50 | raise NotImplementedError
51 |
52 | def pop(self, timeout=0):
53 | """Pop a request"""
54 | raise NotImplementedError
55 |
56 | def clear(self):
57 | """Clear queue/stack"""
58 | self.server.delete(self.key)
59 |
60 |
61 | class SpiderQueue(Base):
62 | """Per-spider FIFO queue"""
63 |
64 | def __len__(self):
65 | """Return the length of the queue"""
66 | return self.server.llen(self.key)
67 |
68 | def push(self, request):
69 | """Push a request"""
70 | self.server.lpush(self.key, self._encode_request(request))
71 |
72 | def pop(self, timeout=0):
73 | """Pop a request"""
74 | if timeout > 0:
75 | data = self.server.brpop(self.key, timeout)
76 | if isinstance(data, tuple):
77 | data = data[1]
78 | else:
79 | data = self.server.rpop(self.key)
80 | if data:
81 | return self._decode_request(data)
82 |
83 |
84 | class SpiderPriorityQueue(Base):
85 | """Per-spider priority queue abstraction using redis' sorted set"""
86 |
87 | def __len__(self):
88 | """Return the length of the queue"""
89 | return self.server.zcard(self.key)
90 |
91 | def push(self, request):
92 | """Push a request"""
93 | data = self._encode_request(request)
94 | score = -request.priority
95 | # We don't use zadd method as the order of arguments change depending on
96 | # whether the class is Redis or StrictRedis, and the option of using
97 | # kwargs only accepts strings, not bytes.
98 | self.server.execute_command('ZADD', self.key, score, data)
99 |
100 | def pop(self, timeout=0):
101 | """
102 | Pop a request
103 | timeout not support in this queue class
104 | """
105 | # use atomic range/remove using multi/exec
106 | pipe = self.server.pipeline()
107 | pipe.multi()
108 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
109 | results, count = pipe.execute()
110 | if results:
111 | return self._decode_request(results[0])
112 |
113 |
114 | class SpiderStack(Base):
115 | """Per-spider stack"""
116 |
117 | def __len__(self):
118 | """Return the length of the stack"""
119 | return self.server.llen(self.key)
120 |
121 | def push(self, request):
122 | """Push a request"""
123 | self.server.lpush(self.key, self._encode_request(request))
124 |
125 | def pop(self, timeout=0):
126 | """Pop a request"""
127 | if timeout > 0:
128 | data = self.server.blpop(self.key, timeout)
129 | if isinstance(data, tuple):
130 | data = data[1]
131 | else:
132 | data = self.server.lpop(self.key)
133 |
134 | if data:
135 | return self._decode_request(data)
136 |
137 |
138 | __all__ = ['SpiderQueue', 'SpiderPriorityQueue', 'SpiderStack']
139 |
140 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/css/main.css:
--------------------------------------------------------------------------------
1 | div#server_set li#server-node {
2 | margin-bottom: 20px;
3 | }
4 |
5 | .active { color: red; }
6 |
7 | header {
8 | background-color: #222426;
9 | height: 60px;
10 | margin-right: 0;
11 | position: absolute;
12 | width: 100%;
13 | z-index: 200;
14 | }
15 |
16 | header div.brand {
17 | padding: 6px 0 0 0;
18 | }
19 |
20 | .flexbox {
21 | display: flex;
22 | overflow: hidden;
23 | flex-direction: row;
24 | min-height: 100vh;
25 | }
26 |
27 | div.subheader {
28 | position: absolute;
29 | margin-top: 60px;
30 | background-color: #2A2D2F;
31 | width: 100%;
32 | height: 46px;
33 | color: #00B280;
34 | z-index: 200;
35 | padding-top: 11px;
36 | }
37 |
38 | aside.sidebar {
39 | color: #001f3f;
40 | min-height: 100%;
41 | padding: 114px 0 0 10px;
42 | background-color: #DDFFDD;
43 | flex: 0 0 280px;
44 | }
45 |
46 | section.main-content-wrapper {
47 | padding: 114px 10px 10px 10px;
48 | /*border: 1px solid red;*/
49 | min-height: 100%;
50 | flex: 1;
51 | }
52 |
53 | .sidebar-header {
54 | color: #6f737e;
55 | font-weight: 600;
56 | line-height: 20px;
57 | margin: 0;
58 | padding: 10px 10px 5px;
59 | text-transform: uppercase;
60 | }
61 |
62 | .sidebar .nav a {
63 | font-weight: 600;
64 | text-decoration: none;
65 | }
66 | .sidebar .nav i {
67 | font-size: 1em;
68 | margin-right: 5px;
69 | }
70 | .sidebar .nav .nav-sub {
71 | display: none;
72 | list-style: outside none none;
73 | padding: 0;
74 | }
75 | .sidebar .nav .nav-sub li > a {
76 | display: block;
77 | font-size: 0.813em;
78 | padding: 8px 0 8px 10px;
79 | }
80 | .sidebar .nav > li > .nav-sub > li > a {
81 | padding-left: 22px;
82 | }
83 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > a {
84 | padding-left: 55px;
85 | }
86 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
87 | padding-left: 65px;
88 | }
89 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
90 | padding-left: 70px;
91 | }
92 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
93 | padding-left: 75px;
94 | }
95 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > a {
96 | padding-left: 25px;
97 | }
98 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > .nav-sub > li > a {
99 | padding-left: 35px;
100 | }
101 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
102 | padding-left: 45px;
103 | }
104 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
105 | padding-left: 55px;
106 | }
107 | .sidebar .nav .nav-sub .nav-dropdown > a {
108 | padding-right: 30px;
109 | }
110 | .sidebar .nav .nav-sub > .open > a, .sidebar .nav .nav-sub > .open > a:focus, .sidebar .nav .nav-sub > .open > a:hover {
111 | background-color: transparent;
112 | border-color: transparent;
113 | }
114 | .sidebar .nav-pills {
115 | margin-left: 5px;
116 | margin-right: 12px;
117 | }
118 | .sidebar .nav-pills > li > a {
119 | font-size: 0.875em;
120 | padding: 9px 10px;
121 | }
122 |
123 | .sidebar-left .nav > li.open > a,
124 | .sidebar-left .nav > li > a:hover {
125 | background-color: #ffffff;
126 | color: #1d2939;
127 | }
128 |
129 | .sidebar-mini .sidebar-left .nav > li.nav-dropdown-open > a,
130 | .sidebar-mini .sidebar-left .nav > li:hover > a {
131 | background-color: #fff;
132 | color: #1d2939;
133 | }
134 |
135 | .nav-pills .nav-item.open .nav-link,
136 | .nav-pills .nav-item.open .nav-link:focus,
137 | .nav-pills .nav-item.open .nav-link:hover {
138 | background-color: #29d1ca;
139 | color: #fff;
140 | cursor: pointer;
141 | }
142 |
143 | .nav-pills .nav-link.active,
144 | .nav-pills .nav-link.active:focus,
145 | .nav-pills .nav-link.active:hover {
146 | background-color: #27b6af;
147 | color: #fff;
148 | cursor: pointer;
149 | }
150 |
151 | .sidebar-left a {
152 | color: #1f7e9a;
153 | }
154 |
155 | .sidebar-left a:focus,
156 | .sidebar-left a:hover {
157 | background-color: transparent;
158 | color: #001f3f;
159 | }
160 |
161 | .sidebar-left .active > a,
162 | .sidebar-left .active > a:focus,
163 | .sidebar-left .active > a:hover {
164 | /* Cor do item ativo dentro da categoria */
165 | color: #1d2939;
166 | }
167 |
168 | .sidebar-mini .sidebar-left .nav > li.open > a {
169 | background-color: transparent;
170 | color: pink;
171 | }
172 | .sidebar-left .nav > li > a:focus {
173 | /* A cor que fica o texto depois de clicar na categoria (focus) */
174 | background-color: #29d1ca;
175 | color: #fff;
176 | }
177 |
178 | .sidebar .nav-pills > li > a > .badge {
179 | margin: 3px 0;
180 | }
181 |
182 | .pull-right {
183 | float: right !important;
184 | }
185 |
186 | .nav-pills > li > a > .tag {
187 | margin-top: 2px;
188 | font-size: 80%;
189 | padding: 0.25em 0.4em 0.28em;
190 | }
191 |
192 | div.breadcrumbs span a,
193 | div.breadcrumbs {
194 | color: #d4d4d4;
195 | font-size: 14px;
196 | }
197 |
198 | div.breadcrumbs span:first-child a {
199 | color: #00B280;
200 | }
--------------------------------------------------------------------------------
/scrapy_eagle/worker/spiders.py:
--------------------------------------------------------------------------------
1 | from scrapy import signals
2 | from scrapy.exceptions import DontCloseSpider
3 | from scrapy.spiders import Spider, CrawlSpider
4 |
5 | from . import connection
6 |
7 |
8 | # Default batch size matches default concurrent requests setting.
9 | DEFAULT_START_URLS_BATCH_SIZE = 16
10 | DEFAULT_START_URLS_KEY = '%(name)s:start_urls'
11 |
12 |
13 | class DistributedMixin(object):
14 | """Mixin class to implement reading urls from a redis queue."""
15 | # Per spider redis key, default to DEFAULT_KEY.
16 | redis_key = None
17 | # Fetch this amount of start urls when idle. Default to DEFAULT_BATCH_SIZE.
18 | redis_batch_size = None
19 | # Redis client instance.
20 | server = None
21 |
22 | def start_requests(self):
23 | """Returns a batch of start requests from redis."""
24 | return self.next_requests()
25 |
26 | def setup_redis(self, crawler=None):
27 | """Setup redis connection and idle signal.
28 |
29 | This should be called after the spider has set its crawler object.
30 | """
31 | if self.server is not None:
32 | return
33 |
34 | if crawler is None:
35 | # We allow optional crawler argument to keep backwards
36 | # compatibility.
37 | # XXX: Raise a deprecation warning.
38 | crawler = getattr(self, 'crawler', None)
39 |
40 | if crawler is None:
41 | raise ValueError("crawler is required")
42 |
43 | settings = crawler.settings
44 |
45 | if self.redis_key is None:
46 | self.redis_key = settings.get(
47 | 'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY,
48 | )
49 |
50 | self.redis_key = self.redis_key % {'name': self.name}
51 |
52 | if not self.redis_key.strip():
53 | raise ValueError("redis_key must not be empty")
54 |
55 | if self.redis_batch_size is None:
56 | self.redis_batch_size = settings.getint(
57 | 'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE,
58 | )
59 |
60 | try:
61 | self.redis_batch_size = int(self.redis_batch_size)
62 | except (TypeError, ValueError):
63 | raise ValueError("redis_batch_size must be an integer")
64 |
65 | self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
66 | "(batch size: %(redis_batch_size)s)", self.__dict__)
67 |
68 | self.server = connection.from_settings(crawler.settings)
69 | # The idle signal is called when the spider has no requests left,
70 | # that's when we will schedule new requests from redis queue
71 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
72 |
73 | def next_requests(self):
74 | """Returns a request to be scheduled or none."""
75 | use_set = self.settings.getbool('REDIS_START_URLS_AS_SET')
76 | fetch_one = self.server.spop if use_set else self.server.lpop
77 | # XXX: Do we need to use a timeout here?
78 | found = 0
79 | while found < self.redis_batch_size:
80 | data = fetch_one(self.redis_key)
81 | if data:
82 | data = data.decode('utf-8')
83 | else:
84 | # Queue empty.
85 | break
86 | req = self.make_request_from_data(data)
87 | if req:
88 | yield req
89 | found += 1
90 | else:
91 | self.logger.debug("Request not made from data: %r", data)
92 |
93 | if found:
94 | self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
95 |
96 | def make_request_from_data(self, data):
97 | # By default, data is an URL.
98 | if '://' in data:
99 | return self.make_requests_from_url(data)
100 | else:
101 | self.logger.error("Unexpected URL from '%s': %r", self.redis_key, data)
102 |
103 | def schedule_next_requests(self):
104 | """Schedules a request if available"""
105 | for req in self.next_requests():
106 | self.crawler.engine.crawl(req, spider=self)
107 |
108 | def spider_idle(self):
109 | """Schedules a request if available, otherwise waits."""
110 | # XXX: Handle a sentinel to close the spider.
111 | self.schedule_next_requests()
112 | raise DontCloseSpider
113 |
114 |
115 | class DistributedSpider(DistributedMixin, Spider):
116 | """Spider that reads urls from redis queue when idle."""
117 |
118 | @classmethod
119 | def from_crawler(self, crawler, *args, **kwargs):
120 | obj = super(DistributedSpider, self).from_crawler(crawler, *args, **kwargs)
121 | obj.setup_redis(crawler)
122 | return obj
123 |
124 |
125 | class DistributedCrawlSpider(DistributedMixin, CrawlSpider):
126 | """Spider that reads urls from redis queue when idle."""
127 |
128 | @classmethod
129 | def from_crawler(self, crawler, *args, **kwargs):
130 | obj = super(DistributedCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
131 | obj.setup_redis(crawler)
132 | return obj
133 |
134 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/App.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react'
2 | import { Link, IndexLink } from 'react-router'
3 | import { connect } from 'react-redux'
4 | import Breadcrumbs from 'react-breadcrumbs'
5 |
6 | require('./App.scss');
7 |
8 | class App extends React.Component {
9 | constructor(props){
10 | super(props);
11 | }
12 |
13 | componentWillMount(){
14 | this.intervals = [];
15 | }
16 |
17 | setInterval() {
18 | this.intervals.push(setInterval.apply(null, arguments));
19 | }
20 |
21 | componentWillUnmount(){
22 | this.intervals.forEach(clearInterval);
23 |
24 | // Ref: https://facebook.github.io/react/tips/initial-ajax.html
25 | this.clientsRequest.abort();
26 | }
27 |
28 | ajax_get_jobs_info(){
29 |
30 | var that = this;
31 |
32 | this.clientsRequest = $.ajax({
33 | url: window.location.protocol + "//" + document.domain + ":" + location.port + "/jobs/list",
34 | type: 'GET',
35 | dataType: 'json',
36 | cache: false
37 | }).done((data) => {
38 |
39 | $.each(data, (key, value) => {
40 | // console.log(key, value);
41 |
42 | that.props.dispatch(
43 | {
44 | type: 'UPDATE_SPIDER_INFO',
45 | spider_id: key,
46 | frequency_minutes: value.frequency_minutes,
47 | last_started_at: value.last_started_at,
48 | max_concurrency: value.max_concurrency,
49 | min_concurrency: value.min_concurrency,
50 | max_memory_mb: value.max_memory_mb,
51 | priority: value.priority,
52 | job_type: value.job_type,
53 | active: value.active,
54 | start_urls: value.start_urls
55 | }
56 | );
57 |
58 | })
59 |
60 | }).always(() => {
61 | // that.setState({'server_set': server_set_new});
62 | });
63 |
64 | }
65 |
66 | componentDidMount(){
67 | this.ajax_get_jobs_info();
68 | this.setInterval(this.ajax_get_jobs_info.bind(this), 5000);
69 | }
70 |
71 | render(){
72 | const { servers_qty } = this.props;
73 | return (
74 |
75 |
76 |
77 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | {/*Distributed Scrapy
88 |
89 | {this.props.SET_SERVER_QTY(7)}}>{servers_qty}
90 | =>
91 |
92 |
93 | - /
94 | - /servers/monitoring
95 | - /spiders/config
96 |
97 | */}
98 |
99 | {this.props.children}
100 |
101 |
102 |
103 |
149 |
150 |
151 |
152 |
153 | );
154 | }
155 | }
156 |
157 | var mapDispatchToProps = function(dispatch){
158 | return {
159 | dispatch
160 | }
161 | };
162 |
163 | export default connect(
164 | (state) => {
165 | return {
166 | servers_qty: state.servers.servers_qty
167 | }
168 | },
169 | mapDispatchToProps
170 | )(App)
--------------------------------------------------------------------------------
/scrapy_eagle/worker/scheduler.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import six
3 |
4 | from scrapy.utils.misc import load_object
5 |
6 | from . import connection
7 |
8 |
9 | class DistributedScheduler(object):
10 | """Redis-based scheduler"""
11 |
12 | def __init__(self, server,
13 | persist=False,
14 | flush_on_start=False,
15 | queue_key='%(spider)s:requests',
16 | queue_cls='scrapy_eagle.worker.queue.SpiderPriorityQueue',
17 | dupefilter_key='%(spider)s:dupefilter',
18 | dupefilter_cls='scrapy_eagle.worker.dupefilter.RFPDupeFilter',
19 | idle_before_close=0,
20 | serializer=None):
21 | """Initialize scheduler.
22 |
23 | Parameters
24 | ----------
25 | server : Redis
26 | The redis server instance.
27 | persist : bool
28 | Whether to flush requests when closing. Default is False.
29 | flush_on_start : bool
30 | Whether to flush requests on start. Default is False.
31 | queue_key : str
32 | Requests queue key.
33 | queue_cls : str
34 | Importable path to the queue class.
35 | dupefilter_key : str
36 | Duplicates filter key.
37 | dupefilter_cls : str
38 | Importable path to the dupefilter class.
39 | idle_before_close : int
40 | Timeout before giving up.
41 |
42 | """
43 | if idle_before_close < 0:
44 | raise TypeError("idle_before_close cannot be negative")
45 |
46 | self.server = server
47 | self.persist = persist
48 | self.flush_on_start = flush_on_start
49 | self.queue_key = queue_key
50 | self.queue_cls = queue_cls
51 | self.dupefilter_cls = dupefilter_cls
52 | self.dupefilter_key = dupefilter_key
53 | self.idle_before_close = idle_before_close
54 | self.serializer = serializer
55 | self.stats = None
56 |
57 | def __len__(self):
58 | return len(self.queue)
59 |
60 | @classmethod
61 | def from_settings(cls, settings):
62 | kwargs = {
63 | 'persist': settings.getbool('SCHEDULER_PERSIST'),
64 | 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
65 | 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
66 | }
67 |
68 | # If these values are missing, it means we want to use the defaults.
69 | optional = {
70 | # TODO: Use custom prefixes for this settings to note that are
71 | # specific to scrapy-redis.
72 | 'queue_key': 'SCHEDULER_QUEUE_KEY',
73 | 'queue_cls': 'SCHEDULER_QUEUE_CLASS',
74 | 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
75 | # We use the default setting name to keep compatibility.
76 | 'dupefilter_cls': 'DUPEFILTER_CLASS',
77 | 'serializer': 'SCHEDULER_SERIALIZER',
78 | }
79 | for name, setting_name in optional.items():
80 | val = settings.get(setting_name)
81 | if val:
82 | kwargs[name] = val
83 |
84 | # Support serializer as a path to a module.
85 | if isinstance(kwargs.get('serializer'), six.string_types):
86 | kwargs['serializer'] = importlib.import_module(kwargs['serializer'])
87 |
88 | server = connection.from_settings(settings)
89 | # Ensure the connection is working.
90 | server.ping()
91 |
92 | return cls(server=server, **kwargs)
93 |
94 | @classmethod
95 | def from_crawler(cls, crawler):
96 | instance = cls.from_settings(crawler.settings)
97 | # FIXME: for now, stats are only supported from this constructor
98 | instance.stats = crawler.stats
99 | return instance
100 |
101 | def open(self, spider):
102 | self.spider = spider
103 |
104 | try:
105 | self.queue = load_object(self.queue_cls)(
106 | server=self.server,
107 | spider=spider,
108 | key=self.queue_key % {'spider': spider.name},
109 | serializer=self.serializer,
110 | )
111 | except TypeError as e:
112 | raise ValueError("Failed to instantiate queue class '%s': %s",
113 | self.queue_cls, e)
114 |
115 | try:
116 | self.df = load_object(self.dupefilter_cls)(
117 | server=self.server,
118 | key=self.dupefilter_key % {'spider': spider.name},
119 | debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
120 | )
121 | except TypeError as e:
122 | raise ValueError("Failed to instantiate dupefilter class '%s': %s",
123 | self.dupefilter_cls, e)
124 |
125 | if self.flush_on_start:
126 | self.flush()
127 | # notice if there are requests already in the queue to resume the crawl
128 | if len(self.queue):
129 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
130 |
131 | def close(self, reason):
132 | if not self.persist:
133 | self.flush()
134 |
135 | def flush(self):
136 | self.df.clear()
137 | self.queue.clear()
138 |
139 | def enqueue_request(self, request):
140 | if not request.dont_filter and self.df.request_seen(request):
141 | self.df.log(request, self.spider)
142 | return False
143 | if self.stats:
144 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
145 | self.queue.push(request)
146 | return True
147 |
148 | def next_request(self):
149 | block_pop_timeout = self.idle_before_close
150 | request = self.queue.pop(block_pop_timeout)
151 | if request and self.stats:
152 | self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
153 | return request
154 |
155 | def has_pending_requests(self):
156 | return len(self) > 0
157 |
158 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/jobs.py:
--------------------------------------------------------------------------------
1 | import json
2 | from collections import OrderedDict
3 | from datetime import datetime, timedelta
4 |
5 | import flask
6 |
7 | from scrapy_eagle.dashboard import settings
8 | from scrapy_eagle.dashboard.memory import get_job_object, update_job_object
9 |
10 |
11 | jobs = flask.Blueprint('jobs', __name__)
12 |
13 |
14 | @jobs.route('/update', methods=['POST'])
15 | def update():
16 |
17 | #TODO: Ensure that the incoming request comes from the same IP (Security)
18 |
19 | result = {}
20 | error = False
21 |
22 | key, job_type, active, frequency_minutes, max_concurrency = (None, None, None, None, None)
23 | min_concurrency, priority, max_memory_mb, start_urls = (None, None, None, None)
24 |
25 | try:
26 |
27 | key = flask.request.form.get('key', None)
28 | job_type = flask.request.form.get('job_type', None)
29 | frequency_minutes = int(flask.request.form.get('frequency_minutes', None))
30 | max_concurrency = int(flask.request.form.get('max_concurrency', None))
31 | min_concurrency = int(flask.request.form.get('min_concurrency', None))
32 | priority = int(flask.request.form.get('priority', None))
33 | max_memory_mb = int(flask.request.form.get('max_memory_mb', None))
34 | start_urls = flask.request.form.get('start_urls', None)
35 |
36 | if flask.request.form.get('active', None) == 'false':
37 | active = False
38 | elif flask.request.form.get('active', None) == 'true':
39 | active = True
40 | else:
41 | active = False
42 |
43 | # Never trust in the user input type
44 | except ValueError:
45 | error = True
46 | result.update({
47 | 'status': 'error',
48 | 'msg': 'You sent wrong datatypes, like a letter when it should be numeric.'
49 | })
50 |
51 | if not error:
52 |
53 | if not all([key, job_type, frequency_minutes, max_concurrency, min_concurrency, priority, max_memory_mb]):
54 | error = True
55 | result.update({
56 | 'status': 'error',
57 | 'msg': 'You are missing some information, please check your form.'
58 | })
59 |
60 | elif not start_urls and job_type == 'spider':
61 | error = True
62 | result.update({
63 | 'status': 'error',
64 | 'msg': 'You should provide the Start URLs information for spiders.'
65 | })
66 |
67 | else:
68 |
69 | actual_obj = get_job_object(key=key)
70 |
71 | # A brand new
72 | if not actual_obj:
73 | actual_obj = {}
74 | else:
75 | current_frequency = actual_obj['frequency_minutes']
76 |
77 | actual_obj.update({
78 | 'active': active,
79 | 'job_type': job_type,
80 | 'frequency_minutes': frequency_minutes,
81 | 'max_concurrency': max_concurrency,
82 | 'min_concurrency': min_concurrency,
83 | 'priority': priority,
84 | 'max_memory_mb': max_memory_mb
85 | })
86 |
87 | # If the frequency change, recalculate the next execution
88 | if current_frequency != frequency_minutes:
89 | actual_obj['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=frequency_minutes)).isoformat()
90 |
91 | if job_type == 'spider':
92 | actual_obj.update({'start_urls': [x for x in start_urls.split("\n") if x]})
93 |
94 | update_job_object(key=key, fields=actual_obj)
95 |
96 | if not error:
97 | result.update({
98 | 'status': 'ok'
99 | })
100 |
101 | return flask.Response(
102 | response=json.dumps(result, sort_keys=True),
103 | status=200,
104 | mimetype="application/json"
105 | )
106 |
107 |
108 | @jobs.route('/list', methods=['GET'])
109 | def listing():
110 |
111 | _spiders = settings.get_spiders()
112 | _commands = settings.get_commands()
113 |
114 | # When the system is starting up, spiders may return empty because
115 | # we're using async execution `green_threads.find_new_spiders`.
116 | if not _spiders:
117 | return flask.Response(
118 | response=json.dumps({}, sort_keys=True),
119 | status=200,
120 | mimetype="application/json"
121 | )
122 |
123 | _spiders.sort()
124 |
125 | d = OrderedDict()
126 |
127 | for s in _spiders:
128 |
129 | obj = get_job_object(key=s)
130 |
131 | if obj:
132 | d[s] = obj
133 | else:
134 | # Jobs without previous information, using default config
135 | d[s] = {}
136 | d[s]['active'] = False
137 | d[s]['job_type'] = 'spider'
138 | d[s]['min_concurrency'] = 1
139 | d[s]['max_concurrency'] = 5
140 | d[s]['max_memory_mb'] = 200
141 | d[s]['priority'] = 1
142 | d[s]['frequency_minutes'] = 60
143 | d[s]['start_urls'] = []
144 | d[s]['last_started_at'] = datetime.utcnow().isoformat()
145 | d[s]['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=d[s]['frequency_minutes'])).isoformat()
146 |
147 | for file_name in _commands:
148 |
149 | obj = get_job_object(key=file_name)
150 |
151 | if obj:
152 | d[file_name] = obj
153 |
154 | else:
155 | d[file_name] = {}
156 | d[file_name]['active'] = False
157 | d[file_name]['job_type'] = 'command'
158 | d[file_name]['min_concurrency'] = 1
159 | d[file_name]['max_concurrency'] = 1
160 | d[file_name]['max_memory_mb'] = 50
161 | d[file_name]['priority'] = 1
162 | d[file_name]['frequency_minutes'] = 60
163 | d[file_name]['last_started_at'] = None
164 | d[file_name]['next_execution_at'] = None
165 |
166 | return flask.Response(
167 | response=json.dumps(d, sort_keys=True),
168 | status=200,
169 | mimetype="application/json"
170 | )
171 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: docs/images/logo_readme.jpg
2 | ======================================
3 |
4 | .. image:: https://travis-ci.org/rafaelcapucho/scrapy-eagle.svg?branch=master
5 | :target: https://travis-ci.org/rafaelcapucho/scrapy-eagle
6 |
7 | .. image:: https://img.shields.io/pypi/v/scrapy-eagle.svg
8 | :target: https://pypi.python.org/pypi/scrapy-eagle
9 | :alt: PyPI Version
10 |
11 | .. image:: https://img.shields.io/pypi/pyversions/scrapy-eagle.svg
12 | :target: https://pypi.python.org/pypi/scrapy-eagle
13 |
14 | .. image:: https://landscape.io/github/rafaelcapucho/scrapy-eagle/master/landscape.svg?style=flat
15 | :target: https://landscape.io/github/rafaelcapucho/scrapy-eagle/master
16 | :alt: Code Quality Status
17 |
18 | .. image:: https://requires.io/github/rafaelcapucho/scrapy-eagle/requirements.svg?branch=master
19 | :target: https://requires.io/github/rafaelcapucho/scrapy-eagle/requirements/?branch=master
20 | :alt: Requirements Status
21 |
22 | Scrapy Eagle is a tool that allow us to run any Scrapy_ based project in a distributed fashion and monitor how it is going on and how many resources it is consuming on each server.
23 |
24 | .. _Scrapy: http://scrapy.org
25 |
26 | **This project is Under Development, don't use it yet**
27 |
28 | .. image:: https://badge.waffle.io/rafaelcapucho/scrapy-eagle.svg?label=ready&title=Ready
29 | :target: https://waffle.io/rafaelcapucho/scrapy-eagle
30 | :alt: 'Stories in Ready'
31 |
32 | Requeriments
33 | ------------
34 |
35 | Scrapy Eagle uses Redis_ as Distributed Queue, so you will need a redis instance running.
36 |
37 | .. _Redis: http://mail.python.org/pipermail/doc-sig/
38 |
39 | Installation
40 | ------------
41 |
42 | It could be easily made by running the code bellow,
43 |
44 | .. code-block:: console
45 |
46 | $ virtualenv eagle_venv; cd eagle_venv; source bin/activate
47 | $ pip install scrapy-eagle
48 |
49 | You should create one ``configparser`` configuration file (e.g. in /etc/scrapy-eagle.ini) containing:
50 |
51 | .. code-block:: console
52 |
53 | [redis]
54 | host = 127.0.0.1
55 | port = 6379
56 | db = 0
57 | ;password = someverysecretpass
58 |
59 | [server]
60 | debug = True
61 | cookie_secret_key = ha74h3hdh42a
62 | host = 0.0.0.0
63 | port = 5000
64 |
65 | [scrapy]
66 | binary = /project_venv/bin/scrapy
67 | base_dir = /project_venv/project_scrapy/project
68 |
69 | [commands]
70 | binary = /project_venv/bin/python3
71 | base_dir = /project_venv/project_scrapy/project/commands
72 |
73 | Then you will be able to execute the `eagle_server` command like,
74 |
75 | .. code-block:: console
76 |
77 | eagle_server --config-file=/etc/scrapy-eagle.ini
78 |
79 | Changes into your Scrapy project
80 | --------------------------------
81 |
82 | Enable the components in your `settings.py` of your Scrapy project:
83 |
84 | .. code-block:: python
85 |
86 | # Enables scheduling storing requests queue in redis.
87 | SCHEDULER = "scrapy_eagle.worker.scheduler.DistributedScheduler"
88 |
89 | # Ensure all spiders share same duplicates filter through redis.
90 | DUPEFILTER_CLASS = "scrapy_eagle.worker.dupefilter.RFPDupeFilter"
91 |
92 | # Schedule requests using a priority queue. (default)
93 | SCHEDULER_QUEUE_CLASS = "scrapy_eagle.worker.queue.SpiderPriorityQueue"
94 |
95 | # Schedule requests using a queue (FIFO).
96 | SCHEDULER_QUEUE_CLASS = "scrapy_eagle.worker.queue.SpiderQueue"
97 |
98 | # Schedule requests using a stack (LIFO).
99 | SCHEDULER_QUEUE_CLASS = "scrapy_eagle.worker.queue.SpiderStack"
100 |
101 | # Max idle time to prevent the spider from being closed when distributed crawling.
102 | # This only works if queue class is SpiderQueue or SpiderStack,
103 | # and may also block the same time when your spider start at the first time (because the queue is empty).
104 | SCHEDULER_IDLE_BEFORE_CLOSE = 0
105 |
106 | # Specify the host and port to use when connecting to Redis (optional).
107 | REDIS_HOST = 'localhost'
108 | REDIS_PORT = 6379
109 |
110 | # Specify the full Redis URL for connecting (optional).
111 | # If set, this takes precedence over the REDIS_HOST and REDIS_PORT settings.
112 | REDIS_URL = "redis://user:pass@hostname:6379"
113 |
114 | Once the configuration is finished, you should adapt each spider to use our Mixin:
115 |
116 | .. code-block:: python
117 |
118 | from scrapy.spiders import CrawlSpider, Rule
119 | from scrapy_eagle.worker.spiders import DistributedMixin
120 |
121 | class YourSpider(DistributedMixin, CrawlSpider):
122 |
123 | name = "domain.com"
124 |
125 | # start_urls = ['http://www.domain.com/']
126 | redis_key = 'domain.com:start_urls'
127 |
128 | rules = (
129 | Rule(...),
130 | Rule(...),
131 | )
132 |
133 | def _set_crawler(self, crawler):
134 | CrawlSpider._set_crawler(self, crawler)
135 | DistributedMixin.setup_redis(self)
136 |
137 | Feeding a Spider from Redis
138 | ---------------------------
139 |
140 | The class `scrapy_eagle.worker.spiders.DistributedMixin` enables a spider to read the
141 | urls from redis. The urls in the redis queue will be processed one
142 | after another.
143 |
144 | Then, push urls to redis::
145 |
146 | redis-cli lpush domain.com:start_urls http://domain.com/
147 |
148 | Dashboard Development
149 | ---------------------
150 |
151 | If you would like to change the client-side then you'll need to have NPM_ installed because we use ReactJS_ to build our interface. Installing all dependencies locally:
152 |
153 | .. _ReactJS: https://facebook.github.io/react/
154 | .. _NPM: https://www.npmjs.com/
155 |
156 | .. code-block:: console
157 |
158 | cd scrapy-eagle/dashboard
159 | npm install
160 |
161 | Then you can run ``npm start`` to compile and start monitoring any changes and recompiling automatically.
162 |
163 | To generate the production version, run ``npm run build``.
164 |
165 | To be easier to test the Dashboard you could use one simple http server instead of run the ``eagle_server``, like:
166 |
167 | .. code-block:: console
168 |
169 | sudo npm install -g http-server
170 | cd scrapy-eagle/dashboard
171 | http-server templates/
172 |
173 | It would be available for you at http://127.0.0.1:8080
174 |
175 | **Note**: Until now the Scrapy Eagle is mostly based on https://github.com/rolando/scrapy-redis.
176 |
--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/jobs/JobsItem.jsx:
--------------------------------------------------------------------------------
1 | import React from 'react'
2 | import { connect } from 'react-redux'
3 |
4 | import cx from 'classnames'
5 | import Switch from 'react-switchery'
6 |
7 | class BaseComponent extends React.Component {
8 | _bind(...methods) {
9 | methods.forEach( (method) => this[method] = this[method].bind(this) );
10 | }
11 | }
12 |
13 | class JobsItem extends React.Component {
14 |
15 | constructor(props){
16 | super(props);
17 | // this._bind('_handleClick', '_handleFoo');
18 | this.handleSave = this.handleSave.bind(this);
19 | this.onBlurFrequency = this.onBlurFrequency.bind(this);
20 | this.onBlurMaxConcurrency = this.onBlurMaxConcurrency.bind(this);
21 | this.onBlurMinConcurrency = this.onBlurMinConcurrency.bind(this);
22 | this.onChangePriority = this.onChangePriority.bind(this);
23 | this.onBlurMaxMemory = this.onBlurMaxMemory.bind(this);
24 | this.onBlurStartURLs = this.onBlurStartURLs.bind(this);
25 | this.handleSave = this.handleSave.bind(this);
26 | this.state = {
27 | 'key': this.props.id,
28 | 'active': this.props.value.active,
29 | 'job_type': this.props.value.job_type,
30 | 'frequency_minutes': this.props.value.frequency_minutes,
31 | 'max_concurrency': this.props.value.max_concurrency,
32 | 'min_concurrency': this.props.value.min_concurrency,
33 | 'priority': this.props.value.priority,
34 | 'max_memory_mb': this.props.value.max_memory_mb,
35 | };
36 |
37 | if(this.props.value.start_urls){
38 | this.state['start_urls'] = this.format_start_urls(this.props.value.start_urls);
39 | }
40 |
41 | }
42 |
43 | format_start_urls(mylist){
44 | let buff = "";
45 | mylist.forEach(elem => {
46 | buff += elem + "\n";
47 | })
48 | return buff;
49 | }
50 |
51 | onBlurFrequency(e){ this.setState({'frequency_minutes': $.trim(e.target.value)}) }
52 | onBlurMaxConcurrency(e){ this.setState({'max_concurrency': $.trim(e.target.value)}) }
53 | onBlurMinConcurrency(e){ this.setState({'min_concurrency': $.trim(e.target.value)}) }
54 | onChangePriority(e){ this.setState({'priority': e.target.value}) }
55 | onBlurMaxMemory(e){ this.setState({'max_memory_mb': $.trim(e.target.value)}) }
56 | onBlurStartURLs(e){ this.setState({'start_urls': $.trim(e.target.value)}) }
57 |
58 | handleSave(){
59 |
60 | $.ajax({
61 | url: window.location.protocol + "//" + document.domain + ":" + location.port + "/jobs/update",
62 | type: 'POST',
63 | dataType: 'json',
64 | data: this.state,
65 | }).done((data) => {
66 |
67 | if(data.status == 'error'){
68 | alert(data.msg);
69 | } else if(data.status == 'ok'){
70 |
71 | }
72 |
73 | }).fail(() => {
74 | alert('The request failed, please try again.');
75 | }).always(() => {
76 | // that.setState({});
77 | });
78 |
79 | }
80 |
81 | SwitchonChange(value) {
82 | console.log(value);
83 | }
84 |
85 | render(){
86 |
87 | var show_start_urls = () => {
88 |
89 | if(this.state.job_type == 'spider') {
90 | return (
91 |
92 |
93 |
94 |
96 |
97 |
98 | )
99 | }
100 |
101 | };
102 |
103 | return (
104 |
105 |
{this.state.key}
106 |
183 |
184 | );
185 | }
186 |
187 | }
188 |
189 | var mapDispatchToProps = function(dispatch){
190 | return {
191 | dispatch
192 | }
193 | };
194 |
195 | export default connect(
196 | (state) => {
197 | return {
198 | //jobs: state.jobs
199 | }
200 | },
201 | mapDispatchToProps
202 | )(JobsItem)
--------------------------------------------------------------------------------