├── scrapy_eagle
    ├── __init__.py
    ├── worker
    │   ├── __init__.py
    │   ├── picklecompat.py
    │   ├── connection.py
    │   ├── dupefilter.py
    │   ├── queue.py
    │   ├── spiders.py
    │   └── scheduler.py
    └── dashboard
    │   ├── __init__.py
    │   ├── views
    │       ├── __init__.py
    │       ├── root.py
    │       ├── react_app.py
    │       ├── servers.py
    │       ├── processes.py
    │       └── jobs.py
    │   ├── .babelrc
    │   ├── templates
    │       ├── static
    │       │   ├── css
    │       │   │   ├── bundle.css.map
    │       │   │   ├── bundle.css
    │       │   │   └── main.css
    │       │   ├── img
    │       │   │   └── system-logo.jpg
    │       │   └── js
    │       │   │   └── vendor
    │       │   │       └── jquery.navgoco.min.js
    │       └── index.html
    │   ├── react-src
    │       ├── components
    │       │   ├── Home.jsx
    │       │   ├── jobs
    │       │   │   ├── Root.jsx
    │       │   │   ├── JobsConfig.scss
    │       │   │   ├── JobsConfig.jsx
    │       │   │   └── JobsItem.jsx
    │       │   ├── servers
    │       │   │   ├── Root.jsx
    │       │   │   ├── ServerSubProcess.jsx
    │       │   │   ├── ServerSet.jsx
    │       │   │   └── ServerNode.jsx
    │       │   ├── ListItem.jsx
    │       │   ├── App.scss
    │       │   ├── List.jsx
    │       │   └── App.jsx
    │       ├── services
    │       │   └── httpservice.js
    │       ├── reducers
    │       │   ├── servers.jsx
    │       │   └── jobs.jsx
    │       └── main.jsx
    │   ├── utils
    │       ├── __init__.py
    │       ├── spiderskit.py
    │       ├── commandskit.py
    │       ├── ip.py
    │       └── processkit.py
    │   ├── green_threads
    │       ├── __init__.py
    │       ├── heartbeat.py
    │       ├── executor.py
    │       └── stats.py
    │   ├── webpack.config.dev.js
    │   ├── webpack.config.prod.js
    │   ├── package.json
    │   ├── settings.py
    │   ├── memory.py
    │   └── main.py
├── docs
    └── images
    │   └── logo_readme.jpg
├── .travis.yml
├── requirements.txt
├── MANIFEST.in
├── pytest.ini
├── tox.ini
├── generator.py
├── setup.py
├── tests
    └── test_queue.py
├── .gitignore
└── README.rst


/scrapy_eagle/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy_eagle/worker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 |   "presets": ["react", "es2015"]
3 | }
4 | 


--------------------------------------------------------------------------------
/docs/images/logo_readme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rafaelcapucho/scrapy-eagle/HEAD/docs/images/logo_readme.jpg


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "3.5"
4 | install: "pip install -r requirements.txt"
5 | script: nosetests
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | pymongo
3 | requests
4 | redis
5 | scrapy>=1.1.0
6 | flask-socketio
7 | flask-cors
8 | gevent
9 | psutil


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/css/bundle.css.map:
--------------------------------------------------------------------------------
1 | {"version":3,"sources":[],"names":[],"mappings":"","file":"../css/bundle.css","sourceRoot":""}


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/img/system-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rafaelcapucho/scrapy-eagle/HEAD/scrapy_eagle/dashboard/templates/static/img/system-logo.jpg


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | graft docs
 2 | 
 3 | include *.in
 4 | include *.ini
 5 | include *.rst
 6 | include *.txt
 7 | 
 8 | recursive-include scrapy_eagle/dashboard/templates *
 9 | 
10 | global-exclude __pycache__ *.py[cod]
11 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/root.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import flask
 3 | 
 4 | 
 5 | root = flask.Blueprint('root', __name__)
 6 | 
 7 | 
 8 | @root.route('/')
 9 | def index():
10 | 
11 |     return flask.redirect('/app')
12 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | norecursedirs =
 3 |     .*
 4 |     dist
 5 |     build
 6 | python_files =
 7 |     test_*.py
 8 |     *_test.py
 9 |     tests.py
10 | ignore =
11 |     setup.py
12 | addopts =
13 |     -rxEfsw -v
14 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/Home.jsx:
--------------------------------------------------------------------------------
 1 | var React = require('react');
 2 | 
 3 | var Home = React.createClass({
 4 |   render: function() {
 5 |     return <div>App Home</div>
 6 |   }
 7 | });
 8 | 
 9 | module.exports = Home;
10 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py{35}-scrapy{11}
 3 | 
 4 | [testenv]
 5 | basepython =
 6 |     py35: python3.5
 7 | deps =
 8 |     -rrequirements.txt
 9 | commands =
10 |     scrapy11: pip install scrapy>=1.1,<1.2
11 |     {posargs:py.test}
12 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/jobs/Root.jsx:
--------------------------------------------------------------------------------
 1 | import React from 'react'
 2 | 
 3 | export default class SpiderRoot extends React.Component {
 4 |   constructor(props){
 5 |     super(props);
 6 |   }
 7 | 
 8 |   render(){
 9 |     return this.props.children;
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/servers/Root.jsx:
--------------------------------------------------------------------------------
 1 | import React from 'react'
 2 | 
 3 | export default class ServerRoot extends React.Component {
 4 |   constructor(props){
 5 |     super(props);
 6 |   }
 7 | 
 8 |   render(){
 9 |     return this.props.children;
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/react_app.py:
--------------------------------------------------------------------------------
 1 | import flask
 2 | 
 3 | 
 4 | react_app = flask.Blueprint('app', __name__)
 5 | 
 6 | 
 7 | @react_app.route('/', defaults={'path': ''})
 8 | @react_app.route('/<path:path>')
 9 | def app(path):
10 |     return flask.render_template('index.html')
11 | 


--------------------------------------------------------------------------------
/scrapy_eagle/worker/picklecompat.py:
--------------------------------------------------------------------------------
 1 | """A pickle wrapper module with protocol=-1 by default."""
 2 | 
 3 | try:
 4 |     import cPickle as pickle  # PY2
 5 | except ImportError:
 6 |     import pickle
 7 | 
 8 | 
 9 | def loads(s):
10 |     return pickle.loads(s)
11 | 
12 | 
13 | def dumps(obj):
14 |     return pickle.dumps(obj, protocol=-1)
15 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from calendar import timegm
 3 | 
 4 | 
 5 | def iso_to_timestamp(iso):
 6 |     epoch = timegm(datetime.strptime(iso, "%Y-%m-%dT%H:%M:%S.%f").timetuple())
 7 |     assert isinstance(epoch, int)
 8 |     return epoch
 9 | 
10 | 
11 | def timestamp_to_utc(ts):
12 |     return datetime.utcfromtimestamp(ts)
13 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/services/httpservice.js:
--------------------------------------------------------------------------------
 1 | var Fetch = require('whatwg-fetch');
 2 | var baseUrl = 'http://localhost:6060';
 3 | 
 4 | var service = {
 5 |     get: function(url) {
 6 |         return fetch(baseUrl + url)
 7 |         .then(function(response) {
 8 |             return response.json();
 9 |         });
10 |     }
11 | };
12 | 
13 | module.exports = service;
14 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/ListItem.jsx:
--------------------------------------------------------------------------------
 1 | var React = require('react');
 2 | 
 3 | var ListItem = React.createClass({
 4 | 
 5 |     render: function() {
 6 |         return (
 7 |             <li>
 8 |                 <h4>{this.props.memory_used_mb} - {this.props.memory_available_mb}</h4>
 9 |             </li>
10 |         );
11 |     }
12 |     
13 | });
14 | 
15 | module.exports = ListItem;
16 | 


--------------------------------------------------------------------------------
/generator.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from time import sleep
 4 | 
 5 | # When the dashboard receives a KeyboardInterrupt
 6 | # the subprocess also receive a KeyboardInterrupt
 7 | # you could catch or not.
 8 | 
 9 | try:
10 |     n = 1
11 |     while True:
12 | 
13 |         print(n)
14 | 
15 |         n += 1
16 | 
17 |         #sys.stdout.flush()
18 | 
19 |         sleep(1)
20 | 
21 |         if n % 20 == 0: break
22 | 
23 |     print(' ')
24 | 
25 | except (KeyboardInterrupt, SystemExit):
26 |     print('fechou')


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/jobs/JobsConfig.scss:
--------------------------------------------------------------------------------
 1 | div.scheduler {
 2 | 
 3 |   h1 {
 4 |     //margin: 30px 0 35px 0;
 5 |   }
 6 | 
 7 |   label.col-form-label {
 8 |     font-size: 80%;
 9 |   }
10 | 
11 |   div.odd {
12 |     background-color: #3b3e42;
13 |   }
14 | 
15 |   div.even {
16 |     background-color: #2a2d2f;
17 |   }
18 | 
19 |   div.jobTitle {
20 |     margin: 10px 0 16px 0;
21 |     font-size: 85%;
22 |     color: #00b280;
23 |     font-weight: bold;
24 |   }
25 | 
26 |   div.box-legends {
27 |     margin-top: 35px;
28 | 
29 |     li {
30 |       font-size: 12px;
31 |     }
32 | 
33 |   }
34 | }


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/reducers/servers.jsx:
--------------------------------------------------------------------------------
 1 | const initialState = {
 2 |   servers_qty: 0,
 3 | };
 4 | 
 5 | export const INCREASE_SERVER = 'INCREASE_SERVER';
 6 | export const SET_SERVER_QTY = 'SET_SERVER_QTY';
 7 | 
 8 | export default function stats(state = initialState, action) {
 9 | 
10 |   switch (action.type) {
11 | 
12 |     case INCREASE_SERVER:
13 | 
14 |       return Object.assign({}, state, {
15 |         servers_qty: state.servers_qty + 1
16 |       });
17 | 
18 |     case SET_SERVER_QTY:
19 | 
20 |       return Object.assign({}, state, {
21 |         servers_qty: action.qty
22 |       });
23 | 
24 |     default:
25 |       return state;
26 |   }
27 | }


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/spiderskit.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | from scrapy_eagle.dashboard import settings
 4 | 
 5 | 
 6 | def find_spiders():
 7 | 
 8 |     _config = settings.get_config_file()
 9 | 
10 |     base_dir = _config.get('scrapy', 'base_dir')
11 |     binary = _config.get('scrapy', 'binary')
12 | 
13 |     spiders = []
14 | 
15 |     with subprocess.Popen(
16 |             [binary, 'list'],
17 |             cwd=base_dir,
18 |             stdout=subprocess.PIPE,
19 |             bufsize=1,
20 |             universal_newlines=True
21 |     ) as p:
22 |         for line in p.stdout:
23 |             spiders.append(line.strip())
24 | 
25 |     return spiders
26 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/css/bundle.css:
--------------------------------------------------------------------------------
1 | body{background-color:#323539;color:#f5f5f5;font-size:100%;margin:0;padding:0;position:relative;text-rendering:optimizelegibility}a:active,a:hover,a:link,a:visited{color:#fff;outline:medium none;text-decoration:none}h1,h2,h3,h4,h5,h6{color:#f5f5f5;font-family:Montserrat,sans-serif;margin:20px 0 25px}h1{font-size:1.375em}h2{font-size:1.188em}h3{font-size:1.063em}h4{font-size:.938em}h5{font-size:.813em}h6{font-size:.75em}div.scheduler label.col-form-label{font-size:80%}div.scheduler div.odd{background-color:#3b3e42}div.scheduler div.even{background-color:#2a2d2f}div.scheduler div.jobTitle{margin:10px 0 16px;font-size:85%;color:#00b280;font-weight:700}div.scheduler div.box-legends{margin-top:35px}div.scheduler div.box-legends li{font-size:12px}


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/green_threads/__init__.py:
--------------------------------------------------------------------------------
 1 | import gevent
 2 | 
 3 | from scrapy_eagle.dashboard import settings
 4 | from scrapy_eagle.dashboard.utils import spiderskit, commandskit
 5 | 
 6 | 
 7 | def find_new_spiders():
 8 | 
 9 |     while True:
10 | 
11 |         # Open the process and execute Scrapy's list command
12 |         _spiders = spiderskit.find_spiders()
13 | 
14 |         # Install the list of spiders names
15 |         settings._spiders = _spiders
16 | 
17 |         gevent.sleep(10)
18 | 
19 | 
20 | def find_new_commands():
21 | 
22 |     while True:
23 | 
24 |         # Monitoring the command folder
25 |         _commands = commandskit.find_commands()
26 | 
27 |         # Install the list of commands names
28 |         settings._commands = _commands
29 | 
30 |         gevent.sleep(5)


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/App.scss:
--------------------------------------------------------------------------------
 1 | body {
 2 |   /*font-size: 12px;*/
 3 |   /*font-family: Arial, Verdana, sans-serif;*/
 4 |   background-color: #323539;
 5 |   color: whitesmoke;
 6 |   font-size: 100%;
 7 |   margin: 0;
 8 |   padding: 0;
 9 |   position: relative;
10 |   text-rendering: optimizelegibility;
11 | }
12 | 
13 | a:link, a:visited {
14 |   color: white;
15 |   outline: medium none;
16 |   text-decoration: none;
17 | }
18 | a:hover, a:active {
19 |   color: white;
20 |   outline: medium none;
21 |   text-decoration: none;
22 | }
23 | 
24 | 
25 | h1, h2, h3, h4, h5, h6 {
26 |   color: whitesmoke;
27 |   font-family: "Montserrat", sans-serif;
28 |   margin: 20px 0 25px 0;
29 | 
30 | }
31 | 
32 | h1 {font-size: 1.375em;}
33 | h2 {font-size: 1.188em;}
34 | h3 {font-size: 1.063em;}
35 | h4 {font-size: 0.938em;}
36 | h5 {font-size: 0.813em;}
37 | h6 {font-size: 0.75em;}


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/commandskit.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from scrapy_eagle.dashboard import settings
 4 | 
 5 | 
 6 | def load_commands_name(dir):
 7 | 
 8 |     if os.path.exists(dir):
 9 | 
10 |         module_names = []
11 | 
12 |         for d in os.listdir(dir):
13 |             if d.find("__init__") == -1 and d.endswith('.py'):
14 | 
15 |                 # Remove possible spaces
16 |                 d = d.replace(" ", "")
17 | 
18 |                 # Remove the Extension
19 |                 d = ".".join(d.split(".")[:-1])
20 | 
21 |                 module_names.append(d)
22 | 
23 |         module_names.sort()
24 | 
25 |         return module_names
26 | 
27 |     else:
28 |         return []
29 | 
30 | 
31 | def find_commands():
32 | 
33 |     _config = settings.get_config_file()
34 | 
35 |     base_dir = _config.get('commands', 'base_dir')
36 | 
37 |     return load_commands_name(dir=base_dir)
38 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/webpack.config.dev.js:
--------------------------------------------------------------------------------
 1 | var webpack = require('webpack');
 2 | var path = require('path');
 3 | 
 4 | var ExtractTextPlugin = require('extract-text-webpack-plugin');
 5 | 
 6 | var BUILD_JS_DIR = path.resolve(__dirname, 'templates/static/js');
 7 | var APP_DIR = path.resolve(__dirname, 'react-src');
 8 | 
 9 | var config = {
10 |   entry: APP_DIR + '/main.jsx',
11 |   output: {
12 |     path: BUILD_JS_DIR,
13 |     filename: 'bundle.js'
14 |   },
15 |   module : {
16 |     loaders : [
17 |       {
18 |         test : /\.jsx?/,
19 |         include : APP_DIR,
20 |         loader : 'babel'
21 |       },
22 |       {
23 |         test: /\.scss$/,
24 |         //loaders: ['style', 'css', 'sass']
25 |         loader: ExtractTextPlugin.extract('css!sass')
26 |       }
27 |     ]
28 |   },
29 |   plugins: [
30 |     new ExtractTextPlugin('../css/bundle.css', {
31 |       allChunks: true
32 |     })
33 |   ]
34 | };
35 | 
36 | module.exports = config;


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/List.jsx:
--------------------------------------------------------------------------------
 1 | var React = require('react');
 2 | var ListItem = require('./ListItem.jsx');
 3 | var HTTP = require('../services/httpservice');
 4 | 
 5 | var List = React.createClass({
 6 |     getInitialState: function() {
 7 |         return {resources: []};
 8 |     },
 9 |     componentWillMount: function() {
10 | 
11 |         this.socket = io.connect('http://127.0.0.1:5000/resources');
12 |         this.socket.on('resources_info', function (msg) {
13 |           this.setState({resources: msg.data.sub});
14 |         }.bind(this));
15 | 
16 |     },
17 |     render: function() {
18 |         /*var listItems = this.state.resources.map(function(item) {
19 |             return <ListItem
20 |               key={item.pid}
21 |               memory_used_mb={item.memory_used_mb}
22 |               memory_available_mb={item.memory_available_mb} />;
23 |         });
24 | 
25 |         return (<ul>{listItems}</ul>);*/
26 |     }
27 | });
28 | 
29 | module.exports = List;
30 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/servers.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import json
 4 | import flask
 5 | 
 6 | from scrapy_eagle.dashboard.memory import get_connection
 7 | 
 8 | 
 9 | servers = flask.Blueprint('servers', __name__)
10 | 
11 | 
12 | @servers.route('/list')
13 | def listing():
14 | 
15 |     now = datetime.now()
16 | 
17 |     redis_conn = get_connection()
18 | 
19 |     _servers = redis_conn.zrangebyscore('eagle_servers', now.timestamp(), max='+inf')
20 | 
21 |     results = []
22 | 
23 |     for entry in _servers:
24 |         parts = entry.decode('utf-8').split("-")
25 |         ip, hostname = parts[0], "-".join(parts[1:])
26 |         results.append({'public_ip': ip, 'hostname': hostname})
27 | 
28 |     # Sets in Redis usually returns in random order, sort by hostname
29 |     results = sorted(results, key=lambda x: x['hostname'])
30 | 
31 |     return flask.Response(
32 |         response=json.dumps(results, sort_keys=True),
33 |         status=200,
34 |         mimetype="application/json"
35 |     )
36 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/webpack.config.prod.js:
--------------------------------------------------------------------------------
 1 | var webpack = require('webpack');
 2 | var path = require('path');
 3 | 
 4 | var ExtractTextPlugin = require('extract-text-webpack-plugin');
 5 | 
 6 | var BUILD_JS_DIR = path.resolve(__dirname, 'templates/static/js');
 7 | var APP_DIR = path.resolve(__dirname, 'react-src');
 8 | 
 9 | var config = {
10 |   entry: APP_DIR + '/main.jsx',
11 |   output: {
12 |     path: BUILD_JS_DIR,
13 |     filename: 'bundle.js'
14 |   },
15 |   plugins: [
16 |     new webpack.optimize.OccurrenceOrderPlugin(),
17 |     new webpack.DefinePlugin({
18 |       'process.env': {
19 |         'NODE_ENV': JSON.stringify('production')
20 |       }
21 |     }),
22 |     new webpack.optimize.UglifyJsPlugin({
23 |       compressor: {
24 |         warnings: false
25 |       }
26 |     }),
27 |     new ExtractTextPlugin('../css/bundle.css', {
28 |       allChunks: true
29 |     })
30 |   ],
31 |   module : {
32 |     loaders : [
33 |       {
34 |         test : /\.jsx?/,
35 |         include : APP_DIR,
36 |         loader : 'babel'
37 |       },
38 |       {
39 |         test: /\.scss$/,
40 |         //loaders: ['style', 'css', 'sass']
41 |         loader: ExtractTextPlugin.extract('css!sass')
42 |       }
43 |     ]
44 |   }
45 | };
46 | 
47 | module.exports = config;


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import io
 5 | from setuptools import setup, find_packages
 6 | 
 7 | 
 8 | LONG_DESC = open(os.path.join(os.path.dirname(__file__), 'README.rst')).read()
 9 | 
10 | 
11 | def read_file(filename):
12 |     with io.open(filename) as fp:
13 |         return fp.read().strip()
14 | 
15 | 
16 | def read_requirements(filename):
17 |     return [line.strip() for line in read_file(filename).splitlines()
18 |         if not line.startswith('#')]
19 | 
20 | 
21 | setup(name='scrapy-eagle',
22 |     version='0.0.37',
23 |     description='Run Scrapy Distributed',
24 |     long_description=LONG_DESC,
25 |     author='Rafael Alfredo Capucho',
26 |     author_email='rafael.capucho@gmail.com',
27 |     url='http://github.com/rafaelcapucho/scrapy-eagle',
28 |     packages=find_packages(),
29 |     license='BSD',
30 |     install_requires=read_requirements('requirements.txt'),
31 |     include_package_data=True,
32 |     entry_points={
33 |         'console_scripts': ['eagle_server=scrapy_eagle.dashboard.main:entry_point'],
34 |     },
35 |     classifiers=[
36 |         'Development Status :: 3 - Alpha',
37 |         'Framework :: Scrapy',
38 |         'Programming Language :: Python',
39 |         'Programming Language :: Python :: 3.5',
40 |         'Intended Audience :: Developers',
41 |     ],
42 | )
43 | 


--------------------------------------------------------------------------------
/tests/test_queue.py:
--------------------------------------------------------------------------------
 1 | import mock
 2 | 
 3 | from scrapy import Spider
 4 | from scrapy.http import Request
 5 | 
 6 | from scrapy_eagle.worker.queue import Base
 7 | 
 8 | 
 9 | class TestBaseQueue(object):
10 | 
11 |     def setup(self):
12 |         self.server = mock.Mock()
13 |         self.spider = Spider(name='foo')
14 |         self.spider.parse_method = lambda x: x
15 |         self.key = 'key'
16 |         self.q = Base(self.server, self.spider, self.key)
17 | 
18 |     def test_encode_decode_requests(self, q=None):
19 |         if q is None:
20 |             q = self.q
21 |         req = Request('http://example.com',
22 |                       callback=self.spider.parse,
23 |                       meta={'foo': 'bar'})
24 |         out = q._decode_request(q._encode_request(req))
25 |         assert req.url == out.url
26 |         assert req.meta == out.meta
27 |         assert req.callback == out.callback
28 | 
29 |     def test_custom_serializer(self):
30 |         serializer = mock.Mock()
31 |         serializer.dumps = mock.Mock(side_effect=lambda x: x)
32 |         serializer.loads = mock.Mock(side_effect=lambda x: x)
33 |         q  = Base(self.server, self.spider, self.key, serializer=serializer)
34 |         self.test_encode_decode_requests(q)
35 |         assert serializer.dumps.call_count == 1
36 |         assert serializer.loads.call_count == 1
37 | 
38 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/main.jsx:
--------------------------------------------------------------------------------
 1 | import React from 'react'
 2 | import { render } from 'react-dom'
 3 | import { Router, Route, IndexRoute, browserHistory } from 'react-router'
 4 | 
 5 | import { createStore, combineReducers } from 'redux'
 6 | import { Provider } from 'react-redux'
 7 | 
 8 | import App from './components/App.jsx'
 9 | import Home from './components/Home.jsx'
10 | import ServerSet from './components/servers/ServerSet.jsx'
11 | import ServerRoot from './components/servers/Root.jsx'
12 | 
13 | import JobsConfig from './components/jobs/JobsConfig.jsx'
14 | import JobsRoot from './components/jobs/Root.jsx'
15 | 
16 | import servers from './reducers/servers.jsx'
17 | import jobs from './reducers/jobs.jsx'
18 | 
19 | var reducers = combineReducers({
20 |   servers: servers,
21 |   jobs: jobs
22 | });
23 | 
24 | const store = createStore(reducers);
25 | 
26 | render((
27 |   <Provider store={store}>
28 |     <Router history={browserHistory}>
29 | 
30 |       <Route name="Dashboard" path="/app/" component={App}>
31 | 
32 |         <IndexRoute component={Home}/>
33 | 
34 |         <Route name="Jobs" path="jobs" component={JobsRoot}>
35 |           <Route name="Config" path="config" component={JobsConfig}/>
36 |         </Route>
37 | 
38 |         <Route name="Servers" path="servers" component={ServerRoot}>
39 |           <Route name="Monitoring" path="monitoring" component={ServerSet}/>
40 |         </Route>
41 | 
42 |       </Route>
43 | 
44 |     </Router>
45 |   </Provider>
46 | ), document.getElementById('app'));
47 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/green_threads/heartbeat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import signal
 3 | from datetime import datetime, timedelta
 4 | 
 5 | import gevent
 6 | 
 7 | 
 8 | def heartbeat_servers(redis_conn, ip, hostname):
 9 | 
10 |     while True:
11 | 
12 |         future = datetime.now() + timedelta(seconds=6)
13 | 
14 |         redis_conn.zadd(
15 |             'eagle_servers',
16 |             '{ip}-{hostname}'.format(ip=ip, hostname=hostname),
17 |             int(future.timestamp())
18 |         )
19 | 
20 |         # now = datetime.now()
21 |         # servers = redis_conn.zrangebyscore('servers', now.timestamp(), max='+inf')
22 | 
23 |         gevent.sleep(3)
24 | 
25 | 
26 | def heartbeat_subprocess(pid, spider, max_seconds_idle, max_size_limit, queue_info_global):
27 | 
28 |     last_processed = None
29 | 
30 |     max_size = 0
31 | 
32 |     while True:
33 | 
34 |         size = None
35 |         for entry in queue_info_global:
36 |             if entry['name'] == spider:
37 |                 size = entry['size']
38 | 
39 |         if size > 0:
40 |             last_processed = datetime.now()
41 | 
42 |         if size > max_size:
43 |             max_size = size
44 | 
45 |         if last_processed:
46 |             diff = datetime.now() - last_processed
47 | 
48 |             # print('\nlast_processed_secs: ', diff.seconds, ' maxsize: ', max_size, ' size: ', size, '\n\n')
49 | 
50 |             if diff.seconds > max_seconds_idle and max_size > max_size_limit:
51 | 
52 |                 os.kill(pid, signal.SIGHUP)
53 | 
54 |                 break
55 | 
56 |         gevent.sleep(2)
57 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "react-scrapy-eagle",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start:babel": "watchify react-src/main.jsx -v -t [ babelify --presets [ es2015 react ] ] -o templates/static/js/bundle.js",
 8 |     "build:dev": "./node_modules/webpack/bin/webpack.js -d --progress --colors --config webpack.config.dev.js",
 9 |     "build:prod": "NODE_ENV=production ./node_modules/webpack/bin/webpack.js -p --progress --colors --config webpack.config.prod.js",
10 |     "start": "npm run build:dev -- --watch",
11 |     "build": "npm run build:prod",
12 |     "test": "echo \"Error: no test specified\" && exit 1"
13 |   },
14 |   "author": "Rafael Capucho",
15 |   "license": "ISC",
16 |   "dependencies": {
17 |     "babel-loader": "^6.2.4",
18 |     "babel-preset-es2015": "^6.9.0",
19 |     "babel-preset-react": "^6.11.1",
20 |     "babelify": "^7.3.0",
21 |     "classnames": "^2.2.5",
22 |     "css-loader": "^0.23.1",
23 |     "extract-text-webpack-plugin": "^1.0.1",
24 |     "immutable": "^3.8.1",
25 |     "moment": "^2.14.1",
26 |     "node-sass": "^3.8.0",
27 |     "react": "^15.3.1",
28 |     "react-addons-pure-render-mixin": "^15.3.1",
29 |     "react-breadcrumbs": "^1.3.16",
30 |     "react-dom": "^15.3.1",
31 |     "react-redux": "^4.4.5",
32 |     "react-router": "^2.6.1",
33 |     "react-switchery": "^1.0.0",
34 |     "redux": "^3.5.2",
35 |     "sass-loader": "^4.0.0",
36 |     "style-loader": "^0.13.1",
37 |     "watchify": "^3.7.0",
38 |     "webpack": "^1.13.1",
39 |     "whatwg-fetch": "^1.0.0"
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/reducers/jobs.jsx:
--------------------------------------------------------------------------------
 1 | import { Record, OrderedMap, List } from 'immutable';
 2 | 
 3 | const JobRecord = Record({
 4 |   active: undefined, // true or false
 5 |   frequency_minutes: undefined,
 6 |   last_started_at: undefined,
 7 |   max_concurrency: undefined,
 8 |   min_concurrency: undefined,
 9 |   max_memory_mb: undefined,
10 |   priority: 0,
11 |   job_type: undefined, // 'spider' or 'command'
12 |   start_urls: new List()
13 | });
14 | 
15 | class JobInfo extends JobRecord {
16 |   getPriority(){
17 |     return this.priority;
18 |   }
19 | }
20 | 
21 | const SpidersMap = OrderedMap({});
22 | 
23 | export default (state = SpidersMap, action) => {
24 | 
25 |   switch (action.type) {
26 | 
27 |     case 'UPDATE_SPIDER_INFO':
28 | 
29 |       // Check if there's already one Record from this Spider
30 |       if(!state.has(action.spider_id)){
31 |         state = state.set(action.spider_id, new JobInfo());
32 |       }
33 | 
34 |       return state.update(action.spider_id,
35 |         (spider_record) =>
36 |           spider_record.merge({
37 |             'priority': action.priority,
38 |             'frequency_minutes': action.frequency_minutes,
39 |             'last_started_at': action.last_started_at,
40 |             'max_concurrency': action.max_concurrency,
41 |             'min_concurrency': action.min_concurrency,
42 |             'max_memory_mb': action.max_memory_mb,
43 |             'job_type': action.job_type,
44 |             'start_urls': action.start_urls,
45 |             'active': action.active
46 |           })
47 |       );
48 | 
49 |     default:
50 |       return state;
51 |   }
52 | }


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/ip.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import os
 4 | import re
 5 | import requests
 6 | import random
 7 | 
 8 | def get_hostname():
 9 | 
10 |     return os.uname()[1]
11 | 
12 | def get_external_ip():
13 | 
14 |     source_list = [
15 |         'http://ip.dnsexit.com',
16 |         'http://ifconfig.me/ip',
17 |         'http://ipecho.net/plain',
18 |         'http://ipogre.com/linux.php',
19 |         'http://myexternalip.com/raw',
20 |         'http://icanhazip.com/',
21 |         'http://httpbin.org/ip'
22 |     ]
23 | 
24 |     headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0'}
25 | 
26 |     for i in range(len(source_list)):
27 | 
28 |         target = random.choice(source_list)
29 | 
30 |         try:
31 | 
32 |             content = requests.get(target, headers=headers, timeout=6, verify=False)
33 | 
34 |             m = re.search(
35 |                 '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})',
36 |                 content.text
37 |             )
38 | 
39 |             ip = m.group(0)
40 | 
41 |             if len(ip) > 0:
42 |                 return ip
43 | 
44 |         # Without Internet
45 |         except requests.exceptions.ConnectionError as e:
46 | 
47 |             # Only interested in there kind of error
48 |             if str(e).find("Temporary failure in name resolution") > -1:
49 |                 return None
50 | 
51 |         # Timeout
52 |         except requests.exceptions.RequestException:
53 |             # Try next
54 |             source_list.pop(i)
55 | 
56 |         except Exception:
57 |             continue
58 | 
59 | 
60 |     return None
61 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/servers/ServerSubProcess.jsx:
--------------------------------------------------------------------------------
 1 | var React = require('react');
 2 | var moment = require('moment');
 3 | 
 4 | var ServerSubProcess = React.createClass({
 5 | 
 6 |   getInitialState: function() {
 7 |     return {link_open_buffer: ""};
 8 |   },
 9 |   onClickKill: function(){
10 | 
11 |     $.get(window.location.protocol+"//"+this.props.public_ip+":"+location.port+"/processes/kill_subprocess/"+this.props.pid, function(data) {
12 | 
13 |     });
14 | 
15 |   },
16 |   componentDidMount: function(){
17 |     this.setState({'link_open_buffer': window.location.protocol+"//"+this.props.public_ip+":"+location.port+"/processes/read_buffer/"+this.props.pid});
18 |   },
19 |   render: function(){
20 | 
21 |     var created_at = moment.utc(this.props.created_at);
22 |     var fromNow = created_at.fromNow();
23 | 
24 |     return (
25 |       <li key={this.props.pid}>
26 |         <ul>
27 |           <li>Command: {this.props.command}</li>
28 |           <li>PID: {this.props.pid}</li>
29 |           <li>CPU: {this.props.cpu_percent}%</li>
30 |           <li>Memory Used: {this.props.memory_used_mb}mb</li>
31 |           <li>Spider: {this.props.spider}</li>
32 |           <li>Base Dir: {this.props.base_dir}</li>
33 |           <li>Created At: {fromNow}</li>
34 |           <li>
35 |             <button onClick={this.onClickKill}>Kill</button>
36 |             <a href={this.state.link_open_buffer} target="_blank"><button>Open Buffer</button></a>
37 |           </li>
38 |         </ul>
39 |       </li>
40 |     );
41 |   }
42 | 
43 | });
44 | 
45 | /*var Link = React.createClass({
46 | 
47 |   render: function(){
48 |     return (
49 |       <a href='http://'+{this.props.public_ip}+':5000/read_buffer/{this.props.pid}' target="_blank">
50 |         <button>Open Buffer</button>
51 |       </a>
52 |     );
53 |   }
54 | 
55 | });*/
56 | 
57 | module.exports = ServerSubProcess;
58 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/settings.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | try:
 4 |     import configparser
 5 | except ImportError:
 6 |     import ConfigParser as configparser
 7 | 
 8 | from scrapy_eagle.dashboard.utils import ip
 9 | 
10 | buffers = {}
11 | 
12 | queue_info_global = []
13 | 
14 | subprocess_pids = set()
15 | 
16 | # Never import these directly
17 | # Use get_config_file and get_args instead
18 | _args = None
19 | _config = None
20 | _public_ip = None
21 | _hostname = None
22 | _spiders = None
23 | _commands = None
24 | 
25 | 
26 | def setup_configuration(config_file=None):
27 | 
28 |     global _config
29 | 
30 |     _config = configparser.RawConfigParser()
31 |     _config.read(config_file)
32 | 
33 |     globals()['_config'] = _config
34 | 
35 |     return _config
36 | 
37 | 
38 | def setup(config_file=None, output=True):
39 | 
40 |     global _args, _config, _public_ip, _hostname
41 | 
42 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
43 |     parser.add_argument('-c', '--config-file', help='Config file path.')
44 | 
45 |     _args = parser.parse_args()
46 | 
47 |     if not _args.config_file and not config_file:
48 |         print('You should specify a config file using --config-file parameter.')
49 |         exit(0)
50 | 
51 |     _config = setup_configuration(config_file=_args.config_file or config_file)
52 | 
53 |     if output:
54 |         print('discovering your external entrypoint address... ', end='', flush=True)
55 | 
56 |     _public_ip = ip.get_external_ip()
57 | 
58 |     if output:
59 |         print(_public_ip)
60 | 
61 |     _hostname = ip.get_hostname()
62 | 
63 |     return _args, _config
64 | 
65 | 
66 | def get_public_ip():
67 |     return _public_ip
68 | 
69 | 
70 | def get_hostname():
71 |     return _hostname
72 | 
73 | 
74 | def get_config_file():
75 |     return _config
76 | 
77 | 
78 | def get_args():
79 |     return _args
80 | 
81 | 
82 | def get_spiders():
83 |     return _spiders
84 | 
85 | 
86 | def get_commands():
87 |     return _commands


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/memory.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import redis
 4 | 
 5 | from scrapy_eagle.dashboard.settings import get_config_file
 6 | 
 7 | redis_pool = None
 8 | 
 9 | 
10 | def init_memory():
11 | 
12 |     global redis_pool
13 | 
14 |     config = get_config_file()
15 | 
16 |     redis_pool = redis.ConnectionPool(
17 |         host=config['redis']['host'],
18 |         port=config['redis']['port'],
19 |         db=config['redis']['db'],
20 |         password=config.get('redis', 'password', fallback='')
21 |     )
22 | 
23 | 
24 | def get_redis_pool():
25 |     return redis_pool
26 | 
27 | 
28 | def get_connection():
29 | 
30 |     if not redis_pool:
31 |         init_memory()
32 | 
33 |     return redis.Redis(connection_pool=redis_pool)
34 | 
35 | 
36 | def get_job_object(key):
37 | 
38 |     redis_conn = get_connection()
39 | 
40 |     json_obj = redis_conn.get('eagle_jobs:{key}'.format(key=key))
41 | 
42 |     if json_obj:
43 |         return json.loads(json_obj.decode('utf-8'))
44 |     else:
45 |         return None
46 | 
47 | def update_job_object(key, fields):
48 | 
49 |     redis_conn = get_connection()
50 | 
51 |     serialized = json.dumps(fields, sort_keys=True)
52 | 
53 |     redis_conn.set('eagle_jobs:{key}'.format(key=key), serialized)
54 | 
55 | if __name__ == "__main__":
56 | 
57 |     from scrapy_eagle.dashboard.settings import setup_configuration
58 | 
59 |     _config = setup_configuration(config_file='/etc/scrapy-eagle.ini')
60 | 
61 |     init_memory()
62 | 
63 |     o = get_job_object(key='epocacosmeticos.com.br')
64 | 
65 |     print(o)
66 | 
67 |     d = {
68 |         "active": True,
69 |         "max_memory_mb": 220,
70 |         "job_type": "spider",
71 |         "last_started_at": "2016-08-31T04:17:51.200187",
72 |         "priority": 6,
73 |         "start_urls": [
74 |             "http://epocacosmeticos.com.br/",
75 |             "http://www.epocacosmeticos.com.br/perfumes"
76 |         ],
77 |         "max_concurrency": 4,
78 |         "min_concurrency": 1,
79 |         "frequency_minutes": 1440
80 |     }
81 | 
82 |     update_job_object(key='epocacosmeticos.com.br', fields=d)
83 | 
84 |     print(get_job_object(key='epocacosmeticos.com.br'))


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/green_threads/executor.py:
--------------------------------------------------------------------------------
 1 | import gevent
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from scrapy_eagle.dashboard import settings
 5 | from scrapy_eagle.dashboard.memory import get_job_object, update_job_object
 6 | from scrapy_eagle.dashboard.utils import iso_to_timestamp, timestamp_to_utc, processkit
 7 | 
 8 | 
 9 | def evaluation_loop():
10 | 
11 |     while True:
12 | 
13 |         _spiders = settings.get_spiders()
14 |         _commands = settings.get_commands()
15 | 
16 |         # When the system is starting up, spiders/commands may return empty because
17 |         # we're using async execution `green_threads.find_new_spiders`.
18 |         if _spiders and _commands:
19 | 
20 |             for key in _spiders + _commands:
21 |                 obj = get_job_object(key=key)
22 | 
23 |                 if obj and obj.get('next_execution_at'):
24 | 
25 |                     next_execution_at = timestamp_to_utc(iso_to_timestamp(obj['next_execution_at']))
26 | 
27 |                     now = datetime.utcnow()
28 | 
29 |                     if next_execution_at < now:
30 | 
31 |                         dispatch(key=key, register=obj)
32 | 
33 |         gevent.sleep(3)
34 | 
35 | 
36 | def dispatch(key, register):
37 | 
38 |     _config = settings.get_config_file()
39 | 
40 |     register['last_started_at'] = datetime.utcnow().isoformat()
41 |     register['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=register['frequency_minutes'])).isoformat()
42 | 
43 |     if register['job_type'] == "spider":
44 |         command = [_config.get('scrapy', 'binary'), 'crawl', key]
45 |         base_dir = _config.get('scrapy', 'base_dir')
46 |         spider = True
47 | 
48 |     elif register['job_type'] == "command":
49 |         command = [_config.get('commands', 'binary'), '-u', key + '.py']
50 |         base_dir = _config.get('commands', 'base_dir')
51 |         spider = False
52 | 
53 |     gevent.spawn(
54 |         processkit.new_subprocess,
55 |         base_dir=base_dir,
56 |         command=command,
57 |         spider=spider,
58 |         subprocess_pids=settings.subprocess_pids,
59 |         queue_info_global=settings.queue_info_global,
60 |         buffers=settings.buffers
61 |     )
62 | 
63 |     update_job_object(key=key, fields=register)
64 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/servers/ServerSet.jsx:
--------------------------------------------------------------------------------
 1 | import React from 'react'
 2 | import { connect } from 'react-redux'
 3 | 
 4 | var ServerNode = require('./ServerNode.jsx');
 5 | 
 6 | var SetIntervalMixin = {
 7 |   componentWillMount: function() {
 8 |     this.intervals = [];
 9 |   },
10 |   setInterval: function() {
11 |     this.intervals.push(setInterval.apply(null, arguments));
12 |   },
13 |   componentWillUnmount: function() {
14 |     this.intervals.forEach(clearInterval);
15 |   }
16 | };
17 | 
18 | var ServerSet = React.createClass({
19 | 
20 |   mixins: [SetIntervalMixin],
21 | 
22 |   getInitialState: function() {
23 |     return {server_set: new Array()};
24 |   },
25 | 
26 |   componentDidMount:function(){
27 |     this.setInterval(this.updateServers, 3000);
28 |   },
29 | 
30 |   updateServers: function() {
31 | 
32 |     var that = this;
33 | 
34 |     var server_set_new = new Array();
35 | 
36 |     this.serversRequest = $.ajax({
37 |       url: window.location.protocol + "//" + document.domain + ":"+ location.port +"/servers/list",
38 |       type: 'GET',
39 |       dataType: 'json',
40 |       cache: false
41 |     }).done(function(data) {
42 | 
43 |       data.forEach(function(elem, index){
44 |         server_set_new.push({public_ip: elem.public_ip, hostname: elem.hostname});
45 |       })
46 | 
47 |     }).always(function () {
48 |       that.setState({'server_set': server_set_new});
49 |       that.props.set_server_qty(server_set_new.length);
50 |     });
51 | 
52 |   },
53 | 
54 |   componentWillUnmount: function() {
55 |     // Ref: https://facebook.github.io/react/tips/initial-ajax.html
56 |     this.serversRequest.abort();
57 |   },
58 |   render: function(){
59 |     var listServers = this.state.server_set.map(function(item) {
60 |         return <ServerNode
61 |           key={item.public_ip}
62 |           hostname={item.hostname}
63 |           public_ip={item.public_ip} />;
64 |     });
65 | 
66 |     return (
67 |       <div>
68 |         <p>ServerSet</p>
69 |         <ul>{listServers}</ul>
70 |       </div>
71 |     );
72 |   }
73 | });
74 | 
75 | 
76 | var mapDispatchToProps = function(dispatch){
77 |   return {
78 |     dispatch,
79 |     set_server_qty: (qty) => { dispatch({type: 'SET_SERVER_QTY', qty: qty}); }
80 |   }
81 | };
82 | 
83 | export default connect(
84 |   (state) => { return {} },
85 |   mapDispatchToProps
86 | )(ServerSet)
87 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
 6 |     <meta http-equiv="x-ua-compatible" content="ie=edge">
 7 |     <title>Scrapy Eagle</title>
 8 |     <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.0.0-alpha.3/css/bootstrap.min.css">
 9 |     <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/switchery/0.8.1/switchery.min.css" />
10 |     <link rel="stylesheet" href="/static/css/bundle.css">
11 |     <link rel="stylesheet" href="/static/css/main.css">
12 |     <style media="screen">
13 | 
14 |     </style>
15 | 
16 |   </head>
17 |   <body>
18 | 
19 |     <header class="container-fluid">
20 |       <div class="brand">
21 |         <a href="/app">
22 |           <img src="/static/img/system-logo.jpg" alt="Scrapy-Eagle Home">
23 |         </a>
24 |       </div>
25 |     </header>
26 | 
27 |     <div id="app"></div>
28 | 
29 |     <script src="//cdnjs.cloudflare.com/ajax/libs/jquery/3.1.0/jquery.min.js"></script>
30 |     <script src="//cdnjs.cloudflare.com/ajax/libs/tether/1.3.3/js/tether.min.js"></script>
31 |     <script src="//cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.0.0-alpha.3/js/bootstrap.min.js"></script>
32 |     <script src="/static/js/vendor/jquery.navgoco.min.js"></script>
33 |     <script src="//cdnjs.cloudflare.com/ajax/libs/switchery/0.8.1/switchery.min.js"></script>
34 | 
35 |     <script type="text/javascript" src="//cdnjs.cloudflare.com/ajax/libs/socket.io/1.4.5/socket.io.min.js"></script>
36 |     <script src="/static/js/bundle.js"></script>
37 | 
38 |     <script type="text/javascript">
39 |       $(function() {
40 | 
41 |         var subMenu = $('.sidebar .nav');
42 | 
43 |         $(subMenu).navgoco({
44 |           caretHtml: false,
45 |           accordion: true,
46 |         });
47 | 
48 |         $('ul.nav li.nav-item a.nav-link').on('click', function (event) {
49 | 
50 |           if ($(this).attr('href') !== '#' && !$(this).hasClass('direct') ) {
51 | 
52 |             var previous_url = $(this).attr('href');
53 | 
54 |             event.preventDefault();
55 | 
56 |             $(this).attr('href','#');
57 | 
58 |             $(this).trigger('click');
59 | 
60 |             $(this).attr('href', previous_url);
61 | 
62 |           }
63 | 
64 |         });
65 | 
66 |       });
67 |     </script>
68 | 
69 |   </body>
70 | </html>
71 | 


--------------------------------------------------------------------------------
/scrapy_eagle/worker/connection.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | import six
 3 | 
 4 | from scrapy.utils.misc import load_object
 5 | 
 6 | 
 7 | DEFAULT_REDIS_CLS = redis.StrictRedis
 8 | 
 9 | 
10 | # Sane connection defaults.
11 | DEFAULT_PARAMS = {
12 |     'socket_timeout': 30,
13 |     'socket_connect_timeout': 30,
14 |     'retry_on_timeout': True,
15 | }
16 | 
17 | # Shortcut maps 'setting name' -> 'parmater name'.
18 | SETTINGS_PARAMS_MAP = {
19 |     'REDIS_URL': 'url',
20 |     'REDIS_HOST': 'host',
21 |     'REDIS_PORT': 'port',
22 | }
23 | 
24 | 
25 | def get_redis_from_settings(settings):
26 |     """Returns a redis client instance from given Scrapy settings object.
27 | 
28 |     This function uses ``get_client`` to instantiate the client and uses
29 |     ``DEFAULT_PARAMS`` global as defaults values for the parameters. You can
30 |     override them using the ``REDIS_PARAMS`` setting.
31 | 
32 |     Parameters
33 |     ----------
34 |     settings : Settings
35 |         A scrapy settings object. See the supported settings below.
36 | 
37 |     Returns
38 |     -------
39 |     server
40 |         Redis client instance.
41 | 
42 |     Other Parameters
43 |     ----------------
44 |     REDIS_URL : str, optional
45 |         Server connection URL.
46 |     REDIS_HOST : str, optional
47 |         Server host.
48 |     REDIS_PORT : str, optional
49 |         Server port.
50 |     REDIS_PARAMS : dict, optional
51 |         Additional client parameters.
52 | 
53 |     """
54 |     params = DEFAULT_PARAMS.copy()
55 |     params.update(settings.getdict('REDIS_PARAMS'))
56 |     # XXX: Deprecate REDIS_* settings.
57 |     for source, dest in SETTINGS_PARAMS_MAP.items():
58 |         val = settings.get(source)
59 |         if val:
60 |             params[dest] = val
61 | 
62 |     # Allow ``redis_cls`` to be a path to a class.
63 |     if isinstance(params.get('redis_cls'), six.string_types):
64 |         params['redis_cls'] = load_object(params['redis_cls'])
65 | 
66 |     return get_redis(**params)
67 | 
68 | 
69 | # Backwards compatible alias.
70 | from_settings = get_redis_from_settings
71 | 
72 | 
73 | def get_redis(**kwargs):
74 |     """Returns a redis client instance.
75 | 
76 |     Parameters
77 |     ----------
78 |     redis_cls : class, optional
79 |         Defaults to ``redis.StrictRedis``.
80 |     url : str, optional
81 |         If given, ``redis_cls.from_url`` is used to instantiate the class.
82 |     **kwargs
83 |         Extra parameters to be passed to the ``redis_cls`` class.
84 | 
85 |     Returns
86 |     -------
87 |     server
88 |         Redis client instance.
89 | 
90 |     """
91 |     redis_cls = kwargs.pop('redis_cls', DEFAULT_REDIS_CLS)
92 |     url = kwargs.pop('url', None)
93 |     if url:
94 |         return redis_cls.from_url(url, **kwargs)
95 |     else:
96 |         return redis_cls(**kwargs)
97 | 
98 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/green_threads/stats.py:
--------------------------------------------------------------------------------
 1 | from gevent import monkey
 2 | monkey.patch_all()
 3 | 
 4 | import gevent
 5 | import gevent.pool
 6 | 
 7 | from scrapy_eagle.dashboard import settings
 8 | from scrapy_eagle.dashboard.utils.processkit import get_resources_info_from_pid, get_resources_info_from_server
 9 | 
10 | def send_redis_queue_info(socketio, redis_conn, spiders, queue_info_global):
11 | 
12 |     while True:
13 | 
14 |         queues = []
15 | 
16 |         for spider in spiders:
17 |             queues.append(
18 |                 {
19 |                     'name': spider,
20 |                     'size': int(redis_conn.llen('{spider}:requests'.format(spider=spider)))
21 |                 }
22 |             )
23 | 
24 |         # Don't asign directly to maintain the reference to the global object
25 |         queue_info_global.clear()
26 |         queue_info_global.extend(queues)
27 | 
28 |         socketio.emit('redis_queue_info', {'data': queues}, namespace="/queues", broadcast=True)
29 | 
30 |         gevent.sleep(1)
31 | 
32 | def send_resources_info(socketio, subprocess_pids, public_ip):
33 | 
34 |     while True:
35 | 
36 |         dict_info_pid_greenlet = gevent.spawn(get_resources_info_from_pid)
37 |         dict_info_host_greenlet = gevent.spawn(get_resources_info_from_server)
38 | 
39 |         subprocess_info_greenlets = []
40 | 
41 |         for pid, spider, command, base_dir, created_at in subprocess_pids:
42 | 
43 |             # We pass all the parameters that we like to keep instead
44 |             # of simply use a .update() here because the returned instance
45 |             # is a Greenlet instead of a dict.
46 | 
47 |             info_greenlet = gevent.spawn(
48 |                 get_resources_info_from_pid,
49 |                 pid=pid,
50 |                 spider=spider,
51 |                 command=command,
52 |                 base_dir=base_dir,
53 |                 created_at=created_at,
54 |             )
55 | 
56 |             subprocess_info_greenlets.append(info_greenlet)
57 | 
58 |         dict_info_pid_greenlet.join()
59 |         dict_info = dict_info_pid_greenlet.get()
60 |         dict_info['public_ip'] = public_ip
61 | 
62 |         dict_info_host_greenlet.join()
63 |         dict_info_host = dict_info_host_greenlet.get()
64 |         dict_info.update(dict_info_host)
65 | 
66 |         gevent.joinall(subprocess_info_greenlets)
67 |         dict_info['sub'] = [greenlet.get() for greenlet in subprocess_info_greenlets]
68 | 
69 |         # When get_resources_info try to access a PID that dont exists any more it
70 |         # return None, here we remove those results. It happen because it takes
71 |         # sometime to subprocess_pids remove PIDs that finishs.
72 |         dict_info['sub'] = [x for x in dict_info['sub'] if x]
73 | 
74 |         _spiders = settings.get_spiders()
75 |         _commands = settings.get_commands()
76 | 
77 |         dict_info['spiders'] = _spiders or []
78 |         dict_info['commands'] = _commands or []
79 | 
80 |         print('\n\ndict_info: ', dict_info, '\n\n')
81 | 
82 |         socketio.emit('resources_info', {'data': dict_info}, namespace="/resources", broadcast=True)
83 | 
84 |         gevent.sleep(1)


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/utils/processkit.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | import os
  5 | import subprocess
  6 | from datetime import datetime
  7 | 
  8 | import psutil
  9 | import gevent
 10 | 
 11 | from scrapy_eagle.dashboard.green_threads import heartbeat
 12 | 
 13 | 
 14 | def new_subprocess(base_dir, subprocess_pids, queue_info_global, command=None, spider=None, buffers={}):
 15 | 
 16 |     if not command:
 17 |         command = ['python', '-u', 'generator.py']
 18 |     # command = ['galculator']
 19 |     # command = ['/usr/bin/scrapy-py35', 'crawl', '{spider}'.format(spider)]
 20 | 
 21 |     with subprocess.Popen(
 22 |             command,
 23 |             cwd=base_dir,
 24 |             stdout=subprocess.PIPE,
 25 |             bufsize=1,
 26 |             universal_newlines=True
 27 |     ) as p:
 28 | 
 29 |         # Turn it JSON serializable
 30 |         created_at = datetime.utcnow().isoformat()
 31 | 
 32 |         identifier = (p.pid, spider, " ".join(command), base_dir, created_at)
 33 | 
 34 |         subprocess_pids.add(identifier)
 35 | 
 36 |         buffers[p.pid] = {'finished': False, 'lines': []}
 37 | 
 38 |         if spider:
 39 |             gevent.spawn(
 40 |                 heartbeat.heartbeat_subprocess,
 41 |                 p.pid,
 42 |                 spider,
 43 |                 max_seconds_idle=20,
 44 |                 max_size_limit=15,
 45 |                 queue_info_global=queue_info_global
 46 |             )
 47 | 
 48 |         for line in p.stdout:
 49 | 
 50 |             # TODO: remove empty lines
 51 | 
 52 |             if len(line.strip()) > 0:
 53 | 
 54 |                 buffers[p.pid]['lines'].append(line)
 55 | 
 56 |             # print(line, end='', flush=True)
 57 | 
 58 |     buffers[p.pid]['finished'] = True
 59 | 
 60 |     subprocess_pids.remove(identifier)
 61 | 
 62 | 
 63 | def _get_info_from_pid(pid=None):
 64 | 
 65 |     if not pid:
 66 |         pid = os.getpid()
 67 | 
 68 |     process = psutil.Process(pid)
 69 | 
 70 |     mem = process.memory_info()[0] / float(2 ** 20)
 71 |     mem = float('{0:.2f}'.format(mem))
 72 | 
 73 |     cpu = process.cpu_percent(interval=0.5)
 74 | 
 75 |     return pid, mem, cpu
 76 | 
 77 | 
 78 | def get_resources_info_from_server():
 79 | 
 80 |     cpus = psutil.cpu_percent(interval=0.5, percpu=True)
 81 | 
 82 |     # Mem results return in bytes
 83 |     vmem = psutil.virtual_memory()
 84 | 
 85 |     total = vmem.total
 86 |     total = (total / 1024.0) / 1024.0
 87 | 
 88 |     available = vmem.available
 89 |     available = (available / 1024.0) / 1024.0
 90 | 
 91 |     used = total - available
 92 | 
 93 |     return {
 94 |         'cpus': cpus,
 95 |         'memory_total_mb': float('{0:.2f}'.format(total)),
 96 |         'memory_available_mb': float('{0:.2f}'.format(available)),
 97 |         'memory_used_server_mb': float('{0:.2f}'.format(used))
 98 |     }
 99 | 
100 | 
101 | def get_resources_info_from_pid(pid=None, *args, **kwargs):
102 | 
103 |     try:
104 | 
105 |         pid, memory_used_mb, cpu_percent = _get_info_from_pid(pid=pid)
106 | 
107 |         result = {
108 |             'pid': pid,
109 |             'memory_used_mb': memory_used_mb,
110 |             'cpu_percent': cpu_percent,
111 |         }
112 | 
113 |         result.update(kwargs)
114 | 
115 |         return result
116 | 
117 |     except psutil.NoSuchProcess:
118 |         print('TODO: an error here')
119 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/processes.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import signal
  4 | 
  5 | import flask
  6 | import gevent
  7 | 
  8 | from scrapy_eagle.dashboard.utils import processkit
  9 | from scrapy_eagle.dashboard import settings
 10 | 
 11 | 
 12 | processes = flask.Blueprint('processes', __name__)
 13 | 
 14 | 
 15 | @processes.route('/exec_command')
 16 | def exec_command():
 17 | 
 18 |     gevent.spawn(
 19 |         processkit.new_subprocess,
 20 |         base_dir='.',
 21 |         subprocess_pids=settings.subprocess_pids,
 22 |         queue_info_global=settings.queue_info_global,
 23 |         buffers=settings.buffers
 24 |     )
 25 | 
 26 |     result = {
 27 |         'status': True
 28 |     }
 29 | 
 30 |     return flask.Response(
 31 |         response=json.dumps(result, sort_keys=True),
 32 |         status=200,
 33 |         mimetype="application/json"
 34 |     )
 35 | 
 36 | 
 37 | @processes.route('/read_buffer/<int:pid>')
 38 | def read_buffer(pid):
 39 | 
 40 |     if not settings.buffers.get(pid):
 41 |         return flask.Response(
 42 |             response=json.dumps(
 43 |                 {'status': False, 'msg': 'PID Not Found'},
 44 |                 sort_keys=True
 45 |             ),
 46 |             status=200,
 47 |             mimetype="application/json"
 48 |         )
 49 | 
 50 |     def generate():
 51 | 
 52 |         sent = 0
 53 | 
 54 |         while not settings.buffers[pid]['finished']:
 55 | 
 56 |             for i, row in enumerate(settings.buffers[pid]['lines'][sent:]):
 57 | 
 58 |                 sent += 1
 59 | 
 60 |                 yield row+'<br>'
 61 | 
 62 |             gevent.sleep(0.5)
 63 | 
 64 |     return flask.Response(
 65 |         response=generate(),
 66 |         status=200,
 67 |         mimetype="text/html"
 68 |     )
 69 | 
 70 | 
 71 | @processes.route('/kill_subprocess/<int:pid>')
 72 | def kill_subprocess(pid):
 73 | 
 74 |     safe = False
 75 | 
 76 |     for _pid, _, _, _, _ in settings.subprocess_pids:
 77 | 
 78 |         if pid == _pid:
 79 |             safe = True
 80 |             break
 81 | 
 82 |     if safe:
 83 |         os.kill(pid, signal.SIGHUP)
 84 | 
 85 |         result = {
 86 |             'status': True,
 87 |             'msg': 'SIGHUP signal sent to PID {0}'.format(pid)
 88 |         }
 89 | 
 90 |     else:
 91 |         result = {
 92 |             'status': False,
 93 |             'msg': 'PID Not Found'
 94 |         }
 95 | 
 96 |     return flask.Response(
 97 |         response=json.dumps(result, sort_keys=True),
 98 |         status=200,
 99 |         mimetype="application/json"
100 |     )
101 | 
102 | 
103 | @processes.route('/start_spider/<spider>')
104 | def start_spider(spider):
105 | 
106 |     _config = settings.get_config_file()
107 | 
108 |     command = [_config.get('scrapy', 'binary'), 'crawl', spider]
109 | 
110 |     # TODO: Verify if base_dir is set before use it
111 | 
112 |     gevent.spawn(
113 |         processkit.new_subprocess,
114 |         base_dir=_config.get('scrapy', 'base_dir'),
115 |         command=command,
116 |         spider=spider,
117 |         subprocess_pids=settings.subprocess_pids,
118 |         queue_info_global=settings.queue_info_global,
119 |         buffers=settings.buffers
120 |     )
121 | 
122 |     result = {
123 |         'status': True
124 |     }
125 | 
126 |     return flask.Response(
127 |         response=json.dumps(result, sort_keys=True),
128 |         status=200,
129 |         mimetype="application/json"
130 |     )
131 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/js/vendor/jquery.navgoco.min.js:
--------------------------------------------------------------------------------
1 | /*
2 |  * jQuery Navgoco Menus Plugin v0.2.1 (2014-04-11)
3 |  * https://github.com/tefra/navgoco
4 |  *
5 |  * Copyright (c) 2014 Chris T (@tefra)
6 |  * BSD - https://github.com/tefra/navgoco/blob/master/LICENSE-BSD
7 |  */
8 | !function(a){"use strict";var b=function(b,c,d){return this.el=b,this.$el=a(b),this.options=c,this.uuid=this.$el.attr("id")?this.$el.attr("id"):d,this.state={},this.init(),this};b.prototype={init:function(){var b=this;b._load(),b.$el.find("ul").each(function(c){var d=a(this);d.attr("data-index",c),b.options.save&&b.state.hasOwnProperty(c)?(d.parent().addClass(b.options.openClass),d.show()):d.parent().hasClass(b.options.openClass)?(d.show(),b.state[c]=1):d.hide()});var c=a("<span></span>").prepend(b.options.caretHtml),d=b.$el.find("li > a");b._trigger(c,!1),b._trigger(d,!0),b.$el.find("li:has(ul) > a").prepend(c)},_trigger:function(b,c){var d=this;b.on("click",function(b){b.stopPropagation();var e=c?a(this).next():a(this).parent().next(),f=!1;if(c){var g=a(this).attr("href");f=void 0===g||""===g||"#"===g}if(e=e.length>0?e:!1,d.options.onClickBefore.call(this,b,e),!c||e&&f)b.preventDefault(),d._toggle(e,e.is(":hidden")),d._save();else if(d.options.accordion){var h=d.state=d._parents(a(this));d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");h.hasOwnProperty(c)||d._toggle(b,!1)}),d._save()}d.options.onClickAfter.call(this,b,e)})},_toggle:function(b,c){var d=this,e=b.attr("data-index"),f=b.parent();if(d.options.onToggleBefore.call(this,b,c),c){if(f.addClass(d.options.openClass),b.slideDown(d.options.slide),d.state[e]=1,d.options.accordion){var g=d.state=d._parents(b);g[e]=d.state[e]=1,d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");g.hasOwnProperty(c)||d._toggle(b,!1)})}}else f.removeClass(d.options.openClass),b.slideUp(d.options.slide),d.state[e]=0;d.options.onToggleAfter.call(this,b,c)},_parents:function(b,c){var d={},e=b.parent(),f=e.parents("ul");return f.each(function(){var b=a(this),e=b.attr("data-index");return e?void(d[e]=c?b:1):!1}),d},_save:function(){if(this.options.save){var b={};for(var d in this.state)1===this.state[d]&&(b[d]=1);c[this.uuid]=this.state=b,a.cookie(this.options.cookie.name,JSON.stringify(c),this.options.cookie)}},_load:function(){if(this.options.save){if(null===c){var b=a.cookie(this.options.cookie.name);c=b?JSON.parse(b):{}}this.state=c.hasOwnProperty(this.uuid)?c[this.uuid]:{}}},toggle:function(b){var c=this,d=arguments.length;if(1>=d)c.$el.find("ul").each(function(){var d=a(this);c._toggle(d,b)});else{var e,f={},g=Array.prototype.slice.call(arguments,1);d--;for(var h=0;d>h;h++){e=g[h];var i=c.$el.find('ul[data-index="'+e+'"]').first();if(i&&(f[e]=i,b)){var j=c._parents(i,!0);for(var k in j)f.hasOwnProperty(k)||(f[k]=j[k])}}for(e in f)c._toggle(f[e],b)}c._save()},destroy:function(){a.removeData(this.$el),this.$el.find("li:has(ul) > a").unbind("click"),this.$el.find("li:has(ul) > a > span").unbind("click")}},a.fn.navgoco=function(c){if("string"==typeof c&&"_"!==c.charAt(0)&&"init"!==c)var d=!0,e=Array.prototype.slice.call(arguments,1);else c=a.extend({},a.fn.navgoco.defaults,c||{}),a.cookie||(c.save=!1);return this.each(function(f){var g=a(this),h=g.data("navgoco");h||(h=new b(this,d?a.fn.navgoco.defaults:c,f),g.data("navgoco",h)),d&&h[c].apply(h,e)})};var c=null;a.fn.navgoco.defaults={caretHtml:"",accordion:!1,openClass:"open",save:!0,cookie:{name:"navgoco",expires:!1,path:"/"},slide:{duration:400,easing:"swing"},onClickBefore:a.noop,onClickAfter:a.noop,onToggleBefore:a.noop,onToggleAfter:a.noop}}(jQuery);


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/jobs/JobsConfig.jsx:
--------------------------------------------------------------------------------
  1 | import React from 'react'
  2 | 
  3 | import { connect } from 'react-redux'
  4 | //import PureRenderMixin from 'react-addons-pure-render-mixin'
  5 | 
  6 | import JobsItem from './JobsItem.jsx'
  7 | 
  8 | require('./JobsConfig.scss');
  9 | 
 10 | class JobsConfig extends React.Component {
 11 | 
 12 |   constructor(props){
 13 |     super(props);
 14 |     //this.shouldComponentUpdate = PureRenderMixin.shouldComponentUpdate.bind(this);
 15 |     this.state = {};
 16 |   }
 17 | 
 18 |   componentDidMount(){
 19 |     this.updateSpiders();
 20 |   }
 21 | 
 22 |   updateSpiders(){
 23 | 
 24 |   }
 25 | 
 26 |   componentWillReceiveProps(nextProps) {
 27 |     // console.log('entro componentWillReceiveProps');
 28 |   }
 29 | 
 30 |   shouldComponentUpdate(nextProps, nextState) {
 31 |     return true;
 32 |     //return nextProps.id !== this.props.id;
 33 |   }
 34 | 
 35 |   render() {
 36 |     const {jobs} = this.props;
 37 | 
 38 |     // console.log('render!');
 39 | 
 40 |     var toggle_class = 'odd';
 41 | 
 42 |     // https://github.com/facebook/immutable-js/issues/667#issuecomment-220223640
 43 |     var list_spiders = jobs.entrySeq().map(([key, value]) => {
 44 | 
 45 |       if (value.job_type == 'spider') {
 46 | 
 47 |         toggle_class = (toggle_class == 'odd') ? 'even' : 'odd';
 48 | 
 49 |         return <JobsItem
 50 |           key={key}
 51 |           id={key}
 52 |           toggle_class={toggle_class}
 53 |           value={value}
 54 |         />;
 55 |       }
 56 | 
 57 |     });
 58 | 
 59 |     var list_commands = jobs.entrySeq().map(([key, value]) => {
 60 | 
 61 |       if (value.job_type == 'command') {
 62 | 
 63 |         toggle_class = (toggle_class == 'odd') ? 'even' : 'odd';
 64 | 
 65 |         return <JobsItem
 66 |           key={key}
 67 |           id={key}
 68 |           toggle_class={toggle_class}
 69 |           value={value}
 70 |         />;
 71 |       }
 72 | 
 73 |     });
 74 | 
 75 |     return (
 76 |       <div className="container-fluid scheduler">
 77 |         <h1>Jobs Configuration</h1>
 78 | 
 79 |         {list_spiders}
 80 | 
 81 |         <div style={{'clear':'both', 'height':'40px'}}></div>
 82 | 
 83 |         <h1>Commands Configuration</h1>
 84 | 
 85 |         {list_commands}
 86 | 
 87 |         <div style={{'clear':'both', 'height':'40px'}}></div>
 88 | 
 89 |         <div className="box-legends">
 90 |           <h2>Legends</h2>
 91 |           <ul>
 92 |             <li><strong>Frequency</strong>: Amount of time in minutes defining when to trigger this action over time. Ex.: 60 means each hour</li>
 93 |             <li><strong>Max Concurrency</strong>: How many servers will be this action running.</li>
 94 |             <li><strong>Min Concurrency</strong>: Only dispatch this job when a minimum of resources are available.</li>
 95 |             <li><strong>Priority</strong>: Highest numbers is selected when the system need to choose between equals opportunities.</li>
 96 |             <li><strong>Max Memory</strong>: The processes are killed when reach this threshold (in megabytes) and could be reallocated in other server or in the same server.</li>
 97 |             <li><strong>Start URLs</strong>: A list of URLs to use as starting point, one by line.</li>
 98 |             <li><strong>Last started at</strong>: Last time this job was triggered.</li>
 99 |           </ul>
100 |         </div>
101 | 
102 | 
103 |       </div>
104 |     );
105 |   }
106 | 
107 | }
108 | 
109 | var mapDispatchToProps = function(dispatch){
110 |   return {
111 |     dispatch
112 |   }
113 | };
114 | 
115 | export default connect(
116 |   (state) => {
117 |     return {
118 |       jobs: state.jobs
119 |     }
120 |   },
121 |   mapDispatchToProps
122 | )(JobsConfig)


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/main.py:
--------------------------------------------------------------------------------
  1 | from gevent import monkey
  2 | monkey.patch_all()
  3 | 
  4 | import os
  5 | import sys
  6 | import signal
  7 | import threading
  8 | 
  9 | import flask
 10 | import gevent
 11 | 
 12 | from flask_cors import CORS
 13 | from flask_socketio import SocketIO
 14 | 
 15 | try:
 16 |     import configparser
 17 | except ImportError:
 18 |     import ConfigParser as configparser
 19 | 
 20 | from scrapy_eagle.dashboard import settings
 21 | from scrapy_eagle.dashboard import memory
 22 | from scrapy_eagle.dashboard.green_threads import heartbeat, stats, find_new_spiders, find_new_commands, executor
 23 | from scrapy_eagle.dashboard.utils import processkit
 24 | 
 25 | 
 26 | app = flask.Flask(__name__, static_folder='templates/static')
 27 | 
 28 | 
 29 | def main():
 30 | 
 31 |     # Install the arguments and config file inside the config module
 32 |     _, _ = settings.setup()
 33 | 
 34 | 
 35 | def shutdown():
 36 | 
 37 |     # Send a signal to all opened subprocess, closing them.
 38 |     for pid, _, _, _, _ in settings.subprocess_pids:
 39 | 
 40 |         print('killing subprocess: {pid}'.format(pid=pid))
 41 | 
 42 |         os.kill(pid, signal.SIGHUP)
 43 | 
 44 |     print('\nshutting down {0}...'.format(threading.currentThread().getName()))
 45 | 
 46 |     sys.exit(0)
 47 | 
 48 | 
 49 | def start_periodics(socketio):
 50 | 
 51 |     redis_conn = memory.get_connection()
 52 |     public_ip = settings.get_public_ip()
 53 |     hostname = settings.get_hostname()
 54 | 
 55 |     for i in range(3):
 56 |         gevent.spawn(
 57 |             processkit.new_subprocess,
 58 |             base_dir='.',
 59 |             subprocess_pids=settings.subprocess_pids,
 60 |             queue_info_global=settings.queue_info_global,
 61 |             buffers=settings.buffers
 62 |         )
 63 | 
 64 |     gevent.spawn(heartbeat.heartbeat_servers, redis_conn, public_ip, hostname)
 65 |     gevent.spawn(stats.send_resources_info, socketio, settings.subprocess_pids, public_ip)
 66 |     gevent.spawn(executor.evaluation_loop)
 67 |     gevent.spawn(find_new_spiders)
 68 |     gevent.spawn(find_new_commands)
 69 | 
 70 | 
 71 | def entry_point():
 72 | 
 73 |     # Graceful shutdown when kill are received
 74 |     signal.signal(signal.SIGTERM, lambda sig, frame: shutdown())
 75 | 
 76 |     # Graceful shutdown when terminal session are closed
 77 |     signal.signal(signal.SIGHUP, lambda sig, frame: shutdown())
 78 | 
 79 |     main()
 80 | 
 81 |     try:
 82 | 
 83 |         _config = settings.get_config_file()
 84 | 
 85 |         app.config['SECRET_KEY'] = _config.get('server', 'cookie_secret_key')
 86 |         app.config['DEBUG'] = _config.getboolean('server', 'debug', fallback=True)
 87 | 
 88 |         from scrapy_eagle.dashboard.views import servers, processes, root, jobs, react_app
 89 | 
 90 |         app.register_blueprint(root.root, url_prefix='/')
 91 |         app.register_blueprint(react_app.react_app, url_prefix='/app')
 92 |         app.register_blueprint(servers.servers, url_prefix='/servers')
 93 |         app.register_blueprint(processes.processes, url_prefix='/processes')
 94 |         app.register_blueprint(jobs.jobs, url_prefix='/jobs')
 95 | 
 96 |         CORS(app)
 97 | 
 98 |         socketio = SocketIO(app, async_mode='gevent')
 99 | 
100 |         start_periodics(socketio)
101 | 
102 |         # use_reloader: avoid Flask execute twice
103 |         socketio.run(
104 |             app=app,
105 |             host=_config.get('server', 'host', fallback='0.0.0.0'),
106 |             port=_config.getint('server', 'port', fallback=5000),
107 |             use_reloader=False
108 |         )
109 | 
110 |     except (KeyboardInterrupt, SystemExit):
111 | 
112 |         shutdown()
113 | 
114 | 
115 | if __name__ == "__main__":
116 | 
117 |     entry_point()
118 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | node_modules/
  2 | **/static/js/bundle.js.map
  3 | 
  4 | # Created by https://www.gitignore.io/api/pycharm,python,sublimetext,komodoedit,vim,linux
  5 | 
  6 | ### PyCharm ###
  7 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  8 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  9 | 
 10 | # User-specific stuff:
 11 | .idea
 12 | .idea/workspace.xml
 13 | .idea/tasks.xml
 14 | .idea/dictionaries
 15 | .idea/vcs.xml
 16 | .idea/jsLibraryMappings.xml
 17 | 
 18 | # Sensitive or high-churn files:
 19 | .idea/dataSources.ids
 20 | .idea/dataSources.xml
 21 | .idea/dataSources.local.xml
 22 | .idea/sqlDataSources.xml
 23 | .idea/dynamic.xml
 24 | .idea/uiDesigner.xml
 25 | 
 26 | # Gradle:
 27 | .idea/gradle.xml
 28 | .idea/libraries
 29 | 
 30 | # Mongo Explorer plugin:
 31 | .idea/mongoSettings.xml
 32 | 
 33 | ## File-based project format:
 34 | *.iws
 35 | 
 36 | ## Plugin-specific files:
 37 | 
 38 | # IntelliJ
 39 | /out/
 40 | 
 41 | # mpeltonen/sbt-idea plugin
 42 | .idea_modules/
 43 | 
 44 | # JIRA plugin
 45 | atlassian-ide-plugin.xml
 46 | 
 47 | # Crashlytics plugin (for Android Studio and IntelliJ)
 48 | com_crashlytics_export_strings.xml
 49 | crashlytics.properties
 50 | crashlytics-build.properties
 51 | fabric.properties
 52 | 
 53 | 
 54 | ### Python ###
 55 | # Byte-compiled / optimized / DLL files
 56 | __pycache__/
 57 | *.py[cod]
 58 | *$py.class
 59 | 
 60 | # C extensions
 61 | *.so
 62 | 
 63 | # Distribution / packaging
 64 | .Python
 65 | env/
 66 | build/
 67 | develop-eggs/
 68 | dist/
 69 | downloads/
 70 | eggs/
 71 | .eggs/
 72 | lib/
 73 | lib64/
 74 | parts/
 75 | sdist/
 76 | var/
 77 | *.egg-info/
 78 | .installed.cfg
 79 | *.egg
 80 | 
 81 | # PyInstaller
 82 | #  Usually these files are written by a python script from a template
 83 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 84 | *.manifest
 85 | *.spec
 86 | 
 87 | # Installer logs
 88 | pip-log.txt
 89 | pip-delete-this-directory.txt
 90 | 
 91 | # Unit test / coverage reports
 92 | htmlcov/
 93 | .tox/
 94 | .coverage
 95 | .coverage.*
 96 | .cache
 97 | nosetests.xml
 98 | coverage.xml
 99 | *,cover
100 | .hypothesis/
101 | 
102 | # Translations
103 | *.mo
104 | *.pot
105 | 
106 | # Django stuff:
107 | *.log
108 | local_settings.py
109 | 
110 | # Flask instance folder
111 | instance/
112 | 
113 | # Scrapy stuff:
114 | .scrapy
115 | 
116 | # Sphinx documentation
117 | docs/_build/
118 | 
119 | # PyBuilder
120 | target/
121 | 
122 | # IPython Notebook
123 | .ipynb_checkpoints
124 | 
125 | # pyenv
126 | .python-version
127 | 
128 | # celery beat schedule file
129 | celerybeat-schedule
130 | 
131 | # dotenv
132 | .env
133 | 
134 | # virtualenv
135 | venv/
136 | ENV/
137 | 
138 | # Spyder project settings
139 | .spyderproject
140 | 
141 | # Rope project settings
142 | .ropeproject
143 | 
144 | 
145 | ### SublimeText ###
146 | # cache files for sublime text
147 | *.tmlanguage.cache
148 | *.tmPreferences.cache
149 | *.stTheme.cache
150 | 
151 | # workspace files are user-specific
152 | *.sublime-workspace
153 | 
154 | # project files should be checked into the repository, unless a significant
155 | # proportion of contributors will probably not be using SublimeText
156 | # *.sublime-project
157 | 
158 | # sftp configuration file
159 | sftp-config.json
160 | 
161 | 
162 | ### KomodoEdit ###
163 | *.komodoproject
164 | .komodotools
165 | 
166 | 
167 | ### Vim ###
168 | # swap
169 | [._]*.s[a-w][a-z]
170 | [._]s[a-w][a-z]
171 | # session
172 | Session.vim
173 | # temporary
174 | .netrwhist
175 | *~
176 | # auto-generated tag files
177 | tags
178 | 
179 | 
180 | ### Linux ###
181 | *~
182 | 
183 | # temporary files which can be created if a process still has a handle open of a deleted file
184 | .fuse_hidden*
185 | 
186 | # KDE directory preferences
187 | .directory
188 | 
189 | # Linux trash folder which might appear on any partition or disk
190 | .Trash-*
191 | 
192 | 
193 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/servers/ServerNode.jsx:
--------------------------------------------------------------------------------
  1 | var React = require('react');
  2 | var ServerSubProcess = require('./ServerSubProcess.jsx');
  3 | 
  4 | var ServerNode = React.createClass({
  5 |   getInitialState: function() {
  6 |     return {
  7 |       pid: "",
  8 |       public_ip: "",
  9 |       cpu_percent: "",
 10 |       memory_available_mb: "",
 11 |       memory_total_mb: "",
 12 |       memory_used_mb: "",
 13 |       memory_used_server_mb: "",
 14 |       cpus: [],
 15 |       subprocesses: [],
 16 |       spiders: []
 17 |     };
 18 |   },
 19 |   componentWillMount: function() {
 20 | 
 21 |     this.socket = io.connect(window.location.protocol + "//" + this.props.public_ip + ":" + location.port + "/resources");
 22 |     this.socket.on('resources_info', function (msg) {
 23 | 
 24 |       var buff = "[ ";
 25 |       for(var i = 0; i < msg.data.cpus.length; i++){
 26 |         if(i+1 == msg.data.cpus.length){
 27 |           buff += msg.data.cpus[i] + " ";
 28 | 
 29 |         } else {
 30 |           buff += msg.data.cpus[i] + " / ";
 31 |         }
 32 |       }
 33 |       buff += "]";
 34 | 
 35 |       this.setState({
 36 |         pid: msg.data.pid,
 37 |         public_ip: msg.data.public_ip,
 38 |         cpu_percent: msg.data.cpu_percent,
 39 |         memory_available_mb: msg.data.memory_available_mb,
 40 |         memory_total_mb: msg.data.memory_total_mb,
 41 |         memory_used_mb: msg.data.memory_used_mb,
 42 |         memory_used_server_mb: msg.data.memory_used_server_mb,
 43 |         cpus: buff,
 44 |         subprocesses: msg.data.sub,
 45 |         spiders: msg.data.spiders
 46 |       });
 47 | 
 48 |       // console.log(msg.data.cpus);
 49 | 
 50 |     }.bind(this));
 51 | 
 52 |   },
 53 |   componentWillUnmount: function(){
 54 | 
 55 |     this.socket.disconnect();
 56 | 
 57 |   },
 58 |   onClickExecCommand: function(e){
 59 | 
 60 |     $.get(window.location.protocol + "//" + this.state.public_ip + ":" + location.port + "/processes/exec_command", function(data) {
 61 | 
 62 |     });
 63 | 
 64 |   },
 65 |   onClickStartWorker: function(e){
 66 | 
 67 |     $.get(window.location.protocol + "//" + this.state.public_ip + ":" + location.port + "/processes/start_spider/" + this.state.selected_spider, function(data) {
 68 | 
 69 |     });
 70 | 
 71 |   },
 72 |   onChangeDataProvider: function(e){
 73 | 
 74 |     this.setState({'selected_spider': e.target.value});
 75 | 
 76 |   },
 77 |   render: function(){
 78 | 
 79 |     var listSubProcesses = this.state.subprocesses.map(function (item, i) {
 80 |       return <ServerSubProcess
 81 |         key={item.pid}
 82 |         pid={item.pid}
 83 |         cpu_percent={item.cpu_percent}
 84 |         spider={item.spider}
 85 |         public_ip={this.state.public_ip}
 86 |         base_dir={item.base_dir}
 87 |         command={item.command}
 88 |         created_at={item.created_at}
 89 |         memory_used_mb={item.memory_used_mb} />;
 90 |     }.bind(this));
 91 | 
 92 |     var listSpiders = this.state.spiders.map(function (item, i) {
 93 |       return (
 94 |           <option key={i} value={item}>{item}</option>
 95 |       );
 96 |     }.bind(this));
 97 | 
 98 |     return (
 99 |       <li id="server-node">
100 |         <ul>
101 |           <li>IP: {this.props.public_ip} ({this.props.hostname})</li>
102 |           <li>PID: {this.state.pid}</li>
103 |           <li>CPU Server: {this.state.cpus}%</li>
104 |           <li>Memory Used Server : {this.state.memory_used_server_mb}mb</li>
105 |           <li>CPU Process: {this.state.cpu_percent}%</li>
106 |           <li>Memory Used Process: {this.state.memory_used_mb}mb</li>
107 |           <li>Memory Available: {this.state.memory_available_mb}mb</li>
108 |           <li>Memory Total: {this.state.memory_total_mb}mb</li>
109 |           <li><button onClick={this.onClickExecCommand}>Exec Sample Command</button></li>
110 |           <li>
111 |             <select onChange={this.onChangeDataProvider}>
112 |               <option value="">--- choose a spider</option>
113 |               {listSpiders}
114 |             </select>
115 |             <button onClick={this.onClickStartWorker}>Start Worker</button></li>
116 |           <ul>{listSubProcesses}</ul>
117 |         </ul>
118 |       </li>
119 |     );
120 | 
121 |   }
122 | });
123 | 
124 | module.exports = ServerNode;
125 | 


--------------------------------------------------------------------------------
/scrapy_eagle/worker/dupefilter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | 
  4 | from scrapy.dupefilters import BaseDupeFilter
  5 | from scrapy.utils.request import request_fingerprint
  6 | 
  7 | from .connection import get_redis_from_settings
  8 | 
  9 | 
 10 | DEFAULT_DUPEFILTER_KEY = "dupefilter:%(timestamp)s"
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class RFPDupeFilter(BaseDupeFilter):
 16 |     """Redis-based request duplicates filter.
 17 | 
 18 |     This class can also be used with default Scrapy's scheduler.
 19 | 
 20 |     """
 21 | 
 22 |     logger = logger
 23 | 
 24 |     def __init__(self, server, key, debug=False):
 25 |         """Initialize the duplicates filter.
 26 | 
 27 |         Parameters
 28 |         ----------
 29 |         server : redis.StrictRedis
 30 |             The redis server instance.
 31 |         key : str
 32 |             Redis key Where to store fingerprints.
 33 |         debug : bool, optional
 34 |             Whether to log filtered requests.
 35 | 
 36 |         """
 37 |         self.server = server
 38 |         self.key = key
 39 |         self.debug = debug
 40 |         self.logdupes = True
 41 | 
 42 |     @classmethod
 43 |     def from_settings(cls, settings):
 44 |         """Returns an instance from given settings.
 45 | 
 46 |         This uses by default the key ``dupefilter:<timestamp>``. When using the
 47 |         ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
 48 |         it needs to pass the spider name in the key.
 49 | 
 50 |         Parameters
 51 |         ----------
 52 |         settings : scrapy.settings.Settings
 53 | 
 54 |         Returns
 55 |         -------
 56 |         RFPDupeFilter
 57 |             A RFPDupeFilter instance.
 58 | 
 59 | 
 60 |         """
 61 |         server = get_redis_from_settings(settings)
 62 |         # XXX: This creates one-time key. needed to support to use this
 63 |         # class as standalone dupefilter with scrapy's default scheduler
 64 |         # if scrapy passes spider on open() method this wouldn't be needed
 65 |         # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
 66 |         key = DEFAULT_DUPEFILTER_KEY % {'timestamp': int(time.time())}
 67 |         debug = settings.getbool('DUPEFILTER_DEBUG')
 68 |         return cls(server, key=key, debug=debug)
 69 | 
 70 |     @classmethod
 71 |     def from_crawler(cls, crawler):
 72 |         """Returns instance from crawler.
 73 | 
 74 |         Parameters
 75 |         ----------
 76 |         crawler : scrapy.crawler.Crawler
 77 | 
 78 |         Returns
 79 |         -------
 80 |         RFPDupeFilter
 81 |             Instance of RFPDupeFilter.
 82 | 
 83 |         """
 84 |         return cls.from_settings(crawler.settings)
 85 | 
 86 |     def request_seen(self, request):
 87 |         """Returns True if request was already seen.
 88 | 
 89 |         Parameters
 90 |         ----------
 91 |         request : scrapy.http.Request
 92 | 
 93 |         Returns
 94 |         -------
 95 |         bool
 96 | 
 97 |         """
 98 |         fp = self.request_fingerprint(request)
 99 |         # This returns the number of values added, zero if already exists.
100 |         added = self.server.sadd(self.key, fp)
101 |         return added == 0
102 | 
103 |     def request_fingerprint(self, request):
104 |         """Returns a fingerprint for a given request.
105 | 
106 |         Parameters
107 |         ----------
108 |         request : scrapy.http.Request
109 | 
110 |         Returns
111 |         -------
112 |         str
113 | 
114 |         """
115 |         return request_fingerprint(request)
116 | 
117 |     def close(self, reason=''):
118 |         """Delete data on close. Called by Scrapy's scheduler.
119 | 
120 |         Parameters
121 |         ----------
122 |         reason : str, optional
123 | 
124 |         """
125 |         self.clear()
126 | 
127 |     def clear(self):
128 |         """Clears fingerprints data."""
129 |         self.server.delete(self.key)
130 | 
131 |     def log(self, request, spider):
132 |         """Logs given request.
133 | 
134 |         Parameters
135 |         ----------
136 |         request : scrapy.http.Request
137 |         spider : scrapy.spiders.Spider
138 | 
139 |         """
140 |         if self.debug:
141 |             msg = "Filtered duplicate request: %(request)s"
142 |             self.logger.debug(msg, {'request': request}, extra={'spider': spider})
143 |         elif self.logdupes:
144 |             msg = ("Filtered duplicate request %(request)s"
145 |                    " - no more duplicates will be shown"
146 |                    " (see DUPEFILTER_DEBUG to show all duplicates)")
147 |             msg = "Filtered duplicate request: %(request)s"
148 |             self.logger.debug(msg, {'request': request}, extra={'spider': spider})
149 |             self.logdupes = False
150 | 
151 | 


--------------------------------------------------------------------------------
/scrapy_eagle/worker/queue.py:
--------------------------------------------------------------------------------
  1 | from scrapy.utils.reqser import request_to_dict, request_from_dict
  2 | 
  3 | from . import picklecompat
  4 | 
  5 | 
  6 | class Base(object):
  7 |     """Per-spider queue/stack base class"""
  8 | 
  9 |     def __init__(self, server, spider, key, serializer=None):
 10 |         """Initialize per-spider redis queue.
 11 | 
 12 |         Parameters:
 13 |             server -- redis connection
 14 |             spider -- spider instance
 15 |             key -- key for this queue (e.g. "%(spider)s:queue")
 16 | 
 17 |         """
 18 |         if serializer is None:
 19 |             # Backward compatibility.
 20 |             # TODO: deprecate pickle.
 21 |             serializer = picklecompat
 22 |         if not hasattr(serializer, 'loads'):
 23 |             raise TypeError("serializer does not implement 'loads' function: %r"
 24 |                             % serializer)
 25 |         if not hasattr(serializer, 'dumps'):
 26 |             raise TypeError("serializer '%s' does not implement 'dumps' function: %r"
 27 |                             % serializer)
 28 | 
 29 |         self.server = server
 30 |         self.spider = spider
 31 |         self.key = key % {'spider': spider.name}
 32 |         self.serializer = serializer
 33 | 
 34 |     def _encode_request(self, request):
 35 |         """Encode a request object"""
 36 |         obj = request_to_dict(request, self.spider)
 37 |         return self.serializer.dumps(obj)
 38 | 
 39 |     def _decode_request(self, encoded_request):
 40 |         """Decode an request previously encoded"""
 41 |         obj = self.serializer.loads(encoded_request)
 42 |         return request_from_dict(obj, self.spider)
 43 | 
 44 |     def __len__(self):
 45 |         """Return the length of the queue"""
 46 |         raise NotImplementedError
 47 | 
 48 |     def push(self, request):
 49 |         """Push a request"""
 50 |         raise NotImplementedError
 51 | 
 52 |     def pop(self, timeout=0):
 53 |         """Pop a request"""
 54 |         raise NotImplementedError
 55 | 
 56 |     def clear(self):
 57 |         """Clear queue/stack"""
 58 |         self.server.delete(self.key)
 59 | 
 60 | 
 61 | class SpiderQueue(Base):
 62 |     """Per-spider FIFO queue"""
 63 | 
 64 |     def __len__(self):
 65 |         """Return the length of the queue"""
 66 |         return self.server.llen(self.key)
 67 | 
 68 |     def push(self, request):
 69 |         """Push a request"""
 70 |         self.server.lpush(self.key, self._encode_request(request))
 71 | 
 72 |     def pop(self, timeout=0):
 73 |         """Pop a request"""
 74 |         if timeout > 0:
 75 |             data = self.server.brpop(self.key, timeout)
 76 |             if isinstance(data, tuple):
 77 |                 data = data[1]
 78 |         else:
 79 |             data = self.server.rpop(self.key)
 80 |         if data:
 81 |             return self._decode_request(data)
 82 | 
 83 | 
 84 | class SpiderPriorityQueue(Base):
 85 |     """Per-spider priority queue abstraction using redis' sorted set"""
 86 | 
 87 |     def __len__(self):
 88 |         """Return the length of the queue"""
 89 |         return self.server.zcard(self.key)
 90 | 
 91 |     def push(self, request):
 92 |         """Push a request"""
 93 |         data = self._encode_request(request)
 94 |         score = -request.priority
 95 |         # We don't use zadd method as the order of arguments change depending on
 96 |         # whether the class is Redis or StrictRedis, and the option of using
 97 |         # kwargs only accepts strings, not bytes.
 98 |         self.server.execute_command('ZADD', self.key, score, data)
 99 | 
100 |     def pop(self, timeout=0):
101 |         """
102 |         Pop a request
103 |         timeout not support in this queue class
104 |         """
105 |         # use atomic range/remove using multi/exec
106 |         pipe = self.server.pipeline()
107 |         pipe.multi()
108 |         pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
109 |         results, count = pipe.execute()
110 |         if results:
111 |             return self._decode_request(results[0])
112 | 
113 | 
114 | class SpiderStack(Base):
115 |     """Per-spider stack"""
116 | 
117 |     def __len__(self):
118 |         """Return the length of the stack"""
119 |         return self.server.llen(self.key)
120 | 
121 |     def push(self, request):
122 |         """Push a request"""
123 |         self.server.lpush(self.key, self._encode_request(request))
124 | 
125 |     def pop(self, timeout=0):
126 |         """Pop a request"""
127 |         if timeout > 0:
128 |             data = self.server.blpop(self.key, timeout)
129 |             if isinstance(data, tuple):
130 |                 data = data[1]
131 |         else:
132 |             data = self.server.lpop(self.key)
133 | 
134 |         if data:
135 |             return self._decode_request(data)
136 | 
137 | 
138 | __all__ = ['SpiderQueue', 'SpiderPriorityQueue', 'SpiderStack']
139 | 
140 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/templates/static/css/main.css:
--------------------------------------------------------------------------------
  1 | div#server_set li#server-node {
  2 |     margin-bottom: 20px;
  3 | }
  4 | 
  5 | .active { color: red; }
  6 | 
  7 | header {
  8 |     background-color: #222426;
  9 |     height: 60px;
 10 |     margin-right: 0;
 11 |     position: absolute;
 12 |     width: 100%;
 13 |     z-index: 200;
 14 | }
 15 | 
 16 | header div.brand {
 17 |     padding: 6px 0 0 0;
 18 | }
 19 | 
 20 | .flexbox {
 21 |     display: flex;
 22 |     overflow: hidden;
 23 |     flex-direction: row;
 24 |     min-height: 100vh;
 25 | }
 26 | 
 27 | div.subheader {
 28 |     position: absolute;
 29 |     margin-top: 60px;
 30 |     background-color: #2A2D2F;
 31 |     width: 100%;
 32 |     height: 46px;
 33 |     color: #00B280;
 34 |     z-index: 200;
 35 |     padding-top: 11px;
 36 | }
 37 | 
 38 | aside.sidebar {
 39 |     color: #001f3f;
 40 |     min-height: 100%;
 41 |     padding: 114px 0 0 10px;
 42 |     background-color: #DDFFDD;
 43 |     flex: 0 0 280px;
 44 | }
 45 | 
 46 | section.main-content-wrapper {
 47 |     padding: 114px 10px 10px 10px;
 48 |     /*border: 1px solid red;*/
 49 |     min-height: 100%;
 50 |     flex: 1;
 51 | }
 52 | 
 53 | .sidebar-header {
 54 |   color: #6f737e;
 55 |   font-weight: 600;
 56 |   line-height: 20px;
 57 |   margin: 0;
 58 |   padding: 10px 10px 5px;
 59 |   text-transform: uppercase;
 60 | }
 61 | 
 62 | .sidebar .nav a {
 63 |   font-weight: 600;
 64 |   text-decoration: none;
 65 | }
 66 | .sidebar .nav i {
 67 |     font-size: 1em;
 68 |     margin-right: 5px;
 69 | }
 70 | .sidebar .nav .nav-sub {
 71 |     display: none;
 72 |     list-style: outside none none;
 73 |     padding: 0;
 74 | }
 75 | .sidebar .nav .nav-sub li > a {
 76 |     display: block;
 77 |     font-size: 0.813em;
 78 |     padding: 8px 0 8px 10px;
 79 | }
 80 | .sidebar .nav > li > .nav-sub > li > a {
 81 |     padding-left: 22px;
 82 | }
 83 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > a {
 84 |     padding-left: 55px;
 85 | }
 86 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
 87 |     padding-left: 65px;
 88 | }
 89 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
 90 |     padding-left: 70px;
 91 | }
 92 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
 93 |     padding-left: 75px;
 94 | }
 95 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > a {
 96 |     padding-left: 25px;
 97 | }
 98 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > .nav-sub > li > a {
 99 |     padding-left: 35px;
100 | }
101 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
102 |     padding-left: 45px;
103 | }
104 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a {
105 |     padding-left: 55px;
106 | }
107 | .sidebar .nav .nav-sub .nav-dropdown > a {
108 |     padding-right: 30px;
109 | }
110 | .sidebar .nav .nav-sub > .open > a, .sidebar .nav .nav-sub > .open > a:focus, .sidebar .nav .nav-sub > .open > a:hover {
111 |     background-color: transparent;
112 |     border-color: transparent;
113 | }
114 | .sidebar .nav-pills {
115 |     margin-left: 5px;
116 |     margin-right: 12px;
117 | }
118 | .sidebar .nav-pills > li > a {
119 |     font-size: 0.875em;
120 |     padding: 9px 10px;
121 | }
122 | 
123 | .sidebar-left .nav > li.open > a,
124 | .sidebar-left .nav > li > a:hover {
125 |   background-color: #ffffff;
126 |   color: #1d2939;
127 | }
128 | 
129 | .sidebar-mini .sidebar-left .nav > li.nav-dropdown-open > a,
130 | .sidebar-mini .sidebar-left .nav > li:hover > a {
131 |     background-color: #fff;
132 |     color: #1d2939;
133 | }
134 | 
135 | .nav-pills .nav-item.open .nav-link,
136 | .nav-pills .nav-item.open .nav-link:focus,
137 | .nav-pills .nav-item.open .nav-link:hover {
138 |   background-color: #29d1ca;
139 |   color: #fff;
140 |   cursor: pointer;
141 | }
142 | 
143 | .nav-pills .nav-link.active,
144 | .nav-pills .nav-link.active:focus,
145 | .nav-pills .nav-link.active:hover {
146 |   background-color: #27b6af;
147 |   color: #fff;
148 |   cursor: pointer;
149 | }
150 | 
151 | .sidebar-left a {
152 |   color: #1f7e9a;
153 | }
154 | 
155 | .sidebar-left a:focus,
156 | .sidebar-left a:hover {
157 |   background-color: transparent;
158 |   color: #001f3f;
159 | }
160 | 
161 | .sidebar-left .active > a,
162 | .sidebar-left .active > a:focus,
163 | .sidebar-left .active > a:hover {
164 |   /* Cor do item ativo dentro da categoria */
165 |   color: #1d2939;
166 | }
167 | 
168 | .sidebar-mini .sidebar-left .nav > li.open > a {
169 |   background-color: transparent;
170 |   color: pink;
171 | }
172 | .sidebar-left .nav > li > a:focus {
173 |   /* A cor que fica o texto depois de clicar na categoria (focus) */
174 |   background-color: #29d1ca;
175 |   color: #fff;
176 | }
177 | 
178 | .sidebar .nav-pills > li > a > .badge {
179 |   margin: 3px 0;
180 | }
181 | 
182 | .pull-right {
183 |     float: right !important;
184 | }
185 | 
186 | .nav-pills > li > a > .tag {
187 |   margin-top: 2px;
188 |   font-size: 80%;
189 |   padding: 0.25em 0.4em 0.28em;
190 | }
191 | 
192 | div.breadcrumbs span a,
193 | div.breadcrumbs {
194 |     color: #d4d4d4;
195 |     font-size: 14px;
196 | }
197 | 
198 | div.breadcrumbs span:first-child a {
199 |     color: #00B280;
200 | }


--------------------------------------------------------------------------------
/scrapy_eagle/worker/spiders.py:
--------------------------------------------------------------------------------
  1 | from scrapy import signals
  2 | from scrapy.exceptions import DontCloseSpider
  3 | from scrapy.spiders import Spider, CrawlSpider
  4 | 
  5 | from . import connection
  6 | 
  7 | 
  8 | # Default batch size matches default concurrent requests setting.
  9 | DEFAULT_START_URLS_BATCH_SIZE = 16
 10 | DEFAULT_START_URLS_KEY = '%(name)s:start_urls'
 11 | 
 12 | 
 13 | class DistributedMixin(object):
 14 |     """Mixin class to implement reading urls from a redis queue."""
 15 |     # Per spider redis key, default to DEFAULT_KEY.
 16 |     redis_key = None
 17 |     # Fetch this amount of start urls when idle. Default to DEFAULT_BATCH_SIZE.
 18 |     redis_batch_size = None
 19 |     # Redis client instance.
 20 |     server = None
 21 | 
 22 |     def start_requests(self):
 23 |         """Returns a batch of start requests from redis."""
 24 |         return self.next_requests()
 25 | 
 26 |     def setup_redis(self, crawler=None):
 27 |         """Setup redis connection and idle signal.
 28 | 
 29 |         This should be called after the spider has set its crawler object.
 30 |         """
 31 |         if self.server is not None:
 32 |             return
 33 | 
 34 |         if crawler is None:
 35 |             # We allow optional crawler argument to keep backwards
 36 |             # compatibility.
 37 |             # XXX: Raise a deprecation warning.
 38 |             crawler = getattr(self, 'crawler', None)
 39 | 
 40 |         if crawler is None:
 41 |             raise ValueError("crawler is required")
 42 | 
 43 |         settings = crawler.settings
 44 | 
 45 |         if self.redis_key is None:
 46 |             self.redis_key = settings.get(
 47 |                 'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY,
 48 |             )
 49 | 
 50 |         self.redis_key = self.redis_key % {'name': self.name}
 51 | 
 52 |         if not self.redis_key.strip():
 53 |             raise ValueError("redis_key must not be empty")
 54 | 
 55 |         if self.redis_batch_size is None:
 56 |             self.redis_batch_size = settings.getint(
 57 |                 'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE,
 58 |             )
 59 | 
 60 |         try:
 61 |             self.redis_batch_size = int(self.redis_batch_size)
 62 |         except (TypeError, ValueError):
 63 |             raise ValueError("redis_batch_size must be an integer")
 64 | 
 65 |         self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
 66 |                          "(batch size: %(redis_batch_size)s)", self.__dict__)
 67 | 
 68 |         self.server = connection.from_settings(crawler.settings)
 69 |         # The idle signal is called when the spider has no requests left,
 70 |         # that's when we will schedule new requests from redis queue
 71 |         crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
 72 | 
 73 |     def next_requests(self):
 74 |         """Returns a request to be scheduled or none."""
 75 |         use_set = self.settings.getbool('REDIS_START_URLS_AS_SET')
 76 |         fetch_one = self.server.spop if use_set else self.server.lpop
 77 |         # XXX: Do we need to use a timeout here?
 78 |         found = 0
 79 |         while found < self.redis_batch_size:
 80 |             data = fetch_one(self.redis_key)
 81 |             if data:
 82 |                 data = data.decode('utf-8')
 83 |             else:
 84 |                 # Queue empty.
 85 |                 break
 86 |             req = self.make_request_from_data(data)
 87 |             if req:
 88 |                 yield req
 89 |                 found += 1
 90 |             else:
 91 |                 self.logger.debug("Request not made from data: %r", data)
 92 | 
 93 |         if found:
 94 |             self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
 95 | 
 96 |     def make_request_from_data(self, data):
 97 |         # By default, data is an URL.
 98 |         if '://' in data:
 99 |             return self.make_requests_from_url(data)
100 |         else:
101 |             self.logger.error("Unexpected URL from '%s': %r", self.redis_key, data)
102 | 
103 |     def schedule_next_requests(self):
104 |         """Schedules a request if available"""
105 |         for req in self.next_requests():
106 |             self.crawler.engine.crawl(req, spider=self)
107 | 
108 |     def spider_idle(self):
109 |         """Schedules a request if available, otherwise waits."""
110 |         # XXX: Handle a sentinel to close the spider.
111 |         self.schedule_next_requests()
112 |         raise DontCloseSpider
113 | 
114 | 
115 | class DistributedSpider(DistributedMixin, Spider):
116 |     """Spider that reads urls from redis queue when idle."""
117 | 
118 |     @classmethod
119 |     def from_crawler(self, crawler, *args, **kwargs):
120 |         obj = super(DistributedSpider, self).from_crawler(crawler, *args, **kwargs)
121 |         obj.setup_redis(crawler)
122 |         return obj
123 | 
124 | 
125 | class DistributedCrawlSpider(DistributedMixin, CrawlSpider):
126 |     """Spider that reads urls from redis queue when idle."""
127 | 
128 |     @classmethod
129 |     def from_crawler(self, crawler, *args, **kwargs):
130 |         obj = super(DistributedCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
131 |         obj.setup_redis(crawler)
132 |         return obj
133 | 
134 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/App.jsx:
--------------------------------------------------------------------------------
  1 | import React from 'react'
  2 | import { Link, IndexLink } from 'react-router'
  3 | import { connect } from 'react-redux'
  4 | import Breadcrumbs from 'react-breadcrumbs'
  5 | 
  6 | require('./App.scss');
  7 | 
  8 | class App extends React.Component {
  9 |   constructor(props){
 10 |     super(props);
 11 |   }
 12 | 
 13 |   componentWillMount(){
 14 |     this.intervals = [];
 15 |   }
 16 | 
 17 |   setInterval() {
 18 |     this.intervals.push(setInterval.apply(null, arguments));
 19 |   }
 20 | 
 21 |   componentWillUnmount(){
 22 |     this.intervals.forEach(clearInterval);
 23 |     
 24 |     // Ref: https://facebook.github.io/react/tips/initial-ajax.html
 25 |     this.clientsRequest.abort();
 26 |   }
 27 | 
 28 |   ajax_get_jobs_info(){
 29 | 
 30 |     var that = this;
 31 | 
 32 |     this.clientsRequest = $.ajax({
 33 |       url: window.location.protocol + "//" + document.domain + ":" + location.port + "/jobs/list",
 34 |       type: 'GET',
 35 |       dataType: 'json',
 36 |       cache: false
 37 |     }).done((data) => {
 38 | 
 39 |       $.each(data, (key, value) => {
 40 |         // console.log(key, value);
 41 | 
 42 |         that.props.dispatch(
 43 |           {
 44 |             type: 'UPDATE_SPIDER_INFO',
 45 |             spider_id: key,
 46 |             frequency_minutes: value.frequency_minutes,
 47 |             last_started_at: value.last_started_at,
 48 |             max_concurrency: value.max_concurrency,
 49 |             min_concurrency: value.min_concurrency,
 50 |             max_memory_mb: value.max_memory_mb,
 51 |             priority: value.priority,
 52 |             job_type: value.job_type,
 53 |             active: value.active,
 54 |             start_urls: value.start_urls
 55 |           }
 56 |         );
 57 | 
 58 |       })
 59 | 
 60 |     }).always(() => {
 61 |       // that.setState({'server_set': server_set_new});
 62 |     });
 63 | 
 64 |   }
 65 | 
 66 |   componentDidMount(){
 67 |     this.ajax_get_jobs_info();
 68 |     this.setInterval(this.ajax_get_jobs_info.bind(this), 5000);
 69 |   }
 70 | 
 71 |   render(){
 72 |     const { servers_qty } = this.props;
 73 |     return (
 74 |       <div>
 75 | 
 76 |         <div className="container-fluid subheader">
 77 |           <Breadcrumbs
 78 |             routes={this.props.routes}
 79 |             params={this.props.params}
 80 |           />
 81 |         </div>
 82 | 
 83 |         <div className="flexbox">
 84 | 
 85 |           <section className="main-content-wrapper">
 86 | 
 87 |             {/*<h1>Distributed Scrapy</h1>
 88 | 
 89 |             <a onClick={() => {this.props.SET_SERVER_QTY(7)}}>{servers_qty}</a>
 90 |             => <button onClick={() => { this.props.dispatch({type: 'SET_SERVER_QTY', qty:10}); }}>{servers_qty}</button>
 91 | 
 92 |             <ul>
 93 |               <li><IndexLink to="/app/" activeClassName="active">/</IndexLink></li>
 94 |               <li><Link to="/app/servers/monitoring" activeClassName="active">/servers/monitoring</Link></li>
 95 |               <li><Link to="/app/spiders/config" activeClassName="active">/spiders/config</Link></li>
 96 |             </ul>
 97 |             */}
 98 | 
 99 |             {this.props.children}
100 | 
101 |           </section>
102 | 
103 |           <aside className="sidebar sidebar-left">
104 | 
105 |             <nav>
106 |               <h5 className="sidebar-header">Navigation</h5>
107 |               <ul className="nav nav-pills nav-stacked">
108 | 
109 |                 {/*<li className="nav-item">
110 |                   <a className="nav-link" href="#">
111 |                     Option 1
112 |                     <span className="pull-right tag tag-pill tag-primary">8</span>
113 |                   </a>
114 |                 </li>
115 | 
116 |                 <li className="nav-item">
117 |                   <a className="nav-link" href="#">
118 |                     Option 2
119 |                     <span className="pull-right tag tag-danger">new</span>
120 |                   </a>
121 |                 </li>*/}
122 | 
123 |                 <li className="nav-item">
124 |                   <IndexLink to="/app/" className="nav-link direct" activeClassName="active">Main</IndexLink>
125 |                 </li>
126 | 
127 |                 <li className="nav-item nav-dropdown">
128 |                   <Link to="/app/servers" className="nav-link" activeClassName="active">
129 |                     Servers
130 |                     <span className="pull-right tag tag-pill tag-primary">{ servers_qty }</span>
131 |                   </Link>
132 |                   <ul className="nav-sub">
133 |                     <li><Link to="/app/servers/monitoring" activeClassName="active">Monitoring</Link></li>
134 |                   </ul>
135 |                 </li>
136 | 
137 |                 <li className="nav-item nav-dropdown">
138 |                   <Link to="/app/jobs" className="nav-link" activeClassName="active">Spiders & Commands (Jobs)</Link>
139 |                   <ul className="nav-sub">
140 |                     <li><Link to="/app/jobs/config" activeClassName="active">Configuration</Link></li>
141 |                   </ul>
142 |                 </li>
143 | 
144 |               </ul>
145 | 
146 |             </nav>
147 | 
148 |           </aside>
149 | 
150 |         </div>
151 | 
152 |       </div>
153 |       );
154 |   }
155 | }
156 | 
157 | var mapDispatchToProps = function(dispatch){
158 |   return {
159 |     dispatch
160 |   }
161 | };
162 | 
163 | export default connect(
164 |   (state) => {
165 |     return {
166 |       servers_qty: state.servers.servers_qty
167 |     }
168 |   },
169 |   mapDispatchToProps
170 | )(App)


--------------------------------------------------------------------------------
/scrapy_eagle/worker/scheduler.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import six
  3 | 
  4 | from scrapy.utils.misc import load_object
  5 | 
  6 | from . import connection
  7 | 
  8 | 
  9 | class DistributedScheduler(object):
 10 |     """Redis-based scheduler"""
 11 | 
 12 |     def __init__(self, server,
 13 |                  persist=False,
 14 |                  flush_on_start=False,
 15 |                  queue_key='%(spider)s:requests',
 16 |                  queue_cls='scrapy_eagle.worker.queue.SpiderPriorityQueue',
 17 |                  dupefilter_key='%(spider)s:dupefilter',
 18 |                  dupefilter_cls='scrapy_eagle.worker.dupefilter.RFPDupeFilter',
 19 |                  idle_before_close=0,
 20 |                  serializer=None):
 21 |         """Initialize scheduler.
 22 | 
 23 |         Parameters
 24 |         ----------
 25 |         server : Redis
 26 |             The redis server instance.
 27 |         persist : bool
 28 |             Whether to flush requests when closing. Default is False.
 29 |         flush_on_start : bool
 30 |             Whether to flush requests on start. Default is False.
 31 |         queue_key : str
 32 |             Requests queue key.
 33 |         queue_cls : str
 34 |             Importable path to the queue class.
 35 |         dupefilter_key : str
 36 |             Duplicates filter key.
 37 |         dupefilter_cls : str
 38 |             Importable path to the dupefilter class.
 39 |         idle_before_close : int
 40 |             Timeout before giving up.
 41 | 
 42 |         """
 43 |         if idle_before_close < 0:
 44 |             raise TypeError("idle_before_close cannot be negative")
 45 | 
 46 |         self.server = server
 47 |         self.persist = persist
 48 |         self.flush_on_start = flush_on_start
 49 |         self.queue_key = queue_key
 50 |         self.queue_cls = queue_cls
 51 |         self.dupefilter_cls = dupefilter_cls
 52 |         self.dupefilter_key = dupefilter_key
 53 |         self.idle_before_close = idle_before_close
 54 |         self.serializer = serializer
 55 |         self.stats = None
 56 | 
 57 |     def __len__(self):
 58 |         return len(self.queue)
 59 | 
 60 |     @classmethod
 61 |     def from_settings(cls, settings):
 62 |         kwargs = {
 63 |             'persist': settings.getbool('SCHEDULER_PERSIST'),
 64 |             'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
 65 |             'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
 66 |         }
 67 | 
 68 |         # If these values are missing, it means we want to use the defaults.
 69 |         optional = {
 70 |             # TODO: Use custom prefixes for this settings to note that are
 71 |             # specific to scrapy-redis.
 72 |             'queue_key': 'SCHEDULER_QUEUE_KEY',
 73 |             'queue_cls': 'SCHEDULER_QUEUE_CLASS',
 74 |             'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
 75 |             # We use the default setting name to keep compatibility.
 76 |             'dupefilter_cls': 'DUPEFILTER_CLASS',
 77 |             'serializer': 'SCHEDULER_SERIALIZER',
 78 |         }
 79 |         for name, setting_name in optional.items():
 80 |             val = settings.get(setting_name)
 81 |             if val:
 82 |                 kwargs[name] = val
 83 | 
 84 |         # Support serializer as a path to a module.
 85 |         if isinstance(kwargs.get('serializer'), six.string_types):
 86 |             kwargs['serializer'] = importlib.import_module(kwargs['serializer'])
 87 | 
 88 |         server = connection.from_settings(settings)
 89 |         # Ensure the connection is working.
 90 |         server.ping()
 91 | 
 92 |         return cls(server=server, **kwargs)
 93 | 
 94 |     @classmethod
 95 |     def from_crawler(cls, crawler):
 96 |         instance = cls.from_settings(crawler.settings)
 97 |         # FIXME: for now, stats are only supported from this constructor
 98 |         instance.stats = crawler.stats
 99 |         return instance
100 | 
101 |     def open(self, spider):
102 |         self.spider = spider
103 | 
104 |         try:
105 |             self.queue = load_object(self.queue_cls)(
106 |                 server=self.server,
107 |                 spider=spider,
108 |                 key=self.queue_key % {'spider': spider.name},
109 |                 serializer=self.serializer,
110 |             )
111 |         except TypeError as e:
112 |             raise ValueError("Failed to instantiate queue class '%s': %s",
113 |                              self.queue_cls, e)
114 | 
115 |         try:
116 |             self.df = load_object(self.dupefilter_cls)(
117 |                 server=self.server,
118 |                 key=self.dupefilter_key % {'spider': spider.name},
119 |                 debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
120 |             )
121 |         except TypeError as e:
122 |             raise ValueError("Failed to instantiate dupefilter class '%s': %s",
123 |                              self.dupefilter_cls, e)
124 | 
125 |         if self.flush_on_start:
126 |             self.flush()
127 |         # notice if there are requests already in the queue to resume the crawl
128 |         if len(self.queue):
129 |             spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
130 | 
131 |     def close(self, reason):
132 |         if not self.persist:
133 |             self.flush()
134 | 
135 |     def flush(self):
136 |         self.df.clear()
137 |         self.queue.clear()
138 | 
139 |     def enqueue_request(self, request):
140 |         if not request.dont_filter and self.df.request_seen(request):
141 |             self.df.log(request, self.spider)
142 |             return False
143 |         if self.stats:
144 |             self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
145 |         self.queue.push(request)
146 |         return True
147 | 
148 |     def next_request(self):
149 |         block_pop_timeout = self.idle_before_close
150 |         request = self.queue.pop(block_pop_timeout)
151 |         if request and self.stats:
152 |             self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
153 |         return request
154 | 
155 |     def has_pending_requests(self):
156 |         return len(self) > 0
157 | 
158 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/views/jobs.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from collections import OrderedDict
  3 | from datetime import datetime, timedelta
  4 | 
  5 | import flask
  6 | 
  7 | from scrapy_eagle.dashboard import settings
  8 | from scrapy_eagle.dashboard.memory import get_job_object, update_job_object
  9 | 
 10 | 
 11 | jobs = flask.Blueprint('jobs', __name__)
 12 | 
 13 | 
 14 | @jobs.route('/update', methods=['POST'])
 15 | def update():
 16 | 
 17 |     #TODO: Ensure that the incoming request comes from the same IP (Security)
 18 | 
 19 |     result = {}
 20 |     error = False
 21 | 
 22 |     key, job_type, active, frequency_minutes, max_concurrency = (None, None, None, None, None)
 23 |     min_concurrency, priority, max_memory_mb, start_urls = (None, None, None, None)
 24 | 
 25 |     try:
 26 | 
 27 |         key = flask.request.form.get('key', None)
 28 |         job_type = flask.request.form.get('job_type', None)
 29 |         frequency_minutes = int(flask.request.form.get('frequency_minutes', None))
 30 |         max_concurrency = int(flask.request.form.get('max_concurrency', None))
 31 |         min_concurrency = int(flask.request.form.get('min_concurrency', None))
 32 |         priority = int(flask.request.form.get('priority', None))
 33 |         max_memory_mb = int(flask.request.form.get('max_memory_mb', None))
 34 |         start_urls = flask.request.form.get('start_urls', None)
 35 | 
 36 |         if flask.request.form.get('active', None) == 'false':
 37 |             active = False
 38 |         elif flask.request.form.get('active', None) == 'true':
 39 |             active = True
 40 |         else:
 41 |             active = False
 42 | 
 43 |     # Never trust in the user input type
 44 |     except ValueError:
 45 |         error = True
 46 |         result.update({
 47 |             'status': 'error',
 48 |             'msg': 'You sent wrong datatypes, like a letter when it should be numeric.'
 49 |         })
 50 | 
 51 |     if not error:
 52 | 
 53 |         if not all([key, job_type, frequency_minutes, max_concurrency, min_concurrency, priority, max_memory_mb]):
 54 |             error = True
 55 |             result.update({
 56 |                 'status': 'error',
 57 |                 'msg': 'You are missing some information, please check your form.'
 58 |             })
 59 | 
 60 |         elif not start_urls and job_type == 'spider':
 61 |             error = True
 62 |             result.update({
 63 |                 'status': 'error',
 64 |                 'msg': 'You should provide the Start URLs information for spiders.'
 65 |             })
 66 | 
 67 |         else:
 68 | 
 69 |             actual_obj = get_job_object(key=key)
 70 | 
 71 |             # A brand new
 72 |             if not actual_obj:
 73 |                 actual_obj = {}
 74 |             else:
 75 |                 current_frequency = actual_obj['frequency_minutes']
 76 | 
 77 |             actual_obj.update({
 78 |                 'active': active,
 79 |                 'job_type': job_type,
 80 |                 'frequency_minutes': frequency_minutes,
 81 |                 'max_concurrency': max_concurrency,
 82 |                 'min_concurrency': min_concurrency,
 83 |                 'priority': priority,
 84 |                 'max_memory_mb': max_memory_mb
 85 |             })
 86 | 
 87 |             # If the frequency change, recalculate the next execution
 88 |             if current_frequency != frequency_minutes:
 89 |                 actual_obj['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=frequency_minutes)).isoformat()
 90 | 
 91 |             if job_type == 'spider':
 92 |                 actual_obj.update({'start_urls': [x for x in start_urls.split("\n") if x]})
 93 | 
 94 |             update_job_object(key=key, fields=actual_obj)
 95 | 
 96 |         if not error:
 97 |             result.update({
 98 |                 'status': 'ok'
 99 |             })
100 | 
101 |     return flask.Response(
102 |         response=json.dumps(result, sort_keys=True),
103 |         status=200,
104 |         mimetype="application/json"
105 |     )
106 | 
107 | 
108 | @jobs.route('/list', methods=['GET'])
109 | def listing():
110 | 
111 |     _spiders = settings.get_spiders()
112 |     _commands = settings.get_commands()
113 | 
114 |     # When the system is starting up, spiders may return empty because
115 |     # we're using async execution `green_threads.find_new_spiders`.
116 |     if not _spiders:
117 |         return flask.Response(
118 |             response=json.dumps({}, sort_keys=True),
119 |             status=200,
120 |             mimetype="application/json"
121 |         )
122 | 
123 |     _spiders.sort()
124 | 
125 |     d = OrderedDict()
126 | 
127 |     for s in _spiders:
128 | 
129 |         obj = get_job_object(key=s)
130 | 
131 |         if obj:
132 |             d[s] = obj
133 |         else:
134 |             # Jobs without previous information, using default config
135 |             d[s] = {}
136 |             d[s]['active'] = False
137 |             d[s]['job_type'] = 'spider'
138 |             d[s]['min_concurrency'] = 1
139 |             d[s]['max_concurrency'] = 5
140 |             d[s]['max_memory_mb'] = 200
141 |             d[s]['priority'] = 1
142 |             d[s]['frequency_minutes'] = 60
143 |             d[s]['start_urls'] = []
144 |             d[s]['last_started_at'] = datetime.utcnow().isoformat()
145 |             d[s]['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=d[s]['frequency_minutes'])).isoformat()
146 | 
147 |     for file_name in _commands:
148 | 
149 |         obj = get_job_object(key=file_name)
150 | 
151 |         if obj:
152 |             d[file_name] = obj
153 | 
154 |         else:
155 |             d[file_name] = {}
156 |             d[file_name]['active'] = False
157 |             d[file_name]['job_type'] = 'command'
158 |             d[file_name]['min_concurrency'] = 1
159 |             d[file_name]['max_concurrency'] = 1
160 |             d[file_name]['max_memory_mb'] = 50
161 |             d[file_name]['priority'] = 1
162 |             d[file_name]['frequency_minutes'] = 60
163 |             d[file_name]['last_started_at'] = None
164 |             d[file_name]['next_execution_at'] = None
165 | 
166 |     return flask.Response(
167 |         response=json.dumps(d, sort_keys=True),
168 |         status=200,
169 |         mimetype="application/json"
170 |     )
171 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: docs/images/logo_readme.jpg
  2 | ======================================
  3 | 
  4 | .. image:: https://travis-ci.org/rafaelcapucho/scrapy-eagle.svg?branch=master
  5 |     :target: https://travis-ci.org/rafaelcapucho/scrapy-eagle
  6 |     
  7 | .. image:: https://img.shields.io/pypi/v/scrapy-eagle.svg
  8 |     :target: https://pypi.python.org/pypi/scrapy-eagle
  9 |     :alt: PyPI Version
 10 |     
 11 | .. image:: https://img.shields.io/pypi/pyversions/scrapy-eagle.svg
 12 |     :target: https://pypi.python.org/pypi/scrapy-eagle
 13 |     
 14 | .. image:: https://landscape.io/github/rafaelcapucho/scrapy-eagle/master/landscape.svg?style=flat
 15 |     :target: https://landscape.io/github/rafaelcapucho/scrapy-eagle/master
 16 |     :alt: Code Quality Status
 17 |     
 18 | .. image:: https://requires.io/github/rafaelcapucho/scrapy-eagle/requirements.svg?branch=master
 19 |     :target: https://requires.io/github/rafaelcapucho/scrapy-eagle/requirements/?branch=master
 20 |     :alt: Requirements Status
 21 | 
 22 | Scrapy Eagle is a tool that allow us to run any Scrapy_ based project in a distributed fashion and monitor how it is going on and how many resources it is consuming on each server.
 23 | 
 24 | .. _Scrapy: http://scrapy.org
 25 | 
 26 | **This project is Under Development, don't use it yet**
 27 | 
 28 | .. image:: https://badge.waffle.io/rafaelcapucho/scrapy-eagle.svg?label=ready&title=Ready
 29 |     :target: https://waffle.io/rafaelcapucho/scrapy-eagle
 30 |     :alt: 'Stories in Ready' 
 31 | 
 32 | Requeriments
 33 | ------------
 34 | 
 35 | Scrapy Eagle uses Redis_ as Distributed Queue, so you will need a redis instance running.
 36 | 
 37 | .. _Redis: http://mail.python.org/pipermail/doc-sig/
 38 | 
 39 | Installation
 40 | ------------
 41 | 
 42 | It could be easily made by running the code bellow,
 43 | 
 44 | .. code-block:: console
 45 | 
 46 |     $ virtualenv eagle_venv; cd eagle_venv; source bin/activate
 47 |     $ pip install scrapy-eagle
 48 |     
 49 | You should create one ``configparser`` configuration file (e.g. in /etc/scrapy-eagle.ini) containing:
 50 | 
 51 | .. code-block:: console
 52 | 
 53 |     [redis]
 54 |     host = 127.0.0.1
 55 |     port = 6379
 56 |     db = 0
 57 |     ;password = someverysecretpass
 58 | 
 59 |     [server]
 60 |     debug = True
 61 |     cookie_secret_key = ha74h3hdh42a
 62 |     host = 0.0.0.0
 63 |     port = 5000
 64 | 
 65 |     [scrapy]
 66 |     binary = /project_venv/bin/scrapy
 67 |     base_dir = /project_venv/project_scrapy/project
 68 | 
 69 |     [commands]
 70 |     binary = /project_venv/bin/python3 
 71 |     base_dir = /project_venv/project_scrapy/project/commands
 72 |     
 73 | Then you will be able to execute the `eagle_server` command like,
 74 | 
 75 | .. code-block:: console
 76 | 
 77 |     eagle_server --config-file=/etc/scrapy-eagle.ini
 78 |     
 79 | Changes into your Scrapy project
 80 | --------------------------------
 81 | 
 82 | Enable the components in your `settings.py` of your Scrapy project:
 83 | 
 84 | .. code-block:: python
 85 | 
 86 |   # Enables scheduling storing requests queue in redis.
 87 |   SCHEDULER = "scrapy_eagle.worker.scheduler.DistributedScheduler"
 88 | 
 89 |   # Ensure all spiders share same duplicates filter through redis.
 90 |   DUPEFILTER_CLASS = "scrapy_eagle.worker.dupefilter.RFPDupeFilter"
 91 | 
 92 |   # Schedule requests using a priority queue. (default)
 93 |   SCHEDULER_QUEUE_CLASS = "scrapy_eagle.worker.queue.SpiderPriorityQueue"
 94 | 
 95 |   # Schedule requests using a queue (FIFO).
 96 |   SCHEDULER_QUEUE_CLASS = "scrapy_eagle.worker.queue.SpiderQueue"
 97 | 
 98 |   # Schedule requests using a stack (LIFO).
 99 |   SCHEDULER_QUEUE_CLASS = "scrapy_eagle.worker.queue.SpiderStack"
100 | 
101 |   # Max idle time to prevent the spider from being closed when distributed crawling.
102 |   # This only works if queue class is SpiderQueue or SpiderStack,
103 |   # and may also block the same time when your spider start at the first time (because the queue is empty).
104 |   SCHEDULER_IDLE_BEFORE_CLOSE = 0
105 | 
106 |   # Specify the host and port to use when connecting to Redis (optional).
107 |   REDIS_HOST = 'localhost'
108 |   REDIS_PORT = 6379
109 | 
110 |   # Specify the full Redis URL for connecting (optional).
111 |   # If set, this takes precedence over the REDIS_HOST and REDIS_PORT settings.
112 |   REDIS_URL = "redis://user:pass@hostname:6379"
113 |   
114 | Once the configuration is finished, you should adapt each spider to use our Mixin:
115 | 
116 | .. code-block:: python
117 | 
118 |     from scrapy.spiders import CrawlSpider, Rule
119 |     from scrapy_eagle.worker.spiders import DistributedMixin
120 |     
121 |     class YourSpider(DistributedMixin, CrawlSpider):
122 |     
123 |         name = "domain.com"
124 |     
125 |         # start_urls = ['http://www.domain.com/']
126 |         redis_key = 'domain.com:start_urls'
127 |         
128 |         rules = (
129 |             Rule(...),
130 |             Rule(...),
131 |         )
132 |         
133 |         def _set_crawler(self, crawler):
134 |             CrawlSpider._set_crawler(self, crawler)
135 |             DistributedMixin.setup_redis(self)
136 | 
137 | Feeding a Spider from Redis
138 | ---------------------------
139 | 
140 | The class `scrapy_eagle.worker.spiders.DistributedMixin` enables a spider to read the
141 | urls from redis. The urls in the redis queue will be processed one
142 | after another.
143 | 
144 | Then, push urls to redis::
145 | 
146 |     redis-cli lpush domain.com:start_urls http://domain.com/
147 | 
148 | Dashboard Development
149 | ---------------------
150 | 
151 | If you would like to change the client-side then you'll need to have NPM_ installed because we use ReactJS_ to build our interface. Installing all dependencies locally:
152 | 
153 | .. _ReactJS: https://facebook.github.io/react/
154 | .. _NPM: https://www.npmjs.com/
155 | 
156 | .. code-block:: console
157 | 
158 |     cd scrapy-eagle/dashboard
159 |     npm install 
160 | 
161 | Then you can run ``npm start`` to compile and start monitoring any changes and recompiling automatically.
162 | 
163 | To generate the production version, run ``npm run build``.
164 | 
165 | To be easier to test the Dashboard you could use one simple http server instead of run the ``eagle_server``, like:
166 | 
167 | .. code-block:: console
168 | 
169 |     sudo npm install -g http-server
170 |     cd scrapy-eagle/dashboard
171 |     http-server templates/
172 | 
173 | It would be available for you at http://127.0.0.1:8080
174 | 
175 | **Note**: Until now the Scrapy Eagle is mostly based on https://github.com/rolando/scrapy-redis.
176 | 


--------------------------------------------------------------------------------
/scrapy_eagle/dashboard/react-src/components/jobs/JobsItem.jsx:
--------------------------------------------------------------------------------
  1 | import React from 'react'
  2 | import { connect } from 'react-redux'
  3 | 
  4 | import cx from 'classnames'
  5 | import Switch from 'react-switchery'
  6 | 
  7 | class BaseComponent extends React.Component {
  8 |   _bind(...methods) {
  9 |     methods.forEach( (method) => this[method] = this[method].bind(this) );
 10 |   }
 11 | }
 12 | 
 13 | class JobsItem extends React.Component {
 14 | 
 15 |   constructor(props){
 16 |     super(props);
 17 |     // this._bind('_handleClick', '_handleFoo');
 18 |     this.handleSave = this.handleSave.bind(this);
 19 |     this.onBlurFrequency = this.onBlurFrequency.bind(this);
 20 |     this.onBlurMaxConcurrency = this.onBlurMaxConcurrency.bind(this);
 21 |     this.onBlurMinConcurrency = this.onBlurMinConcurrency.bind(this);
 22 |     this.onChangePriority = this.onChangePriority.bind(this);
 23 |     this.onBlurMaxMemory = this.onBlurMaxMemory.bind(this);
 24 |     this.onBlurStartURLs = this.onBlurStartURLs.bind(this);
 25 |     this.handleSave = this.handleSave.bind(this);
 26 |     this.state = {
 27 |       'key': this.props.id,
 28 |       'active': this.props.value.active,
 29 |       'job_type': this.props.value.job_type,
 30 |       'frequency_minutes': this.props.value.frequency_minutes,
 31 |       'max_concurrency': this.props.value.max_concurrency,
 32 |       'min_concurrency': this.props.value.min_concurrency,
 33 |       'priority': this.props.value.priority,
 34 |       'max_memory_mb': this.props.value.max_memory_mb,
 35 |     };
 36 | 
 37 |     if(this.props.value.start_urls){
 38 |       this.state['start_urls'] = this.format_start_urls(this.props.value.start_urls);
 39 |     }
 40 | 
 41 |   }
 42 | 
 43 |   format_start_urls(mylist){
 44 |     let buff = "";
 45 |     mylist.forEach(elem => {
 46 |       buff += elem + "\n";
 47 |     })
 48 |     return buff;
 49 |   }
 50 | 
 51 |   onBlurFrequency(e){ this.setState({'frequency_minutes': $.trim(e.target.value)}) }
 52 |   onBlurMaxConcurrency(e){ this.setState({'max_concurrency': $.trim(e.target.value)}) }
 53 |   onBlurMinConcurrency(e){ this.setState({'min_concurrency': $.trim(e.target.value)}) }
 54 |   onChangePriority(e){ this.setState({'priority': e.target.value}) }
 55 |   onBlurMaxMemory(e){ this.setState({'max_memory_mb': $.trim(e.target.value)}) }
 56 |   onBlurStartURLs(e){ this.setState({'start_urls': $.trim(e.target.value)}) }
 57 | 
 58 |   handleSave(){
 59 | 
 60 |     $.ajax({
 61 |       url: window.location.protocol + "//" + document.domain + ":" + location.port + "/jobs/update",
 62 |       type: 'POST',
 63 |       dataType: 'json',
 64 |       data: this.state,
 65 |     }).done((data) => {
 66 | 
 67 |       if(data.status == 'error'){
 68 |         alert(data.msg);
 69 |       } else if(data.status == 'ok'){
 70 | 
 71 |       }
 72 | 
 73 |     }).fail(() => {
 74 |       alert('The request failed, please try again.');
 75 |     }).always(() => {
 76 |       // that.setState({});
 77 |     });
 78 | 
 79 |   }
 80 | 
 81 |   SwitchonChange(value) {
 82 |     console.log(value);
 83 |   }
 84 | 
 85 |   render(){
 86 | 
 87 |     var show_start_urls = () => {
 88 | 
 89 |       if(this.state.job_type == 'spider') {
 90 |         return (
 91 |           <div className="form-group row">
 92 |             <label htmlFor="start_urls" className="col-xs-3 col-form-label">Start URLs</label>
 93 |             <div className="col-xs-9">
 94 |               <textarea className="form-control" name="start_urls" onBlur={this.onBlurStartURLs} id="start_urls"
 95 |                         defaultValue={this.state.start_urls} rows="3"></textarea>
 96 |             </div>
 97 |           </div>
 98 |         )
 99 |       }
100 | 
101 |     };
102 | 
103 |     return (
104 |       <div className={cx('col-sm-4', this.props.toggle_class)} key={this.state.key}>
105 |         <div className="jobTitle">{this.state.key}</div>
106 |         <form method="GET" action="">
107 | 
108 |           <div className="form-group row">
109 |             <label htmlFor="frequency_minutes" className="col-xs-3 col-form-label">Frequency</label>
110 |             <div className="col-xs-9">
111 |               <input className="form-control" name="frequency_minutes" type="text" onBlur={this.onBlurFrequency} defaultValue={this.state.frequency_minutes} id="frequency_minutes" />
112 |             </div>
113 |           </div>
114 | 
115 |           <div className="form-group row">
116 |             <label htmlFor="max_concurrency" className="col-xs-3 col-form-label">Max Concurrency</label>
117 |             <div className="col-xs-9">
118 |               <input className="form-control" name="max_concurrency" type="text" onBlur={this.onBlurMaxConcurrency} defaultValue={this.state.max_concurrency} id="max_concurrency" />
119 |             </div>
120 |           </div>
121 | 
122 |           <div className="form-group row">
123 |             <label htmlFor="min_concurrency" className="col-xs-3 col-form-label">Min Concurrency</label>
124 |             <div className="col-xs-9">
125 |               <input className="form-control" name="min_concurrency" type="text" onBlur={this.onBlurMinConcurrency} defaultValue={this.state.min_concurrency} id="min_concurrency" />
126 |             </div>
127 |           </div>
128 | 
129 |           <div className="form-group row">
130 |             <label htmlFor="priority" className="col-xs-3 col-form-label">Priority</label>
131 |             <div className="col-xs-9">
132 |               <select className="form-control" value={this.state.priority} id="priority" onChange={this.onChangePriority}>
133 |                 <option>0</option>
134 |                 <option>1</option>
135 |                 <option>2</option>
136 |                 <option>3</option>
137 |                 <option>4</option>
138 |                 <option>5</option>
139 |                 <option>6</option>
140 |                 <option>7</option>
141 |                 <option>8</option>
142 |                 <option>9</option>
143 |                 <option>10</option>
144 |               </select>
145 |             </div>
146 |           </div>
147 | 
148 |           <div className="form-group row">
149 |             <label htmlFor="max_memory_mb" className="col-xs-3 col-form-label">Max Memory</label>
150 |             <div className="col-xs-9">
151 |               <input className="form-control" name="max_memory_mb" onBlur={this.onBlurMaxMemory} type="text" defaultValue={this.state.max_memory_mb} id="max_memory_mb" />
152 |               {/*<small id="emailHelp" className="form-text text-muted">The processes are killed when reach this threshold (megabytes).</small>*/}
153 |             </div>
154 |           </div>
155 | 
156 |           {show_start_urls()}
157 | 
158 |           <div className="form-group row">
159 |             <label htmlFor="example-text-input" className="col-xs-3 col-form-label">Last started at</label>
160 |             <div className="col-xs-6">
161 |               <small id="emailHelp" style={{'margin-top': '0.5rem', 'display':'block', 'color':'white'}}>16 minutes ago</small>
162 |             </div>
163 |             <div className="col-xs-3">
164 |               <button onClick={this.handleSave} className="btn btn-outline-success btn-sm" style={{'float': 'right'}} type="button">Save</button>
165 |             </div>
166 | 
167 |           </div>
168 | 
169 |           {/*<Switch
170 |             className="switch-class"
171 |             onChange={this.SwitchonChange}
172 |             label="testando"
173 |             options={
174 |               {
175 |                 color: '#474F79',
176 |                 size: 'small'
177 |               }
178 |             }
179 |             checked
180 |           />*/}
181 | 
182 |         </form>
183 |       </div>
184 |     );
185 |   }
186 | 
187 | }
188 | 
189 | var mapDispatchToProps = function(dispatch){
190 |   return {
191 |     dispatch
192 |   }
193 | };
194 | 
195 | export default connect(
196 |   (state) => {
197 |     return {
198 |       //jobs: state.jobs
199 |     }
200 |   },
201 |   mapDispatchToProps
202 | )(JobsItem)


--------------------------------------------------------------------------------