├── scrapy_eagle ├── __init__.py ├── worker │ ├── __init__.py │ ├── picklecompat.py │ ├── connection.py │ ├── dupefilter.py │ ├── queue.py │ ├── spiders.py │ └── scheduler.py └── dashboard │ ├── __init__.py │ ├── views │ ├── __init__.py │ ├── root.py │ ├── react_app.py │ ├── servers.py │ ├── processes.py │ └── jobs.py │ ├── .babelrc │ ├── templates │ ├── static │ │ ├── css │ │ │ ├── bundle.css.map │ │ │ ├── bundle.css │ │ │ └── main.css │ │ ├── img │ │ │ └── system-logo.jpg │ │ └── js │ │ │ └── vendor │ │ │ └── jquery.navgoco.min.js │ └── index.html │ ├── react-src │ ├── components │ │ ├── Home.jsx │ │ ├── jobs │ │ │ ├── Root.jsx │ │ │ ├── JobsConfig.scss │ │ │ ├── JobsConfig.jsx │ │ │ └── JobsItem.jsx │ │ ├── servers │ │ │ ├── Root.jsx │ │ │ ├── ServerSubProcess.jsx │ │ │ ├── ServerSet.jsx │ │ │ └── ServerNode.jsx │ │ ├── ListItem.jsx │ │ ├── App.scss │ │ ├── List.jsx │ │ └── App.jsx │ ├── services │ │ └── httpservice.js │ ├── reducers │ │ ├── servers.jsx │ │ └── jobs.jsx │ └── main.jsx │ ├── utils │ ├── __init__.py │ ├── spiderskit.py │ ├── commandskit.py │ ├── ip.py │ └── processkit.py │ ├── green_threads │ ├── __init__.py │ ├── heartbeat.py │ ├── executor.py │ └── stats.py │ ├── webpack.config.dev.js │ ├── webpack.config.prod.js │ ├── package.json │ ├── settings.py │ ├── memory.py │ └── main.py ├── docs └── images │ └── logo_readme.jpg ├── .travis.yml ├── requirements.txt ├── MANIFEST.in ├── pytest.ini ├── tox.ini ├── generator.py ├── setup.py ├── tests └── test_queue.py ├── .gitignore └── README.rst /scrapy_eagle/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_eagle/worker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/views/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": ["react", "es2015"] 3 | } 4 | -------------------------------------------------------------------------------- /docs/images/logo_readme.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rafaelcapucho/scrapy-eagle/HEAD/docs/images/logo_readme.jpg -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | install: "pip install -r requirements.txt" 5 | script: nosetests 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | pymongo 3 | requests 4 | redis 5 | scrapy>=1.1.0 6 | flask-socketio 7 | flask-cors 8 | gevent 9 | psutil -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/templates/static/css/bundle.css.map: -------------------------------------------------------------------------------- 1 | {"version":3,"sources":[],"names":[],"mappings":"","file":"../css/bundle.css","sourceRoot":""} -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/templates/static/img/system-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rafaelcapucho/scrapy-eagle/HEAD/scrapy_eagle/dashboard/templates/static/img/system-logo.jpg -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft docs 2 | 3 | include *.in 4 | include *.ini 5 | include *.rst 6 | include *.txt 7 | 8 | recursive-include scrapy_eagle/dashboard/templates * 9 | 10 | global-exclude __pycache__ *.py[cod] 11 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/views/root.py: -------------------------------------------------------------------------------- 1 | import json 2 | import flask 3 | 4 | 5 | root = flask.Blueprint('root', __name__) 6 | 7 | 8 | @root.route('/') 9 | def index(): 10 | 11 | return flask.redirect('/app') 12 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | norecursedirs = 3 | .* 4 | dist 5 | build 6 | python_files = 7 | test_*.py 8 | *_test.py 9 | tests.py 10 | ignore = 11 | setup.py 12 | addopts = 13 | -rxEfsw -v 14 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/Home.jsx: -------------------------------------------------------------------------------- 1 | var React = require('react'); 2 | 3 | var Home = React.createClass({ 4 | render: function() { 5 | return
App Home
6 | } 7 | }); 8 | 9 | module.exports = Home; 10 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py{35}-scrapy{11} 3 | 4 | [testenv] 5 | basepython = 6 | py35: python3.5 7 | deps = 8 | -rrequirements.txt 9 | commands = 10 | scrapy11: pip install scrapy>=1.1,<1.2 11 | {posargs:py.test} 12 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/jobs/Root.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | 3 | export default class SpiderRoot extends React.Component { 4 | constructor(props){ 5 | super(props); 6 | } 7 | 8 | render(){ 9 | return this.props.children; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/servers/Root.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | 3 | export default class ServerRoot extends React.Component { 4 | constructor(props){ 5 | super(props); 6 | } 7 | 8 | render(){ 9 | return this.props.children; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/views/react_app.py: -------------------------------------------------------------------------------- 1 | import flask 2 | 3 | 4 | react_app = flask.Blueprint('app', __name__) 5 | 6 | 7 | @react_app.route('/', defaults={'path': ''}) 8 | @react_app.route('/') 9 | def app(path): 10 | return flask.render_template('index.html') 11 | -------------------------------------------------------------------------------- /scrapy_eagle/worker/picklecompat.py: -------------------------------------------------------------------------------- 1 | """A pickle wrapper module with protocol=-1 by default.""" 2 | 3 | try: 4 | import cPickle as pickle # PY2 5 | except ImportError: 6 | import pickle 7 | 8 | 9 | def loads(s): 10 | return pickle.loads(s) 11 | 12 | 13 | def dumps(obj): 14 | return pickle.dumps(obj, protocol=-1) 15 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from calendar import timegm 3 | 4 | 5 | def iso_to_timestamp(iso): 6 | epoch = timegm(datetime.strptime(iso, "%Y-%m-%dT%H:%M:%S.%f").timetuple()) 7 | assert isinstance(epoch, int) 8 | return epoch 9 | 10 | 11 | def timestamp_to_utc(ts): 12 | return datetime.utcfromtimestamp(ts) 13 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/services/httpservice.js: -------------------------------------------------------------------------------- 1 | var Fetch = require('whatwg-fetch'); 2 | var baseUrl = 'http://localhost:6060'; 3 | 4 | var service = { 5 | get: function(url) { 6 | return fetch(baseUrl + url) 7 | .then(function(response) { 8 | return response.json(); 9 | }); 10 | } 11 | }; 12 | 13 | module.exports = service; 14 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/ListItem.jsx: -------------------------------------------------------------------------------- 1 | var React = require('react'); 2 | 3 | var ListItem = React.createClass({ 4 | 5 | render: function() { 6 | return ( 7 |
  • 8 |

    {this.props.memory_used_mb} - {this.props.memory_available_mb}

    9 |
  • 10 | ); 11 | } 12 | 13 | }); 14 | 15 | module.exports = ListItem; 16 | -------------------------------------------------------------------------------- /generator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from time import sleep 4 | 5 | # When the dashboard receives a KeyboardInterrupt 6 | # the subprocess also receive a KeyboardInterrupt 7 | # you could catch or not. 8 | 9 | try: 10 | n = 1 11 | while True: 12 | 13 | print(n) 14 | 15 | n += 1 16 | 17 | #sys.stdout.flush() 18 | 19 | sleep(1) 20 | 21 | if n % 20 == 0: break 22 | 23 | print(' ') 24 | 25 | except (KeyboardInterrupt, SystemExit): 26 | print('fechou') -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/jobs/JobsConfig.scss: -------------------------------------------------------------------------------- 1 | div.scheduler { 2 | 3 | h1 { 4 | //margin: 30px 0 35px 0; 5 | } 6 | 7 | label.col-form-label { 8 | font-size: 80%; 9 | } 10 | 11 | div.odd { 12 | background-color: #3b3e42; 13 | } 14 | 15 | div.even { 16 | background-color: #2a2d2f; 17 | } 18 | 19 | div.jobTitle { 20 | margin: 10px 0 16px 0; 21 | font-size: 85%; 22 | color: #00b280; 23 | font-weight: bold; 24 | } 25 | 26 | div.box-legends { 27 | margin-top: 35px; 28 | 29 | li { 30 | font-size: 12px; 31 | } 32 | 33 | } 34 | } -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/reducers/servers.jsx: -------------------------------------------------------------------------------- 1 | const initialState = { 2 | servers_qty: 0, 3 | }; 4 | 5 | export const INCREASE_SERVER = 'INCREASE_SERVER'; 6 | export const SET_SERVER_QTY = 'SET_SERVER_QTY'; 7 | 8 | export default function stats(state = initialState, action) { 9 | 10 | switch (action.type) { 11 | 12 | case INCREASE_SERVER: 13 | 14 | return Object.assign({}, state, { 15 | servers_qty: state.servers_qty + 1 16 | }); 17 | 18 | case SET_SERVER_QTY: 19 | 20 | return Object.assign({}, state, { 21 | servers_qty: action.qty 22 | }); 23 | 24 | default: 25 | return state; 26 | } 27 | } -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/utils/spiderskit.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | from scrapy_eagle.dashboard import settings 4 | 5 | 6 | def find_spiders(): 7 | 8 | _config = settings.get_config_file() 9 | 10 | base_dir = _config.get('scrapy', 'base_dir') 11 | binary = _config.get('scrapy', 'binary') 12 | 13 | spiders = [] 14 | 15 | with subprocess.Popen( 16 | [binary, 'list'], 17 | cwd=base_dir, 18 | stdout=subprocess.PIPE, 19 | bufsize=1, 20 | universal_newlines=True 21 | ) as p: 22 | for line in p.stdout: 23 | spiders.append(line.strip()) 24 | 25 | return spiders 26 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/templates/static/css/bundle.css: -------------------------------------------------------------------------------- 1 | body{background-color:#323539;color:#f5f5f5;font-size:100%;margin:0;padding:0;position:relative;text-rendering:optimizelegibility}a:active,a:hover,a:link,a:visited{color:#fff;outline:medium none;text-decoration:none}h1,h2,h3,h4,h5,h6{color:#f5f5f5;font-family:Montserrat,sans-serif;margin:20px 0 25px}h1{font-size:1.375em}h2{font-size:1.188em}h3{font-size:1.063em}h4{font-size:.938em}h5{font-size:.813em}h6{font-size:.75em}div.scheduler label.col-form-label{font-size:80%}div.scheduler div.odd{background-color:#3b3e42}div.scheduler div.even{background-color:#2a2d2f}div.scheduler div.jobTitle{margin:10px 0 16px;font-size:85%;color:#00b280;font-weight:700}div.scheduler div.box-legends{margin-top:35px}div.scheduler div.box-legends li{font-size:12px} -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/green_threads/__init__.py: -------------------------------------------------------------------------------- 1 | import gevent 2 | 3 | from scrapy_eagle.dashboard import settings 4 | from scrapy_eagle.dashboard.utils import spiderskit, commandskit 5 | 6 | 7 | def find_new_spiders(): 8 | 9 | while True: 10 | 11 | # Open the process and execute Scrapy's list command 12 | _spiders = spiderskit.find_spiders() 13 | 14 | # Install the list of spiders names 15 | settings._spiders = _spiders 16 | 17 | gevent.sleep(10) 18 | 19 | 20 | def find_new_commands(): 21 | 22 | while True: 23 | 24 | # Monitoring the command folder 25 | _commands = commandskit.find_commands() 26 | 27 | # Install the list of commands names 28 | settings._commands = _commands 29 | 30 | gevent.sleep(5) -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/App.scss: -------------------------------------------------------------------------------- 1 | body { 2 | /*font-size: 12px;*/ 3 | /*font-family: Arial, Verdana, sans-serif;*/ 4 | background-color: #323539; 5 | color: whitesmoke; 6 | font-size: 100%; 7 | margin: 0; 8 | padding: 0; 9 | position: relative; 10 | text-rendering: optimizelegibility; 11 | } 12 | 13 | a:link, a:visited { 14 | color: white; 15 | outline: medium none; 16 | text-decoration: none; 17 | } 18 | a:hover, a:active { 19 | color: white; 20 | outline: medium none; 21 | text-decoration: none; 22 | } 23 | 24 | 25 | h1, h2, h3, h4, h5, h6 { 26 | color: whitesmoke; 27 | font-family: "Montserrat", sans-serif; 28 | margin: 20px 0 25px 0; 29 | 30 | } 31 | 32 | h1 {font-size: 1.375em;} 33 | h2 {font-size: 1.188em;} 34 | h3 {font-size: 1.063em;} 35 | h4 {font-size: 0.938em;} 36 | h5 {font-size: 0.813em;} 37 | h6 {font-size: 0.75em;} -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/utils/commandskit.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from scrapy_eagle.dashboard import settings 4 | 5 | 6 | def load_commands_name(dir): 7 | 8 | if os.path.exists(dir): 9 | 10 | module_names = [] 11 | 12 | for d in os.listdir(dir): 13 | if d.find("__init__") == -1 and d.endswith('.py'): 14 | 15 | # Remove possible spaces 16 | d = d.replace(" ", "") 17 | 18 | # Remove the Extension 19 | d = ".".join(d.split(".")[:-1]) 20 | 21 | module_names.append(d) 22 | 23 | module_names.sort() 24 | 25 | return module_names 26 | 27 | else: 28 | return [] 29 | 30 | 31 | def find_commands(): 32 | 33 | _config = settings.get_config_file() 34 | 35 | base_dir = _config.get('commands', 'base_dir') 36 | 37 | return load_commands_name(dir=base_dir) 38 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/webpack.config.dev.js: -------------------------------------------------------------------------------- 1 | var webpack = require('webpack'); 2 | var path = require('path'); 3 | 4 | var ExtractTextPlugin = require('extract-text-webpack-plugin'); 5 | 6 | var BUILD_JS_DIR = path.resolve(__dirname, 'templates/static/js'); 7 | var APP_DIR = path.resolve(__dirname, 'react-src'); 8 | 9 | var config = { 10 | entry: APP_DIR + '/main.jsx', 11 | output: { 12 | path: BUILD_JS_DIR, 13 | filename: 'bundle.js' 14 | }, 15 | module : { 16 | loaders : [ 17 | { 18 | test : /\.jsx?/, 19 | include : APP_DIR, 20 | loader : 'babel' 21 | }, 22 | { 23 | test: /\.scss$/, 24 | //loaders: ['style', 'css', 'sass'] 25 | loader: ExtractTextPlugin.extract('css!sass') 26 | } 27 | ] 28 | }, 29 | plugins: [ 30 | new ExtractTextPlugin('../css/bundle.css', { 31 | allChunks: true 32 | }) 33 | ] 34 | }; 35 | 36 | module.exports = config; -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/List.jsx: -------------------------------------------------------------------------------- 1 | var React = require('react'); 2 | var ListItem = require('./ListItem.jsx'); 3 | var HTTP = require('../services/httpservice'); 4 | 5 | var List = React.createClass({ 6 | getInitialState: function() { 7 | return {resources: []}; 8 | }, 9 | componentWillMount: function() { 10 | 11 | this.socket = io.connect('http://127.0.0.1:5000/resources'); 12 | this.socket.on('resources_info', function (msg) { 13 | this.setState({resources: msg.data.sub}); 14 | }.bind(this)); 15 | 16 | }, 17 | render: function() { 18 | /*var listItems = this.state.resources.map(function(item) { 19 | return ; 23 | }); 24 | 25 | return ();*/ 26 | } 27 | }); 28 | 29 | module.exports = List; 30 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/views/servers.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import json 4 | import flask 5 | 6 | from scrapy_eagle.dashboard.memory import get_connection 7 | 8 | 9 | servers = flask.Blueprint('servers', __name__) 10 | 11 | 12 | @servers.route('/list') 13 | def listing(): 14 | 15 | now = datetime.now() 16 | 17 | redis_conn = get_connection() 18 | 19 | _servers = redis_conn.zrangebyscore('eagle_servers', now.timestamp(), max='+inf') 20 | 21 | results = [] 22 | 23 | for entry in _servers: 24 | parts = entry.decode('utf-8').split("-") 25 | ip, hostname = parts[0], "-".join(parts[1:]) 26 | results.append({'public_ip': ip, 'hostname': hostname}) 27 | 28 | # Sets in Redis usually returns in random order, sort by hostname 29 | results = sorted(results, key=lambda x: x['hostname']) 30 | 31 | return flask.Response( 32 | response=json.dumps(results, sort_keys=True), 33 | status=200, 34 | mimetype="application/json" 35 | ) 36 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/webpack.config.prod.js: -------------------------------------------------------------------------------- 1 | var webpack = require('webpack'); 2 | var path = require('path'); 3 | 4 | var ExtractTextPlugin = require('extract-text-webpack-plugin'); 5 | 6 | var BUILD_JS_DIR = path.resolve(__dirname, 'templates/static/js'); 7 | var APP_DIR = path.resolve(__dirname, 'react-src'); 8 | 9 | var config = { 10 | entry: APP_DIR + '/main.jsx', 11 | output: { 12 | path: BUILD_JS_DIR, 13 | filename: 'bundle.js' 14 | }, 15 | plugins: [ 16 | new webpack.optimize.OccurrenceOrderPlugin(), 17 | new webpack.DefinePlugin({ 18 | 'process.env': { 19 | 'NODE_ENV': JSON.stringify('production') 20 | } 21 | }), 22 | new webpack.optimize.UglifyJsPlugin({ 23 | compressor: { 24 | warnings: false 25 | } 26 | }), 27 | new ExtractTextPlugin('../css/bundle.css', { 28 | allChunks: true 29 | }) 30 | ], 31 | module : { 32 | loaders : [ 33 | { 34 | test : /\.jsx?/, 35 | include : APP_DIR, 36 | loader : 'babel' 37 | }, 38 | { 39 | test: /\.scss$/, 40 | //loaders: ['style', 'css', 'sass'] 41 | loader: ExtractTextPlugin.extract('css!sass') 42 | } 43 | ] 44 | } 45 | }; 46 | 47 | module.exports = config; -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import io 5 | from setuptools import setup, find_packages 6 | 7 | 8 | LONG_DESC = open(os.path.join(os.path.dirname(__file__), 'README.rst')).read() 9 | 10 | 11 | def read_file(filename): 12 | with io.open(filename) as fp: 13 | return fp.read().strip() 14 | 15 | 16 | def read_requirements(filename): 17 | return [line.strip() for line in read_file(filename).splitlines() 18 | if not line.startswith('#')] 19 | 20 | 21 | setup(name='scrapy-eagle', 22 | version='0.0.37', 23 | description='Run Scrapy Distributed', 24 | long_description=LONG_DESC, 25 | author='Rafael Alfredo Capucho', 26 | author_email='rafael.capucho@gmail.com', 27 | url='http://github.com/rafaelcapucho/scrapy-eagle', 28 | packages=find_packages(), 29 | license='BSD', 30 | install_requires=read_requirements('requirements.txt'), 31 | include_package_data=True, 32 | entry_points={ 33 | 'console_scripts': ['eagle_server=scrapy_eagle.dashboard.main:entry_point'], 34 | }, 35 | classifiers=[ 36 | 'Development Status :: 3 - Alpha', 37 | 'Framework :: Scrapy', 38 | 'Programming Language :: Python', 39 | 'Programming Language :: Python :: 3.5', 40 | 'Intended Audience :: Developers', 41 | ], 42 | ) 43 | -------------------------------------------------------------------------------- /tests/test_queue.py: -------------------------------------------------------------------------------- 1 | import mock 2 | 3 | from scrapy import Spider 4 | from scrapy.http import Request 5 | 6 | from scrapy_eagle.worker.queue import Base 7 | 8 | 9 | class TestBaseQueue(object): 10 | 11 | def setup(self): 12 | self.server = mock.Mock() 13 | self.spider = Spider(name='foo') 14 | self.spider.parse_method = lambda x: x 15 | self.key = 'key' 16 | self.q = Base(self.server, self.spider, self.key) 17 | 18 | def test_encode_decode_requests(self, q=None): 19 | if q is None: 20 | q = self.q 21 | req = Request('http://example.com', 22 | callback=self.spider.parse, 23 | meta={'foo': 'bar'}) 24 | out = q._decode_request(q._encode_request(req)) 25 | assert req.url == out.url 26 | assert req.meta == out.meta 27 | assert req.callback == out.callback 28 | 29 | def test_custom_serializer(self): 30 | serializer = mock.Mock() 31 | serializer.dumps = mock.Mock(side_effect=lambda x: x) 32 | serializer.loads = mock.Mock(side_effect=lambda x: x) 33 | q = Base(self.server, self.spider, self.key, serializer=serializer) 34 | self.test_encode_decode_requests(q) 35 | assert serializer.dumps.call_count == 1 36 | assert serializer.loads.call_count == 1 37 | 38 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/main.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import { render } from 'react-dom' 3 | import { Router, Route, IndexRoute, browserHistory } from 'react-router' 4 | 5 | import { createStore, combineReducers } from 'redux' 6 | import { Provider } from 'react-redux' 7 | 8 | import App from './components/App.jsx' 9 | import Home from './components/Home.jsx' 10 | import ServerSet from './components/servers/ServerSet.jsx' 11 | import ServerRoot from './components/servers/Root.jsx' 12 | 13 | import JobsConfig from './components/jobs/JobsConfig.jsx' 14 | import JobsRoot from './components/jobs/Root.jsx' 15 | 16 | import servers from './reducers/servers.jsx' 17 | import jobs from './reducers/jobs.jsx' 18 | 19 | var reducers = combineReducers({ 20 | servers: servers, 21 | jobs: jobs 22 | }); 23 | 24 | const store = createStore(reducers); 25 | 26 | render(( 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | ), document.getElementById('app')); 47 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/green_threads/heartbeat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | from datetime import datetime, timedelta 4 | 5 | import gevent 6 | 7 | 8 | def heartbeat_servers(redis_conn, ip, hostname): 9 | 10 | while True: 11 | 12 | future = datetime.now() + timedelta(seconds=6) 13 | 14 | redis_conn.zadd( 15 | 'eagle_servers', 16 | '{ip}-{hostname}'.format(ip=ip, hostname=hostname), 17 | int(future.timestamp()) 18 | ) 19 | 20 | # now = datetime.now() 21 | # servers = redis_conn.zrangebyscore('servers', now.timestamp(), max='+inf') 22 | 23 | gevent.sleep(3) 24 | 25 | 26 | def heartbeat_subprocess(pid, spider, max_seconds_idle, max_size_limit, queue_info_global): 27 | 28 | last_processed = None 29 | 30 | max_size = 0 31 | 32 | while True: 33 | 34 | size = None 35 | for entry in queue_info_global: 36 | if entry['name'] == spider: 37 | size = entry['size'] 38 | 39 | if size > 0: 40 | last_processed = datetime.now() 41 | 42 | if size > max_size: 43 | max_size = size 44 | 45 | if last_processed: 46 | diff = datetime.now() - last_processed 47 | 48 | # print('\nlast_processed_secs: ', diff.seconds, ' maxsize: ', max_size, ' size: ', size, '\n\n') 49 | 50 | if diff.seconds > max_seconds_idle and max_size > max_size_limit: 51 | 52 | os.kill(pid, signal.SIGHUP) 53 | 54 | break 55 | 56 | gevent.sleep(2) 57 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "react-scrapy-eagle", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "start:babel": "watchify react-src/main.jsx -v -t [ babelify --presets [ es2015 react ] ] -o templates/static/js/bundle.js", 8 | "build:dev": "./node_modules/webpack/bin/webpack.js -d --progress --colors --config webpack.config.dev.js", 9 | "build:prod": "NODE_ENV=production ./node_modules/webpack/bin/webpack.js -p --progress --colors --config webpack.config.prod.js", 10 | "start": "npm run build:dev -- --watch", 11 | "build": "npm run build:prod", 12 | "test": "echo \"Error: no test specified\" && exit 1" 13 | }, 14 | "author": "Rafael Capucho", 15 | "license": "ISC", 16 | "dependencies": { 17 | "babel-loader": "^6.2.4", 18 | "babel-preset-es2015": "^6.9.0", 19 | "babel-preset-react": "^6.11.1", 20 | "babelify": "^7.3.0", 21 | "classnames": "^2.2.5", 22 | "css-loader": "^0.23.1", 23 | "extract-text-webpack-plugin": "^1.0.1", 24 | "immutable": "^3.8.1", 25 | "moment": "^2.14.1", 26 | "node-sass": "^3.8.0", 27 | "react": "^15.3.1", 28 | "react-addons-pure-render-mixin": "^15.3.1", 29 | "react-breadcrumbs": "^1.3.16", 30 | "react-dom": "^15.3.1", 31 | "react-redux": "^4.4.5", 32 | "react-router": "^2.6.1", 33 | "react-switchery": "^1.0.0", 34 | "redux": "^3.5.2", 35 | "sass-loader": "^4.0.0", 36 | "style-loader": "^0.13.1", 37 | "watchify": "^3.7.0", 38 | "webpack": "^1.13.1", 39 | "whatwg-fetch": "^1.0.0" 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/reducers/jobs.jsx: -------------------------------------------------------------------------------- 1 | import { Record, OrderedMap, List } from 'immutable'; 2 | 3 | const JobRecord = Record({ 4 | active: undefined, // true or false 5 | frequency_minutes: undefined, 6 | last_started_at: undefined, 7 | max_concurrency: undefined, 8 | min_concurrency: undefined, 9 | max_memory_mb: undefined, 10 | priority: 0, 11 | job_type: undefined, // 'spider' or 'command' 12 | start_urls: new List() 13 | }); 14 | 15 | class JobInfo extends JobRecord { 16 | getPriority(){ 17 | return this.priority; 18 | } 19 | } 20 | 21 | const SpidersMap = OrderedMap({}); 22 | 23 | export default (state = SpidersMap, action) => { 24 | 25 | switch (action.type) { 26 | 27 | case 'UPDATE_SPIDER_INFO': 28 | 29 | // Check if there's already one Record from this Spider 30 | if(!state.has(action.spider_id)){ 31 | state = state.set(action.spider_id, new JobInfo()); 32 | } 33 | 34 | return state.update(action.spider_id, 35 | (spider_record) => 36 | spider_record.merge({ 37 | 'priority': action.priority, 38 | 'frequency_minutes': action.frequency_minutes, 39 | 'last_started_at': action.last_started_at, 40 | 'max_concurrency': action.max_concurrency, 41 | 'min_concurrency': action.min_concurrency, 42 | 'max_memory_mb': action.max_memory_mb, 43 | 'job_type': action.job_type, 44 | 'start_urls': action.start_urls, 45 | 'active': action.active 46 | }) 47 | ); 48 | 49 | default: 50 | return state; 51 | } 52 | } -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/utils/ip.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import os 4 | import re 5 | import requests 6 | import random 7 | 8 | def get_hostname(): 9 | 10 | return os.uname()[1] 11 | 12 | def get_external_ip(): 13 | 14 | source_list = [ 15 | 'http://ip.dnsexit.com', 16 | 'http://ifconfig.me/ip', 17 | 'http://ipecho.net/plain', 18 | 'http://ipogre.com/linux.php', 19 | 'http://myexternalip.com/raw', 20 | 'http://icanhazip.com/', 21 | 'http://httpbin.org/ip' 22 | ] 23 | 24 | headers = {'User-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0'} 25 | 26 | for i in range(len(source_list)): 27 | 28 | target = random.choice(source_list) 29 | 30 | try: 31 | 32 | content = requests.get(target, headers=headers, timeout=6, verify=False) 33 | 34 | m = re.search( 35 | '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})', 36 | content.text 37 | ) 38 | 39 | ip = m.group(0) 40 | 41 | if len(ip) > 0: 42 | return ip 43 | 44 | # Without Internet 45 | except requests.exceptions.ConnectionError as e: 46 | 47 | # Only interested in there kind of error 48 | if str(e).find("Temporary failure in name resolution") > -1: 49 | return None 50 | 51 | # Timeout 52 | except requests.exceptions.RequestException: 53 | # Try next 54 | source_list.pop(i) 55 | 56 | except Exception: 57 | continue 58 | 59 | 60 | return None 61 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/servers/ServerSubProcess.jsx: -------------------------------------------------------------------------------- 1 | var React = require('react'); 2 | var moment = require('moment'); 3 | 4 | var ServerSubProcess = React.createClass({ 5 | 6 | getInitialState: function() { 7 | return {link_open_buffer: ""}; 8 | }, 9 | onClickKill: function(){ 10 | 11 | $.get(window.location.protocol+"//"+this.props.public_ip+":"+location.port+"/processes/kill_subprocess/"+this.props.pid, function(data) { 12 | 13 | }); 14 | 15 | }, 16 | componentDidMount: function(){ 17 | this.setState({'link_open_buffer': window.location.protocol+"//"+this.props.public_ip+":"+location.port+"/processes/read_buffer/"+this.props.pid}); 18 | }, 19 | render: function(){ 20 | 21 | var created_at = moment.utc(this.props.created_at); 22 | var fromNow = created_at.fromNow(); 23 | 24 | return ( 25 |
  • 26 |
      27 |
    • Command: {this.props.command}
    • 28 |
    • PID: {this.props.pid}
    • 29 |
    • CPU: {this.props.cpu_percent}%
    • 30 |
    • Memory Used: {this.props.memory_used_mb}mb
    • 31 |
    • Spider: {this.props.spider}
    • 32 |
    • Base Dir: {this.props.base_dir}
    • 33 |
    • Created At: {fromNow}
    • 34 |
    • 35 | 36 | 37 |
    • 38 |
    39 |
  • 40 | ); 41 | } 42 | 43 | }); 44 | 45 | /*var Link = React.createClass({ 46 | 47 | render: function(){ 48 | return ( 49 | 50 | 51 | 52 | ); 53 | } 54 | 55 | });*/ 56 | 57 | module.exports = ServerSubProcess; 58 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/settings.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | try: 4 | import configparser 5 | except ImportError: 6 | import ConfigParser as configparser 7 | 8 | from scrapy_eagle.dashboard.utils import ip 9 | 10 | buffers = {} 11 | 12 | queue_info_global = [] 13 | 14 | subprocess_pids = set() 15 | 16 | # Never import these directly 17 | # Use get_config_file and get_args instead 18 | _args = None 19 | _config = None 20 | _public_ip = None 21 | _hostname = None 22 | _spiders = None 23 | _commands = None 24 | 25 | 26 | def setup_configuration(config_file=None): 27 | 28 | global _config 29 | 30 | _config = configparser.RawConfigParser() 31 | _config.read(config_file) 32 | 33 | globals()['_config'] = _config 34 | 35 | return _config 36 | 37 | 38 | def setup(config_file=None, output=True): 39 | 40 | global _args, _config, _public_ip, _hostname 41 | 42 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 43 | parser.add_argument('-c', '--config-file', help='Config file path.') 44 | 45 | _args = parser.parse_args() 46 | 47 | if not _args.config_file and not config_file: 48 | print('You should specify a config file using --config-file parameter.') 49 | exit(0) 50 | 51 | _config = setup_configuration(config_file=_args.config_file or config_file) 52 | 53 | if output: 54 | print('discovering your external entrypoint address... ', end='', flush=True) 55 | 56 | _public_ip = ip.get_external_ip() 57 | 58 | if output: 59 | print(_public_ip) 60 | 61 | _hostname = ip.get_hostname() 62 | 63 | return _args, _config 64 | 65 | 66 | def get_public_ip(): 67 | return _public_ip 68 | 69 | 70 | def get_hostname(): 71 | return _hostname 72 | 73 | 74 | def get_config_file(): 75 | return _config 76 | 77 | 78 | def get_args(): 79 | return _args 80 | 81 | 82 | def get_spiders(): 83 | return _spiders 84 | 85 | 86 | def get_commands(): 87 | return _commands -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/memory.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import redis 4 | 5 | from scrapy_eagle.dashboard.settings import get_config_file 6 | 7 | redis_pool = None 8 | 9 | 10 | def init_memory(): 11 | 12 | global redis_pool 13 | 14 | config = get_config_file() 15 | 16 | redis_pool = redis.ConnectionPool( 17 | host=config['redis']['host'], 18 | port=config['redis']['port'], 19 | db=config['redis']['db'], 20 | password=config.get('redis', 'password', fallback='') 21 | ) 22 | 23 | 24 | def get_redis_pool(): 25 | return redis_pool 26 | 27 | 28 | def get_connection(): 29 | 30 | if not redis_pool: 31 | init_memory() 32 | 33 | return redis.Redis(connection_pool=redis_pool) 34 | 35 | 36 | def get_job_object(key): 37 | 38 | redis_conn = get_connection() 39 | 40 | json_obj = redis_conn.get('eagle_jobs:{key}'.format(key=key)) 41 | 42 | if json_obj: 43 | return json.loads(json_obj.decode('utf-8')) 44 | else: 45 | return None 46 | 47 | def update_job_object(key, fields): 48 | 49 | redis_conn = get_connection() 50 | 51 | serialized = json.dumps(fields, sort_keys=True) 52 | 53 | redis_conn.set('eagle_jobs:{key}'.format(key=key), serialized) 54 | 55 | if __name__ == "__main__": 56 | 57 | from scrapy_eagle.dashboard.settings import setup_configuration 58 | 59 | _config = setup_configuration(config_file='/etc/scrapy-eagle.ini') 60 | 61 | init_memory() 62 | 63 | o = get_job_object(key='epocacosmeticos.com.br') 64 | 65 | print(o) 66 | 67 | d = { 68 | "active": True, 69 | "max_memory_mb": 220, 70 | "job_type": "spider", 71 | "last_started_at": "2016-08-31T04:17:51.200187", 72 | "priority": 6, 73 | "start_urls": [ 74 | "http://epocacosmeticos.com.br/", 75 | "http://www.epocacosmeticos.com.br/perfumes" 76 | ], 77 | "max_concurrency": 4, 78 | "min_concurrency": 1, 79 | "frequency_minutes": 1440 80 | } 81 | 82 | update_job_object(key='epocacosmeticos.com.br', fields=d) 83 | 84 | print(get_job_object(key='epocacosmeticos.com.br')) -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/green_threads/executor.py: -------------------------------------------------------------------------------- 1 | import gevent 2 | from datetime import datetime, timedelta 3 | 4 | from scrapy_eagle.dashboard import settings 5 | from scrapy_eagle.dashboard.memory import get_job_object, update_job_object 6 | from scrapy_eagle.dashboard.utils import iso_to_timestamp, timestamp_to_utc, processkit 7 | 8 | 9 | def evaluation_loop(): 10 | 11 | while True: 12 | 13 | _spiders = settings.get_spiders() 14 | _commands = settings.get_commands() 15 | 16 | # When the system is starting up, spiders/commands may return empty because 17 | # we're using async execution `green_threads.find_new_spiders`. 18 | if _spiders and _commands: 19 | 20 | for key in _spiders + _commands: 21 | obj = get_job_object(key=key) 22 | 23 | if obj and obj.get('next_execution_at'): 24 | 25 | next_execution_at = timestamp_to_utc(iso_to_timestamp(obj['next_execution_at'])) 26 | 27 | now = datetime.utcnow() 28 | 29 | if next_execution_at < now: 30 | 31 | dispatch(key=key, register=obj) 32 | 33 | gevent.sleep(3) 34 | 35 | 36 | def dispatch(key, register): 37 | 38 | _config = settings.get_config_file() 39 | 40 | register['last_started_at'] = datetime.utcnow().isoformat() 41 | register['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=register['frequency_minutes'])).isoformat() 42 | 43 | if register['job_type'] == "spider": 44 | command = [_config.get('scrapy', 'binary'), 'crawl', key] 45 | base_dir = _config.get('scrapy', 'base_dir') 46 | spider = True 47 | 48 | elif register['job_type'] == "command": 49 | command = [_config.get('commands', 'binary'), '-u', key + '.py'] 50 | base_dir = _config.get('commands', 'base_dir') 51 | spider = False 52 | 53 | gevent.spawn( 54 | processkit.new_subprocess, 55 | base_dir=base_dir, 56 | command=command, 57 | spider=spider, 58 | subprocess_pids=settings.subprocess_pids, 59 | queue_info_global=settings.queue_info_global, 60 | buffers=settings.buffers 61 | ) 62 | 63 | update_job_object(key=key, fields=register) 64 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/servers/ServerSet.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import { connect } from 'react-redux' 3 | 4 | var ServerNode = require('./ServerNode.jsx'); 5 | 6 | var SetIntervalMixin = { 7 | componentWillMount: function() { 8 | this.intervals = []; 9 | }, 10 | setInterval: function() { 11 | this.intervals.push(setInterval.apply(null, arguments)); 12 | }, 13 | componentWillUnmount: function() { 14 | this.intervals.forEach(clearInterval); 15 | } 16 | }; 17 | 18 | var ServerSet = React.createClass({ 19 | 20 | mixins: [SetIntervalMixin], 21 | 22 | getInitialState: function() { 23 | return {server_set: new Array()}; 24 | }, 25 | 26 | componentDidMount:function(){ 27 | this.setInterval(this.updateServers, 3000); 28 | }, 29 | 30 | updateServers: function() { 31 | 32 | var that = this; 33 | 34 | var server_set_new = new Array(); 35 | 36 | this.serversRequest = $.ajax({ 37 | url: window.location.protocol + "//" + document.domain + ":"+ location.port +"/servers/list", 38 | type: 'GET', 39 | dataType: 'json', 40 | cache: false 41 | }).done(function(data) { 42 | 43 | data.forEach(function(elem, index){ 44 | server_set_new.push({public_ip: elem.public_ip, hostname: elem.hostname}); 45 | }) 46 | 47 | }).always(function () { 48 | that.setState({'server_set': server_set_new}); 49 | that.props.set_server_qty(server_set_new.length); 50 | }); 51 | 52 | }, 53 | 54 | componentWillUnmount: function() { 55 | // Ref: https://facebook.github.io/react/tips/initial-ajax.html 56 | this.serversRequest.abort(); 57 | }, 58 | render: function(){ 59 | var listServers = this.state.server_set.map(function(item) { 60 | return ; 64 | }); 65 | 66 | return ( 67 |
    68 |

    ServerSet

    69 |
      {listServers}
    70 |
    71 | ); 72 | } 73 | }); 74 | 75 | 76 | var mapDispatchToProps = function(dispatch){ 77 | return { 78 | dispatch, 79 | set_server_qty: (qty) => { dispatch({type: 'SET_SERVER_QTY', qty: qty}); } 80 | } 81 | }; 82 | 83 | export default connect( 84 | (state) => { return {} }, 85 | mapDispatchToProps 86 | )(ServerSet) 87 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Scrapy Eagle 8 | 9 | 10 | 11 | 12 | 15 | 16 | 17 | 18 | 19 |
    20 |
    21 | 22 | Scrapy-Eagle Home 23 | 24 |
    25 |
    26 | 27 |
    28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /scrapy_eagle/worker/connection.py: -------------------------------------------------------------------------------- 1 | import redis 2 | import six 3 | 4 | from scrapy.utils.misc import load_object 5 | 6 | 7 | DEFAULT_REDIS_CLS = redis.StrictRedis 8 | 9 | 10 | # Sane connection defaults. 11 | DEFAULT_PARAMS = { 12 | 'socket_timeout': 30, 13 | 'socket_connect_timeout': 30, 14 | 'retry_on_timeout': True, 15 | } 16 | 17 | # Shortcut maps 'setting name' -> 'parmater name'. 18 | SETTINGS_PARAMS_MAP = { 19 | 'REDIS_URL': 'url', 20 | 'REDIS_HOST': 'host', 21 | 'REDIS_PORT': 'port', 22 | } 23 | 24 | 25 | def get_redis_from_settings(settings): 26 | """Returns a redis client instance from given Scrapy settings object. 27 | 28 | This function uses ``get_client`` to instantiate the client and uses 29 | ``DEFAULT_PARAMS`` global as defaults values for the parameters. You can 30 | override them using the ``REDIS_PARAMS`` setting. 31 | 32 | Parameters 33 | ---------- 34 | settings : Settings 35 | A scrapy settings object. See the supported settings below. 36 | 37 | Returns 38 | ------- 39 | server 40 | Redis client instance. 41 | 42 | Other Parameters 43 | ---------------- 44 | REDIS_URL : str, optional 45 | Server connection URL. 46 | REDIS_HOST : str, optional 47 | Server host. 48 | REDIS_PORT : str, optional 49 | Server port. 50 | REDIS_PARAMS : dict, optional 51 | Additional client parameters. 52 | 53 | """ 54 | params = DEFAULT_PARAMS.copy() 55 | params.update(settings.getdict('REDIS_PARAMS')) 56 | # XXX: Deprecate REDIS_* settings. 57 | for source, dest in SETTINGS_PARAMS_MAP.items(): 58 | val = settings.get(source) 59 | if val: 60 | params[dest] = val 61 | 62 | # Allow ``redis_cls`` to be a path to a class. 63 | if isinstance(params.get('redis_cls'), six.string_types): 64 | params['redis_cls'] = load_object(params['redis_cls']) 65 | 66 | return get_redis(**params) 67 | 68 | 69 | # Backwards compatible alias. 70 | from_settings = get_redis_from_settings 71 | 72 | 73 | def get_redis(**kwargs): 74 | """Returns a redis client instance. 75 | 76 | Parameters 77 | ---------- 78 | redis_cls : class, optional 79 | Defaults to ``redis.StrictRedis``. 80 | url : str, optional 81 | If given, ``redis_cls.from_url`` is used to instantiate the class. 82 | **kwargs 83 | Extra parameters to be passed to the ``redis_cls`` class. 84 | 85 | Returns 86 | ------- 87 | server 88 | Redis client instance. 89 | 90 | """ 91 | redis_cls = kwargs.pop('redis_cls', DEFAULT_REDIS_CLS) 92 | url = kwargs.pop('url', None) 93 | if url: 94 | return redis_cls.from_url(url, **kwargs) 95 | else: 96 | return redis_cls(**kwargs) 97 | 98 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/green_threads/stats.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey 2 | monkey.patch_all() 3 | 4 | import gevent 5 | import gevent.pool 6 | 7 | from scrapy_eagle.dashboard import settings 8 | from scrapy_eagle.dashboard.utils.processkit import get_resources_info_from_pid, get_resources_info_from_server 9 | 10 | def send_redis_queue_info(socketio, redis_conn, spiders, queue_info_global): 11 | 12 | while True: 13 | 14 | queues = [] 15 | 16 | for spider in spiders: 17 | queues.append( 18 | { 19 | 'name': spider, 20 | 'size': int(redis_conn.llen('{spider}:requests'.format(spider=spider))) 21 | } 22 | ) 23 | 24 | # Don't asign directly to maintain the reference to the global object 25 | queue_info_global.clear() 26 | queue_info_global.extend(queues) 27 | 28 | socketio.emit('redis_queue_info', {'data': queues}, namespace="/queues", broadcast=True) 29 | 30 | gevent.sleep(1) 31 | 32 | def send_resources_info(socketio, subprocess_pids, public_ip): 33 | 34 | while True: 35 | 36 | dict_info_pid_greenlet = gevent.spawn(get_resources_info_from_pid) 37 | dict_info_host_greenlet = gevent.spawn(get_resources_info_from_server) 38 | 39 | subprocess_info_greenlets = [] 40 | 41 | for pid, spider, command, base_dir, created_at in subprocess_pids: 42 | 43 | # We pass all the parameters that we like to keep instead 44 | # of simply use a .update() here because the returned instance 45 | # is a Greenlet instead of a dict. 46 | 47 | info_greenlet = gevent.spawn( 48 | get_resources_info_from_pid, 49 | pid=pid, 50 | spider=spider, 51 | command=command, 52 | base_dir=base_dir, 53 | created_at=created_at, 54 | ) 55 | 56 | subprocess_info_greenlets.append(info_greenlet) 57 | 58 | dict_info_pid_greenlet.join() 59 | dict_info = dict_info_pid_greenlet.get() 60 | dict_info['public_ip'] = public_ip 61 | 62 | dict_info_host_greenlet.join() 63 | dict_info_host = dict_info_host_greenlet.get() 64 | dict_info.update(dict_info_host) 65 | 66 | gevent.joinall(subprocess_info_greenlets) 67 | dict_info['sub'] = [greenlet.get() for greenlet in subprocess_info_greenlets] 68 | 69 | # When get_resources_info try to access a PID that dont exists any more it 70 | # return None, here we remove those results. It happen because it takes 71 | # sometime to subprocess_pids remove PIDs that finishs. 72 | dict_info['sub'] = [x for x in dict_info['sub'] if x] 73 | 74 | _spiders = settings.get_spiders() 75 | _commands = settings.get_commands() 76 | 77 | dict_info['spiders'] = _spiders or [] 78 | dict_info['commands'] = _commands or [] 79 | 80 | print('\n\ndict_info: ', dict_info, '\n\n') 81 | 82 | socketio.emit('resources_info', {'data': dict_info}, namespace="/resources", broadcast=True) 83 | 84 | gevent.sleep(1) -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/utils/processkit.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | import os 5 | import subprocess 6 | from datetime import datetime 7 | 8 | import psutil 9 | import gevent 10 | 11 | from scrapy_eagle.dashboard.green_threads import heartbeat 12 | 13 | 14 | def new_subprocess(base_dir, subprocess_pids, queue_info_global, command=None, spider=None, buffers={}): 15 | 16 | if not command: 17 | command = ['python', '-u', 'generator.py'] 18 | # command = ['galculator'] 19 | # command = ['/usr/bin/scrapy-py35', 'crawl', '{spider}'.format(spider)] 20 | 21 | with subprocess.Popen( 22 | command, 23 | cwd=base_dir, 24 | stdout=subprocess.PIPE, 25 | bufsize=1, 26 | universal_newlines=True 27 | ) as p: 28 | 29 | # Turn it JSON serializable 30 | created_at = datetime.utcnow().isoformat() 31 | 32 | identifier = (p.pid, spider, " ".join(command), base_dir, created_at) 33 | 34 | subprocess_pids.add(identifier) 35 | 36 | buffers[p.pid] = {'finished': False, 'lines': []} 37 | 38 | if spider: 39 | gevent.spawn( 40 | heartbeat.heartbeat_subprocess, 41 | p.pid, 42 | spider, 43 | max_seconds_idle=20, 44 | max_size_limit=15, 45 | queue_info_global=queue_info_global 46 | ) 47 | 48 | for line in p.stdout: 49 | 50 | # TODO: remove empty lines 51 | 52 | if len(line.strip()) > 0: 53 | 54 | buffers[p.pid]['lines'].append(line) 55 | 56 | # print(line, end='', flush=True) 57 | 58 | buffers[p.pid]['finished'] = True 59 | 60 | subprocess_pids.remove(identifier) 61 | 62 | 63 | def _get_info_from_pid(pid=None): 64 | 65 | if not pid: 66 | pid = os.getpid() 67 | 68 | process = psutil.Process(pid) 69 | 70 | mem = process.memory_info()[0] / float(2 ** 20) 71 | mem = float('{0:.2f}'.format(mem)) 72 | 73 | cpu = process.cpu_percent(interval=0.5) 74 | 75 | return pid, mem, cpu 76 | 77 | 78 | def get_resources_info_from_server(): 79 | 80 | cpus = psutil.cpu_percent(interval=0.5, percpu=True) 81 | 82 | # Mem results return in bytes 83 | vmem = psutil.virtual_memory() 84 | 85 | total = vmem.total 86 | total = (total / 1024.0) / 1024.0 87 | 88 | available = vmem.available 89 | available = (available / 1024.0) / 1024.0 90 | 91 | used = total - available 92 | 93 | return { 94 | 'cpus': cpus, 95 | 'memory_total_mb': float('{0:.2f}'.format(total)), 96 | 'memory_available_mb': float('{0:.2f}'.format(available)), 97 | 'memory_used_server_mb': float('{0:.2f}'.format(used)) 98 | } 99 | 100 | 101 | def get_resources_info_from_pid(pid=None, *args, **kwargs): 102 | 103 | try: 104 | 105 | pid, memory_used_mb, cpu_percent = _get_info_from_pid(pid=pid) 106 | 107 | result = { 108 | 'pid': pid, 109 | 'memory_used_mb': memory_used_mb, 110 | 'cpu_percent': cpu_percent, 111 | } 112 | 113 | result.update(kwargs) 114 | 115 | return result 116 | 117 | except psutil.NoSuchProcess: 118 | print('TODO: an error here') 119 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/views/processes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import signal 4 | 5 | import flask 6 | import gevent 7 | 8 | from scrapy_eagle.dashboard.utils import processkit 9 | from scrapy_eagle.dashboard import settings 10 | 11 | 12 | processes = flask.Blueprint('processes', __name__) 13 | 14 | 15 | @processes.route('/exec_command') 16 | def exec_command(): 17 | 18 | gevent.spawn( 19 | processkit.new_subprocess, 20 | base_dir='.', 21 | subprocess_pids=settings.subprocess_pids, 22 | queue_info_global=settings.queue_info_global, 23 | buffers=settings.buffers 24 | ) 25 | 26 | result = { 27 | 'status': True 28 | } 29 | 30 | return flask.Response( 31 | response=json.dumps(result, sort_keys=True), 32 | status=200, 33 | mimetype="application/json" 34 | ) 35 | 36 | 37 | @processes.route('/read_buffer/') 38 | def read_buffer(pid): 39 | 40 | if not settings.buffers.get(pid): 41 | return flask.Response( 42 | response=json.dumps( 43 | {'status': False, 'msg': 'PID Not Found'}, 44 | sort_keys=True 45 | ), 46 | status=200, 47 | mimetype="application/json" 48 | ) 49 | 50 | def generate(): 51 | 52 | sent = 0 53 | 54 | while not settings.buffers[pid]['finished']: 55 | 56 | for i, row in enumerate(settings.buffers[pid]['lines'][sent:]): 57 | 58 | sent += 1 59 | 60 | yield row+'
    ' 61 | 62 | gevent.sleep(0.5) 63 | 64 | return flask.Response( 65 | response=generate(), 66 | status=200, 67 | mimetype="text/html" 68 | ) 69 | 70 | 71 | @processes.route('/kill_subprocess/') 72 | def kill_subprocess(pid): 73 | 74 | safe = False 75 | 76 | for _pid, _, _, _, _ in settings.subprocess_pids: 77 | 78 | if pid == _pid: 79 | safe = True 80 | break 81 | 82 | if safe: 83 | os.kill(pid, signal.SIGHUP) 84 | 85 | result = { 86 | 'status': True, 87 | 'msg': 'SIGHUP signal sent to PID {0}'.format(pid) 88 | } 89 | 90 | else: 91 | result = { 92 | 'status': False, 93 | 'msg': 'PID Not Found' 94 | } 95 | 96 | return flask.Response( 97 | response=json.dumps(result, sort_keys=True), 98 | status=200, 99 | mimetype="application/json" 100 | ) 101 | 102 | 103 | @processes.route('/start_spider/') 104 | def start_spider(spider): 105 | 106 | _config = settings.get_config_file() 107 | 108 | command = [_config.get('scrapy', 'binary'), 'crawl', spider] 109 | 110 | # TODO: Verify if base_dir is set before use it 111 | 112 | gevent.spawn( 113 | processkit.new_subprocess, 114 | base_dir=_config.get('scrapy', 'base_dir'), 115 | command=command, 116 | spider=spider, 117 | subprocess_pids=settings.subprocess_pids, 118 | queue_info_global=settings.queue_info_global, 119 | buffers=settings.buffers 120 | ) 121 | 122 | result = { 123 | 'status': True 124 | } 125 | 126 | return flask.Response( 127 | response=json.dumps(result, sort_keys=True), 128 | status=200, 129 | mimetype="application/json" 130 | ) 131 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/templates/static/js/vendor/jquery.navgoco.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | * jQuery Navgoco Menus Plugin v0.2.1 (2014-04-11) 3 | * https://github.com/tefra/navgoco 4 | * 5 | * Copyright (c) 2014 Chris T (@tefra) 6 | * BSD - https://github.com/tefra/navgoco/blob/master/LICENSE-BSD 7 | */ 8 | !function(a){"use strict";var b=function(b,c,d){return this.el=b,this.$el=a(b),this.options=c,this.uuid=this.$el.attr("id")?this.$el.attr("id"):d,this.state={},this.init(),this};b.prototype={init:function(){var b=this;b._load(),b.$el.find("ul").each(function(c){var d=a(this);d.attr("data-index",c),b.options.save&&b.state.hasOwnProperty(c)?(d.parent().addClass(b.options.openClass),d.show()):d.parent().hasClass(b.options.openClass)?(d.show(),b.state[c]=1):d.hide()});var c=a("").prepend(b.options.caretHtml),d=b.$el.find("li > a");b._trigger(c,!1),b._trigger(d,!0),b.$el.find("li:has(ul) > a").prepend(c)},_trigger:function(b,c){var d=this;b.on("click",function(b){b.stopPropagation();var e=c?a(this).next():a(this).parent().next(),f=!1;if(c){var g=a(this).attr("href");f=void 0===g||""===g||"#"===g}if(e=e.length>0?e:!1,d.options.onClickBefore.call(this,b,e),!c||e&&f)b.preventDefault(),d._toggle(e,e.is(":hidden")),d._save();else if(d.options.accordion){var h=d.state=d._parents(a(this));d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");h.hasOwnProperty(c)||d._toggle(b,!1)}),d._save()}d.options.onClickAfter.call(this,b,e)})},_toggle:function(b,c){var d=this,e=b.attr("data-index"),f=b.parent();if(d.options.onToggleBefore.call(this,b,c),c){if(f.addClass(d.options.openClass),b.slideDown(d.options.slide),d.state[e]=1,d.options.accordion){var g=d.state=d._parents(b);g[e]=d.state[e]=1,d.$el.find("ul").filter(":visible").each(function(){var b=a(this),c=b.attr("data-index");g.hasOwnProperty(c)||d._toggle(b,!1)})}}else f.removeClass(d.options.openClass),b.slideUp(d.options.slide),d.state[e]=0;d.options.onToggleAfter.call(this,b,c)},_parents:function(b,c){var d={},e=b.parent(),f=e.parents("ul");return f.each(function(){var b=a(this),e=b.attr("data-index");return e?void(d[e]=c?b:1):!1}),d},_save:function(){if(this.options.save){var b={};for(var d in this.state)1===this.state[d]&&(b[d]=1);c[this.uuid]=this.state=b,a.cookie(this.options.cookie.name,JSON.stringify(c),this.options.cookie)}},_load:function(){if(this.options.save){if(null===c){var b=a.cookie(this.options.cookie.name);c=b?JSON.parse(b):{}}this.state=c.hasOwnProperty(this.uuid)?c[this.uuid]:{}}},toggle:function(b){var c=this,d=arguments.length;if(1>=d)c.$el.find("ul").each(function(){var d=a(this);c._toggle(d,b)});else{var e,f={},g=Array.prototype.slice.call(arguments,1);d--;for(var h=0;d>h;h++){e=g[h];var i=c.$el.find('ul[data-index="'+e+'"]').first();if(i&&(f[e]=i,b)){var j=c._parents(i,!0);for(var k in j)f.hasOwnProperty(k)||(f[k]=j[k])}}for(e in f)c._toggle(f[e],b)}c._save()},destroy:function(){a.removeData(this.$el),this.$el.find("li:has(ul) > a").unbind("click"),this.$el.find("li:has(ul) > a > span").unbind("click")}},a.fn.navgoco=function(c){if("string"==typeof c&&"_"!==c.charAt(0)&&"init"!==c)var d=!0,e=Array.prototype.slice.call(arguments,1);else c=a.extend({},a.fn.navgoco.defaults,c||{}),a.cookie||(c.save=!1);return this.each(function(f){var g=a(this),h=g.data("navgoco");h||(h=new b(this,d?a.fn.navgoco.defaults:c,f),g.data("navgoco",h)),d&&h[c].apply(h,e)})};var c=null;a.fn.navgoco.defaults={caretHtml:"",accordion:!1,openClass:"open",save:!0,cookie:{name:"navgoco",expires:!1,path:"/"},slide:{duration:400,easing:"swing"},onClickBefore:a.noop,onClickAfter:a.noop,onToggleBefore:a.noop,onToggleAfter:a.noop}}(jQuery); -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/jobs/JobsConfig.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | 3 | import { connect } from 'react-redux' 4 | //import PureRenderMixin from 'react-addons-pure-render-mixin' 5 | 6 | import JobsItem from './JobsItem.jsx' 7 | 8 | require('./JobsConfig.scss'); 9 | 10 | class JobsConfig extends React.Component { 11 | 12 | constructor(props){ 13 | super(props); 14 | //this.shouldComponentUpdate = PureRenderMixin.shouldComponentUpdate.bind(this); 15 | this.state = {}; 16 | } 17 | 18 | componentDidMount(){ 19 | this.updateSpiders(); 20 | } 21 | 22 | updateSpiders(){ 23 | 24 | } 25 | 26 | componentWillReceiveProps(nextProps) { 27 | // console.log('entro componentWillReceiveProps'); 28 | } 29 | 30 | shouldComponentUpdate(nextProps, nextState) { 31 | return true; 32 | //return nextProps.id !== this.props.id; 33 | } 34 | 35 | render() { 36 | const {jobs} = this.props; 37 | 38 | // console.log('render!'); 39 | 40 | var toggle_class = 'odd'; 41 | 42 | // https://github.com/facebook/immutable-js/issues/667#issuecomment-220223640 43 | var list_spiders = jobs.entrySeq().map(([key, value]) => { 44 | 45 | if (value.job_type == 'spider') { 46 | 47 | toggle_class = (toggle_class == 'odd') ? 'even' : 'odd'; 48 | 49 | return ; 55 | } 56 | 57 | }); 58 | 59 | var list_commands = jobs.entrySeq().map(([key, value]) => { 60 | 61 | if (value.job_type == 'command') { 62 | 63 | toggle_class = (toggle_class == 'odd') ? 'even' : 'odd'; 64 | 65 | return ; 71 | } 72 | 73 | }); 74 | 75 | return ( 76 |
    77 |

    Jobs Configuration

    78 | 79 | {list_spiders} 80 | 81 |
    82 | 83 |

    Commands Configuration

    84 | 85 | {list_commands} 86 | 87 |
    88 | 89 |
    90 |

    Legends

    91 |
      92 |
    • Frequency: Amount of time in minutes defining when to trigger this action over time. Ex.: 60 means each hour
    • 93 |
    • Max Concurrency: How many servers will be this action running.
    • 94 |
    • Min Concurrency: Only dispatch this job when a minimum of resources are available.
    • 95 |
    • Priority: Highest numbers is selected when the system need to choose between equals opportunities.
    • 96 |
    • Max Memory: The processes are killed when reach this threshold (in megabytes) and could be reallocated in other server or in the same server.
    • 97 |
    • Start URLs: A list of URLs to use as starting point, one by line.
    • 98 |
    • Last started at: Last time this job was triggered.
    • 99 |
    100 |
    101 | 102 | 103 |
    104 | ); 105 | } 106 | 107 | } 108 | 109 | var mapDispatchToProps = function(dispatch){ 110 | return { 111 | dispatch 112 | } 113 | }; 114 | 115 | export default connect( 116 | (state) => { 117 | return { 118 | jobs: state.jobs 119 | } 120 | }, 121 | mapDispatchToProps 122 | )(JobsConfig) -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/main.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey 2 | monkey.patch_all() 3 | 4 | import os 5 | import sys 6 | import signal 7 | import threading 8 | 9 | import flask 10 | import gevent 11 | 12 | from flask_cors import CORS 13 | from flask_socketio import SocketIO 14 | 15 | try: 16 | import configparser 17 | except ImportError: 18 | import ConfigParser as configparser 19 | 20 | from scrapy_eagle.dashboard import settings 21 | from scrapy_eagle.dashboard import memory 22 | from scrapy_eagle.dashboard.green_threads import heartbeat, stats, find_new_spiders, find_new_commands, executor 23 | from scrapy_eagle.dashboard.utils import processkit 24 | 25 | 26 | app = flask.Flask(__name__, static_folder='templates/static') 27 | 28 | 29 | def main(): 30 | 31 | # Install the arguments and config file inside the config module 32 | _, _ = settings.setup() 33 | 34 | 35 | def shutdown(): 36 | 37 | # Send a signal to all opened subprocess, closing them. 38 | for pid, _, _, _, _ in settings.subprocess_pids: 39 | 40 | print('killing subprocess: {pid}'.format(pid=pid)) 41 | 42 | os.kill(pid, signal.SIGHUP) 43 | 44 | print('\nshutting down {0}...'.format(threading.currentThread().getName())) 45 | 46 | sys.exit(0) 47 | 48 | 49 | def start_periodics(socketio): 50 | 51 | redis_conn = memory.get_connection() 52 | public_ip = settings.get_public_ip() 53 | hostname = settings.get_hostname() 54 | 55 | for i in range(3): 56 | gevent.spawn( 57 | processkit.new_subprocess, 58 | base_dir='.', 59 | subprocess_pids=settings.subprocess_pids, 60 | queue_info_global=settings.queue_info_global, 61 | buffers=settings.buffers 62 | ) 63 | 64 | gevent.spawn(heartbeat.heartbeat_servers, redis_conn, public_ip, hostname) 65 | gevent.spawn(stats.send_resources_info, socketio, settings.subprocess_pids, public_ip) 66 | gevent.spawn(executor.evaluation_loop) 67 | gevent.spawn(find_new_spiders) 68 | gevent.spawn(find_new_commands) 69 | 70 | 71 | def entry_point(): 72 | 73 | # Graceful shutdown when kill are received 74 | signal.signal(signal.SIGTERM, lambda sig, frame: shutdown()) 75 | 76 | # Graceful shutdown when terminal session are closed 77 | signal.signal(signal.SIGHUP, lambda sig, frame: shutdown()) 78 | 79 | main() 80 | 81 | try: 82 | 83 | _config = settings.get_config_file() 84 | 85 | app.config['SECRET_KEY'] = _config.get('server', 'cookie_secret_key') 86 | app.config['DEBUG'] = _config.getboolean('server', 'debug', fallback=True) 87 | 88 | from scrapy_eagle.dashboard.views import servers, processes, root, jobs, react_app 89 | 90 | app.register_blueprint(root.root, url_prefix='/') 91 | app.register_blueprint(react_app.react_app, url_prefix='/app') 92 | app.register_blueprint(servers.servers, url_prefix='/servers') 93 | app.register_blueprint(processes.processes, url_prefix='/processes') 94 | app.register_blueprint(jobs.jobs, url_prefix='/jobs') 95 | 96 | CORS(app) 97 | 98 | socketio = SocketIO(app, async_mode='gevent') 99 | 100 | start_periodics(socketio) 101 | 102 | # use_reloader: avoid Flask execute twice 103 | socketio.run( 104 | app=app, 105 | host=_config.get('server', 'host', fallback='0.0.0.0'), 106 | port=_config.getint('server', 'port', fallback=5000), 107 | use_reloader=False 108 | ) 109 | 110 | except (KeyboardInterrupt, SystemExit): 111 | 112 | shutdown() 113 | 114 | 115 | if __name__ == "__main__": 116 | 117 | entry_point() 118 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | **/static/js/bundle.js.map 3 | 4 | # Created by https://www.gitignore.io/api/pycharm,python,sublimetext,komodoedit,vim,linux 5 | 6 | ### PyCharm ### 7 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 8 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 9 | 10 | # User-specific stuff: 11 | .idea 12 | .idea/workspace.xml 13 | .idea/tasks.xml 14 | .idea/dictionaries 15 | .idea/vcs.xml 16 | .idea/jsLibraryMappings.xml 17 | 18 | # Sensitive or high-churn files: 19 | .idea/dataSources.ids 20 | .idea/dataSources.xml 21 | .idea/dataSources.local.xml 22 | .idea/sqlDataSources.xml 23 | .idea/dynamic.xml 24 | .idea/uiDesigner.xml 25 | 26 | # Gradle: 27 | .idea/gradle.xml 28 | .idea/libraries 29 | 30 | # Mongo Explorer plugin: 31 | .idea/mongoSettings.xml 32 | 33 | ## File-based project format: 34 | *.iws 35 | 36 | ## Plugin-specific files: 37 | 38 | # IntelliJ 39 | /out/ 40 | 41 | # mpeltonen/sbt-idea plugin 42 | .idea_modules/ 43 | 44 | # JIRA plugin 45 | atlassian-ide-plugin.xml 46 | 47 | # Crashlytics plugin (for Android Studio and IntelliJ) 48 | com_crashlytics_export_strings.xml 49 | crashlytics.properties 50 | crashlytics-build.properties 51 | fabric.properties 52 | 53 | 54 | ### Python ### 55 | # Byte-compiled / optimized / DLL files 56 | __pycache__/ 57 | *.py[cod] 58 | *$py.class 59 | 60 | # C extensions 61 | *.so 62 | 63 | # Distribution / packaging 64 | .Python 65 | env/ 66 | build/ 67 | develop-eggs/ 68 | dist/ 69 | downloads/ 70 | eggs/ 71 | .eggs/ 72 | lib/ 73 | lib64/ 74 | parts/ 75 | sdist/ 76 | var/ 77 | *.egg-info/ 78 | .installed.cfg 79 | *.egg 80 | 81 | # PyInstaller 82 | # Usually these files are written by a python script from a template 83 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 84 | *.manifest 85 | *.spec 86 | 87 | # Installer logs 88 | pip-log.txt 89 | pip-delete-this-directory.txt 90 | 91 | # Unit test / coverage reports 92 | htmlcov/ 93 | .tox/ 94 | .coverage 95 | .coverage.* 96 | .cache 97 | nosetests.xml 98 | coverage.xml 99 | *,cover 100 | .hypothesis/ 101 | 102 | # Translations 103 | *.mo 104 | *.pot 105 | 106 | # Django stuff: 107 | *.log 108 | local_settings.py 109 | 110 | # Flask instance folder 111 | instance/ 112 | 113 | # Scrapy stuff: 114 | .scrapy 115 | 116 | # Sphinx documentation 117 | docs/_build/ 118 | 119 | # PyBuilder 120 | target/ 121 | 122 | # IPython Notebook 123 | .ipynb_checkpoints 124 | 125 | # pyenv 126 | .python-version 127 | 128 | # celery beat schedule file 129 | celerybeat-schedule 130 | 131 | # dotenv 132 | .env 133 | 134 | # virtualenv 135 | venv/ 136 | ENV/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | 145 | ### SublimeText ### 146 | # cache files for sublime text 147 | *.tmlanguage.cache 148 | *.tmPreferences.cache 149 | *.stTheme.cache 150 | 151 | # workspace files are user-specific 152 | *.sublime-workspace 153 | 154 | # project files should be checked into the repository, unless a significant 155 | # proportion of contributors will probably not be using SublimeText 156 | # *.sublime-project 157 | 158 | # sftp configuration file 159 | sftp-config.json 160 | 161 | 162 | ### KomodoEdit ### 163 | *.komodoproject 164 | .komodotools 165 | 166 | 167 | ### Vim ### 168 | # swap 169 | [._]*.s[a-w][a-z] 170 | [._]s[a-w][a-z] 171 | # session 172 | Session.vim 173 | # temporary 174 | .netrwhist 175 | *~ 176 | # auto-generated tag files 177 | tags 178 | 179 | 180 | ### Linux ### 181 | *~ 182 | 183 | # temporary files which can be created if a process still has a handle open of a deleted file 184 | .fuse_hidden* 185 | 186 | # KDE directory preferences 187 | .directory 188 | 189 | # Linux trash folder which might appear on any partition or disk 190 | .Trash-* 191 | 192 | 193 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/servers/ServerNode.jsx: -------------------------------------------------------------------------------- 1 | var React = require('react'); 2 | var ServerSubProcess = require('./ServerSubProcess.jsx'); 3 | 4 | var ServerNode = React.createClass({ 5 | getInitialState: function() { 6 | return { 7 | pid: "", 8 | public_ip: "", 9 | cpu_percent: "", 10 | memory_available_mb: "", 11 | memory_total_mb: "", 12 | memory_used_mb: "", 13 | memory_used_server_mb: "", 14 | cpus: [], 15 | subprocesses: [], 16 | spiders: [] 17 | }; 18 | }, 19 | componentWillMount: function() { 20 | 21 | this.socket = io.connect(window.location.protocol + "//" + this.props.public_ip + ":" + location.port + "/resources"); 22 | this.socket.on('resources_info', function (msg) { 23 | 24 | var buff = "[ "; 25 | for(var i = 0; i < msg.data.cpus.length; i++){ 26 | if(i+1 == msg.data.cpus.length){ 27 | buff += msg.data.cpus[i] + " "; 28 | 29 | } else { 30 | buff += msg.data.cpus[i] + " / "; 31 | } 32 | } 33 | buff += "]"; 34 | 35 | this.setState({ 36 | pid: msg.data.pid, 37 | public_ip: msg.data.public_ip, 38 | cpu_percent: msg.data.cpu_percent, 39 | memory_available_mb: msg.data.memory_available_mb, 40 | memory_total_mb: msg.data.memory_total_mb, 41 | memory_used_mb: msg.data.memory_used_mb, 42 | memory_used_server_mb: msg.data.memory_used_server_mb, 43 | cpus: buff, 44 | subprocesses: msg.data.sub, 45 | spiders: msg.data.spiders 46 | }); 47 | 48 | // console.log(msg.data.cpus); 49 | 50 | }.bind(this)); 51 | 52 | }, 53 | componentWillUnmount: function(){ 54 | 55 | this.socket.disconnect(); 56 | 57 | }, 58 | onClickExecCommand: function(e){ 59 | 60 | $.get(window.location.protocol + "//" + this.state.public_ip + ":" + location.port + "/processes/exec_command", function(data) { 61 | 62 | }); 63 | 64 | }, 65 | onClickStartWorker: function(e){ 66 | 67 | $.get(window.location.protocol + "//" + this.state.public_ip + ":" + location.port + "/processes/start_spider/" + this.state.selected_spider, function(data) { 68 | 69 | }); 70 | 71 | }, 72 | onChangeDataProvider: function(e){ 73 | 74 | this.setState({'selected_spider': e.target.value}); 75 | 76 | }, 77 | render: function(){ 78 | 79 | var listSubProcesses = this.state.subprocesses.map(function (item, i) { 80 | return ; 90 | }.bind(this)); 91 | 92 | var listSpiders = this.state.spiders.map(function (item, i) { 93 | return ( 94 | 95 | ); 96 | }.bind(this)); 97 | 98 | return ( 99 |
  • 100 |
      101 |
    • IP: {this.props.public_ip} ({this.props.hostname})
    • 102 |
    • PID: {this.state.pid}
    • 103 |
    • CPU Server: {this.state.cpus}%
    • 104 |
    • Memory Used Server : {this.state.memory_used_server_mb}mb
    • 105 |
    • CPU Process: {this.state.cpu_percent}%
    • 106 |
    • Memory Used Process: {this.state.memory_used_mb}mb
    • 107 |
    • Memory Available: {this.state.memory_available_mb}mb
    • 108 |
    • Memory Total: {this.state.memory_total_mb}mb
    • 109 |
    • 110 |
    • 111 | 115 |
    • 116 |
        {listSubProcesses}
      117 |
    118 |
  • 119 | ); 120 | 121 | } 122 | }); 123 | 124 | module.exports = ServerNode; 125 | -------------------------------------------------------------------------------- /scrapy_eagle/worker/dupefilter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from scrapy.dupefilters import BaseDupeFilter 5 | from scrapy.utils.request import request_fingerprint 6 | 7 | from .connection import get_redis_from_settings 8 | 9 | 10 | DEFAULT_DUPEFILTER_KEY = "dupefilter:%(timestamp)s" 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class RFPDupeFilter(BaseDupeFilter): 16 | """Redis-based request duplicates filter. 17 | 18 | This class can also be used with default Scrapy's scheduler. 19 | 20 | """ 21 | 22 | logger = logger 23 | 24 | def __init__(self, server, key, debug=False): 25 | """Initialize the duplicates filter. 26 | 27 | Parameters 28 | ---------- 29 | server : redis.StrictRedis 30 | The redis server instance. 31 | key : str 32 | Redis key Where to store fingerprints. 33 | debug : bool, optional 34 | Whether to log filtered requests. 35 | 36 | """ 37 | self.server = server 38 | self.key = key 39 | self.debug = debug 40 | self.logdupes = True 41 | 42 | @classmethod 43 | def from_settings(cls, settings): 44 | """Returns an instance from given settings. 45 | 46 | This uses by default the key ``dupefilter:``. When using the 47 | ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 48 | it needs to pass the spider name in the key. 49 | 50 | Parameters 51 | ---------- 52 | settings : scrapy.settings.Settings 53 | 54 | Returns 55 | ------- 56 | RFPDupeFilter 57 | A RFPDupeFilter instance. 58 | 59 | 60 | """ 61 | server = get_redis_from_settings(settings) 62 | # XXX: This creates one-time key. needed to support to use this 63 | # class as standalone dupefilter with scrapy's default scheduler 64 | # if scrapy passes spider on open() method this wouldn't be needed 65 | # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 66 | key = DEFAULT_DUPEFILTER_KEY % {'timestamp': int(time.time())} 67 | debug = settings.getbool('DUPEFILTER_DEBUG') 68 | return cls(server, key=key, debug=debug) 69 | 70 | @classmethod 71 | def from_crawler(cls, crawler): 72 | """Returns instance from crawler. 73 | 74 | Parameters 75 | ---------- 76 | crawler : scrapy.crawler.Crawler 77 | 78 | Returns 79 | ------- 80 | RFPDupeFilter 81 | Instance of RFPDupeFilter. 82 | 83 | """ 84 | return cls.from_settings(crawler.settings) 85 | 86 | def request_seen(self, request): 87 | """Returns True if request was already seen. 88 | 89 | Parameters 90 | ---------- 91 | request : scrapy.http.Request 92 | 93 | Returns 94 | ------- 95 | bool 96 | 97 | """ 98 | fp = self.request_fingerprint(request) 99 | # This returns the number of values added, zero if already exists. 100 | added = self.server.sadd(self.key, fp) 101 | return added == 0 102 | 103 | def request_fingerprint(self, request): 104 | """Returns a fingerprint for a given request. 105 | 106 | Parameters 107 | ---------- 108 | request : scrapy.http.Request 109 | 110 | Returns 111 | ------- 112 | str 113 | 114 | """ 115 | return request_fingerprint(request) 116 | 117 | def close(self, reason=''): 118 | """Delete data on close. Called by Scrapy's scheduler. 119 | 120 | Parameters 121 | ---------- 122 | reason : str, optional 123 | 124 | """ 125 | self.clear() 126 | 127 | def clear(self): 128 | """Clears fingerprints data.""" 129 | self.server.delete(self.key) 130 | 131 | def log(self, request, spider): 132 | """Logs given request. 133 | 134 | Parameters 135 | ---------- 136 | request : scrapy.http.Request 137 | spider : scrapy.spiders.Spider 138 | 139 | """ 140 | if self.debug: 141 | msg = "Filtered duplicate request: %(request)s" 142 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 143 | elif self.logdupes: 144 | msg = ("Filtered duplicate request %(request)s" 145 | " - no more duplicates will be shown" 146 | " (see DUPEFILTER_DEBUG to show all duplicates)") 147 | msg = "Filtered duplicate request: %(request)s" 148 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 149 | self.logdupes = False 150 | 151 | -------------------------------------------------------------------------------- /scrapy_eagle/worker/queue.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.reqser import request_to_dict, request_from_dict 2 | 3 | from . import picklecompat 4 | 5 | 6 | class Base(object): 7 | """Per-spider queue/stack base class""" 8 | 9 | def __init__(self, server, spider, key, serializer=None): 10 | """Initialize per-spider redis queue. 11 | 12 | Parameters: 13 | server -- redis connection 14 | spider -- spider instance 15 | key -- key for this queue (e.g. "%(spider)s:queue") 16 | 17 | """ 18 | if serializer is None: 19 | # Backward compatibility. 20 | # TODO: deprecate pickle. 21 | serializer = picklecompat 22 | if not hasattr(serializer, 'loads'): 23 | raise TypeError("serializer does not implement 'loads' function: %r" 24 | % serializer) 25 | if not hasattr(serializer, 'dumps'): 26 | raise TypeError("serializer '%s' does not implement 'dumps' function: %r" 27 | % serializer) 28 | 29 | self.server = server 30 | self.spider = spider 31 | self.key = key % {'spider': spider.name} 32 | self.serializer = serializer 33 | 34 | def _encode_request(self, request): 35 | """Encode a request object""" 36 | obj = request_to_dict(request, self.spider) 37 | return self.serializer.dumps(obj) 38 | 39 | def _decode_request(self, encoded_request): 40 | """Decode an request previously encoded""" 41 | obj = self.serializer.loads(encoded_request) 42 | return request_from_dict(obj, self.spider) 43 | 44 | def __len__(self): 45 | """Return the length of the queue""" 46 | raise NotImplementedError 47 | 48 | def push(self, request): 49 | """Push a request""" 50 | raise NotImplementedError 51 | 52 | def pop(self, timeout=0): 53 | """Pop a request""" 54 | raise NotImplementedError 55 | 56 | def clear(self): 57 | """Clear queue/stack""" 58 | self.server.delete(self.key) 59 | 60 | 61 | class SpiderQueue(Base): 62 | """Per-spider FIFO queue""" 63 | 64 | def __len__(self): 65 | """Return the length of the queue""" 66 | return self.server.llen(self.key) 67 | 68 | def push(self, request): 69 | """Push a request""" 70 | self.server.lpush(self.key, self._encode_request(request)) 71 | 72 | def pop(self, timeout=0): 73 | """Pop a request""" 74 | if timeout > 0: 75 | data = self.server.brpop(self.key, timeout) 76 | if isinstance(data, tuple): 77 | data = data[1] 78 | else: 79 | data = self.server.rpop(self.key) 80 | if data: 81 | return self._decode_request(data) 82 | 83 | 84 | class SpiderPriorityQueue(Base): 85 | """Per-spider priority queue abstraction using redis' sorted set""" 86 | 87 | def __len__(self): 88 | """Return the length of the queue""" 89 | return self.server.zcard(self.key) 90 | 91 | def push(self, request): 92 | """Push a request""" 93 | data = self._encode_request(request) 94 | score = -request.priority 95 | # We don't use zadd method as the order of arguments change depending on 96 | # whether the class is Redis or StrictRedis, and the option of using 97 | # kwargs only accepts strings, not bytes. 98 | self.server.execute_command('ZADD', self.key, score, data) 99 | 100 | def pop(self, timeout=0): 101 | """ 102 | Pop a request 103 | timeout not support in this queue class 104 | """ 105 | # use atomic range/remove using multi/exec 106 | pipe = self.server.pipeline() 107 | pipe.multi() 108 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 109 | results, count = pipe.execute() 110 | if results: 111 | return self._decode_request(results[0]) 112 | 113 | 114 | class SpiderStack(Base): 115 | """Per-spider stack""" 116 | 117 | def __len__(self): 118 | """Return the length of the stack""" 119 | return self.server.llen(self.key) 120 | 121 | def push(self, request): 122 | """Push a request""" 123 | self.server.lpush(self.key, self._encode_request(request)) 124 | 125 | def pop(self, timeout=0): 126 | """Pop a request""" 127 | if timeout > 0: 128 | data = self.server.blpop(self.key, timeout) 129 | if isinstance(data, tuple): 130 | data = data[1] 131 | else: 132 | data = self.server.lpop(self.key) 133 | 134 | if data: 135 | return self._decode_request(data) 136 | 137 | 138 | __all__ = ['SpiderQueue', 'SpiderPriorityQueue', 'SpiderStack'] 139 | 140 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/templates/static/css/main.css: -------------------------------------------------------------------------------- 1 | div#server_set li#server-node { 2 | margin-bottom: 20px; 3 | } 4 | 5 | .active { color: red; } 6 | 7 | header { 8 | background-color: #222426; 9 | height: 60px; 10 | margin-right: 0; 11 | position: absolute; 12 | width: 100%; 13 | z-index: 200; 14 | } 15 | 16 | header div.brand { 17 | padding: 6px 0 0 0; 18 | } 19 | 20 | .flexbox { 21 | display: flex; 22 | overflow: hidden; 23 | flex-direction: row; 24 | min-height: 100vh; 25 | } 26 | 27 | div.subheader { 28 | position: absolute; 29 | margin-top: 60px; 30 | background-color: #2A2D2F; 31 | width: 100%; 32 | height: 46px; 33 | color: #00B280; 34 | z-index: 200; 35 | padding-top: 11px; 36 | } 37 | 38 | aside.sidebar { 39 | color: #001f3f; 40 | min-height: 100%; 41 | padding: 114px 0 0 10px; 42 | background-color: #DDFFDD; 43 | flex: 0 0 280px; 44 | } 45 | 46 | section.main-content-wrapper { 47 | padding: 114px 10px 10px 10px; 48 | /*border: 1px solid red;*/ 49 | min-height: 100%; 50 | flex: 1; 51 | } 52 | 53 | .sidebar-header { 54 | color: #6f737e; 55 | font-weight: 600; 56 | line-height: 20px; 57 | margin: 0; 58 | padding: 10px 10px 5px; 59 | text-transform: uppercase; 60 | } 61 | 62 | .sidebar .nav a { 63 | font-weight: 600; 64 | text-decoration: none; 65 | } 66 | .sidebar .nav i { 67 | font-size: 1em; 68 | margin-right: 5px; 69 | } 70 | .sidebar .nav .nav-sub { 71 | display: none; 72 | list-style: outside none none; 73 | padding: 0; 74 | } 75 | .sidebar .nav .nav-sub li > a { 76 | display: block; 77 | font-size: 0.813em; 78 | padding: 8px 0 8px 10px; 79 | } 80 | .sidebar .nav > li > .nav-sub > li > a { 81 | padding-left: 22px; 82 | } 83 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > a { 84 | padding-left: 55px; 85 | } 86 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a { 87 | padding-left: 65px; 88 | } 89 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a { 90 | padding-left: 70px; 91 | } 92 | .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a { 93 | padding-left: 75px; 94 | } 95 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > a { 96 | padding-left: 25px; 97 | } 98 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > .nav-sub > li > a { 99 | padding-left: 35px; 100 | } 101 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a { 102 | padding-left: 45px; 103 | } 104 | .sidebar-mini .sidebar .nav > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > .nav-sub > li > a { 105 | padding-left: 55px; 106 | } 107 | .sidebar .nav .nav-sub .nav-dropdown > a { 108 | padding-right: 30px; 109 | } 110 | .sidebar .nav .nav-sub > .open > a, .sidebar .nav .nav-sub > .open > a:focus, .sidebar .nav .nav-sub > .open > a:hover { 111 | background-color: transparent; 112 | border-color: transparent; 113 | } 114 | .sidebar .nav-pills { 115 | margin-left: 5px; 116 | margin-right: 12px; 117 | } 118 | .sidebar .nav-pills > li > a { 119 | font-size: 0.875em; 120 | padding: 9px 10px; 121 | } 122 | 123 | .sidebar-left .nav > li.open > a, 124 | .sidebar-left .nav > li > a:hover { 125 | background-color: #ffffff; 126 | color: #1d2939; 127 | } 128 | 129 | .sidebar-mini .sidebar-left .nav > li.nav-dropdown-open > a, 130 | .sidebar-mini .sidebar-left .nav > li:hover > a { 131 | background-color: #fff; 132 | color: #1d2939; 133 | } 134 | 135 | .nav-pills .nav-item.open .nav-link, 136 | .nav-pills .nav-item.open .nav-link:focus, 137 | .nav-pills .nav-item.open .nav-link:hover { 138 | background-color: #29d1ca; 139 | color: #fff; 140 | cursor: pointer; 141 | } 142 | 143 | .nav-pills .nav-link.active, 144 | .nav-pills .nav-link.active:focus, 145 | .nav-pills .nav-link.active:hover { 146 | background-color: #27b6af; 147 | color: #fff; 148 | cursor: pointer; 149 | } 150 | 151 | .sidebar-left a { 152 | color: #1f7e9a; 153 | } 154 | 155 | .sidebar-left a:focus, 156 | .sidebar-left a:hover { 157 | background-color: transparent; 158 | color: #001f3f; 159 | } 160 | 161 | .sidebar-left .active > a, 162 | .sidebar-left .active > a:focus, 163 | .sidebar-left .active > a:hover { 164 | /* Cor do item ativo dentro da categoria */ 165 | color: #1d2939; 166 | } 167 | 168 | .sidebar-mini .sidebar-left .nav > li.open > a { 169 | background-color: transparent; 170 | color: pink; 171 | } 172 | .sidebar-left .nav > li > a:focus { 173 | /* A cor que fica o texto depois de clicar na categoria (focus) */ 174 | background-color: #29d1ca; 175 | color: #fff; 176 | } 177 | 178 | .sidebar .nav-pills > li > a > .badge { 179 | margin: 3px 0; 180 | } 181 | 182 | .pull-right { 183 | float: right !important; 184 | } 185 | 186 | .nav-pills > li > a > .tag { 187 | margin-top: 2px; 188 | font-size: 80%; 189 | padding: 0.25em 0.4em 0.28em; 190 | } 191 | 192 | div.breadcrumbs span a, 193 | div.breadcrumbs { 194 | color: #d4d4d4; 195 | font-size: 14px; 196 | } 197 | 198 | div.breadcrumbs span:first-child a { 199 | color: #00B280; 200 | } -------------------------------------------------------------------------------- /scrapy_eagle/worker/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from scrapy.exceptions import DontCloseSpider 3 | from scrapy.spiders import Spider, CrawlSpider 4 | 5 | from . import connection 6 | 7 | 8 | # Default batch size matches default concurrent requests setting. 9 | DEFAULT_START_URLS_BATCH_SIZE = 16 10 | DEFAULT_START_URLS_KEY = '%(name)s:start_urls' 11 | 12 | 13 | class DistributedMixin(object): 14 | """Mixin class to implement reading urls from a redis queue.""" 15 | # Per spider redis key, default to DEFAULT_KEY. 16 | redis_key = None 17 | # Fetch this amount of start urls when idle. Default to DEFAULT_BATCH_SIZE. 18 | redis_batch_size = None 19 | # Redis client instance. 20 | server = None 21 | 22 | def start_requests(self): 23 | """Returns a batch of start requests from redis.""" 24 | return self.next_requests() 25 | 26 | def setup_redis(self, crawler=None): 27 | """Setup redis connection and idle signal. 28 | 29 | This should be called after the spider has set its crawler object. 30 | """ 31 | if self.server is not None: 32 | return 33 | 34 | if crawler is None: 35 | # We allow optional crawler argument to keep backwards 36 | # compatibility. 37 | # XXX: Raise a deprecation warning. 38 | crawler = getattr(self, 'crawler', None) 39 | 40 | if crawler is None: 41 | raise ValueError("crawler is required") 42 | 43 | settings = crawler.settings 44 | 45 | if self.redis_key is None: 46 | self.redis_key = settings.get( 47 | 'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY, 48 | ) 49 | 50 | self.redis_key = self.redis_key % {'name': self.name} 51 | 52 | if not self.redis_key.strip(): 53 | raise ValueError("redis_key must not be empty") 54 | 55 | if self.redis_batch_size is None: 56 | self.redis_batch_size = settings.getint( 57 | 'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE, 58 | ) 59 | 60 | try: 61 | self.redis_batch_size = int(self.redis_batch_size) 62 | except (TypeError, ValueError): 63 | raise ValueError("redis_batch_size must be an integer") 64 | 65 | self.logger.info("Reading start URLs from redis key '%(redis_key)s' " 66 | "(batch size: %(redis_batch_size)s)", self.__dict__) 67 | 68 | self.server = connection.from_settings(crawler.settings) 69 | # The idle signal is called when the spider has no requests left, 70 | # that's when we will schedule new requests from redis queue 71 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 72 | 73 | def next_requests(self): 74 | """Returns a request to be scheduled or none.""" 75 | use_set = self.settings.getbool('REDIS_START_URLS_AS_SET') 76 | fetch_one = self.server.spop if use_set else self.server.lpop 77 | # XXX: Do we need to use a timeout here? 78 | found = 0 79 | while found < self.redis_batch_size: 80 | data = fetch_one(self.redis_key) 81 | if data: 82 | data = data.decode('utf-8') 83 | else: 84 | # Queue empty. 85 | break 86 | req = self.make_request_from_data(data) 87 | if req: 88 | yield req 89 | found += 1 90 | else: 91 | self.logger.debug("Request not made from data: %r", data) 92 | 93 | if found: 94 | self.logger.debug("Read %s requests from '%s'", found, self.redis_key) 95 | 96 | def make_request_from_data(self, data): 97 | # By default, data is an URL. 98 | if '://' in data: 99 | return self.make_requests_from_url(data) 100 | else: 101 | self.logger.error("Unexpected URL from '%s': %r", self.redis_key, data) 102 | 103 | def schedule_next_requests(self): 104 | """Schedules a request if available""" 105 | for req in self.next_requests(): 106 | self.crawler.engine.crawl(req, spider=self) 107 | 108 | def spider_idle(self): 109 | """Schedules a request if available, otherwise waits.""" 110 | # XXX: Handle a sentinel to close the spider. 111 | self.schedule_next_requests() 112 | raise DontCloseSpider 113 | 114 | 115 | class DistributedSpider(DistributedMixin, Spider): 116 | """Spider that reads urls from redis queue when idle.""" 117 | 118 | @classmethod 119 | def from_crawler(self, crawler, *args, **kwargs): 120 | obj = super(DistributedSpider, self).from_crawler(crawler, *args, **kwargs) 121 | obj.setup_redis(crawler) 122 | return obj 123 | 124 | 125 | class DistributedCrawlSpider(DistributedMixin, CrawlSpider): 126 | """Spider that reads urls from redis queue when idle.""" 127 | 128 | @classmethod 129 | def from_crawler(self, crawler, *args, **kwargs): 130 | obj = super(DistributedCrawlSpider, self).from_crawler(crawler, *args, **kwargs) 131 | obj.setup_redis(crawler) 132 | return obj 133 | 134 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/App.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import { Link, IndexLink } from 'react-router' 3 | import { connect } from 'react-redux' 4 | import Breadcrumbs from 'react-breadcrumbs' 5 | 6 | require('./App.scss'); 7 | 8 | class App extends React.Component { 9 | constructor(props){ 10 | super(props); 11 | } 12 | 13 | componentWillMount(){ 14 | this.intervals = []; 15 | } 16 | 17 | setInterval() { 18 | this.intervals.push(setInterval.apply(null, arguments)); 19 | } 20 | 21 | componentWillUnmount(){ 22 | this.intervals.forEach(clearInterval); 23 | 24 | // Ref: https://facebook.github.io/react/tips/initial-ajax.html 25 | this.clientsRequest.abort(); 26 | } 27 | 28 | ajax_get_jobs_info(){ 29 | 30 | var that = this; 31 | 32 | this.clientsRequest = $.ajax({ 33 | url: window.location.protocol + "//" + document.domain + ":" + location.port + "/jobs/list", 34 | type: 'GET', 35 | dataType: 'json', 36 | cache: false 37 | }).done((data) => { 38 | 39 | $.each(data, (key, value) => { 40 | // console.log(key, value); 41 | 42 | that.props.dispatch( 43 | { 44 | type: 'UPDATE_SPIDER_INFO', 45 | spider_id: key, 46 | frequency_minutes: value.frequency_minutes, 47 | last_started_at: value.last_started_at, 48 | max_concurrency: value.max_concurrency, 49 | min_concurrency: value.min_concurrency, 50 | max_memory_mb: value.max_memory_mb, 51 | priority: value.priority, 52 | job_type: value.job_type, 53 | active: value.active, 54 | start_urls: value.start_urls 55 | } 56 | ); 57 | 58 | }) 59 | 60 | }).always(() => { 61 | // that.setState({'server_set': server_set_new}); 62 | }); 63 | 64 | } 65 | 66 | componentDidMount(){ 67 | this.ajax_get_jobs_info(); 68 | this.setInterval(this.ajax_get_jobs_info.bind(this), 5000); 69 | } 70 | 71 | render(){ 72 | const { servers_qty } = this.props; 73 | return ( 74 |
    75 | 76 |
    77 | 81 |
    82 | 83 |
    84 | 85 |
    86 | 87 | {/*

    Distributed Scrapy

    88 | 89 | {this.props.SET_SERVER_QTY(7)}}>{servers_qty} 90 | => 91 | 92 |
      93 |
    • /
    • 94 |
    • /servers/monitoring
    • 95 |
    • /spiders/config
    • 96 |
    97 | */} 98 | 99 | {this.props.children} 100 | 101 |
    102 | 103 | 149 | 150 |
    151 | 152 |
    153 | ); 154 | } 155 | } 156 | 157 | var mapDispatchToProps = function(dispatch){ 158 | return { 159 | dispatch 160 | } 161 | }; 162 | 163 | export default connect( 164 | (state) => { 165 | return { 166 | servers_qty: state.servers.servers_qty 167 | } 168 | }, 169 | mapDispatchToProps 170 | )(App) -------------------------------------------------------------------------------- /scrapy_eagle/worker/scheduler.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import six 3 | 4 | from scrapy.utils.misc import load_object 5 | 6 | from . import connection 7 | 8 | 9 | class DistributedScheduler(object): 10 | """Redis-based scheduler""" 11 | 12 | def __init__(self, server, 13 | persist=False, 14 | flush_on_start=False, 15 | queue_key='%(spider)s:requests', 16 | queue_cls='scrapy_eagle.worker.queue.SpiderPriorityQueue', 17 | dupefilter_key='%(spider)s:dupefilter', 18 | dupefilter_cls='scrapy_eagle.worker.dupefilter.RFPDupeFilter', 19 | idle_before_close=0, 20 | serializer=None): 21 | """Initialize scheduler. 22 | 23 | Parameters 24 | ---------- 25 | server : Redis 26 | The redis server instance. 27 | persist : bool 28 | Whether to flush requests when closing. Default is False. 29 | flush_on_start : bool 30 | Whether to flush requests on start. Default is False. 31 | queue_key : str 32 | Requests queue key. 33 | queue_cls : str 34 | Importable path to the queue class. 35 | dupefilter_key : str 36 | Duplicates filter key. 37 | dupefilter_cls : str 38 | Importable path to the dupefilter class. 39 | idle_before_close : int 40 | Timeout before giving up. 41 | 42 | """ 43 | if idle_before_close < 0: 44 | raise TypeError("idle_before_close cannot be negative") 45 | 46 | self.server = server 47 | self.persist = persist 48 | self.flush_on_start = flush_on_start 49 | self.queue_key = queue_key 50 | self.queue_cls = queue_cls 51 | self.dupefilter_cls = dupefilter_cls 52 | self.dupefilter_key = dupefilter_key 53 | self.idle_before_close = idle_before_close 54 | self.serializer = serializer 55 | self.stats = None 56 | 57 | def __len__(self): 58 | return len(self.queue) 59 | 60 | @classmethod 61 | def from_settings(cls, settings): 62 | kwargs = { 63 | 'persist': settings.getbool('SCHEDULER_PERSIST'), 64 | 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 65 | 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), 66 | } 67 | 68 | # If these values are missing, it means we want to use the defaults. 69 | optional = { 70 | # TODO: Use custom prefixes for this settings to note that are 71 | # specific to scrapy-redis. 72 | 'queue_key': 'SCHEDULER_QUEUE_KEY', 73 | 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 74 | 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', 75 | # We use the default setting name to keep compatibility. 76 | 'dupefilter_cls': 'DUPEFILTER_CLASS', 77 | 'serializer': 'SCHEDULER_SERIALIZER', 78 | } 79 | for name, setting_name in optional.items(): 80 | val = settings.get(setting_name) 81 | if val: 82 | kwargs[name] = val 83 | 84 | # Support serializer as a path to a module. 85 | if isinstance(kwargs.get('serializer'), six.string_types): 86 | kwargs['serializer'] = importlib.import_module(kwargs['serializer']) 87 | 88 | server = connection.from_settings(settings) 89 | # Ensure the connection is working. 90 | server.ping() 91 | 92 | return cls(server=server, **kwargs) 93 | 94 | @classmethod 95 | def from_crawler(cls, crawler): 96 | instance = cls.from_settings(crawler.settings) 97 | # FIXME: for now, stats are only supported from this constructor 98 | instance.stats = crawler.stats 99 | return instance 100 | 101 | def open(self, spider): 102 | self.spider = spider 103 | 104 | try: 105 | self.queue = load_object(self.queue_cls)( 106 | server=self.server, 107 | spider=spider, 108 | key=self.queue_key % {'spider': spider.name}, 109 | serializer=self.serializer, 110 | ) 111 | except TypeError as e: 112 | raise ValueError("Failed to instantiate queue class '%s': %s", 113 | self.queue_cls, e) 114 | 115 | try: 116 | self.df = load_object(self.dupefilter_cls)( 117 | server=self.server, 118 | key=self.dupefilter_key % {'spider': spider.name}, 119 | debug=spider.settings.getbool('DUPEFILTER_DEBUG'), 120 | ) 121 | except TypeError as e: 122 | raise ValueError("Failed to instantiate dupefilter class '%s': %s", 123 | self.dupefilter_cls, e) 124 | 125 | if self.flush_on_start: 126 | self.flush() 127 | # notice if there are requests already in the queue to resume the crawl 128 | if len(self.queue): 129 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 130 | 131 | def close(self, reason): 132 | if not self.persist: 133 | self.flush() 134 | 135 | def flush(self): 136 | self.df.clear() 137 | self.queue.clear() 138 | 139 | def enqueue_request(self, request): 140 | if not request.dont_filter and self.df.request_seen(request): 141 | self.df.log(request, self.spider) 142 | return False 143 | if self.stats: 144 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 145 | self.queue.push(request) 146 | return True 147 | 148 | def next_request(self): 149 | block_pop_timeout = self.idle_before_close 150 | request = self.queue.pop(block_pop_timeout) 151 | if request and self.stats: 152 | self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 153 | return request 154 | 155 | def has_pending_requests(self): 156 | return len(self) > 0 157 | 158 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/views/jobs.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import OrderedDict 3 | from datetime import datetime, timedelta 4 | 5 | import flask 6 | 7 | from scrapy_eagle.dashboard import settings 8 | from scrapy_eagle.dashboard.memory import get_job_object, update_job_object 9 | 10 | 11 | jobs = flask.Blueprint('jobs', __name__) 12 | 13 | 14 | @jobs.route('/update', methods=['POST']) 15 | def update(): 16 | 17 | #TODO: Ensure that the incoming request comes from the same IP (Security) 18 | 19 | result = {} 20 | error = False 21 | 22 | key, job_type, active, frequency_minutes, max_concurrency = (None, None, None, None, None) 23 | min_concurrency, priority, max_memory_mb, start_urls = (None, None, None, None) 24 | 25 | try: 26 | 27 | key = flask.request.form.get('key', None) 28 | job_type = flask.request.form.get('job_type', None) 29 | frequency_minutes = int(flask.request.form.get('frequency_minutes', None)) 30 | max_concurrency = int(flask.request.form.get('max_concurrency', None)) 31 | min_concurrency = int(flask.request.form.get('min_concurrency', None)) 32 | priority = int(flask.request.form.get('priority', None)) 33 | max_memory_mb = int(flask.request.form.get('max_memory_mb', None)) 34 | start_urls = flask.request.form.get('start_urls', None) 35 | 36 | if flask.request.form.get('active', None) == 'false': 37 | active = False 38 | elif flask.request.form.get('active', None) == 'true': 39 | active = True 40 | else: 41 | active = False 42 | 43 | # Never trust in the user input type 44 | except ValueError: 45 | error = True 46 | result.update({ 47 | 'status': 'error', 48 | 'msg': 'You sent wrong datatypes, like a letter when it should be numeric.' 49 | }) 50 | 51 | if not error: 52 | 53 | if not all([key, job_type, frequency_minutes, max_concurrency, min_concurrency, priority, max_memory_mb]): 54 | error = True 55 | result.update({ 56 | 'status': 'error', 57 | 'msg': 'You are missing some information, please check your form.' 58 | }) 59 | 60 | elif not start_urls and job_type == 'spider': 61 | error = True 62 | result.update({ 63 | 'status': 'error', 64 | 'msg': 'You should provide the Start URLs information for spiders.' 65 | }) 66 | 67 | else: 68 | 69 | actual_obj = get_job_object(key=key) 70 | 71 | # A brand new 72 | if not actual_obj: 73 | actual_obj = {} 74 | else: 75 | current_frequency = actual_obj['frequency_minutes'] 76 | 77 | actual_obj.update({ 78 | 'active': active, 79 | 'job_type': job_type, 80 | 'frequency_minutes': frequency_minutes, 81 | 'max_concurrency': max_concurrency, 82 | 'min_concurrency': min_concurrency, 83 | 'priority': priority, 84 | 'max_memory_mb': max_memory_mb 85 | }) 86 | 87 | # If the frequency change, recalculate the next execution 88 | if current_frequency != frequency_minutes: 89 | actual_obj['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=frequency_minutes)).isoformat() 90 | 91 | if job_type == 'spider': 92 | actual_obj.update({'start_urls': [x for x in start_urls.split("\n") if x]}) 93 | 94 | update_job_object(key=key, fields=actual_obj) 95 | 96 | if not error: 97 | result.update({ 98 | 'status': 'ok' 99 | }) 100 | 101 | return flask.Response( 102 | response=json.dumps(result, sort_keys=True), 103 | status=200, 104 | mimetype="application/json" 105 | ) 106 | 107 | 108 | @jobs.route('/list', methods=['GET']) 109 | def listing(): 110 | 111 | _spiders = settings.get_spiders() 112 | _commands = settings.get_commands() 113 | 114 | # When the system is starting up, spiders may return empty because 115 | # we're using async execution `green_threads.find_new_spiders`. 116 | if not _spiders: 117 | return flask.Response( 118 | response=json.dumps({}, sort_keys=True), 119 | status=200, 120 | mimetype="application/json" 121 | ) 122 | 123 | _spiders.sort() 124 | 125 | d = OrderedDict() 126 | 127 | for s in _spiders: 128 | 129 | obj = get_job_object(key=s) 130 | 131 | if obj: 132 | d[s] = obj 133 | else: 134 | # Jobs without previous information, using default config 135 | d[s] = {} 136 | d[s]['active'] = False 137 | d[s]['job_type'] = 'spider' 138 | d[s]['min_concurrency'] = 1 139 | d[s]['max_concurrency'] = 5 140 | d[s]['max_memory_mb'] = 200 141 | d[s]['priority'] = 1 142 | d[s]['frequency_minutes'] = 60 143 | d[s]['start_urls'] = [] 144 | d[s]['last_started_at'] = datetime.utcnow().isoformat() 145 | d[s]['next_execution_at'] = (datetime.utcnow() + timedelta(minutes=d[s]['frequency_minutes'])).isoformat() 146 | 147 | for file_name in _commands: 148 | 149 | obj = get_job_object(key=file_name) 150 | 151 | if obj: 152 | d[file_name] = obj 153 | 154 | else: 155 | d[file_name] = {} 156 | d[file_name]['active'] = False 157 | d[file_name]['job_type'] = 'command' 158 | d[file_name]['min_concurrency'] = 1 159 | d[file_name]['max_concurrency'] = 1 160 | d[file_name]['max_memory_mb'] = 50 161 | d[file_name]['priority'] = 1 162 | d[file_name]['frequency_minutes'] = 60 163 | d[file_name]['last_started_at'] = None 164 | d[file_name]['next_execution_at'] = None 165 | 166 | return flask.Response( 167 | response=json.dumps(d, sort_keys=True), 168 | status=200, 169 | mimetype="application/json" 170 | ) 171 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: docs/images/logo_readme.jpg 2 | ====================================== 3 | 4 | .. image:: https://travis-ci.org/rafaelcapucho/scrapy-eagle.svg?branch=master 5 | :target: https://travis-ci.org/rafaelcapucho/scrapy-eagle 6 | 7 | .. image:: https://img.shields.io/pypi/v/scrapy-eagle.svg 8 | :target: https://pypi.python.org/pypi/scrapy-eagle 9 | :alt: PyPI Version 10 | 11 | .. image:: https://img.shields.io/pypi/pyversions/scrapy-eagle.svg 12 | :target: https://pypi.python.org/pypi/scrapy-eagle 13 | 14 | .. image:: https://landscape.io/github/rafaelcapucho/scrapy-eagle/master/landscape.svg?style=flat 15 | :target: https://landscape.io/github/rafaelcapucho/scrapy-eagle/master 16 | :alt: Code Quality Status 17 | 18 | .. image:: https://requires.io/github/rafaelcapucho/scrapy-eagle/requirements.svg?branch=master 19 | :target: https://requires.io/github/rafaelcapucho/scrapy-eagle/requirements/?branch=master 20 | :alt: Requirements Status 21 | 22 | Scrapy Eagle is a tool that allow us to run any Scrapy_ based project in a distributed fashion and monitor how it is going on and how many resources it is consuming on each server. 23 | 24 | .. _Scrapy: http://scrapy.org 25 | 26 | **This project is Under Development, don't use it yet** 27 | 28 | .. image:: https://badge.waffle.io/rafaelcapucho/scrapy-eagle.svg?label=ready&title=Ready 29 | :target: https://waffle.io/rafaelcapucho/scrapy-eagle 30 | :alt: 'Stories in Ready' 31 | 32 | Requeriments 33 | ------------ 34 | 35 | Scrapy Eagle uses Redis_ as Distributed Queue, so you will need a redis instance running. 36 | 37 | .. _Redis: http://mail.python.org/pipermail/doc-sig/ 38 | 39 | Installation 40 | ------------ 41 | 42 | It could be easily made by running the code bellow, 43 | 44 | .. code-block:: console 45 | 46 | $ virtualenv eagle_venv; cd eagle_venv; source bin/activate 47 | $ pip install scrapy-eagle 48 | 49 | You should create one ``configparser`` configuration file (e.g. in /etc/scrapy-eagle.ini) containing: 50 | 51 | .. code-block:: console 52 | 53 | [redis] 54 | host = 127.0.0.1 55 | port = 6379 56 | db = 0 57 | ;password = someverysecretpass 58 | 59 | [server] 60 | debug = True 61 | cookie_secret_key = ha74h3hdh42a 62 | host = 0.0.0.0 63 | port = 5000 64 | 65 | [scrapy] 66 | binary = /project_venv/bin/scrapy 67 | base_dir = /project_venv/project_scrapy/project 68 | 69 | [commands] 70 | binary = /project_venv/bin/python3 71 | base_dir = /project_venv/project_scrapy/project/commands 72 | 73 | Then you will be able to execute the `eagle_server` command like, 74 | 75 | .. code-block:: console 76 | 77 | eagle_server --config-file=/etc/scrapy-eagle.ini 78 | 79 | Changes into your Scrapy project 80 | -------------------------------- 81 | 82 | Enable the components in your `settings.py` of your Scrapy project: 83 | 84 | .. code-block:: python 85 | 86 | # Enables scheduling storing requests queue in redis. 87 | SCHEDULER = "scrapy_eagle.worker.scheduler.DistributedScheduler" 88 | 89 | # Ensure all spiders share same duplicates filter through redis. 90 | DUPEFILTER_CLASS = "scrapy_eagle.worker.dupefilter.RFPDupeFilter" 91 | 92 | # Schedule requests using a priority queue. (default) 93 | SCHEDULER_QUEUE_CLASS = "scrapy_eagle.worker.queue.SpiderPriorityQueue" 94 | 95 | # Schedule requests using a queue (FIFO). 96 | SCHEDULER_QUEUE_CLASS = "scrapy_eagle.worker.queue.SpiderQueue" 97 | 98 | # Schedule requests using a stack (LIFO). 99 | SCHEDULER_QUEUE_CLASS = "scrapy_eagle.worker.queue.SpiderStack" 100 | 101 | # Max idle time to prevent the spider from being closed when distributed crawling. 102 | # This only works if queue class is SpiderQueue or SpiderStack, 103 | # and may also block the same time when your spider start at the first time (because the queue is empty). 104 | SCHEDULER_IDLE_BEFORE_CLOSE = 0 105 | 106 | # Specify the host and port to use when connecting to Redis (optional). 107 | REDIS_HOST = 'localhost' 108 | REDIS_PORT = 6379 109 | 110 | # Specify the full Redis URL for connecting (optional). 111 | # If set, this takes precedence over the REDIS_HOST and REDIS_PORT settings. 112 | REDIS_URL = "redis://user:pass@hostname:6379" 113 | 114 | Once the configuration is finished, you should adapt each spider to use our Mixin: 115 | 116 | .. code-block:: python 117 | 118 | from scrapy.spiders import CrawlSpider, Rule 119 | from scrapy_eagle.worker.spiders import DistributedMixin 120 | 121 | class YourSpider(DistributedMixin, CrawlSpider): 122 | 123 | name = "domain.com" 124 | 125 | # start_urls = ['http://www.domain.com/'] 126 | redis_key = 'domain.com:start_urls' 127 | 128 | rules = ( 129 | Rule(...), 130 | Rule(...), 131 | ) 132 | 133 | def _set_crawler(self, crawler): 134 | CrawlSpider._set_crawler(self, crawler) 135 | DistributedMixin.setup_redis(self) 136 | 137 | Feeding a Spider from Redis 138 | --------------------------- 139 | 140 | The class `scrapy_eagle.worker.spiders.DistributedMixin` enables a spider to read the 141 | urls from redis. The urls in the redis queue will be processed one 142 | after another. 143 | 144 | Then, push urls to redis:: 145 | 146 | redis-cli lpush domain.com:start_urls http://domain.com/ 147 | 148 | Dashboard Development 149 | --------------------- 150 | 151 | If you would like to change the client-side then you'll need to have NPM_ installed because we use ReactJS_ to build our interface. Installing all dependencies locally: 152 | 153 | .. _ReactJS: https://facebook.github.io/react/ 154 | .. _NPM: https://www.npmjs.com/ 155 | 156 | .. code-block:: console 157 | 158 | cd scrapy-eagle/dashboard 159 | npm install 160 | 161 | Then you can run ``npm start`` to compile and start monitoring any changes and recompiling automatically. 162 | 163 | To generate the production version, run ``npm run build``. 164 | 165 | To be easier to test the Dashboard you could use one simple http server instead of run the ``eagle_server``, like: 166 | 167 | .. code-block:: console 168 | 169 | sudo npm install -g http-server 170 | cd scrapy-eagle/dashboard 171 | http-server templates/ 172 | 173 | It would be available for you at http://127.0.0.1:8080 174 | 175 | **Note**: Until now the Scrapy Eagle is mostly based on https://github.com/rolando/scrapy-redis. 176 | -------------------------------------------------------------------------------- /scrapy_eagle/dashboard/react-src/components/jobs/JobsItem.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import { connect } from 'react-redux' 3 | 4 | import cx from 'classnames' 5 | import Switch from 'react-switchery' 6 | 7 | class BaseComponent extends React.Component { 8 | _bind(...methods) { 9 | methods.forEach( (method) => this[method] = this[method].bind(this) ); 10 | } 11 | } 12 | 13 | class JobsItem extends React.Component { 14 | 15 | constructor(props){ 16 | super(props); 17 | // this._bind('_handleClick', '_handleFoo'); 18 | this.handleSave = this.handleSave.bind(this); 19 | this.onBlurFrequency = this.onBlurFrequency.bind(this); 20 | this.onBlurMaxConcurrency = this.onBlurMaxConcurrency.bind(this); 21 | this.onBlurMinConcurrency = this.onBlurMinConcurrency.bind(this); 22 | this.onChangePriority = this.onChangePriority.bind(this); 23 | this.onBlurMaxMemory = this.onBlurMaxMemory.bind(this); 24 | this.onBlurStartURLs = this.onBlurStartURLs.bind(this); 25 | this.handleSave = this.handleSave.bind(this); 26 | this.state = { 27 | 'key': this.props.id, 28 | 'active': this.props.value.active, 29 | 'job_type': this.props.value.job_type, 30 | 'frequency_minutes': this.props.value.frequency_minutes, 31 | 'max_concurrency': this.props.value.max_concurrency, 32 | 'min_concurrency': this.props.value.min_concurrency, 33 | 'priority': this.props.value.priority, 34 | 'max_memory_mb': this.props.value.max_memory_mb, 35 | }; 36 | 37 | if(this.props.value.start_urls){ 38 | this.state['start_urls'] = this.format_start_urls(this.props.value.start_urls); 39 | } 40 | 41 | } 42 | 43 | format_start_urls(mylist){ 44 | let buff = ""; 45 | mylist.forEach(elem => { 46 | buff += elem + "\n"; 47 | }) 48 | return buff; 49 | } 50 | 51 | onBlurFrequency(e){ this.setState({'frequency_minutes': $.trim(e.target.value)}) } 52 | onBlurMaxConcurrency(e){ this.setState({'max_concurrency': $.trim(e.target.value)}) } 53 | onBlurMinConcurrency(e){ this.setState({'min_concurrency': $.trim(e.target.value)}) } 54 | onChangePriority(e){ this.setState({'priority': e.target.value}) } 55 | onBlurMaxMemory(e){ this.setState({'max_memory_mb': $.trim(e.target.value)}) } 56 | onBlurStartURLs(e){ this.setState({'start_urls': $.trim(e.target.value)}) } 57 | 58 | handleSave(){ 59 | 60 | $.ajax({ 61 | url: window.location.protocol + "//" + document.domain + ":" + location.port + "/jobs/update", 62 | type: 'POST', 63 | dataType: 'json', 64 | data: this.state, 65 | }).done((data) => { 66 | 67 | if(data.status == 'error'){ 68 | alert(data.msg); 69 | } else if(data.status == 'ok'){ 70 | 71 | } 72 | 73 | }).fail(() => { 74 | alert('The request failed, please try again.'); 75 | }).always(() => { 76 | // that.setState({}); 77 | }); 78 | 79 | } 80 | 81 | SwitchonChange(value) { 82 | console.log(value); 83 | } 84 | 85 | render(){ 86 | 87 | var show_start_urls = () => { 88 | 89 | if(this.state.job_type == 'spider') { 90 | return ( 91 |
    92 | 93 |
    94 | 96 |
    97 |
    98 | ) 99 | } 100 | 101 | }; 102 | 103 | return ( 104 |
    105 |
    {this.state.key}
    106 |
    107 | 108 |
    109 | 110 |
    111 | 112 |
    113 |
    114 | 115 |
    116 | 117 |
    118 | 119 |
    120 |
    121 | 122 |
    123 | 124 |
    125 | 126 |
    127 |
    128 | 129 |
    130 | 131 |
    132 | 145 |
    146 |
    147 | 148 |
    149 | 150 |
    151 | 152 | {/*The processes are killed when reach this threshold (megabytes).*/} 153 |
    154 |
    155 | 156 | {show_start_urls()} 157 | 158 |
    159 | 160 |
    161 | 16 minutes ago 162 |
    163 |
    164 | 165 |
    166 | 167 |
    168 | 169 | {/**/} 181 | 182 | 183 |
    184 | ); 185 | } 186 | 187 | } 188 | 189 | var mapDispatchToProps = function(dispatch){ 190 | return { 191 | dispatch 192 | } 193 | }; 194 | 195 | export default connect( 196 | (state) => { 197 | return { 198 | //jobs: state.jobs 199 | } 200 | }, 201 | mapDispatchToProps 202 | )(JobsItem) --------------------------------------------------------------------------------