├── .gitignore ├── main.js ├── example.js ├── package.json ├── padawans └── wikipedia.js ├── README.md └── modules └── jedi.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .DS_Store 3 | npm-debug.log -------------------------------------------------------------------------------- /main.js: -------------------------------------------------------------------------------- 1 | var jedi = require('./modules/jedi.js'); 2 | 3 | module.exports = jedi; 4 | 5 | -------------------------------------------------------------------------------- /example.js: -------------------------------------------------------------------------------- 1 | var jedi = require('./modules/jedi.js'), 2 | fs = require('fs'); 3 | 4 | 5 | /// REGISTER ALL THE PADAWANS 6 | 7 | var PADAWANS_DIR = "./padawans/"; 8 | 9 | fs.readdirSync(PADAWANS_DIR).forEach(function(padawan){ 10 | require(PADAWANS_DIR + padawan)(jedi); 11 | }); 12 | 13 | jedi.crawl('http://en.wikipedia.org/wiki/Montpellier,_France', function(err, data){ 14 | console.log(data); 15 | }); 16 | 17 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jedi-crawler", 3 | "version": "0.0.3", 4 | "description": "Lightsabing Node/PhantomJS crawler. Crawl almost everything, including AJAX content.", 5 | "main": "main.js", 6 | "scripts": { 7 | "test": "npm test" 8 | }, 9 | "keywords": [ 10 | "phantom", 11 | "scraping", 12 | "scrape", 13 | "crawler", 14 | "crawl", 15 | "parse", 16 | "parser", 17 | "web" 18 | ], 19 | "author": "Nicolas Kermarc", 20 | "license": "MIT", 21 | "dependencies": { 22 | "lodash": "~1.3.1", 23 | "node-phantom": "~0.2.3" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /padawans/wikipedia.js: -------------------------------------------------------------------------------- 1 | // Wikipedia crawler 2 | module.exports = function(jedi) { 3 | 4 | jedi.registerPadawan({ 5 | // Pattern to match URL 6 | pattern: /en.wikipedia.org\/wiki\//, 7 | // Selectors to be executed 8 | selectors:{ 9 | title:{ 10 | sel: "#firstHeading span", 11 | type: "text" 12 | }, 13 | firstParagraph:{ 14 | sel: "#toc ~ p:first", 15 | type: "text" 16 | } 17 | }, 18 | // You can choose to process the data AFTER being crawled. 19 | postProcessing: function(data) { 20 | /// Do your custom processing on the data processed 21 | data.title = data.title.toUpperCase(); 22 | return data; 23 | } 24 | }); 25 | 26 | }; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | JEDI CRAWLER 2 | ================================ 3 | 4 | Da fuq? 5 | ------------------------- 6 | 7 | JEDI CRAWLER is a Node/PhantomJS crawler made to scrape pretty much anything from Node, with a really simple syntax. Work in progress ladies 8 | 9 | ``` 10 | npm install jedi-crawler 11 | ``` 12 | 13 | How does it work 14 | ------------------------- 15 | Register padawans to the jedi crawler, that have a pattern to match a URL, and jQuery-style selectors. You can also post-process the data if you need to do some treatment (number conversion, etc) 16 | 17 | wikipedia.js: 18 | 19 | ```javascript 20 | module.exports = function(jedi) { 21 | 22 | jedi.registerPadawan({ 23 | // Pattern to match URL 24 | pattern: /en.wikipedia.org\/wiki\//, 25 | // Selectors to be executed 26 | selectors:{ 27 | title:{ 28 | sel: "#firstHeading span", 29 | type: "text" 30 | }, 31 | firstParagraph:{ 32 | sel: "#toc ~ p:first", 33 | type: "text" 34 | } 35 | }, 36 | // You can choose to process the data AFTER being crawled. 37 | postProcessing: function(data) { 38 | /// Do your custom processing on the data processed 39 | data.title = data.title.toUpperCase(); 40 | return data; 41 | } 42 | }); 43 | 44 | }; 45 | ``` 46 | 47 | For now only two types of selectors are supported : "text" and "src" 48 | 49 | I find having one file per padawan (crawler) pretty cool for code clarity and also padawans need to learn by themselve and be alone 50 | 51 | ``` 52 | npm install jedi-crawlers 53 | ``` 54 | 55 | You can then give your padawans to the Jedi by doing 56 | 57 | ```javascript 58 | var jedi = require('jedi-crawler'); 59 | require('./padawans/wikipedia')(jedi); 60 | ``` 61 | 62 | And then you can do 63 | 64 | ```javascript 65 | jedi.crawl('http://en.wikipedia.org/whatever', function(err, result){ 66 | console.log(err); 67 | console.log(result); 68 | }); 69 | ``` 70 | 71 | As the jedi will figure out what padawan to use given on the URL and of the pattern you set 72 | 73 | 74 | Special features 75 | ----------------- 76 | 77 | Crawlers **only** start to scrape the page **as soon as $(document).ready is fired**. Our own version of jQuery is injected into the page, but then we also give back the $ to its owner in case they're executing 3rd party libraries to modify the DOM or w/e 78 | 79 | If your selectors matches severals DOM elements, then an array of every value is returned 80 | 81 | Right now, PhantomJS is instantiated with "--load-images=no" option so the page loads faster 82 | 83 | Test it now 84 | ----------------- 85 | Pull that bad boy 86 | Make sure you have PhantomJS installed 87 | Run node main.js -------------------------------------------------------------------------------- /modules/jedi.js: -------------------------------------------------------------------------------- 1 | var phantom = require('node-phantom'), 2 | _ = require('lodash'); 3 | 4 | // Instantiate phantom internal function 5 | var init = function(callback) { 6 | phantom.create(callback, {parameters:{'ignore-ssl-errors':'yes'}}); 7 | }; 8 | 9 | var padawans = []; 10 | 11 | var findMatchingPadawan = function(url) { 12 | var found = _.find(padawans,function(padawan){ 13 | return padawan.pattern.exec(url); 14 | }); 15 | return found; 16 | }; 17 | 18 | 19 | module.exports = { 20 | crawl: function(url, callbackFunction, data) { 21 | 22 | // We need to find a matching padawan that will process teh data feel me? 23 | var padawan = findMatchingPadawan(url); 24 | if (!padawan) { 25 | return callbackFunction("No crawler found for this URL ("+url+")"); 26 | } 27 | 28 | if (!padawan.selectors) { 29 | return callbackFunction('No selectors found for this padawan...') 30 | } 31 | 32 | // let's init that mysterious phantomJS 33 | init(function(err, ph) { 34 | // Create a page 35 | ph.createPage(function(err, page) { 36 | 37 | // by the PhantomJS team; just adjusted it for node-phantom bridge 38 | // https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js 39 | var waitFor = function (testFx, onReady, timeOutMillis) { 40 | var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s 41 | start = new Date().getTime(), 42 | condition = false, 43 | interval = setInterval(function() { 44 | if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) { 45 | // If not time-out yet and condition not yet fulfilled 46 | // Adjustement is made here (NK) -- we need to pass a callback 47 | testFx(function(err,result){ 48 | condition = result; 49 | }); 50 | } else { 51 | if(!condition) { 52 | // If condition still not fulfilled (timeout but condition is 'false') 53 | console.log("'waitFor()' timeout"); 54 | ph.exit(); 55 | } else { 56 | // Condition fulfilled (timeout and/or condition is 'true') 57 | console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms."); 58 | typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled 59 | clearInterval(interval); //< Stop this interval 60 | } 61 | } 62 | }, 100); //< repeat check every 100ms 63 | }; 64 | 65 | 66 | // Let's open the URL 67 | page.open(url, function(err, status) { 68 | if (status !== "success") { 69 | ph.exit(); 70 | callbackFunction("An error occured while opening the page with Phantom : "+status); 71 | return; 72 | } 73 | 74 | // Let's include latest jQuery bitches 75 | page.includeJs('https://ajax.googleapis.com/ajax/libs/jquery/2.0.3/jquery.min.js', function(err) { 76 | 77 | // Let's drop a variable when DOM is ready 78 | // We do that straight after including jQ 79 | page.evaluate(function(){ 80 | window.$JEDI = $.noConflict(true); 81 | // Give back $ to its previous owner in case 82 | // they're doing some random shit on the DOM with custom libs 83 | window.$JEDI(document).ready(function(){ 84 | window['JEDI-DOM-LOADED-BITCHES'] = true; 85 | }); 86 | }); 87 | // Wait for the DOM to be loaded! 88 | waitFor(function(resultCallback){ 89 | return page.evaluate(function(){ 90 | return window['JEDI-DOM-LOADED-BITCHES'] == true; 91 | }, resultCallback); 92 | }, 93 | function(){ 94 | return page.evaluate(function(data){ 95 | // Don't pollute the global space 96 | return (function(){ 97 | var result = {}; 98 | for (var key in data) { 99 | var sel = window.$JEDI(data[key].sel); 100 | var type = data[key].type; 101 | 102 | var getValue = function(sel, type) { 103 | if (type == "text") { 104 | return sel.text(); 105 | } 106 | else if (type == "src") { 107 | return sel.attr('src'); 108 | } 109 | }; 110 | 111 | if (sel.length == 1) { 112 | result[key] = getValue(sel, type); 113 | } 114 | else if (sel.length > 1) { 115 | result[key] = []; 116 | sel.each(function(){ 117 | result[key].push(getValue(window.$JEDI(this), type)); 118 | }); 119 | } 120 | else { 121 | result[key] = null; 122 | } 123 | 124 | } 125 | return result; 126 | })(); 127 | 128 | }, function(err, result) { 129 | // postProcessing is not mandatory and is just identity function if doesnt exist 130 | var postProcessing = padawan.postProcessing ? padawan.postProcessing : function(d) { return d; }; 131 | ph.exit(); 132 | callbackFunction(null, postProcessing(result)); 133 | }, padawan.selectors); 134 | }); 135 | 136 | }); 137 | }); 138 | }); 139 | }); 140 | }, 141 | // No headscrapping for now I'm just pushing that shit in an array 142 | registerPadawan: function(padawan) { 143 | if (!padawan || !padawan.pattern) { 144 | console.error('A pattern is needed dude.'); 145 | return false; 146 | } 147 | 148 | console.log('Registering a padawan that will match '+padawan.pattern); 149 | padawans.push(padawan); 150 | } 151 | }; 152 | --------------------------------------------------------------------------------