├── main.js ├── .travis.yml ├── lib ├── privateMethods │ ├── giveWorker.js │ ├── executeSelenium.js │ └── execute.js ├── publicMethods │ ├── setInterval.js │ ├── processHtml.js │ ├── retries.js │ ├── setProxies.js │ ├── setWorkers.js │ ├── setHeaders.js │ ├── find.js │ ├── selenium.js │ ├── notify.js │ ├── get.js │ └── run.js ├── Siphon.js ├── redis │ ├── flush.js │ ├── length.js │ ├── range.js │ ├── setRedis.js │ └── enqueue.js └── Methods.js ├── .gitignore ├── package.json ├── test ├── testSiphon.js └── testMethods.js └── README.md /main.js: -------------------------------------------------------------------------------- 1 | module.exports = require('./lib/Siphon') -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "node" -------------------------------------------------------------------------------- /lib/privateMethods/giveWorker.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Stores a GET request on the siphon object 3 | * @param {String} url - The url given to workers 4 | * @return {Object} The siphon object to allow method chaining 5 | */ 6 | function giveWorker(url) { 7 | if (typeof url !== 'string') throw new Error('Please insert URL string into giveWorker method'); 8 | this.workerURL = url; 9 | return this; 10 | } 11 | 12 | module.exports = giveWorker; 13 | -------------------------------------------------------------------------------- /lib/publicMethods/setInterval.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Sets an interval between requests 3 | * @param {Number} inteval - The interval in milliseconds between requests 4 | * @return {Object} The siphon object to allow method chaining 5 | */ 6 | function setInterval(interval) { 7 | if (typeof interval !== 'number') throw new Error('Please insert number into .setInterval method'); 8 | this.interval = interval; 9 | return this; 10 | } 11 | 12 | module.exports = setInterval; -------------------------------------------------------------------------------- /lib/publicMethods/processHtml.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Allows user to search HTML with Cheerio or Regex 3 | * @param {Function} callback - Passed HTML from request 4 | * @return {Object} The siphon object to allow method chaining 5 | */ 6 | function processHtml(callback) { 7 | if (!(callback instanceof Function)) throw new Error('Please insert callback function into .processHtml method'); 8 | this.html = callback; 9 | return this; 10 | } 11 | 12 | module.exports = processHtml; -------------------------------------------------------------------------------- /lib/publicMethods/retries.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Updates the number of tries on the siphon object 3 | * @param {Number} triesToAdd - Allows additional tries for each job (defaults to 1) 4 | * @return {Object} The siphon object to allow method chaining 5 | */ 6 | function retries(triesToAdd = 1) { 7 | if (!Number.isInteger(triesToAdd)) throw new Error('Please insert integer into .retries method'); 8 | this.tries += triesToAdd; 9 | return this; 10 | } 11 | 12 | module.exports = retries; -------------------------------------------------------------------------------- /lib/Siphon.js: -------------------------------------------------------------------------------- 1 | const methods = require('./Methods'); 2 | const request = require('request'); 3 | 4 | /** 5 | * @description Creates siphon object with properties added by methods. 6 | * @return {Object} The siphon object 7 | */ 8 | function siphon() { 9 | return Object.assign({}, { 10 | urls: [], 11 | searchTerms: [], 12 | numWorkers: require('os').cpus().length, 13 | tries: 1, 14 | idle: true, 15 | initial: true, 16 | }, methods); 17 | } 18 | 19 | module.exports = siphon; 20 | -------------------------------------------------------------------------------- /lib/publicMethods/setProxies.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Sets the proxy values for IP rotation while scraping 3 | * @param {Array} proxies - Each proxy in the array should be a string of a valid IP address 4 | * @return {Object} The siphon object to allow method chaining 5 | */ 6 | function setProxies(proxies) { 7 | if (!Array.isArray(proxies)) throw new Error('Please insert array of proxy strings into .setProxies method'); 8 | this.proxies = proxies; 9 | return this; 10 | } 11 | 12 | module.exports = setProxies; -------------------------------------------------------------------------------- /lib/publicMethods/setWorkers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Sets the numWorkers property of the siphon object 3 | * @param {Number} num - Allows a customized number of workers to be instantiated (defaults to the number of CPU cores) 4 | * @return {Object} The siphon object to allow method chaining 5 | */ 6 | function setWorkers(num) { 7 | if (!Number.isInteger(num)) throw new Error('Please insert integer into .setWorkers method'); 8 | this.numWorkers = num; 9 | return this; 10 | } 11 | 12 | module.exports = setWorkers; -------------------------------------------------------------------------------- /lib/publicMethods/setHeaders.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Sets the headers of the request object on the siphon object 3 | * @param {Object} headers - Headers with their respective values 4 | * @return {Object} The siphon object to allow method chaining 5 | */ 6 | function setHeaders(headers) { 7 | if (typeof headers !== 'object' || headers === null) { 8 | throw new Error('Please insert a valid header object into .setHeaders method'); 9 | } 10 | 11 | this.headers = headers; 12 | return this; 13 | } 14 | 15 | module.exports = setHeaders; -------------------------------------------------------------------------------- /lib/publicMethods/find.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Stores all of the matches for a particular string and regular expression 3 | * as an array in the data property of the siphon object 4 | * @param {Regular Expression} regex - The regex used to scrape that defaults to the html property of the siphon object 5 | * @return {Object} The siphon object to allow method chaining 6 | */ 7 | function find(regex) { 8 | if (!(regex instanceof RegExp)) throw new Error('Please insert a regular expression into .find method'); 9 | this.searchTerms.push(regex); 10 | return this; 11 | } 12 | 13 | module.exports = find; -------------------------------------------------------------------------------- /lib/publicMethods/selenium.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Optionally allows use of fully rendered DOM for more complex scraping 3 | * @param {String} browser - Can choose 'chrome' or 'firefox' instances 4 | * @param {Function} callback - Provides space for custom selenium logic 5 | * @return {Object} The siphon object to allow method chaining 6 | */ 7 | function selenium(browser, callback) { 8 | if (typeof browser !== 'string' || typeof callback !== 'function') { 9 | throw new Error('Please insert valid browser string as first parameter AND function callback as second parameter for .selenium method'); 10 | } 11 | 12 | this.seleniumOptions = { browser, callback }; 13 | return this; 14 | } 15 | 16 | module.exports = selenium; -------------------------------------------------------------------------------- /lib/publicMethods/notify.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Saves the function callback as a property on the siphon object to later be used to 3 | * send a notification when a job finishes 4 | * @param {Function} callback - Applied to the statusMessage object in the 'run' method 5 | * @return {Object} The siphon object to allow method chaining 6 | */ 7 | function notify(callback = setDefault) { 8 | if (!(callback instanceof Function)) throw new Error('Please insert callback function into .notify method'); 9 | this.notifyFunction = callback; 10 | return this; 11 | } 12 | 13 | // Allows nested data to be seen in console 14 | function setDefault(statMsg) { 15 | console.log(JSON.stringify(statMsg)); 16 | } 17 | 18 | module.exports = notify; -------------------------------------------------------------------------------- /lib/redis/flush.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Becomes public method after setRedis is called. Empties jobsQueue list. 3 | * @param {Object} siphonObject - Carries 'this' reference from setRedis 4 | * @param {Object} redisClient - Perform Redis CLI commands with client from setRedis 5 | * @param {Object} cluster - Gives access to Node cluster module from setRedis 6 | * @return {Object} The siphon object for method chaining 7 | */ 8 | function flush(siphonObject, redisClient, cluster) { 9 | if (cluster.isMaster) { 10 | redisClient.flushdb((err, reply) => { 11 | if (err) throw new Error(err); 12 | console.log(`jobsQueue is flushed`); 13 | }); 14 | } 15 | 16 | return siphonObject; 17 | } 18 | 19 | module.exports = flush; -------------------------------------------------------------------------------- /lib/redis/length.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Becomes public method after setRedis is called. Gives length of jobs queue. 3 | * @param {Object} siphonObject - Carries 'this' reference from setRedis 4 | * @param {Object} redisClient - Perform Redis CLI commands with client from setRedis 5 | * @param {Object} cluster - Gives access to Node cluster module from setRedis 6 | * @return {Object} The siphon object for method chaining 7 | */ 8 | function length(siphonObject, redisClient, cluster) { 9 | if (cluster.isMaster) { 10 | redisClient.llen('jobsQueue', (err, reply) => { 11 | if (err) throw new Error(err); 12 | console.log(`jobsQueue length is ${reply}`); 13 | }); 14 | } 15 | 16 | return siphonObject; 17 | } 18 | 19 | module.exports = length; -------------------------------------------------------------------------------- /lib/publicMethods/get.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Stores urls for GET request on the siphon object 3 | * @param {Array || String} urls - The url(s) of the target website(s) 4 | * @return {Object} The siphon object to allow method chaining 5 | */ 6 | function get(urls) { 7 | if (!urls || (!Array.isArray(urls) && typeof urls !== 'string')) throw new Error('Please insert array of URL strings or single URL string into .get method'); 8 | if (typeof urls === 'string') this.urls = [urls]; 9 | else this.urls = removeDuplicateURLs(urls); 10 | return this; 11 | } 12 | 13 | function removeDuplicateURLs(urlArray) { 14 | const results = {}; 15 | urlArray.forEach(ele => results[ele] = 'I exist'); 16 | return Object.keys(results); 17 | } 18 | 19 | module.exports = get; -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Project-specific # 2 | ################### 3 | sandbox 4 | chromedriver 5 | ansible 6 | 7 | # Node # 8 | ################### 9 | node_modules 10 | 11 | # Compiled source # 12 | ################### 13 | *.com 14 | *.class 15 | *.dll 16 | *.exe 17 | *.o 18 | *.so 19 | 20 | # Packages # 21 | ############ 22 | # it's better to unpack these files and commit the raw source 23 | # git has its own built in compression methods 24 | *.7z 25 | *.dmg 26 | *.gz 27 | *.iso 28 | *.jar 29 | *.rar 30 | *.tar 31 | *.zip 32 | 33 | # Logs and databases # 34 | ###################### 35 | *.log 36 | *.sql 37 | *.sqlite 38 | 39 | # OS generated files # 40 | ###################### 41 | .DS_Store 42 | .DS_Store? 43 | ._* 44 | .Spotlight-V100 45 | .Trashes 46 | ehthumbs.db 47 | Thumbs.db -------------------------------------------------------------------------------- /lib/redis/range.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Becomes public method after setRedis is called. Shows all items in jobsQueue. 3 | * @param {Object} siphonObject - Carries 'this' reference from setRedis 4 | * @param {Object} redisClient - Perform Redis CLI commands with client from setRedis 5 | * @param {Object} cluster - Gives access to Node cluster module from setRedis 6 | * @return {Object} The siphon object for method chaining 7 | */ 8 | function range(siphonObject, redisClient, cluster) { 9 | if (cluster.isMaster) { 10 | redisClient.lrange(['jobsQueue', 0, -1], (err, reply) => { 11 | if (err) throw new Error(err); 12 | console.log(`jobsQueue contains these items: ${reply}`); 13 | }); 14 | } 15 | 16 | return siphonObject; 17 | } 18 | 19 | module.exports = range; -------------------------------------------------------------------------------- /lib/Methods.js: -------------------------------------------------------------------------------- 1 | const execute = require('./privateMethods/execute'); 2 | const find = require('./publicMethods/find'); 3 | const get = require('./publicMethods/get'); 4 | const giveWorker = require('./privateMethods/giveWorker'); 5 | const notify = require('./publicMethods/notify'); 6 | const processHtml = require('./publicMethods/processHtml'); 7 | const retries = require('./publicMethods/retries'); 8 | const run = require('./publicMethods/run'); 9 | const selenium = require('./publicMethods/selenium'); 10 | const setHeaders = require('./publicMethods/setHeaders'); 11 | const setInterval = require('./publicMethods/setInterval'); 12 | const setProxies = require('./publicMethods/setProxies'); 13 | const setRedis = require('./redis/setRedis'); 14 | const setWorkers = require('./publicMethods/setWorkers'); 15 | 16 | module.exports = { 17 | execute, 18 | find, 19 | get, 20 | giveWorker, 21 | notify, 22 | processHtml, 23 | retries, 24 | run, 25 | selenium, 26 | setHeaders, 27 | setInterval, 28 | setProxies, 29 | setRedis, 30 | setWorkers 31 | }; 32 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "siphonjs", 3 | "version": "1.2.0", 4 | "description": "Node.js data extraction library built for scale", 5 | "homepage": "https://github.com/siphonjs/siphon", 6 | "bugs": { 7 | "url": "https://github.com/siphonjs/siphon/issues" 8 | }, 9 | "constributors": [ 10 | { 11 | "name": "William Galebach", 12 | "email": "william.galebach@gmail.com" 13 | }, 14 | { 15 | "name": "Sooeung Lee", 16 | "email": "sooeung2@gmail.com" 17 | }, 18 | { 19 | "name": "George Norberg", 20 | "email": "george.norberg@gmail.com" 21 | } 22 | ], 23 | "files": [ 24 | "main.js", 25 | "lib" 26 | ], 27 | "main": "main.js", 28 | "repository": { 29 | "type": "git", 30 | "url": "https://github.com/siphonjs/siphon.git" 31 | }, 32 | "scripts": { 33 | "test": "mocha test/*" 34 | }, 35 | "license": "MIT", 36 | "dependencies": { 37 | "request": "^2.79.0" 38 | }, 39 | "optionalDependencies": { 40 | "chai": "^3.5.0", 41 | "mocha": "^3.2.0", 42 | "redis": "^2.6.3", 43 | "selenium-webdriver": "^3.0.1" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /lib/privateMethods/executeSelenium.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description If .selenium is called, load web driver and execute with Selenium API 3 | * @param {Object} seleniumOptions - passed in by the user as { browser, callback } 4 | * @param {String} workerURL - acts as ID for current job 5 | * @param {Function} storeFunction - stores data in database according to callback configuration 6 | * @return {Object} The siphon object to allow method chaining 7 | */ 8 | function executeSelenium(seleniumOptions, workerURL, storeFunction) { 9 | const webdriver = require('selenium-webdriver'); 10 | const driver = new webdriver.Builder().forBrowser(seleniumOptions.browser).build(); 11 | driver.get(workerURL); 12 | 13 | seleniumOptions 14 | .callback(driver) 15 | .then((data) => { 16 | process.send({ 17 | type: 'data', 18 | data: { 19 | data, 20 | url: workerURL, 21 | } 22 | }); 23 | }) 24 | .catch((err) => { 25 | process.send({ 26 | type: 'error', 27 | error: { 28 | description: 'Selenium connection error', 29 | error: err, 30 | url: workerURL, 31 | } 32 | }); 33 | }); 34 | } 35 | 36 | module.exports = executeSelenium; 37 | -------------------------------------------------------------------------------- /test/testSiphon.js: -------------------------------------------------------------------------------- 1 | const expect = require('chai').expect; 2 | const siphon = require('../lib/Siphon'); 3 | 4 | let mySiphon; 5 | beforeEach(() => { 6 | mySiphon = siphon(); 7 | }); 8 | 9 | describe('default Siphon Object', () => { 10 | it('can have new properties added to it', () => { 11 | expect(mySiphon).to.be.extensible; 12 | }); 13 | 14 | it('has a URLs property whose default value is an empty array', () => { 15 | expect(mySiphon.urls).to.deep.equal([]); 16 | }); 17 | 18 | it('has a searchTerms property whose default value is an empty array', () => { 19 | expect(mySiphon.searchTerms).to.deep.equal([]); 20 | }); 21 | 22 | it('has a numWorkers property whose default value is the user\'s number of CPU cores', () => { 23 | expect(mySiphon.numWorkers).to.equal(require('os').cpus().length); 24 | }); 25 | 26 | it('has a tries property whose default value is 1', () => { 27 | expect(mySiphon.tries).to.equal(1); 28 | }); 29 | 30 | it('has an idle property whose default value is true', () => { 31 | expect(mySiphon.idle).to.equal(true); 32 | }); 33 | 34 | it('has an initial property whose default value is true', () => { 35 | expect(mySiphon.initial).to.equal(true); 36 | }); 37 | }); 38 | -------------------------------------------------------------------------------- /lib/redis/setRedis.js: -------------------------------------------------------------------------------- 1 | const redis = require('redis'); 2 | const cluster = require('cluster'); 3 | 4 | // Redis methods 5 | const enqueue = require('./enqueue'); 6 | const flush = require('./flush'); 7 | const length = require('./length'); 8 | const range = require('./range'); 9 | 10 | /** 11 | * @description Connects to Redis server, then allows access to queue methods. 12 | * @param {Number} port - Remote Redis server's port 13 | * @param {Number} ip - Remote Redis server's IP address 14 | * @param {String} password - Remote Redis server's password (if applicable) 15 | * @return {Object} The siphon object to allow method chaining 16 | */ 17 | function setRedis(port, ip, password) { 18 | if (cluster.isMaster) { 19 | 20 | // Connect to Redis server 21 | if (port && ip) { 22 | this.client = redis.createClient(port, ip); 23 | if (password) this.client.auth(password); 24 | } else { 25 | this.client = redis.createClient(); 26 | } 27 | 28 | // Debugging listeners 29 | this.client.on('connect', () => console.log('Connected to redis server')); 30 | this.client.on("error", (err) => console.log("Error connecting to redis server " + err)); 31 | } 32 | 33 | this.enqueue = () => enqueue(this, this.client, cluster); 34 | this.flush = () => flush(this, this.client, cluster); 35 | this.length = () => length(this, this.client, cluster); 36 | this.range = () => range(this, this.client, cluster); 37 | 38 | return this; 39 | } 40 | 41 | module.exports = setRedis; 42 | -------------------------------------------------------------------------------- /lib/redis/enqueue.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @description Becomes public method after setRedis is called. Adds jobs to Redis queue. 3 | * @param {Object} siphonObject - Carries 'this' reference from setRedis 4 | * @param {Object} redisClient - Perform Redis CLI commands with client from setRedis 5 | * @param {Object} cluster - Gives access to Node cluster module from setRedis 6 | * @return {Object} The siphon object for method chaining 7 | */ 8 | function enqueue(siphonObject, redisClient, cluster) { 9 | if (cluster.isMaster) { 10 | const job = JSON.stringify(siphonObject, replacer, 2); 11 | redisClient.lpush(['jobsQueue', job], (err, reply) => { 12 | if (err) throw new Error(err); 13 | console.log(`jobsQueue now contains ${reply} jobs`); 14 | }); 15 | } 16 | 17 | return siphonObject; 18 | } 19 | 20 | /** 21 | * @description Used as JSON.stringify parameter to only stringify properties necessary for execution on remote server. 22 | * @param {String} key - If replacer finds object or array, this represents each key 23 | * @param {Any} value - If replacer finds object or array, this represents each value 24 | * @return {Any} Replaces old value. If falsy, removes old value. 25 | */ 26 | function replacer(key, value) { 27 | const omittedProperties = { 28 | client: 'instant lookup', 29 | execute: 'instant lookup', 30 | executeSelenium: 'instant lookup', 31 | get: 'instant lookup', 32 | giveWorker: 'instant lookup', 33 | find: 'instant lookup', 34 | idle: 'instant lookup', 35 | notify: 'instant lookup', 36 | retries: 'instant lookup', 37 | run: 'instant lookup', 38 | selenium: 'instant lookup', 39 | setHeaders: 'instant lookup', 40 | setInterval: 'instant lookup', 41 | setProxies: 'instant lookup', 42 | setWorkers: 'instant lookup', 43 | }; 44 | 45 | if (omittedProperties.hasOwnProperty(key)) return; 46 | if (value instanceof RegExp) return "__REGEXP " + value.toString(); 47 | if (value instanceof Function) return value.toString(); 48 | return value; 49 | } 50 | 51 | module.exports = enqueue; -------------------------------------------------------------------------------- /lib/privateMethods/execute.js: -------------------------------------------------------------------------------- 1 | const request = require('request'); 2 | const executeSelenium = require('./executeSelenium'); 3 | 4 | /** 5 | * @description Executes data extraction in Node workers based on Siphon Object values 6 | * @return {Object} The siphon object to allow method chaining 7 | */ 8 | function execute() { 9 | const workerURL = this.workerURL; 10 | 11 | // Applies Selenium callback if provided 12 | if (this.seleniumOptions) { 13 | executeSelenium(this.seleniumOptions, workerURL); 14 | return this; 15 | } 16 | 17 | // Build up options object for GET request using request module 18 | let requestOptions = { url: workerURL }; 19 | 20 | // Assign headers if provided 21 | if (this.headers) { 22 | requestOptions = Object.assign(requestOptions, { headers: this.headers }); 23 | } 24 | 25 | // Rotate proxies if provided 26 | if (this.proxies && this.proxies[0]) { 27 | curProxy = this.proxies[Math.floor(Math.random() * this.proxies.length)] 28 | requestOptions = Object.assign(requestOptions, { proxy: curProxy }); 29 | } 30 | 31 | // Send GET request with options we just built up 32 | request(requestOptions, (err, response, html) => { 33 | if (err) { 34 | return process.send({ 35 | type: 'error', 36 | error: { 37 | description: 'Error with HTTP request', 38 | error: err, 39 | url: workerURL, 40 | } 41 | }); 42 | }; 43 | 44 | // If user wishes to process entire HTML page directly, apply callback then send processed data to master 45 | if (this.html) { 46 | process.send({ 47 | type: 'data', 48 | data: { 49 | data: this.html(html, response), 50 | url: workerURL, 51 | } 52 | }); 53 | 54 | return this; 55 | } 56 | 57 | // If user employs find method instead of processHTML, store regex matches 58 | const matchArray = []; 59 | this.searchTerms.forEach(regex => { 60 | const matches = html.match(regex); 61 | if (matches) { 62 | delete matches.index; 63 | delete matches.input; 64 | matchArray.push(matches); 65 | } else { 66 | matchArray.push('no matches for regex: ' + regex.toString()); 67 | } 68 | 69 | // Send regex matches to the master 70 | process.send({ 71 | type: 'data', 72 | data: { 73 | data: matchArray, 74 | url: workerURL, 75 | } 76 | }); 77 | }); 78 | 79 | return this; 80 | }); 81 | } 82 | 83 | module.exports = execute; -------------------------------------------------------------------------------- /test/testMethods.js: -------------------------------------------------------------------------------- 1 | const expect = require('chai').expect; 2 | const siphon = require('../lib/Siphon'); 3 | 4 | let mySiphon; 5 | beforeEach(() => { 6 | mySiphon = siphon(); 7 | }); 8 | 9 | describe('default Siphon Object', () => { 10 | it('can have new properties added to it', () => { 11 | expect(mySiphon).to.be.extensible; 12 | }); 13 | }); 14 | 15 | describe('find method', () => { 16 | it('should insert search terms into Siphon Object', () => { 17 | mySiphon.find(/[0-9]{2}\.[0-9]/); 18 | expect(mySiphon.searchTerms).to.deep.equal([/[0-9]{2}\.[0-9]/]); 19 | }); 20 | }); 21 | 22 | describe('get method', () => { 23 | it('should insert single URL string into Siphon Object', () => { 24 | mySiphon.get('https://www.wunderground.com/cgi-bin/findweather/getForecast?query=90025') 25 | expect(mySiphon.urls).to.deep.equal(['https://www.wunderground.com/cgi-bin/findweather/getForecast?query=90025']); 26 | }); 27 | 28 | it('should insert array of URLs into Siphon Object', () => { 29 | mySiphon.get(['https://www.wunderground.com/cgi-bin/findweather/getForecast?query=12345', 'https://www.wunderground.com/cgi-bin/findweather/getForecast?query=12346']); 30 | expect(mySiphon.urls).to.deep.equal(['https://www.wunderground.com/cgi-bin/findweather/getForecast?query=12345', 'https://www.wunderground.com/cgi-bin/findweather/getForecast?query=12346']); 31 | }); 32 | 33 | it('should overwrite Siphon Object urls property when .get method is chained', () => { 34 | mySiphon.get('https://www.wunderground.com/cgi-bin/findweather/getForecast?query=90025') 35 | .get(['https://www.wunderground.com/cgi-bin/findweather/getForecast?query=12345', 'https://www.wunderground.com/cgi-bin/findweather/getForecast?query=12346']); 36 | expect(mySiphon.urls).to.deep.equal(['https://www.wunderground.com/cgi-bin/findweather/getForecast?query=12345', 'https://www.wunderground.com/cgi-bin/findweather/getForecast?query=12346']); 37 | }); 38 | }); 39 | 40 | describe('giveWorker method', () => { 41 | it('should insert URL intended for worker into Siphon Object', () => { 42 | mySiphon.giveWorker('https://www.wunderground.com/cgi-bin/findweather/getForecast?query=90025') 43 | expect(mySiphon.workerURL).to.equal('https://www.wunderground.com/cgi-bin/findweather/getForecast?query=90025'); 44 | }); 45 | }); 46 | 47 | describe('notify method', () => { 48 | it('should add notifyFunction to Siphon Object', () => { 49 | mySiphon.notify(() => console.log('notify user')); 50 | expect(mySiphon.notifyFunction).to.be.an.instanceOf(Function); 51 | }); 52 | }); 53 | 54 | describe('processHtml method', () => { 55 | it('should add html function to Siphon Object', () => { 56 | mySiphon.processHtml(() => console.log('process HTML')); 57 | expect(mySiphon.html).to.be.an.instanceOf(Function); 58 | }); 59 | }); 60 | 61 | describe('retries method', () => { 62 | it('should modify tries in Siphon Object', () => { 63 | mySiphon.retries(3); 64 | expect(mySiphon.tries).to.equal(4); 65 | }); 66 | }); 67 | 68 | describe('selenium method', () => { 69 | it('should add browser and callback to seleniumOptions in Siphon Object', () => { 70 | const seleniumCallback = () => console.log('using selenium'); 71 | mySiphon.selenium('chrome', seleniumCallback); 72 | expect(mySiphon.seleniumOptions).to.deep.equal({ browser: 'chrome', callback: seleniumCallback }); 73 | }); 74 | }); 75 | 76 | describe('setHeaders method', () => { 77 | it('should add headers to Siphon Object', () => { 78 | mySiphon.setHeaders({ 'User-Agent': 'George Soowill' }); 79 | expect(mySiphon.headers).to.deep.equal({ 'User-Agent': 'George Soowill' }); 80 | }); 81 | }); 82 | 83 | describe('setInterval method', () => { 84 | it('should add interval length to Siphon Object', () => { 85 | mySiphon.setInterval(10); 86 | expect(mySiphon.interval).to.equal(10); 87 | }); 88 | }); 89 | 90 | describe('setProxies method', () => { 91 | it('should add proxies to Siphon Object', () => { 92 | mySiphon.setProxies(['192.168.1.0', '123.456.7.8']); 93 | expect(mySiphon.proxies).to.deep.equal(['192.168.1.0', '123.456.7.8']); 94 | }); 95 | }); 96 | 97 | describe('setWorkers method', () => { 98 | it('should modify number of workers in Siphon Object', () => { 99 | mySiphon.setWorkers(2); 100 | expect(mySiphon.numWorkers).to.equal(2); 101 | }); 102 | }); 103 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/siphonjs/siphon.svg?branch=master)](https://travis-ci.org/siphonjs/siphon) 2 | [![NPM Version](https://img.shields.io/npm/v/siphonjs.svg)](https://www.npmjs.com/package/siphonjs) 3 | [![License](https://img.shields.io/npm/l/siphonjs.svg)](https://www.npmjs.com/package/siphonjs) 4 | 5 | 6 | # Siphon 7 | Siphonjs is a powerful, lightweight data extraction library for Node.js designed to work at scale. 8 | 9 | ## Features 10 | 11 | - Intuitive chainable API 12 | - Fault tolerant with retries and advanced error handling 13 | - Proxies automatically rotated to enable higher volume searches 14 | - Clustered Node.js servers for improved server-side performance 15 | - Custom runtime intervals for throttling to match site limits 16 | - Pre-configured Selenium Web Driver for advanced DOM manipulation 17 | - Pre-configured Redis access for scaling to multiple servers 18 | - Lightweight with no large required dependencies 19 | 20 | ## Install 21 | ``` 22 | $ npm install --save siphonjs 23 | ``` 24 | 25 | ## Usage 26 | 27 | Collect 1000 temperatures in a matter of seconds! 28 | 29 | ``` 30 | const siphon = require('siphonjs'); 31 | 32 | const urls = []; 33 | for (let i = 90025; i < 91025; i++) { 34 | urls.push(`https://www.wunderground.com/cgi-bin/findweather/getForecast?query=${i}`); 35 | } 36 | 37 | siphon() 38 | .get(urls) 39 | .find(/[0-9]{2}\.[0-9]/) 40 | .run() 41 | ``` 42 | 43 | ## Advanced Usage 44 | 45 | Extract faster with remote servers and a Redis queue! We handle horizontal scaling under the hood. 46 | 47 | Controller: 48 | ``` 49 | const siphon = require('siphonjs'); 50 | const request = require('request'); 51 | 52 | // Search 100,000 weather urls in batches of 100 53 | const INCREMENT = 100; 54 | 55 | const siph = siphon() 56 | .setRedis('PORT', 'IP', 'PASSWORD') 57 | .processHtml((html, res) => { 58 | let temp = html.match(/[0-9]{2}\.[0-9]/); 59 | if (!temp) return { zip: null }; 60 | else temp = temp[0]; 61 | if (temp === '10.4') return { zip: null } 62 | let zip = res.req.path.match(/[0-9]{5}/); 63 | if (zip !== null) zip = zip[0]; 64 | return { zip, temp }; 65 | }) 66 | .notify((statMsg) => { 67 | console.log(statMsg); 68 | request.post(*your url here*, { 69 | headers: { 70 | 'Content-type': 'application/json' 71 | }, 72 | body: JSON.stringify(statMsg) 73 | }); 74 | }); 75 | 76 | for (let i = 00000; i < 99999; i += INCREMENT) { 77 | const urls = []; 78 | for (let j = 0; j < INCREMENT; j++) { 79 | let num = (i + j).toString(); 80 | while (num.length < 5) { 81 | num = '0' + num; 82 | } 83 | 84 | urls.push(`https://www.wunderground.com/cgi-bin/findweather/getForecast?query=${num}`); 85 | } 86 | 87 | siph.get(urls).enqueue() 88 | } 89 | ``` 90 | 91 | Workers: 92 | ``` 93 | const siphon = require('siphonjs'); 94 | 95 | siphon() 96 | .setRedis(6379, 192.168.123.456, 'password') 97 | .run() 98 | ``` 99 | 100 | ## Required Dependencies 101 | 102 | - `request` for http request handling 103 | 104 | ## Optional Dependencies 105 | 106 | - `redis` for parallel processing with multiple servers 107 | - `selenium-webdriver` for jobs requiring full client-side rendering 108 | 109 | ## Testing Dependencies 110 | 111 | Simply run "npm test" in your terminal to execute all tests! 112 | 113 | - `mocha` test runner 114 | - `chai` assertion library 115 | 116 | # API 117 | 118 | Using Siphon is simple! Chain as many methods as you'd like. 119 | 120 | ### .find 121 | 122 | Parameter: `regular expression` 123 | 124 | Customize your search with regex. 125 | 126 | ``` 127 | siphon() 128 | .get(urls) 129 | .find(/[0-9]{2}\.[0-9]/) 130 | .run() 131 | ``` 132 | 133 | ### .get 134 | 135 | Parameter: `string OR array of strings` 136 | 137 | Each URL represents a query. 138 | 139 | ``` 140 | siphon() 141 | .get(urls) 142 | .find(/[0-9]{2}\.[0-9]/) 143 | .run() 144 | ``` 145 | 146 | ### .notify 147 | 148 | Parameter: `function` 149 | 150 | Notify is used to both visualize received data and store your data in a database. 151 | If invoked without parameters, this method defaults to console.log with stringified data. 152 | 153 | Here is the structure of the status message: 154 | 155 | ``` 156 | { 157 | id: // unique URL string, 158 | errors: [], 159 | data: [], 160 | } 161 | ``` 162 | 163 | Here is an example with Sequelize's "bulk create" method to store your data: 164 | 165 | ``` 166 | siphon() 167 | .get(urls) 168 | .find(/[0-9]{2}\.[0-9]/) 169 | .notify((statusMessage) => { 170 | Tank.bulkCreate({ processedHtml: statusMessage.data }, (err) => { 171 | if (err) return handleError(err); 172 | }); 173 | }) 174 | .run() 175 | ``` 176 | 177 | ### .processHtml 178 | 179 | Parameter: `function` 180 | 181 | Callback receives entire HTML string. 182 | 183 | ``` 184 | siphon() 185 | .get(urls) 186 | .processHtml((html) => { 187 | console.log(html); 188 | }) 189 | .run() 190 | ``` 191 | 192 | ### .retries 193 | 194 | Parameter: `number` 195 | 196 | If a query fails, this allows more tries on each failed query. 197 | 198 | ``` 199 | siphon() 200 | .get(urls) 201 | .find(/[0-9]{2}\.[0-9]/) 202 | .retries(5) 203 | .run() 204 | ``` 205 | 206 | ### .run 207 | 208 | No parameters. Simply invoke as last method to execute your search on that server! 209 | 210 | ``` 211 | siphon() 212 | .get(urls) 213 | .find(/[0-9]{2}\.[0-9]/) 214 | .run() 215 | ``` 216 | 217 | ### .selenium 218 | 219 | Parameter: `function` 220 | 221 | If you wish to use the power of the Selenium Web Driver, insert all Selenium logic inside of this callback. 222 | 223 | ``` 224 | siphon() 225 | .get(urls) 226 | .find(/[0-9]{2}\.[0-9]/) 227 | .selenium('chrome', (driver) => { 228 | data = driver.findElement({className: 'class-name'}).getText(); 229 | driver.quit(); 230 | return data; 231 | }) 232 | .run() 233 | ``` 234 | 235 | ### .setHeaders 236 | 237 | Parameter: `object` 238 | 239 | Provide headers for GET requests. 240 | 241 | ``` 242 | siphon() 243 | .get(urls) 244 | .find(/[0-9]{2}\.[0-9]/) 245 | .setHeaders({ 'User-Agent': 'George Soowill' }) 246 | .run() 247 | ``` 248 | 249 | ### .setInterval 250 | 251 | Parameter: `number` (milliseconds) 252 | 253 | Sets how often you would like to search again. Great for throttling calls to stay within a website's request limits. 254 | 255 | ``` 256 | siphon() 257 | .get(urls) 258 | .find(/[0-9]{2}\.[0-9]/) 259 | .setInterval(200) 260 | .run() 261 | ``` 262 | 263 | ### .setProxies 264 | 265 | Parameter: `array of strings` 266 | 267 | If you provide more than one proxy, we automatically rotate through them for you! 268 | 269 | ``` 270 | siphon() 271 | .get(urls) 272 | .find(/[0-9]{2}\.[0-9]/) 273 | .setProxies(['192.168.1.2', '123.456.7.8']) 274 | .run() 275 | ``` 276 | 277 | ## .setRedis 278 | 279 | Parameters: `string (PORT), string (Redis IP Address), string (password if applicable)` 280 | 281 | Use a Redis queue to store your queries for later execution. Makes Redis methods below public (enqueue, flush, length, range). 282 | Siphon will automatically configure the 'jobsQueue' list for you. Defaults to your computer's client if no parameters provided. 283 | 284 | Single Computer: 285 | 286 | ``` 287 | siphon() 288 | .get(urls) 289 | .find(/[0-9]{2}\.[0-9]/) 290 | .setRedis() 291 | .enqueue() 292 | .run() 293 | ``` 294 | 295 | Remote Redis server with worker cluster: 296 | 297 | Controller: 298 | ``` 299 | siphon() 300 | .get(urls) 301 | .find(/[0-9]{2}\.[0-9]/) 302 | .setRedis('6379', '188.78.58.162', 'siphontestingnodejs') 303 | .enqueue() 304 | ``` 305 | 306 | Workers: 307 | ``` 308 | siphon() 309 | .setRedis('6379', '188.78.58.162', 'siphontestingnodejs') 310 | .run() 311 | ``` 312 | 313 | ### .enqueue 314 | 315 | Private until .setRedis method is called. No parameters. Stores queries in your Redis server. 316 | 317 | ``` 318 | siphon() 319 | .get(urls) 320 | .setRedis() 321 | .enqueue() 322 | ``` 323 | 324 | ### .flush 325 | 326 | Private until .setRedis method is called. No parameters. Empties Redis server. 327 | 328 | ``` 329 | siphon() 330 | .setRedis() 331 | .flush() 332 | ``` 333 | 334 | ### .length 335 | 336 | Private until .setRedis method is called. No parameters. Gives length of jobs queue. 337 | 338 | ``` 339 | siphon() 340 | .setRedis('6379', '188.78.58.162', 'siphontestingnodejs') 341 | .length() 342 | ``` 343 | 344 | ### .range 345 | 346 | Private until .setRedis method is called. No parameters. Provides list of all jobs in queue. 347 | 348 | ``` 349 | siphon() 350 | .setRedis('6379', '188.78.58.162', 'siphontestingnodejs') 351 | .range() 352 | ``` 353 | 354 | ## Team 355 | 356 | [![Image of Will](https://avatars0.githubusercontent.com/u/7759384?v=3&s=150)](https://github.com/willbach) 357 | [![Image of George](https://avatars3.githubusercontent.com/u/18508195?v=3&s=150)](https://github.com/ganorberg) 358 | [![Image of Soo](https://avatars1.githubusercontent.com/u/15530782?v=3&s=150)](https://github.com/sooeung2) 359 | 360 | ## License 361 | 362 | Released under the [MIT License](https://opensource.org/licenses/mit-license.php). 363 | -------------------------------------------------------------------------------- /lib/publicMethods/run.js: -------------------------------------------------------------------------------- 1 | const cluster = require('cluster'); 2 | const request = require('request'); 3 | 4 | /** 5 | * @description Initializes workers from master and sets logic to execute jobs 6 | */ 7 | function run() { 8 | 9 | // If using Redis... 10 | if (this.client) { 11 | 12 | // Arrow functions maintain 'this' reference 13 | const checkRedis = () => { 14 | setTimeout(() => { 15 | console.log('listening: ', this.idle); 16 | if (this.idle) { 17 | this.idle = false; 18 | this.client.rpop('jobsQueue', (err, reply) => { 19 | if (err) throw new Error ('Error pulling from the Redis queue: ' + err); 20 | 21 | // If response is null, stop operations and reset idle 22 | try { 23 | const response = JSON.parse(reply, reviver); 24 | if (!response.interval) delete this.interval; 25 | Object.assign(this, response); 26 | processJobs(this); 27 | } catch (e) { 28 | this.idle = true; 29 | } 30 | }); 31 | } 32 | 33 | // Recursively call function to continue checks every second 34 | checkRedis()}, 1000); 35 | } 36 | 37 | checkRedis(); 38 | 39 | // Workers should not interact with Redis queue 40 | } else { 41 | this.idle = false; 42 | processJobs(this); 43 | } 44 | } 45 | 46 | /** 47 | * @description Sets up status message, job queue behavior, and logic for executing jobs 48 | * @param {Object} siphonObj - Provides Siphon Object for access to its values 49 | */ 50 | function processJobs(siphonObj) { 51 | if (cluster.isMaster) { 52 | if (siphonObj.searchTerms.length < 1 && !siphonObj.html && !siphonObj.seleniumOptions) return console.error('Please enter a search term, process html function, or selenium function and run again'); 53 | if (siphonObj.urls.length < 1) return console.error('No urls in array'); 54 | 55 | // Store all data along with any uncompleted jobs or errors 56 | const statusMessage = { 57 | id: siphonObj.urls[0], 58 | jobs: {}, 59 | errors: [], 60 | data: [], 61 | jobCount: siphonObj.urls.length 62 | } 63 | 64 | // Due to Selenium and setInterval speed, only one worker is required 65 | if (siphonObj.seleniumOptions || siphonObj.interval) siphonObj.numWorkers = 1; 66 | 67 | // Initiate the cluster 68 | console.log('Master cluster setting up ' + siphonObj.numWorkers + ' workers'); 69 | for (let i = 0; i < siphonObj.numWorkers; i++) { 70 | createWorker(statusMessage, siphonObj); 71 | } 72 | 73 | // Create event listeners for master the first time cluster is initialized 74 | if (siphonObj.initial) { 75 | 76 | // Restart workers on exit event (except for deliberate shutdown) 77 | cluster.on('exit', (worker, code, signal) => { 78 | if (code) createWorker(statusMessage, siphonObj); 79 | }); 80 | 81 | siphonObj.initial = false; 82 | } 83 | 84 | // Initiate interval to distribute jobs according to user. By default, all jobs are assigned at once. 85 | assignJobs(siphonObj.urls, cluster.workers, statusMessage, siphonObj); 86 | siphonObj.urls = []; 87 | 88 | if (!siphonObj.seleniumOptions && !siphonObj.interval) checkJobs(cluster.workers, statusMessage, siphonObj); 89 | 90 | // Workers have a listener for executing and shutting down 91 | } else { 92 | process.on('message', (message) => { 93 | if (message.type === 'execute') { 94 | siphonObj.searchTerms = message.searchTerms.map(term => typeof term === 'string' ? parseRegex(term) : term); 95 | siphonObj.html = parseFunction(message.htmlFunction); 96 | if (message.seleniumOptions) { 97 | siphonObj.seleniumOptions = { 98 | browser: message.seleniumOptions.browser, 99 | callback: parseFunction(message.seleniumOptions.callback), 100 | } 101 | } 102 | 103 | if (message.curInterval) setTimeout( () => siphonObj.giveWorker(message.url).execute(), message.curInterval); 104 | else siphonObj.giveWorker(message.url).execute(); 105 | } 106 | 107 | if (message.type === 'shutdown') { 108 | console.log('shutting down'); 109 | process.exit(0); 110 | } 111 | }); 112 | } 113 | } 114 | 115 | /** 116 | * @description Creates a worker and sets up listeners for communication with master 117 | * @param {Object} statMsg - Provides status message 118 | * @param {Object} siphonObj - Provides Siphon Object for access to its values 119 | */ 120 | function createWorker(statMsg, siphonObj) { 121 | const worker = cluster.fork(); 122 | 123 | worker.on('message', (message) => { 124 | if (message.type === 'data' && statMsg.jobs[message.data.url]) { 125 | statMsg.data.push(message.data); 126 | delete statMsg.jobs[message.data.url]; 127 | console.log(--statMsg.jobCount); 128 | } 129 | 130 | if (message.type === 'error' && statMsg.jobs[message.error.url]) { 131 | --statMsg.jobs[message.error.url]; 132 | 133 | // If job has no tries left, push it to array and delete the job 134 | if (statMsg.jobs[message.error.url] === 0) { 135 | statMsg.errors.push(message.error); 136 | delete statMsg.jobs[message.error.url]; 137 | console.log(--statMsg.jobCount); 138 | } 139 | } 140 | 141 | if (statMsg.jobCount === 0 && siphonObj.idle === false) { 142 | Object.keys(cluster.workers).forEach(worker => { 143 | cluster.workers[worker].send({ 144 | type: 'shutdown', 145 | from: 'master' 146 | }); 147 | }); 148 | 149 | console.log(`All jobs completed with ${statMsg.data.length} data points and ${statMsg.errors.length} errors`); 150 | if (siphonObj.notifyFunction) siphonObj.notifyFunction(statMsg); 151 | 152 | siphonObj.idle = true; 153 | } 154 | }); 155 | } 156 | 157 | /** 158 | * @description Assigns all outstanding jobs to workers on a decreasing interval 159 | * @param {Object} workers - References Node cluster workers 160 | * @param {Object} statMsg - Provides status message 161 | * @param {Object} siphonObj - Provides Siphon Object for later access to its values 162 | */ 163 | function checkJobs(workers, statMsg, siphonObj) { 164 | const urlArray = Object.keys(statMsg.jobs); 165 | let remainCount = urlArray.length; 166 | if (remainCount > 0) { 167 | setTimeout(() => { 168 | assignJobs(urlArray, workers, statMsg, siphonObj); 169 | checkJobs(workers, statMsg, siphonObj); 170 | }, remainCount > 100 ? remainCount * 50: 2000); 171 | } 172 | } 173 | 174 | /** 175 | * @description Master distributes jobs to workers 176 | * @param {Array} urlArray - Provides URLs included in get method 177 | * @param {Object} workers - References Node cluster workers 178 | * @param {Object} statMsg - Provides status message 179 | * @param {Object} siphonObj - Provides Siphon Object for later access to its values 180 | */ 181 | function assignJobs(urlArray, workers, statMsg, siphonObj) { 182 | let num = 0; 183 | while (urlArray[0] !== undefined) { 184 | 185 | Object.keys(workers).forEach((worker) => { 186 | if (urlArray[0] !== undefined) { 187 | let currentJob = urlArray.pop(); 188 | let curInterval = siphonObj.interval ? siphonObj.interval * num++ : false; 189 | if(statMsg.jobs[currentJob] === undefined) statMsg.jobs[currentJob] = siphonObj.tries; 190 | // console.log('this job should be showing up', currentJob,'|||', statMsg.jobs) 191 | 192 | workers[worker].send({ 193 | type: 'execute', url: currentJob, htmlFunction: siphonObj.html, 194 | curInterval, searchTerms: siphonObj.searchTerms, 195 | seleniumOptions: siphonObj.seleniumOptions, from: 'master' 196 | }) 197 | } 198 | }); 199 | } 200 | } 201 | 202 | /** 203 | * @description Used as JSON.parse parameter to convert a RegExp string to a RegExp object 204 | * @param {regexStr} - Stringified regular expression 205 | * @return {RegExp} - A regular expression 206 | */ 207 | function parseRegex(regexStr) { 208 | const m = regexStr.split("__REGEXP ")[1].match(/\/(.*)\/(.*)?/); 209 | return new RegExp(m[1], m[2] || ""); 210 | } 211 | 212 | /** 213 | * @description Used as JSON.parse parameter to convert a function string to a function 214 | * @param {funcStr} - Stringified function 215 | * @return {function} - A revived function 216 | */ 217 | function parseFunction(funcStr) { 218 | return new Function('return ' + funcStr)(); 219 | } 220 | 221 | /** 222 | * @description Used as JSON.parse parameter to convert a function string to a function 223 | * @param {String} key - If reviver finds object or array, this represents each key 224 | * @param {Any} value - If reviver finds object or array, this represents each value 225 | * @return {Any} - Revives notify function with full functionality OR original value 226 | */ 227 | function reviver(key, value) { 228 | return key === 'notifyFunction' ? new Function('return ' + value)() : value; 229 | } 230 | 231 | module.exports = run; --------------------------------------------------------------------------------