├── .babelrc ├── .gitignore ├── .npmignore ├── LICENSE ├── example └── print.js ├── index.js ├── package.json ├── src ├── scrapers │ ├── free-proxy-list.net.js │ ├── gatherproxy.com.js │ ├── hide-my-ip.com.js │ ├── hidemy.name.js │ ├── hidester.com.js │ ├── index.js │ ├── multiproxy.org.js │ ├── nordvpn.com.js │ ├── premproxy.com.js │ ├── proxy24.blogspot.fr.js │ ├── proxydb.net.js │ ├── spys.me.js │ └── text.js ├── util │ ├── fetch-cheerio.js │ ├── fetch-session.js │ ├── promise-lock.js │ └── temp-mail.js └── worker.js └── tools └── build.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | ["env", { 4 | "targets": { 5 | "node": "current" 6 | } 7 | }] 8 | ], 9 | "plugins": [["transform-object-rest-spread", { "useBuiltIns": true }]] 10 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Proxy Scraper ### 2 | .gatherproxy.account 3 | lib 4 | 5 | ### Node ### 6 | # Logs 7 | logs 8 | *.log 9 | npm-debug.log* 10 | yarn-debug.log* 11 | yarn-error.log* 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (http://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # Typescript v1 declaration files 45 | typings/ 46 | 47 | # Optional npm cache directory 48 | .npm 49 | 50 | # Optional eslint cache 51 | .eslintcache 52 | 53 | # Optional REPL history 54 | .node_repl_history 55 | 56 | # Output of 'npm pack' 57 | *.tgz 58 | 59 | # Yarn Integrity file 60 | .yarn-integrity 61 | 62 | # dotenv environment variables file 63 | .env -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | # Ignore everything 2 | /* 3 | 4 | !lib -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 David Duarte 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /example/print.js: -------------------------------------------------------------------------------- 1 | import ProxyScraper from '../index.js' 2 | import { Transform as TransformStream } from 'stream' 3 | import { createWriteStream } from 'fs' 4 | 5 | const scraper = new ProxyScraper({ workerCount: 10 }) 6 | 7 | scraper.getProxies(500).then(stream => { 8 | const toJson = new TransformStream({ objectMode: true }) 9 | toJson._transform = function(chunk, enc, cb) { 10 | this.push(`Working ${JSON.stringify(chunk)}\n`) 11 | cb() 12 | } 13 | 14 | stream.on('progress', progress => { 15 | console.log( 16 | `Progress ${progress.percentage.toFixed(2)}% (${progress.tested}/${progress.length}) (Source: ${progress.source})` 17 | ) 18 | }) 19 | 20 | stream.on('end', () => { 21 | console.log('Stopping workers') 22 | scraper.stop() 23 | }) 24 | 25 | const jsonStream = stream.pipe(toJson) 26 | const logStream = createWriteStream('./proxy.log') 27 | jsonStream.pipe(process.stdout) 28 | jsonStream.pipe(logStream) 29 | }) 30 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | import scrapers from './src/scrapers' 2 | import Lock from './src/util/promise-lock' 3 | import child from 'child_process' 4 | import fetch from 'node-fetch' 5 | import path from 'path' 6 | import { Readable as ReadableStream } from 'stream' 7 | import debug from 'debug' 8 | import os from 'os' 9 | 10 | const log = debug('proxy-scraper') 11 | 12 | const TYPES = ['http', 'socks'] 13 | const VALID_TYPES = ['socks', 'socks5', 'socks4', 'https', 'http'] 14 | 15 | export default class ProxyScraper { 16 | constructor({ workerCount = os.cpus().length } = {}) { 17 | this._workers = [] 18 | for (let i = 0; i < workerCount; i++) { 19 | log('Spawning worker %d', i) 20 | const worker = child.fork(path.join(__dirname, './src/worker.js'), [i]) 21 | worker.on('error', error => console.error(error)) 22 | this._workers.push(new Lock(worker)) 23 | } 24 | } 25 | 26 | getProxies(timeout) { 27 | return this.scrapProxies().then(proxies => 28 | this.testProxies(timeout, proxies) 29 | ) 30 | } 31 | 32 | testProxies(timeout, proxies) { 33 | log('Testing %d proxies with %d timeout', proxies.length, timeout) 34 | const stream = new ReadableStream({ objectMode: true }) 35 | const proxiesCount = proxies.length 36 | const queue = proxies.slice(0) //Clone it 37 | let testedProxies = 0 38 | stream._read = () => { 39 | for (const worker of this._workers) { 40 | let done = false 41 | const run = () => { 42 | if (queue.length > 0) { 43 | const proxy = queue.pop() 44 | worker 45 | .get(worker => 46 | this._testProxy( 47 | { 48 | url: 'http://example.com/', 49 | proxy: proxy.url(), 50 | timeout 51 | }, 52 | worker 53 | ) 54 | ) 55 | .then(time => { 56 | done = true 57 | proxy.time = time 58 | log('Working proxy: %o', proxy) 59 | stream.push(proxy) 60 | }) 61 | .catch(e => { 62 | if (e.type && e.type == 'missmatch') 63 | log('Content missmatch %o for proxy %o', e, proxy) 64 | }) 65 | .then(() => { 66 | testedProxies++ 67 | if(testedProxies === proxiesCount) 68 | stream.push(null) 69 | stream.emit('progress', { 70 | length: proxiesCount, 71 | tested: testedProxies, 72 | remaining: proxiesCount - testedProxies, 73 | percentage: (testedProxies / proxiesCount) * 100, 74 | source: proxy.source 75 | }) 76 | if (!done) run() 77 | }) 78 | } 79 | } 80 | run() 81 | } 82 | } 83 | return fetch('http://example.com/') 84 | .then(res => res.text()) 85 | .then(page => 86 | Promise.all( 87 | this._workers.map(worker => 88 | worker.get(worker => this._setPage(page, worker)) 89 | ) 90 | ) 91 | ) 92 | .then(() => stream) 93 | } 94 | 95 | _testProxy(proxy, worker) { 96 | worker.send({ 97 | event: 'test', 98 | data: proxy 99 | }) 100 | return new Promise((resolve, reject) => { 101 | worker.once('message', data => { 102 | if (data.working) { 103 | resolve(data.time) 104 | } else { 105 | reject(data.error) 106 | } 107 | }) 108 | }) 109 | } 110 | 111 | _setPage(page, worker) { 112 | worker.send({ 113 | event: 'page', 114 | data: page 115 | }) 116 | } 117 | 118 | scrapProxies() { 119 | const proxies = [] 120 | log('Scrapers: %o', Object.keys(ProxyScraper.scrapers)) 121 | for (let scraper in ProxyScraper.scrapers) { 122 | proxies.push( 123 | scrapers 124 | [scraper]() 125 | .then((proxies = []) => { 126 | log('Found %d proxies from %s', proxies.length, scraper) 127 | return proxies 128 | .map(proxy => this._aggregateProxy(proxy, scraper)) 129 | .reduce((prev, next) => prev.concat(next), []) 130 | }) 131 | .catch(e => { 132 | log('Error while scraping proxies with %s\n%o', scraper, e) 133 | return [] 134 | }) 135 | ) 136 | } 137 | return Promise.all(proxies).then(values => 138 | values.reduce((prev, next) => prev.concat(next)) 139 | ) 140 | } 141 | 142 | stop() { 143 | for (const worker of this._workers) { 144 | worker.get(worker => worker.kill()) 145 | } 146 | } 147 | 148 | _aggregateProxy(proxy, source) { 149 | const aproxy = { 150 | source, 151 | url() { 152 | return `${this.type}://${this.ip}:${this.port}` 153 | }, 154 | ...proxy 155 | } 156 | 157 | return VALID_TYPES.includes(aproxy.type) 158 | ? aproxy 159 | : TYPES.map(type => ({ 160 | ...aproxy, 161 | type 162 | })) 163 | } 164 | } 165 | 166 | ProxyScraper.scrapers = scrapers 167 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "proxy-scraper", 3 | "version": "2.1.5", 4 | "description": "Scrap proxis from the web", 5 | "main": "lib/index.js", 6 | "files": [ 7 | "lib" 8 | ], 9 | "scripts": { 10 | "print": "DEBUG='proxy-scraper*,-proxy-scraper:worker:*' babel-node example/print.js", 11 | "build": "babel-node tools/build.js", 12 | "prepublish": "npm run build", 13 | "format": "prettier --no-semi --single-quote --use-tabs --write index.js src/*.js src/**/*.js example/*.js tools/*.js" 14 | }, 15 | "repository": { 16 | "type": "git", 17 | "url": "git+https://github.com/DeltaEvo/proxy-scraper.git" 18 | }, 19 | "keywords": [ 20 | "proxy", 21 | "scraper" 22 | ], 23 | "author": "Duarte David ", 24 | "license": "MIT", 25 | "bugs": { 26 | "url": "https://github.com/DeltaEvo/proxy-scraper/issues" 27 | }, 28 | "homepage": "https://github.com/DeltaEvo/proxy-scraper#readme", 29 | "dependencies": { 30 | "cheerio": "^0.22.0", 31 | "debug": "^2.6.8", 32 | "form-data": "^2.1.4", 33 | "node-fetch": "^2.0.0-alpha", 34 | "proxy-agent": "^2.0.0", 35 | "socks-proxy-agent": "DeltaEvo/node-socks-proxy-agent", 36 | "temp-mail": "^2.0.0", 37 | "tough-cookie": "^2.3.2" 38 | }, 39 | "devDependencies": { 40 | "babel-cli": "^6.24.1", 41 | "babel-core": "^6.24.1", 42 | "babel-plugin-external-helpers": "^6.22.0", 43 | "babel-plugin-transform-object-rest-spread": "^6.23.0", 44 | "babel-preset-env": "^1.4.0", 45 | "fs-extra": "^3.0.1", 46 | "prettier": "^1.3.1", 47 | "rollup": "^0.41.6", 48 | "rollup-plugin-babel": "^2.7.1", 49 | "rollup-plugin-local-resolve": "^1.0.7" 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/scrapers/free-proxy-list.net.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch' 2 | import cheerio from '../util/fetch-cheerio' 3 | 4 | const ANONIMITY_LEVELS = ['transparent', 'anonymous', 'elite proxy'] 5 | 6 | const SOURCES = [ 7 | { 8 | url: 'http://www.free-proxy-list.net', 9 | type(element) { 10 | return element.eq(5).text() === 'yes' ? 'https' : 'http' 11 | }, 12 | anonymity: 4 13 | }, 14 | { 15 | url: 'http://www.socks-proxy.net', 16 | type(element) { 17 | return element.eq(4).text().toLowerCase() 18 | }, 19 | anonymity: 5 20 | } 21 | ] 22 | 23 | export default function scrap() { 24 | return Promise.all( 25 | SOURCES.map(source => 26 | fetch(source.url).then(cheerio()).then($ => 27 | $('#proxylisttable > tbody > tr') 28 | .map((i, e) => { 29 | const element = $(e).find('td') 30 | const ip = element.eq(0).text() 31 | const port = element.eq(1).text() 32 | const country = element.eq(3).text().toUpperCase() 33 | const anonymity = ANONIMITY_LEVELS.indexOf( 34 | element.eq(source.anonymity).text().toLowerCase() 35 | ) 36 | const type = source.type(element) 37 | return { ip, port, country, anonymity, type } 38 | }) 39 | .get() 40 | ) 41 | ) 42 | ).then(datas => datas.reduce((prev, next) => prev.concat(next))) 43 | } 44 | -------------------------------------------------------------------------------- /src/scrapers/gatherproxy.com.js: -------------------------------------------------------------------------------- 1 | import url from 'url' 2 | import nodeFetch from 'node-fetch' 3 | import FormData from 'form-data' 4 | import { Script } from 'vm' 5 | import { extractProxies } from './text' 6 | import cheerio from '../util/fetch-cheerio' 7 | import Session from '../util/fetch-session' 8 | import * as TempMail from '../util/temp-mail' 9 | import debug from 'debug' 10 | import fs from 'fs' 11 | 12 | const log = debug('proxy-scraper:gatherproxy.com') 13 | 14 | const CAPTCHA_REPLACE = { 15 | multiplied: '*', 16 | plus: '+', 17 | minus: '-', 18 | x: '*', 19 | zero: 0, 20 | one: 1, 21 | two: 2, 22 | three: 3, 23 | four: 4, 24 | five: 5, 25 | six: 6, 26 | seven: 7, 27 | eight: 8, 28 | nine: 9 29 | } 30 | 31 | const ANONIMITY_LEVELS = ['transparent', 'anonymous', 'elite'] 32 | 33 | export default function scrap() { 34 | const session = new Session(nodeFetch) 35 | const fetch = (url, options) => session.fetch(url, options) 36 | return getAccount(fetch).then(() => { 37 | let chain = Promise.resolve([]) 38 | for (let level of ANONIMITY_LEVELS) { 39 | chain = chain.then(data => 40 | getProxyListId(fetch, level) 41 | .then(id => downloadProxyList(fetch, level, id)) 42 | .then(text => 43 | data.push( 44 | extractProxies(text, () => ({ 45 | anonimity: ANONIMITY_LEVELS.indexOf(level), 46 | type: 'http' 47 | })) 48 | ) 49 | ) 50 | .then(() => data) 51 | ) 52 | } 53 | return chain.then(datas => datas.reduce((prev, next) => prev.concat(next))) 54 | }) 55 | } 56 | 57 | function getAccount(fetch) { 58 | return loadAccountFromCache() 59 | .then(({ email, password }) => { 60 | log( 61 | 'Loaded account with email %s and password %s from .gatherproxy.account', 62 | email, 63 | password 64 | ) 65 | return login(fetch, email, password) 66 | }) 67 | .catch(() => { 68 | log('Invalid account in .gatherproxy.account, creating new one') 69 | return createAccount(fetch) 70 | .then(({ email, password }) => { 71 | log( 72 | 'Storing account with email %s and password %s in .gatherproxy.account', 73 | email, 74 | password 75 | ) 76 | return storeAccountToCache(email, password).catch(() => ({ 77 | email, 78 | password 79 | })) 80 | }) 81 | .then(({ email, password }) => login(fetch, email, password)) 82 | }) 83 | } 84 | 85 | function loadAccountFromCache() { 86 | return new Promise((resolve, reject) => { 87 | fs.readFile('./.gatherproxy.account', 'utf8', (err, data) => { 88 | if (err) reject(err) 89 | else resolve(JSON.parse(data)) 90 | }) 91 | }) 92 | } 93 | 94 | function storeAccountToCache(email, password) { 95 | return new Promise((resolve, reject) => { 96 | fs.writeFile( 97 | './.gatherproxy.account', 98 | JSON.stringify({ email, password }), 99 | err => { 100 | if (err) reject(err) 101 | else resolve({ email, password }) 102 | } 103 | ) 104 | }) 105 | } 106 | 107 | function createAccount(fetch) { 108 | const email = `${Math.random().toString(36).substring(7)}@doanart.com` //@binka.me don't receive email 109 | log('Creating account with email %s', email) 110 | const form = new FormData() 111 | form.append('email', email) 112 | return fetch('http://www.gatherproxy.com/subscribe', { 113 | method: 'POST', 114 | body: form, 115 | headers: form.getHeaders() 116 | }) 117 | .then(() => TempMail.poll(email)) 118 | .then( 119 | ([{ mail_text_only }]) => /

Password: (.*)<\/p>/.exec(mail_text_only)[1] 120 | ) 121 | .then(password => { 122 | log('Account %s with password %s created', email, password) 123 | return { email, password } 124 | }) 125 | } 126 | 127 | function login(fetch, email, password) { 128 | log('Logging in with email %s and password %s', email, password) 129 | return fetch('http://www.gatherproxy.com/subscribe/login') 130 | .then(cheerio()) 131 | .then($ => { 132 | const captcha = $('#body > form > .label > .blue').first().text() 133 | 134 | const form = new FormData() 135 | form.append('Username', email) 136 | form.append('Password', password) 137 | form.append('Captcha', solveCaptcha(captcha)) 138 | 139 | return fetch('http://www.gatherproxy.com/subscribe/login', { 140 | method: 'POST', 141 | body: form, 142 | headers: form.getHeaders() 143 | }) 144 | }) 145 | } 146 | 147 | function solveCaptcha(raw) { 148 | const captcha = raw 149 | .split(' ') 150 | .map(part => part.toLowerCase()) 151 | .map(part => (part in CAPTCHA_REPLACE ? CAPTCHA_REPLACE[part] : part)) 152 | 153 | captcha.splice(captcha.indexOf('='), 1) // Remove the = 154 | 155 | const c = captcha.join(' ') 156 | const result = new Script(c).runInNewContext({}) 157 | 158 | log('Captcha %s (raw: %s) solved, result: %d', c, raw, result) 159 | return result 160 | } 161 | 162 | function getProxyListId(fetch, anonimity) { 163 | log('Getting id for anonimity %s', anonimity) 164 | const form = new FormData() 165 | form.append('Uptime', 0) 166 | form.append('Type', anonimity) 167 | return fetch('http://www.gatherproxy.com/proxylist/anonymityplaintext', { 168 | method: 'POST', 169 | body: form, 170 | headers: form.getHeaders(), 171 | redirect: 'manual' 172 | }).then(res => url.parse(res.headers.get('location'), true).query.sid) 173 | } 174 | 175 | function downloadProxyList(fetch, anonimity, id) { 176 | log('Downloading proxy with anonimity %s using id %s', anonimity, id) 177 | const form = new FormData() 178 | form.append('ID', id) 179 | form.append('T', anonimity) 180 | return fetch('http://www.gatherproxy.com/proxylist/downloadproxylist/', { 181 | method: 'POST', 182 | body: form, 183 | headers: form.getHeaders() 184 | }).then(res => res.text()) 185 | } 186 | -------------------------------------------------------------------------------- /src/scrapers/hide-my-ip.com.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch' 2 | 3 | const ANONIMITY_LEVELS = ['Low', 'Med', 'High'] 4 | 5 | export default function scrap(types) { 6 | return fetch('https://www.hide-my-ip.com/proxylist.shtml', { 7 | headers: { 8 | 'User-Agent': 'Mozilla/5.0' 9 | } 10 | }) 11 | .then(res => res.text()) 12 | .then(text => 13 | /var json =..(\[(?:.*)\]);/.exec( 14 | text.split('\n').join(' ') 15 | ) 16 | ) 17 | .then(match => (match ? JSON.parse(match[1]) : [])) 18 | .then(proxies => 19 | proxies.map(proxy => ({ 20 | ip: proxy.i, 21 | port: parseInt(proxy.p), 22 | country: proxy.c.n.toUpperCase(), 23 | type: proxy.tp.toLowerCase(), 24 | anonimity: ANONIMITY_LEVELS.indexOf(proxy.a) 25 | })) 26 | ) 27 | } 28 | -------------------------------------------------------------------------------- /src/scrapers/hidemy.name.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch' 2 | import cheerio from '../util/fetch-cheerio' 3 | import { validIp, validPort } from './text' 4 | 5 | export default function scrap() { 6 | const proxies = [] 7 | for (let i = 0; i < 20; i++) { 8 | proxies.push( 9 | fetch(`https://hidemy.name/en/proxy-list/?start=${i * 64}`, { 10 | headers: { 11 | 'User-Agent': 'Mozilla/5.0' 12 | } 13 | }) 14 | .then(cheerio()) 15 | .then($ => { 16 | return $('.proxy__t > tbody > tr') 17 | .map((i, e) => { 18 | const element = $(e).find('td') 19 | const ip = element.eq(0).text() 20 | const port = element.eq(1).text() 21 | const type = element.eq(4).text().split(',')[0].toLowerCase() 22 | if (validIp(ip) && validPort(port)) return { ip, port, type } 23 | }) 24 | .filter(e => e !== undefined) 25 | .get() 26 | }) 27 | ) 28 | } 29 | return Promise.all(proxies).then(datas => 30 | datas.reduce((prev, next) => prev.concat(next)) 31 | ) 32 | } 33 | -------------------------------------------------------------------------------- /src/scrapers/hidester.com.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch' 2 | 3 | const ANONIMITY_LEVELS = ['transparent', 'anonymous', 'elite'] 4 | 5 | export default function scrap(types) { 6 | return fetch( 7 | 'https://hidester.com/proxydata/php/data.php?mykey=csv&gproxy=2', 8 | { 9 | headers: { 10 | Referer: 'https://hidester.com/proxylist/' 11 | } 12 | } 13 | ) 14 | .then(req => req.json()) 15 | .then(datas => 16 | datas.map(({ IP: ip, PORT: port, type, country, anonymity }) => ({ 17 | ip, 18 | port, 19 | type, 20 | country, 21 | anonimity: ANONIMITY_LEVELS.indexOf(anonymity.toLowerCase()) 22 | })) 23 | ) 24 | } 25 | -------------------------------------------------------------------------------- /src/scrapers/index.js: -------------------------------------------------------------------------------- 1 | import freeproxylist from './free-proxy-list.net' 2 | import gatherproxy from './gatherproxy.com' 3 | import hidemyip from './hide-my-ip.com' 4 | import hidemyname from './hidemy.name' 5 | import hidester from './hidester.com' 6 | import multiproxy from './multiproxy.org' 7 | import nordvpn from './nordvpn.com' 8 | import premproxy from './premproxy.com' 9 | import proxy24 from './proxy24.blogspot.fr' 10 | import proxydb from './proxydb.net' 11 | import spys from './spys.me' 12 | 13 | export default { 14 | 'free-proxy-list.net': freeproxylist, 15 | 'gatherproxy.com': gatherproxy, 16 | 'hide-my-ip.com': hidemyip, 17 | 'hidemy.name': hidemyname, 18 | 'hidester.com': hidester, 19 | 'multiproxy.org': multiproxy, 20 | 'nordvpn.com': nordvpn, 21 | 'premproxy.com': premproxy, 22 | 'proxy24.blogspot.fr': proxy24, 23 | 'proxydb.net': proxydb, 24 | 'spys.me': spys 25 | } 26 | -------------------------------------------------------------------------------- /src/scrapers/multiproxy.org.js: -------------------------------------------------------------------------------- 1 | import createTextScraper from './text' 2 | 3 | export default createTextScraper('http://multiproxy.org/txt_all/proxy.txt') 4 | -------------------------------------------------------------------------------- /src/scrapers/nordvpn.com.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch' 2 | import Session from '../util/fetch-session' 3 | 4 | export default function scrap() { 5 | const session = new Session(fetch) 6 | 7 | return session 8 | .fetch('https://nordvpn.com/free-proxy-list/') 9 | .then(() => 10 | session.fetch( 11 | 'https://nordvpn.com/wp-admin/admin-ajax.php?searchParameters%5B0%5D%5Bname%5D=proxy-country&searchParameters%5B0%5D%5Bvalue%5D=&searchParameters%5B1%5D%5Bname%5D=proxy-ports&searchParameters%5B1%5D%5Bvalue%5D=&offset=0&limit=100000&action=getProxies', 12 | { 13 | method: 'POST' 14 | } 15 | ) 16 | ) 17 | .then(res => res.json()) 18 | .then(datas => 19 | datas.map(({ ip, port, country, type }) => ({ 20 | ip, 21 | port, 22 | country, 23 | type: type.toLowerCase() 24 | })) 25 | ) 26 | } 27 | -------------------------------------------------------------------------------- /src/scrapers/premproxy.com.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch' 2 | import cheerio from '../util/fetch-cheerio' 3 | 4 | const ANONIMITY_LEVELS = ['transparent', 'anonymous', 'high-anonymous'] 5 | 6 | const SOURCES = [ 7 | { 8 | url: 'https://premproxy.com/list/', 9 | data(element) { 10 | return { 11 | type: 'http', 12 | anonymity: element.eq(1).text().trim().toLowerCase() 13 | } 14 | }, 15 | pages: 20 16 | }, 17 | { 18 | url: 'https://premproxy.com/socks-list/', 19 | data(element) { 20 | return { 21 | type: element.eq(1).text().trim().toLowerCase() 22 | } 23 | }, 24 | pages: 5 25 | } 26 | ] 27 | 28 | export default function scrap() { 29 | return Promise.all( 30 | SOURCES.map(source => { 31 | const promises = [] 32 | for (let i = 1; i <= source.pages; i++) { 33 | promises.push( 34 | fetch(`${source.url}/${i > 9 ? i : '0' + i}.htm`) 35 | .then(cheerio()) 36 | .then($ => 37 | $('#proxylist > tr:not(.list_sorted)') 38 | .map((i, e) => { 39 | const element = $(e).find('td') 40 | const [ip, port] = element.eq(0).text().split(':') 41 | const data = source.data(element) 42 | const country = element.eq(3).text().toUpperCase() 43 | return { ip, port, country, ...data } 44 | }) 45 | .get() 46 | ) 47 | ) 48 | } 49 | return Promise.all(promises).then(values => 50 | values.reduce((prev, next) => prev.concat(next)) 51 | ) 52 | }) 53 | ).then(values => values.reduce((prev, next) => prev.concat(next))) 54 | } 55 | -------------------------------------------------------------------------------- /src/scrapers/proxy24.blogspot.fr.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch' 2 | import cheerio from '../util/fetch-cheerio' 3 | import { extractProxies } from './text' 4 | 5 | const SOURCES = [ 6 | { 7 | url: 'http://proxyserverlist-24.blogspot.fr/', 8 | type: 'http', 9 | selector: '.post-body > pre > span > span:nth-child(2)' 10 | }, 11 | { 12 | url: 'http://sslproxies24.blogspot.fr/', 13 | type: 'https', 14 | selector: '.post-body > pre > span > span' 15 | }, 16 | { 17 | url: 'http://vip-socks24.blogspot.com/', 18 | type: 'socks5', 19 | selector: '.post-body > textarea' 20 | } 21 | ] 22 | 23 | export default function scrap() { 24 | const result = [] 25 | for (const source of SOURCES) { 26 | result.push( 27 | fetch(source.url) 28 | .then(cheerio()) 29 | .then($ => 30 | Promise.all( 31 | $('.post-title > a') 32 | .map((i, e) => 33 | fetch($(e).attr('href')) 34 | .then(cheerio()) 35 | .then($ => $(source.selector).eq(0).text()) 36 | .then(proxies => 37 | extractProxies(proxies, () => ({ type: source.type })) 38 | ) 39 | ) 40 | .get() 41 | ).then(datas => datas.reduce((prev, next) => prev.concat(next))) 42 | ) 43 | ) 44 | } 45 | return Promise.all(result).then(datas => 46 | datas.reduce((prev, next) => prev.concat(next)) 47 | ) 48 | } 49 | -------------------------------------------------------------------------------- /src/scrapers/proxydb.net.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch' 2 | import cheerio from '../util/fetch-cheerio' 3 | 4 | export default function scrap() { 5 | const promises = [] 6 | for (let offset = 0; offset < 1000; offset += 50) { 7 | promises.push( 8 | fetch(`http://proxydb.net/?limit=50&offset=${offset}`) 9 | .then(cheerio()) 10 | .then($ => { 11 | if (!$) return [] 12 | return $('table > tbody > tr') 13 | .map((i, element) => { 14 | let url = $(element).find('td > a').first().text().trim() 15 | let ip = /(\d+\.){3}\d+/.exec(url)[0] 16 | let port = /\d+$/.exec(url)[0] 17 | 18 | return { ip, port } 19 | }) 20 | .get() 21 | }) 22 | ) 23 | } 24 | return Promise.all(promises).then( 25 | values => values.reduce((prev, next) => prev.concat(next)), 26 | [] 27 | ) 28 | } 29 | -------------------------------------------------------------------------------- /src/scrapers/spys.me.js: -------------------------------------------------------------------------------- 1 | import createTextScraper from './text' 2 | 3 | const ANONIMITY_LEVELS = ['N', 'A', 'H'] 4 | 5 | export default createTextScraper('http://spys.me/proxy.txt', data => { 6 | const [country, anonymity, ssl = 'N'] = data[0].split('-') 7 | return { 8 | type: ssl.charAt(0) === 'S' ? 'https' : 'http', 9 | country, 10 | anonymity: ANONIMITY_LEVELS.indexOf(anonymity) 11 | } 12 | }) 13 | -------------------------------------------------------------------------------- /src/scrapers/text.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch' 2 | 3 | export default function createTextScraper(url, aggregator) { 4 | return function scrap() { 5 | return fetch(url) 6 | .then(res => res.text()) 7 | .then(text => extractProxies(text, aggregator)) 8 | } 9 | } 10 | 11 | export function extractProxies(text, aggregator = function() {}) { 12 | return text 13 | .split('\n') 14 | .map(proxy => { 15 | const more = proxy.split(' ') 16 | const [ip, p] = more[0].split(':') 17 | const port = parseInt(p) 18 | if (validIp(ip) && validPort(port)) { 19 | const agg = aggregator(more.slice(1)) 20 | return { 21 | ip, 22 | port, 23 | ...agg 24 | } 25 | } 26 | }) 27 | .filter(proxy => proxy !== undefined) 28 | } 29 | 30 | export function validIp(ip) { 31 | return true 32 | } 33 | 34 | export function validPort(port) { 35 | return port > 0 && port <= 65535 36 | } 37 | -------------------------------------------------------------------------------- /src/util/fetch-cheerio.js: -------------------------------------------------------------------------------- 1 | import $ from 'cheerio' 2 | 3 | export default function cheerio(options) { 4 | return req => req.text().then(body => $.load(body, options)) 5 | } 6 | -------------------------------------------------------------------------------- /src/util/fetch-session.js: -------------------------------------------------------------------------------- 1 | import { CookieJar } from 'tough-cookie' 2 | 3 | export default class Session { 4 | constructor(fetch) { 5 | this.jar = new CookieJar() 6 | this._fetch = fetch 7 | } 8 | 9 | fetch(url, opts = {}) { 10 | return this.getCookies(url).then(cookies => { 11 | return this._fetch( 12 | url, 13 | Object.assign(opts, { 14 | headers: Object.assign(opts.headers || {}, { 15 | cookie: cookies.join('; ') 16 | }) 17 | }) 18 | ).then(res => { 19 | let cookies = res.headers.get('set-cookie') 20 | cookies = (cookies && cookies.split(',')) || [] 21 | cookies.map(cookie => this.jar.setCookie(cookie, url, () => null)) 22 | return res 23 | }) 24 | }) 25 | } 26 | 27 | getCookies(url) { 28 | return new Promise((resolve, reject) => { 29 | this.jar.getCookies(url, (err, cookies) => { 30 | if (err) reject(err) 31 | else resolve(cookies) 32 | }) 33 | }) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/util/promise-lock.js: -------------------------------------------------------------------------------- 1 | export default class Lock { 2 | constructor(inner) { 3 | this.inner = Promise.resolve(inner) 4 | } 5 | 6 | get(callback) { 7 | const p = this.inner.then(inner => { 8 | const result = callback(inner) 9 | return Promise.resolve(result).then( 10 | result => ({ inner, result }), 11 | err => ({ inner, err }) 12 | ) 13 | }) 14 | this.inner = p.then(({ inner }) => inner) 15 | return p.then(({ result, err }) => (err ? Promise.reject(err) : result)) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/util/temp-mail.js: -------------------------------------------------------------------------------- 1 | import { getInbox } from 'temp-mail' 2 | import debug from 'debug' 3 | 4 | const log = debug('proxy-scraper:temp-mail') 5 | 6 | const NOOP = function() {} 7 | 8 | export * from 'temp-mail' 9 | 10 | export function poll(email, interval = 5000, iterations = 20) { 11 | return new Promise((resolve, reject) => { 12 | let count = 0 13 | const id = setInterval(() => { 14 | if (count < iterations) { 15 | count++ 16 | log( 17 | 'Polling inbox for mail %s iteration %d, max: %d', 18 | email, 19 | count, 20 | iterations 21 | ) 22 | getInbox(email) 23 | .then(mails => { 24 | resolve(mails) 25 | clearInterval(id) 26 | }) 27 | .catch(NOOP) //Ignore Error 28 | } else { 29 | clearInterval(id) 30 | reject(new Error('Timeout exceeded, no messages found.')) 31 | } 32 | }, interval) 33 | }) 34 | } 35 | -------------------------------------------------------------------------------- /src/worker.js: -------------------------------------------------------------------------------- 1 | import ProxyAgent from 'proxy-agent' 2 | import fetch from 'node-fetch' 3 | import debug from 'debug' 4 | import { parse } from 'url' 5 | const log = debug(`proxy-scraper:worker:${process.argv[2]}`) 6 | 7 | log('Worker started !') 8 | 9 | let page 10 | 11 | process.on('message', msg => { 12 | switch (msg.event) { 13 | case 'test': 14 | testProxy(msg.data) 15 | break 16 | case 'page': 17 | page = msg.data 18 | break 19 | } 20 | }) 21 | 22 | function testProxy({ url, method, proxy, timeout }) { 23 | log('testing %s with proxy %s and %s ms of timeout', url, proxy, timeout) 24 | const startTime = Date.now() 25 | const p = parse(proxy) 26 | p.timeout = timeout 27 | fetch(url, { 28 | method: method || 'GET', 29 | agent: new ProxyAgent(p), 30 | timeout 31 | }) 32 | .then(res => { 33 | if (page) { 34 | return res.text().then(p => { 35 | if (p != page) { 36 | const e = new Error('Page content missmatch') 37 | e.type = 'missmatch' 38 | throw e 39 | } 40 | }) 41 | } 42 | }) 43 | .then(() => process.send({ working: true, time: Date.now() - startTime })) 44 | .catch(error => process.send({ error, working: false })) 45 | } 46 | -------------------------------------------------------------------------------- /tools/build.js: -------------------------------------------------------------------------------- 1 | import data from 'babel-preset-env/data/plugins.json' 2 | import { rollup } from 'rollup' 3 | import babel from 'rollup-plugin-babel' 4 | import resolve from 'rollup-plugin-local-resolve' 5 | import pkg from '../package.json' 6 | import fs from 'fs-extra' 7 | import { join, dirname } from 'path' 8 | import { transformFile } from 'babel-core' 9 | 10 | const VERSIONS = Object.values(data) 11 | .map(p => p.node) 12 | .filter((e, i, array) => e && array.indexOf(e) == i) 13 | .sort() 14 | .reverse() 15 | 16 | const NODE_MODULES = [ 17 | 'vm', 18 | 'fs', 19 | 'os', 20 | 'path', 21 | 'child_process', 22 | 'url', 23 | 'stream' 24 | ] 25 | 26 | const FOLDER = 'lib' 27 | 28 | const WORKER = 'src/worker.js' 29 | 30 | function generateLoader(versions) { 31 | const checks = VERSIONS.map( 32 | ver => `if(v>=${ver})module.exports=require('${versions.get(ver)}')\n` 33 | ).reduce((prev, next) => (prev ? `${prev}else ${next}` : next)) 34 | return `var v=parseFloat(process.versions.node)\n${checks}` 35 | } 36 | 37 | function build() { 38 | const versions = new Map() 39 | let chain = Promise.resolve() 40 | for (const ver of VERSIONS) { 41 | versions.set(ver, `./${ver}.js`) 42 | chain = chain.then(() => console.log(`Building for node ${ver}`)).then(() => 43 | rollup({ 44 | entry: './index.js', 45 | external: Object.keys(pkg.dependencies).concat(NODE_MODULES), 46 | plugins: [ 47 | resolve(), 48 | babel({ 49 | babelrc: false, 50 | exclude: 'node_modules/**', 51 | presets: [ 52 | [ 53 | 'env', 54 | { 55 | targets: { 56 | node: ver 57 | }, 58 | modules: false 59 | } 60 | ] 61 | ], 62 | plugins: [ 63 | 'external-helpers', 64 | ['transform-object-rest-spread', { useBuiltIns: true }] 65 | ] 66 | }) 67 | ] 68 | }).then(bundle => 69 | bundle.write({ 70 | dest: join(FOLDER, `${ver}.js`), 71 | format: 'cjs', 72 | sourceMap: true 73 | }) 74 | ) 75 | ) 76 | } 77 | return chain.then(() => versions) 78 | } 79 | 80 | function copyWorker() { 81 | return new Promise((resolve, reject) => 82 | transformFile( 83 | WORKER, 84 | { 85 | babelrc: false, 86 | ast: false, 87 | sourceMaps: true, 88 | presets: [ 89 | [ 90 | 'env', 91 | { 92 | targets: { 93 | node: 0.12 94 | } 95 | } 96 | ] 97 | ], 98 | plugins: [['transform-object-rest-spread', { useBuiltIns: true }]] 99 | }, 100 | (err, result) => (err ? reject(err) : resolve(result)) 101 | ) 102 | ).then(({ code, map }) => 103 | Promise.all([ 104 | fs.outputFile(join(FOLDER, WORKER), code), 105 | fs.outputJson(join(FOLDER, WORKER + '.map'), map) 106 | ]) 107 | ) 108 | } 109 | fs 110 | .remove(FOLDER) 111 | .then(build) 112 | .then(generateLoader) 113 | .then( 114 | loader => 115 | new Promise((resolve, reject) => 116 | fs.writeFile( 117 | join(FOLDER, 'index.js'), 118 | loader, 119 | err => (err ? reject(err) : resolve()) 120 | ) 121 | ) 122 | ) 123 | .then(copyWorker) 124 | .catch(e => console.error(e)) 125 | --------------------------------------------------------------------------------