├── .babelrc
├── .gitignore
├── .npmignore
├── LICENSE
├── example
    └── print.js
├── index.js
├── package.json
├── src
    ├── scrapers
    │   ├── free-proxy-list.net.js
    │   ├── gatherproxy.com.js
    │   ├── hide-my-ip.com.js
    │   ├── hidemy.name.js
    │   ├── hidester.com.js
    │   ├── index.js
    │   ├── multiproxy.org.js
    │   ├── nordvpn.com.js
    │   ├── premproxy.com.js
    │   ├── proxy24.blogspot.fr.js
    │   ├── proxydb.net.js
    │   ├── spys.me.js
    │   └── text.js
    ├── util
    │   ├── fetch-cheerio.js
    │   ├── fetch-session.js
    │   ├── promise-lock.js
    │   └── temp-mail.js
    └── worker.js
└── tools
    └── build.js


/.babelrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "presets": [
 3 |     ["env", {
 4 |       "targets": {
 5 |         "node": "current"
 6 |       }
 7 |     }]
 8 |   ],
 9 |   "plugins": [["transform-object-rest-spread", { "useBuiltIns": true }]]
10 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Proxy Scraper ###
 2 | .gatherproxy.account
 3 | lib
 4 | 
 5 | ### Node ###
 6 | # Logs
 7 | logs
 8 | *.log
 9 | npm-debug.log*
10 | yarn-debug.log*
11 | yarn-error.log*
12 | 
13 | # Runtime data
14 | pids
15 | *.pid
16 | *.seed
17 | *.pid.lock
18 | 
19 | # Directory for instrumented libs generated by jscoverage/JSCover
20 | lib-cov
21 | 
22 | # Coverage directory used by tools like istanbul
23 | coverage
24 | 
25 | # nyc test coverage
26 | .nyc_output
27 | 
28 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
29 | .grunt
30 | 
31 | # Bower dependency directory (https://bower.io/)
32 | bower_components
33 | 
34 | # node-waf configuration
35 | .lock-wscript
36 | 
37 | # Compiled binary addons (http://nodejs.org/api/addons.html)
38 | build/Release
39 | 
40 | # Dependency directories
41 | node_modules/
42 | jspm_packages/
43 | 
44 | # Typescript v1 declaration files
45 | typings/
46 | 
47 | # Optional npm cache directory
48 | .npm
49 | 
50 | # Optional eslint cache
51 | .eslintcache
52 | 
53 | # Optional REPL history
54 | .node_repl_history
55 | 
56 | # Output of 'npm pack'
57 | *.tgz
58 | 
59 | # Yarn Integrity file
60 | .yarn-integrity
61 | 
62 | # dotenv environment variables file
63 | .env


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | # Ignore everything
2 | /*
3 | 
4 | !lib


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 David Duarte
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/example/print.js:
--------------------------------------------------------------------------------
 1 | import ProxyScraper from '../index.js'
 2 | import { Transform as TransformStream } from 'stream'
 3 | import { createWriteStream } from 'fs'
 4 | 
 5 | const scraper = new ProxyScraper({ workerCount: 10 })
 6 | 
 7 | scraper.getProxies(500).then(stream => {
 8 | 	const toJson = new TransformStream({ objectMode: true })
 9 | 	toJson._transform = function(chunk, enc, cb) {
10 | 		this.push(`Working ${JSON.stringify(chunk)}\n`)
11 | 		cb()
12 | 	}
13 | 
14 | 	stream.on('progress', progress => {
15 | 		console.log(
16 | 			`Progress ${progress.percentage.toFixed(2)}% (${progress.tested}/${progress.length}) (Source: ${progress.source})`
17 | 		)
18 | 	})
19 | 
20 | 	stream.on('end', () => {
21 | 		console.log('Stopping workers')
22 | 		scraper.stop()
23 | 	})
24 | 
25 | 	const jsonStream = stream.pipe(toJson)
26 | 	const logStream = createWriteStream('./proxy.log')
27 | 	jsonStream.pipe(process.stdout)
28 | 	jsonStream.pipe(logStream)
29 | })
30 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | import scrapers from './src/scrapers'
  2 | import Lock from './src/util/promise-lock'
  3 | import child from 'child_process'
  4 | import fetch from 'node-fetch'
  5 | import path from 'path'
  6 | import { Readable as ReadableStream } from 'stream'
  7 | import debug from 'debug'
  8 | import os from 'os'
  9 | 
 10 | const log = debug('proxy-scraper')
 11 | 
 12 | const TYPES = ['http', 'socks']
 13 | const VALID_TYPES = ['socks', 'socks5', 'socks4', 'https', 'http']
 14 | 
 15 | export default class ProxyScraper {
 16 | 	constructor({ workerCount = os.cpus().length } = {}) {
 17 | 		this._workers = []
 18 | 		for (let i = 0; i < workerCount; i++) {
 19 | 			log('Spawning worker %d', i)
 20 | 			const worker = child.fork(path.join(__dirname, './src/worker.js'), [i])
 21 | 			worker.on('error', error => console.error(error))
 22 | 			this._workers.push(new Lock(worker))
 23 | 		}
 24 | 	}
 25 | 
 26 | 	getProxies(timeout) {
 27 | 		return this.scrapProxies().then(proxies =>
 28 | 			this.testProxies(timeout, proxies)
 29 | 		)
 30 | 	}
 31 | 
 32 | 	testProxies(timeout, proxies) {
 33 | 		log('Testing %d proxies with %d timeout', proxies.length, timeout)
 34 | 		const stream = new ReadableStream({ objectMode: true })
 35 | 		const proxiesCount = proxies.length
 36 | 		const queue = proxies.slice(0) //Clone it
 37 | 		let testedProxies = 0
 38 | 		stream._read = () => {
 39 | 			for (const worker of this._workers) {
 40 | 				let done = false
 41 | 				const run = () => {
 42 | 					if (queue.length > 0) {
 43 | 						const proxy = queue.pop()
 44 | 						worker
 45 | 							.get(worker =>
 46 | 								this._testProxy(
 47 | 									{
 48 | 										url: 'http://example.com/',
 49 | 										proxy: proxy.url(),
 50 | 										timeout
 51 | 									},
 52 | 									worker
 53 | 								)
 54 | 							)
 55 | 							.then(time => {
 56 | 								done = true
 57 | 								proxy.time = time
 58 | 								log('Working proxy: %o', proxy)
 59 | 								stream.push(proxy)
 60 | 							})
 61 | 							.catch(e => {
 62 | 								if (e.type && e.type == 'missmatch')
 63 | 									log('Content missmatch %o for proxy %o', e, proxy)
 64 | 							})
 65 | 							.then(() => {
 66 | 								testedProxies++
 67 | 								if(testedProxies === proxiesCount)
 68 | 									stream.push(null)
 69 | 								stream.emit('progress', {
 70 | 									length: proxiesCount,
 71 | 									tested: testedProxies,
 72 | 									remaining: proxiesCount - testedProxies,
 73 | 									percentage: (testedProxies / proxiesCount) * 100,
 74 | 									source: proxy.source
 75 | 								})
 76 | 								if (!done) run()
 77 | 							})
 78 | 					}
 79 | 				}
 80 | 				run()
 81 | 			}
 82 | 		}
 83 | 		return fetch('http://example.com/')
 84 | 			.then(res => res.text())
 85 | 			.then(page =>
 86 | 				Promise.all(
 87 | 					this._workers.map(worker =>
 88 | 						worker.get(worker => this._setPage(page, worker))
 89 | 					)
 90 | 				)
 91 | 			)
 92 | 			.then(() => stream)
 93 | 	}
 94 | 
 95 | 	_testProxy(proxy, worker) {
 96 | 		worker.send({
 97 | 			event: 'test',
 98 | 			data: proxy
 99 | 		})
100 | 		return new Promise((resolve, reject) => {
101 | 			worker.once('message', data => {
102 | 				if (data.working) {
103 | 					resolve(data.time)
104 | 				} else {
105 | 					reject(data.error)
106 | 				}
107 | 			})
108 | 		})
109 | 	}
110 | 
111 | 	_setPage(page, worker) {
112 | 		worker.send({
113 | 			event: 'page',
114 | 			data: page
115 | 		})
116 | 	}
117 | 
118 | 	scrapProxies() {
119 | 		const proxies = []
120 | 		log('Scrapers: %o', Object.keys(ProxyScraper.scrapers))
121 | 		for (let scraper in ProxyScraper.scrapers) {
122 | 			proxies.push(
123 | 				scrapers
124 | 					[scraper]()
125 | 					.then((proxies = []) => {
126 | 						log('Found %d proxies from %s', proxies.length, scraper)
127 | 						return proxies
128 | 							.map(proxy => this._aggregateProxy(proxy, scraper))
129 | 							.reduce((prev, next) => prev.concat(next), [])
130 | 					})
131 | 					.catch(e => {
132 | 						log('Error while scraping proxies with %s\n%o', scraper, e)
133 | 						return []
134 | 					})
135 | 			)
136 | 		}
137 | 		return Promise.all(proxies).then(values =>
138 | 			values.reduce((prev, next) => prev.concat(next))
139 | 		)
140 | 	}
141 | 
142 | 	stop() {
143 | 		for (const worker of this._workers) {
144 | 			worker.get(worker => worker.kill())
145 | 		}
146 | 	}
147 | 
148 | 	_aggregateProxy(proxy, source) {
149 | 		const aproxy = {
150 | 			source,
151 | 			url() {
152 | 				return `${this.type}://${this.ip}:${this.port}`
153 | 			},
154 | 			...proxy
155 | 		}
156 | 
157 | 		return VALID_TYPES.includes(aproxy.type)
158 | 			? aproxy
159 | 			: TYPES.map(type => ({
160 | 					...aproxy,
161 | 					type
162 | 				}))
163 | 	}
164 | }
165 | 
166 | ProxyScraper.scrapers = scrapers
167 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "proxy-scraper",
 3 |   "version": "2.1.5",
 4 |   "description": "Scrap proxis from the web",
 5 |   "main": "lib/index.js",
 6 |   "files": [
 7 |     "lib"
 8 |   ],
 9 |   "scripts": {
10 |     "print": "DEBUG='proxy-scraper*,-proxy-scraper:worker:*' babel-node example/print.js",
11 |     "build": "babel-node tools/build.js",
12 |     "prepublish": "npm run build",
13 |     "format": "prettier --no-semi --single-quote --use-tabs --write index.js src/*.js src/**/*.js example/*.js tools/*.js"
14 |   },
15 |   "repository": {
16 |     "type": "git",
17 |     "url": "git+https://github.com/DeltaEvo/proxy-scraper.git"
18 |   },
19 |   "keywords": [
20 |     "proxy",
21 |     "scraper"
22 |   ],
23 |   "author": "Duarte David <deltaduartedavid@gmail.com>",
24 |   "license": "MIT",
25 |   "bugs": {
26 |     "url": "https://github.com/DeltaEvo/proxy-scraper/issues"
27 |   },
28 |   "homepage": "https://github.com/DeltaEvo/proxy-scraper#readme",
29 |   "dependencies": {
30 |     "cheerio": "^0.22.0",
31 |     "debug": "^2.6.8",
32 |     "form-data": "^2.1.4",
33 |     "node-fetch": "^2.0.0-alpha",
34 |     "proxy-agent": "^2.0.0",
35 |     "socks-proxy-agent": "DeltaEvo/node-socks-proxy-agent",
36 |     "temp-mail": "^2.0.0",
37 |     "tough-cookie": "^2.3.2"
38 |   },
39 |   "devDependencies": {
40 |     "babel-cli": "^6.24.1",
41 |     "babel-core": "^6.24.1",
42 |     "babel-plugin-external-helpers": "^6.22.0",
43 |     "babel-plugin-transform-object-rest-spread": "^6.23.0",
44 |     "babel-preset-env": "^1.4.0",
45 |     "fs-extra": "^3.0.1",
46 |     "prettier": "^1.3.1",
47 |     "rollup": "^0.41.6",
48 |     "rollup-plugin-babel": "^2.7.1",
49 |     "rollup-plugin-local-resolve": "^1.0.7"
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/scrapers/free-proxy-list.net.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch'
 2 | import cheerio from '../util/fetch-cheerio'
 3 | 
 4 | const ANONIMITY_LEVELS = ['transparent', 'anonymous', 'elite proxy']
 5 | 
 6 | const SOURCES = [
 7 | 	{
 8 | 		url: 'http://www.free-proxy-list.net',
 9 | 		type(element) {
10 | 			return element.eq(5).text() === 'yes' ? 'https' : 'http'
11 | 		},
12 | 		anonymity: 4
13 | 	},
14 | 	{
15 | 		url: 'http://www.socks-proxy.net',
16 | 		type(element) {
17 | 			return element.eq(4).text().toLowerCase()
18 | 		},
19 | 		anonymity: 5
20 | 	}
21 | ]
22 | 
23 | export default function scrap() {
24 | 	return Promise.all(
25 | 		SOURCES.map(source =>
26 | 			fetch(source.url).then(cheerio()).then($ =>
27 | 				$('#proxylisttable > tbody > tr')
28 | 					.map((i, e) => {
29 | 						const element = $(e).find('td')
30 | 						const ip = element.eq(0).text()
31 | 						const port = element.eq(1).text()
32 | 						const country = element.eq(3).text().toUpperCase()
33 | 						const anonymity = ANONIMITY_LEVELS.indexOf(
34 | 							element.eq(source.anonymity).text().toLowerCase()
35 | 						)
36 | 						const type = source.type(element)
37 | 						return { ip, port, country, anonymity, type }
38 | 					})
39 | 					.get()
40 | 			)
41 | 		)
42 | 	).then(datas => datas.reduce((prev, next) => prev.concat(next)))
43 | }
44 | 


--------------------------------------------------------------------------------
/src/scrapers/gatherproxy.com.js:
--------------------------------------------------------------------------------
  1 | import url from 'url'
  2 | import nodeFetch from 'node-fetch'
  3 | import FormData from 'form-data'
  4 | import { Script } from 'vm'
  5 | import { extractProxies } from './text'
  6 | import cheerio from '../util/fetch-cheerio'
  7 | import Session from '../util/fetch-session'
  8 | import * as TempMail from '../util/temp-mail'
  9 | import debug from 'debug'
 10 | import fs from 'fs'
 11 | 
 12 | const log = debug('proxy-scraper:gatherproxy.com')
 13 | 
 14 | const CAPTCHA_REPLACE = {
 15 | 	multiplied: '*',
 16 | 	plus: '+',
 17 | 	minus: '-',
 18 | 	x: '*',
 19 | 	zero: 0,
 20 | 	one: 1,
 21 | 	two: 2,
 22 | 	three: 3,
 23 | 	four: 4,
 24 | 	five: 5,
 25 | 	six: 6,
 26 | 	seven: 7,
 27 | 	eight: 8,
 28 | 	nine: 9
 29 | }
 30 | 
 31 | const ANONIMITY_LEVELS = ['transparent', 'anonymous', 'elite']
 32 | 
 33 | export default function scrap() {
 34 | 	const session = new Session(nodeFetch)
 35 | 	const fetch = (url, options) => session.fetch(url, options)
 36 | 	return getAccount(fetch).then(() => {
 37 | 		let chain = Promise.resolve([])
 38 | 		for (let level of ANONIMITY_LEVELS) {
 39 | 			chain = chain.then(data =>
 40 | 				getProxyListId(fetch, level)
 41 | 					.then(id => downloadProxyList(fetch, level, id))
 42 | 					.then(text =>
 43 | 						data.push(
 44 | 							extractProxies(text, () => ({
 45 | 								anonimity: ANONIMITY_LEVELS.indexOf(level),
 46 | 								type: 'http'
 47 | 							}))
 48 | 						)
 49 | 					)
 50 | 					.then(() => data)
 51 | 			)
 52 | 		}
 53 | 		return chain.then(datas => datas.reduce((prev, next) => prev.concat(next)))
 54 | 	})
 55 | }
 56 | 
 57 | function getAccount(fetch) {
 58 | 	return loadAccountFromCache()
 59 | 		.then(({ email, password }) => {
 60 | 			log(
 61 | 				'Loaded account with email %s and password %s from .gatherproxy.account',
 62 | 				email,
 63 | 				password
 64 | 			)
 65 | 			return login(fetch, email, password)
 66 | 		})
 67 | 		.catch(() => {
 68 | 			log('Invalid account in .gatherproxy.account, creating new one')
 69 | 			return createAccount(fetch)
 70 | 				.then(({ email, password }) => {
 71 | 					log(
 72 | 						'Storing account with email %s and password %s in .gatherproxy.account',
 73 | 						email,
 74 | 						password
 75 | 					)
 76 | 					return storeAccountToCache(email, password).catch(() => ({
 77 | 						email,
 78 | 						password
 79 | 					}))
 80 | 				})
 81 | 				.then(({ email, password }) => login(fetch, email, password))
 82 | 		})
 83 | }
 84 | 
 85 | function loadAccountFromCache() {
 86 | 	return new Promise((resolve, reject) => {
 87 | 		fs.readFile('./.gatherproxy.account', 'utf8', (err, data) => {
 88 | 			if (err) reject(err)
 89 | 			else resolve(JSON.parse(data))
 90 | 		})
 91 | 	})
 92 | }
 93 | 
 94 | function storeAccountToCache(email, password) {
 95 | 	return new Promise((resolve, reject) => {
 96 | 		fs.writeFile(
 97 | 			'./.gatherproxy.account',
 98 | 			JSON.stringify({ email, password }),
 99 | 			err => {
100 | 				if (err) reject(err)
101 | 				else resolve({ email, password })
102 | 			}
103 | 		)
104 | 	})
105 | }
106 | 
107 | function createAccount(fetch) {
108 | 	const email = `${Math.random().toString(36).substring(7)}@doanart.com` //@binka.me don't receive email
109 | 	log('Creating account with email %s', email)
110 | 	const form = new FormData()
111 | 	form.append('email', email)
112 | 	return fetch('http://www.gatherproxy.com/subscribe', {
113 | 		method: 'POST',
114 | 		body: form,
115 | 		headers: form.getHeaders()
116 | 	})
117 | 		.then(() => TempMail.poll(email))
118 | 		.then(
119 | 			([{ mail_text_only }]) => /<p>Password: (.*)<\/p>/.exec(mail_text_only)[1]
120 | 		)
121 | 		.then(password => {
122 | 			log('Account %s with password %s created', email, password)
123 | 			return { email, password }
124 | 		})
125 | }
126 | 
127 | function login(fetch, email, password) {
128 | 	log('Logging in with email %s and password %s', email, password)
129 | 	return fetch('http://www.gatherproxy.com/subscribe/login')
130 | 		.then(cheerio())
131 | 		.then($ => {
132 | 			const captcha = $('#body > form > .label > .blue').first().text()
133 | 
134 | 			const form = new FormData()
135 | 			form.append('Username', email)
136 | 			form.append('Password', password)
137 | 			form.append('Captcha', solveCaptcha(captcha))
138 | 
139 | 			return fetch('http://www.gatherproxy.com/subscribe/login', {
140 | 				method: 'POST',
141 | 				body: form,
142 | 				headers: form.getHeaders()
143 | 			})
144 | 		})
145 | }
146 | 
147 | function solveCaptcha(raw) {
148 | 	const captcha = raw
149 | 		.split(' ')
150 | 		.map(part => part.toLowerCase())
151 | 		.map(part => (part in CAPTCHA_REPLACE ? CAPTCHA_REPLACE[part] : part))
152 | 
153 | 	captcha.splice(captcha.indexOf('='), 1) // Remove the =
154 | 
155 | 	const c = captcha.join(' ')
156 | 	const result = new Script(c).runInNewContext({})
157 | 
158 | 	log('Captcha %s (raw: %s) solved, result: %d', c, raw, result)
159 | 	return result
160 | }
161 | 
162 | function getProxyListId(fetch, anonimity) {
163 | 	log('Getting id for anonimity %s', anonimity)
164 | 	const form = new FormData()
165 | 	form.append('Uptime', 0)
166 | 	form.append('Type', anonimity)
167 | 	return fetch('http://www.gatherproxy.com/proxylist/anonymityplaintext', {
168 | 		method: 'POST',
169 | 		body: form,
170 | 		headers: form.getHeaders(),
171 | 		redirect: 'manual'
172 | 	}).then(res => url.parse(res.headers.get('location'), true).query.sid)
173 | }
174 | 
175 | function downloadProxyList(fetch, anonimity, id) {
176 | 	log('Downloading proxy with anonimity %s using id %s', anonimity, id)
177 | 	const form = new FormData()
178 | 	form.append('ID', id)
179 | 	form.append('T', anonimity)
180 | 	return fetch('http://www.gatherproxy.com/proxylist/downloadproxylist/', {
181 | 		method: 'POST',
182 | 		body: form,
183 | 		headers: form.getHeaders()
184 | 	}).then(res => res.text())
185 | }
186 | 


--------------------------------------------------------------------------------
/src/scrapers/hide-my-ip.com.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch'
 2 | 
 3 | const ANONIMITY_LEVELS = ['Low', 'Med', 'High']
 4 | 
 5 | export default function scrap(types) {
 6 | 	return fetch('https://www.hide-my-ip.com/proxylist.shtml', {
 7 | 		headers: {
 8 | 			'User-Agent': 'Mozilla/5.0'
 9 | 		}
10 | 	})
11 | 		.then(res => res.text())
12 | 		.then(text =>
13 | 			/var json =..(\[(?:.*)\]);<!-- proxylist -->/.exec(
14 | 				text.split('\n').join(' ')
15 | 			)
16 | 		)
17 | 		.then(match => (match ? JSON.parse(match[1]) : []))
18 | 		.then(proxies =>
19 | 			proxies.map(proxy => ({
20 | 				ip: proxy.i,
21 | 				port: parseInt(proxy.p),
22 | 				country: proxy.c.n.toUpperCase(),
23 | 				type: proxy.tp.toLowerCase(),
24 | 				anonimity: ANONIMITY_LEVELS.indexOf(proxy.a)
25 | 			}))
26 | 		)
27 | }
28 | 


--------------------------------------------------------------------------------
/src/scrapers/hidemy.name.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch'
 2 | import cheerio from '../util/fetch-cheerio'
 3 | import { validIp, validPort } from './text'
 4 | 
 5 | export default function scrap() {
 6 | 	const proxies = []
 7 | 	for (let i = 0; i < 20; i++) {
 8 | 		proxies.push(
 9 | 			fetch(`https://hidemy.name/en/proxy-list/?start=${i * 64}`, {
10 | 				headers: {
11 | 					'User-Agent': 'Mozilla/5.0'
12 | 				}
13 | 			})
14 | 				.then(cheerio())
15 | 				.then($ => {
16 | 					return $('.proxy__t > tbody > tr')
17 | 						.map((i, e) => {
18 | 							const element = $(e).find('td')
19 | 							const ip = element.eq(0).text()
20 | 							const port = element.eq(1).text()
21 | 							const type = element.eq(4).text().split(',')[0].toLowerCase()
22 | 							if (validIp(ip) && validPort(port)) return { ip, port, type }
23 | 						})
24 | 						.filter(e => e !== undefined)
25 | 						.get()
26 | 				})
27 | 		)
28 | 	}
29 | 	return Promise.all(proxies).then(datas =>
30 | 		datas.reduce((prev, next) => prev.concat(next))
31 | 	)
32 | }
33 | 


--------------------------------------------------------------------------------
/src/scrapers/hidester.com.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch'
 2 | 
 3 | const ANONIMITY_LEVELS = ['transparent', 'anonymous', 'elite']
 4 | 
 5 | export default function scrap(types) {
 6 | 	return fetch(
 7 | 		'https://hidester.com/proxydata/php/data.php?mykey=csv&gproxy=2',
 8 | 		{
 9 | 			headers: {
10 | 				Referer: 'https://hidester.com/proxylist/'
11 | 			}
12 | 		}
13 | 	)
14 | 		.then(req => req.json())
15 | 		.then(datas =>
16 | 			datas.map(({ IP: ip, PORT: port, type, country, anonymity }) => ({
17 | 				ip,
18 | 				port,
19 | 				type,
20 | 				country,
21 | 				anonimity: ANONIMITY_LEVELS.indexOf(anonymity.toLowerCase())
22 | 			}))
23 | 		)
24 | }
25 | 


--------------------------------------------------------------------------------
/src/scrapers/index.js:
--------------------------------------------------------------------------------
 1 | import freeproxylist from './free-proxy-list.net'
 2 | import gatherproxy from './gatherproxy.com'
 3 | import hidemyip from './hide-my-ip.com'
 4 | import hidemyname from './hidemy.name'
 5 | import hidester from './hidester.com'
 6 | import multiproxy from './multiproxy.org'
 7 | import nordvpn from './nordvpn.com'
 8 | import premproxy from './premproxy.com'
 9 | import proxy24 from './proxy24.blogspot.fr'
10 | import proxydb from './proxydb.net'
11 | import spys from './spys.me'
12 | 
13 | export default {
14 | 	'free-proxy-list.net': freeproxylist,
15 | 	'gatherproxy.com': gatherproxy,
16 | 	'hide-my-ip.com': hidemyip,
17 | 	'hidemy.name': hidemyname,
18 | 	'hidester.com': hidester,
19 | 	'multiproxy.org': multiproxy,
20 | 	'nordvpn.com': nordvpn,
21 | 	'premproxy.com': premproxy,
22 | 	'proxy24.blogspot.fr': proxy24,
23 | 	'proxydb.net': proxydb,
24 | 	'spys.me': spys
25 | }
26 | 


--------------------------------------------------------------------------------
/src/scrapers/multiproxy.org.js:
--------------------------------------------------------------------------------
1 | import createTextScraper from './text'
2 | 
3 | export default createTextScraper('http://multiproxy.org/txt_all/proxy.txt')
4 | 


--------------------------------------------------------------------------------
/src/scrapers/nordvpn.com.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch'
 2 | import Session from '../util/fetch-session'
 3 | 
 4 | export default function scrap() {
 5 | 	const session = new Session(fetch)
 6 | 
 7 | 	return session
 8 | 		.fetch('https://nordvpn.com/free-proxy-list/')
 9 | 		.then(() =>
10 | 			session.fetch(
11 | 				'https://nordvpn.com/wp-admin/admin-ajax.php?searchParameters%5B0%5D%5Bname%5D=proxy-country&searchParameters%5B0%5D%5Bvalue%5D=&searchParameters%5B1%5D%5Bname%5D=proxy-ports&searchParameters%5B1%5D%5Bvalue%5D=&offset=0&limit=100000&action=getProxies',
12 | 				{
13 | 					method: 'POST'
14 | 				}
15 | 			)
16 | 		)
17 | 		.then(res => res.json())
18 | 		.then(datas =>
19 | 			datas.map(({ ip, port, country, type }) => ({
20 | 				ip,
21 | 				port,
22 | 				country,
23 | 				type: type.toLowerCase()
24 | 			}))
25 | 		)
26 | }
27 | 


--------------------------------------------------------------------------------
/src/scrapers/premproxy.com.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch'
 2 | import cheerio from '../util/fetch-cheerio'
 3 | 
 4 | const ANONIMITY_LEVELS = ['transparent', 'anonymous', 'high-anonymous']
 5 | 
 6 | const SOURCES = [
 7 | 	{
 8 | 		url: 'https://premproxy.com/list/',
 9 | 		data(element) {
10 | 			return {
11 | 				type: 'http',
12 | 				anonymity: element.eq(1).text().trim().toLowerCase()
13 | 			}
14 | 		},
15 | 		pages: 20
16 | 	},
17 | 	{
18 | 		url: 'https://premproxy.com/socks-list/',
19 | 		data(element) {
20 | 			return {
21 | 				type: element.eq(1).text().trim().toLowerCase()
22 | 			}
23 | 		},
24 | 		pages: 5
25 | 	}
26 | ]
27 | 
28 | export default function scrap() {
29 | 	return Promise.all(
30 | 		SOURCES.map(source => {
31 | 			const promises = []
32 | 			for (let i = 1; i <= source.pages; i++) {
33 | 				promises.push(
34 | 					fetch(`${source.url}/${i > 9 ? i : '0' + i}.htm`)
35 | 						.then(cheerio())
36 | 						.then($ =>
37 | 							$('#proxylist > tr:not(.list_sorted)')
38 | 								.map((i, e) => {
39 | 									const element = $(e).find('td')
40 | 									const [ip, port] = element.eq(0).text().split(':')
41 | 									const data = source.data(element)
42 | 									const country = element.eq(3).text().toUpperCase()
43 | 									return { ip, port, country, ...data }
44 | 								})
45 | 								.get()
46 | 						)
47 | 				)
48 | 			}
49 | 			return Promise.all(promises).then(values =>
50 | 				values.reduce((prev, next) => prev.concat(next))
51 | 			)
52 | 		})
53 | 	).then(values => values.reduce((prev, next) => prev.concat(next)))
54 | }
55 | 


--------------------------------------------------------------------------------
/src/scrapers/proxy24.blogspot.fr.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch'
 2 | import cheerio from '../util/fetch-cheerio'
 3 | import { extractProxies } from './text'
 4 | 
 5 | const SOURCES = [
 6 | 	{
 7 | 		url: 'http://proxyserverlist-24.blogspot.fr/',
 8 | 		type: 'http',
 9 | 		selector: '.post-body > pre > span > span:nth-child(2)'
10 | 	},
11 | 	{
12 | 		url: 'http://sslproxies24.blogspot.fr/',
13 | 		type: 'https',
14 | 		selector: '.post-body > pre > span > span'
15 | 	},
16 | 	{
17 | 		url: 'http://vip-socks24.blogspot.com/',
18 | 		type: 'socks5',
19 | 		selector: '.post-body > textarea'
20 | 	}
21 | ]
22 | 
23 | export default function scrap() {
24 | 	const result = []
25 | 	for (const source of SOURCES) {
26 | 		result.push(
27 | 			fetch(source.url)
28 | 				.then(cheerio())
29 | 				.then($ =>
30 | 					Promise.all(
31 | 						$('.post-title > a')
32 | 							.map((i, e) =>
33 | 								fetch($(e).attr('href'))
34 | 									.then(cheerio())
35 | 									.then($ => $(source.selector).eq(0).text())
36 | 									.then(proxies =>
37 | 										extractProxies(proxies, () => ({ type: source.type }))
38 | 									)
39 | 							)
40 | 							.get()
41 | 					).then(datas => datas.reduce((prev, next) => prev.concat(next)))
42 | 				)
43 | 		)
44 | 	}
45 | 	return Promise.all(result).then(datas =>
46 | 		datas.reduce((prev, next) => prev.concat(next))
47 | 	)
48 | }
49 | 


--------------------------------------------------------------------------------
/src/scrapers/proxydb.net.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch'
 2 | import cheerio from '../util/fetch-cheerio'
 3 | 
 4 | export default function scrap() {
 5 | 	const promises = []
 6 | 	for (let offset = 0; offset < 1000; offset += 50) {
 7 | 		promises.push(
 8 | 			fetch(`http://proxydb.net/?limit=50&offset=${offset}`)
 9 | 				.then(cheerio())
10 | 				.then($ => {
11 | 					if (!$) return []
12 | 					return $('table > tbody > tr')
13 | 						.map((i, element) => {
14 | 							let url = $(element).find('td > a').first().text().trim()
15 | 							let ip = /(\d+\.){3}\d+/.exec(url)[0]
16 | 							let port = /\d+$/.exec(url)[0]
17 | 
18 | 							return { ip, port }
19 | 						})
20 | 						.get()
21 | 				})
22 | 		)
23 | 	}
24 | 	return Promise.all(promises).then(
25 | 		values => values.reduce((prev, next) => prev.concat(next)),
26 | 		[]
27 | 	)
28 | }
29 | 


--------------------------------------------------------------------------------
/src/scrapers/spys.me.js:
--------------------------------------------------------------------------------
 1 | import createTextScraper from './text'
 2 | 
 3 | const ANONIMITY_LEVELS = ['N', 'A', 'H']
 4 | 
 5 | export default createTextScraper('http://spys.me/proxy.txt', data => {
 6 | 	const [country, anonymity, ssl = 'N'] = data[0].split('-')
 7 | 	return {
 8 | 		type: ssl.charAt(0) === 'S' ? 'https' : 'http',
 9 | 		country,
10 | 		anonymity: ANONIMITY_LEVELS.indexOf(anonymity)
11 | 	}
12 | })
13 | 


--------------------------------------------------------------------------------
/src/scrapers/text.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch'
 2 | 
 3 | export default function createTextScraper(url, aggregator) {
 4 | 	return function scrap() {
 5 | 		return fetch(url)
 6 | 			.then(res => res.text())
 7 | 			.then(text => extractProxies(text, aggregator))
 8 | 	}
 9 | }
10 | 
11 | export function extractProxies(text, aggregator = function() {}) {
12 | 	return text
13 | 		.split('\n')
14 | 		.map(proxy => {
15 | 			const more = proxy.split(' ')
16 | 			const [ip, p] = more[0].split(':')
17 | 			const port = parseInt(p)
18 | 			if (validIp(ip) && validPort(port)) {
19 | 				const agg = aggregator(more.slice(1))
20 | 				return {
21 | 					ip,
22 | 					port,
23 | 					...agg
24 | 				}
25 | 			}
26 | 		})
27 | 		.filter(proxy => proxy !== undefined)
28 | }
29 | 
30 | export function validIp(ip) {
31 | 	return true
32 | }
33 | 
34 | export function validPort(port) {
35 | 	return port > 0 && port <= 65535
36 | }
37 | 


--------------------------------------------------------------------------------
/src/util/fetch-cheerio.js:
--------------------------------------------------------------------------------
1 | import $ from 'cheerio'
2 | 
3 | export default function cheerio(options) {
4 | 	return req => req.text().then(body => $.load(body, options))
5 | }
6 | 


--------------------------------------------------------------------------------
/src/util/fetch-session.js:
--------------------------------------------------------------------------------
 1 | import { CookieJar } from 'tough-cookie'
 2 | 
 3 | export default class Session {
 4 | 	constructor(fetch) {
 5 | 		this.jar = new CookieJar()
 6 | 		this._fetch = fetch
 7 | 	}
 8 | 
 9 | 	fetch(url, opts = {}) {
10 | 		return this.getCookies(url).then(cookies => {
11 | 			return this._fetch(
12 | 				url,
13 | 				Object.assign(opts, {
14 | 					headers: Object.assign(opts.headers || {}, {
15 | 						cookie: cookies.join('; ')
16 | 					})
17 | 				})
18 | 			).then(res => {
19 | 				let cookies = res.headers.get('set-cookie')
20 | 				cookies = (cookies && cookies.split(',')) || []
21 | 				cookies.map(cookie => this.jar.setCookie(cookie, url, () => null))
22 | 				return res
23 | 			})
24 | 		})
25 | 	}
26 | 
27 | 	getCookies(url) {
28 | 		return new Promise((resolve, reject) => {
29 | 			this.jar.getCookies(url, (err, cookies) => {
30 | 				if (err) reject(err)
31 | 				else resolve(cookies)
32 | 			})
33 | 		})
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/src/util/promise-lock.js:
--------------------------------------------------------------------------------
 1 | export default class Lock {
 2 | 	constructor(inner) {
 3 | 		this.inner = Promise.resolve(inner)
 4 | 	}
 5 | 
 6 | 	get(callback) {
 7 | 		const p = this.inner.then(inner => {
 8 | 			const result = callback(inner)
 9 | 			return Promise.resolve(result).then(
10 | 				result => ({ inner, result }),
11 | 				err => ({ inner, err })
12 | 			)
13 | 		})
14 | 		this.inner = p.then(({ inner }) => inner)
15 | 		return p.then(({ result, err }) => (err ? Promise.reject(err) : result))
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/src/util/temp-mail.js:
--------------------------------------------------------------------------------
 1 | import { getInbox } from 'temp-mail'
 2 | import debug from 'debug'
 3 | 
 4 | const log = debug('proxy-scraper:temp-mail')
 5 | 
 6 | const NOOP = function() {}
 7 | 
 8 | export * from 'temp-mail'
 9 | 
10 | export function poll(email, interval = 5000, iterations = 20) {
11 | 	return new Promise((resolve, reject) => {
12 | 		let count = 0
13 | 		const id = setInterval(() => {
14 | 			if (count < iterations) {
15 | 				count++
16 | 				log(
17 | 					'Polling inbox for mail %s iteration %d, max: %d',
18 | 					email,
19 | 					count,
20 | 					iterations
21 | 				)
22 | 				getInbox(email)
23 | 					.then(mails => {
24 | 						resolve(mails)
25 | 						clearInterval(id)
26 | 					})
27 | 					.catch(NOOP) //Ignore Error
28 | 			} else {
29 | 				clearInterval(id)
30 | 				reject(new Error('Timeout exceeded, no messages found.'))
31 | 			}
32 | 		}, interval)
33 | 	})
34 | }
35 | 


--------------------------------------------------------------------------------
/src/worker.js:
--------------------------------------------------------------------------------
 1 | import ProxyAgent from 'proxy-agent'
 2 | import fetch from 'node-fetch'
 3 | import debug from 'debug'
 4 | import { parse } from 'url'
 5 | const log = debug(`proxy-scraper:worker:${process.argv[2]}`)
 6 | 
 7 | log('Worker started !')
 8 | 
 9 | let page
10 | 
11 | process.on('message', msg => {
12 | 	switch (msg.event) {
13 | 		case 'test':
14 | 			testProxy(msg.data)
15 | 			break
16 | 		case 'page':
17 | 			page = msg.data
18 | 			break
19 | 	}
20 | })
21 | 
22 | function testProxy({ url, method, proxy, timeout }) {
23 | 	log('testing %s with proxy %s and %s ms of timeout', url, proxy, timeout)
24 | 	const startTime = Date.now()
25 | 	const p = parse(proxy)
26 | 	p.timeout = timeout
27 | 	fetch(url, {
28 | 		method: method || 'GET',
29 | 		agent: new ProxyAgent(p),
30 | 		timeout
31 | 	})
32 | 		.then(res => {
33 | 			if (page) {
34 | 				return res.text().then(p => {
35 | 					if (p != page) {
36 | 						const e = new Error('Page content missmatch')
37 | 						e.type = 'missmatch'
38 | 						throw e
39 | 					}
40 | 				})
41 | 			}
42 | 		})
43 | 		.then(() => process.send({ working: true, time: Date.now() - startTime }))
44 | 		.catch(error => process.send({ error, working: false }))
45 | }
46 | 


--------------------------------------------------------------------------------
/tools/build.js:
--------------------------------------------------------------------------------
  1 | import data from 'babel-preset-env/data/plugins.json'
  2 | import { rollup } from 'rollup'
  3 | import babel from 'rollup-plugin-babel'
  4 | import resolve from 'rollup-plugin-local-resolve'
  5 | import pkg from '../package.json'
  6 | import fs from 'fs-extra'
  7 | import { join, dirname } from 'path'
  8 | import { transformFile } from 'babel-core'
  9 | 
 10 | const VERSIONS = Object.values(data)
 11 | 	.map(p => p.node)
 12 | 	.filter((e, i, array) => e && array.indexOf(e) == i)
 13 | 	.sort()
 14 | 	.reverse()
 15 | 
 16 | const NODE_MODULES = [
 17 | 	'vm',
 18 | 	'fs',
 19 | 	'os',
 20 | 	'path',
 21 | 	'child_process',
 22 | 	'url',
 23 | 	'stream'
 24 | ]
 25 | 
 26 | const FOLDER = 'lib'
 27 | 
 28 | const WORKER = 'src/worker.js'
 29 | 
 30 | function generateLoader(versions) {
 31 | 	const checks = VERSIONS.map(
 32 | 		ver => `if(v>=${ver})module.exports=require('${versions.get(ver)}')\n`
 33 | 	).reduce((prev, next) => (prev ? `${prev}else ${next}` : next))
 34 | 	return `var v=parseFloat(process.versions.node)\n${checks}`
 35 | }
 36 | 
 37 | function build() {
 38 | 	const versions = new Map()
 39 | 	let chain = Promise.resolve()
 40 | 	for (const ver of VERSIONS) {
 41 | 		versions.set(ver, `./${ver}.js`)
 42 | 		chain = chain.then(() => console.log(`Building for node ${ver}`)).then(() =>
 43 | 			rollup({
 44 | 				entry: './index.js',
 45 | 				external: Object.keys(pkg.dependencies).concat(NODE_MODULES),
 46 | 				plugins: [
 47 | 					resolve(),
 48 | 					babel({
 49 | 						babelrc: false,
 50 | 						exclude: 'node_modules/**',
 51 | 						presets: [
 52 | 							[
 53 | 								'env',
 54 | 								{
 55 | 									targets: {
 56 | 										node: ver
 57 | 									},
 58 | 									modules: false
 59 | 								}
 60 | 							]
 61 | 						],
 62 | 						plugins: [
 63 | 							'external-helpers',
 64 | 							['transform-object-rest-spread', { useBuiltIns: true }]
 65 | 						]
 66 | 					})
 67 | 				]
 68 | 			}).then(bundle =>
 69 | 				bundle.write({
 70 | 					dest: join(FOLDER, `${ver}.js`),
 71 | 					format: 'cjs',
 72 | 					sourceMap: true
 73 | 				})
 74 | 			)
 75 | 		)
 76 | 	}
 77 | 	return chain.then(() => versions)
 78 | }
 79 | 
 80 | function copyWorker() {
 81 | 	return new Promise((resolve, reject) =>
 82 | 		transformFile(
 83 | 			WORKER,
 84 | 			{
 85 | 				babelrc: false,
 86 | 				ast: false,
 87 | 				sourceMaps: true,
 88 | 				presets: [
 89 | 					[
 90 | 						'env',
 91 | 						{
 92 | 							targets: {
 93 | 								node: 0.12
 94 | 							}
 95 | 						}
 96 | 					]
 97 | 				],
 98 | 				plugins: [['transform-object-rest-spread', { useBuiltIns: true }]]
 99 | 			},
100 | 			(err, result) => (err ? reject(err) : resolve(result))
101 | 		)
102 | 	).then(({ code, map }) =>
103 | 		Promise.all([
104 | 			fs.outputFile(join(FOLDER, WORKER), code),
105 | 			fs.outputJson(join(FOLDER, WORKER + '.map'), map)
106 | 		])
107 | 	)
108 | }
109 | fs
110 | 	.remove(FOLDER)
111 | 	.then(build)
112 | 	.then(generateLoader)
113 | 	.then(
114 | 		loader =>
115 | 			new Promise((resolve, reject) =>
116 | 				fs.writeFile(
117 | 					join(FOLDER, 'index.js'),
118 | 					loader,
119 | 					err => (err ? reject(err) : resolve())
120 | 				)
121 | 			)
122 | 	)
123 | 	.then(copyWorker)
124 | 	.catch(e => console.error(e))
125 | 


--------------------------------------------------------------------------------