├── .idea └── .gitignore ├── .env.example ├── .vscode └── launch.json ├── package.json ├── .gitignore ├── sources.js ├── README.md ├── parsers.js └── index.js /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | *.xml 2 | *.iml 3 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | MONGO_URL = mongodb://localhost:27017 2 | MONGO_DB = proxyList 3 | MONGO_COLLECTION = proxies -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible Node.js debug attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "type": "node", 9 | "request": "launch", 10 | "name": "Launch Program", 11 | "program": "${workspaceRoot}\\index.js" 12 | }, 13 | { 14 | "type": "node", 15 | "request": "launch", 16 | "name": "Proxy Spidey", 17 | "program": "proxySpidey.js", 18 | } 19 | ] 20 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "proxy-list-builder", 3 | "version": "1.0.7", 4 | "description": "Node package to build proxy list from the free proxy listing websites and store them into mongodb", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "node index.js" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git://github.com/asadhaider13/proxy-list-builder.git" 13 | }, 14 | "keywords": [ 15 | "proxies", 16 | "proxy scraper", 17 | "proxy list builder", 18 | "proxies scraper", 19 | "proxy list" 20 | ], 21 | "author": "Asad Haider", 22 | "license": "ISC", 23 | "dependencies": { 24 | "dotenv": "^16.0.3", 25 | "mongodb": "^5.4.0", 26 | "spidey": "^1.0.7" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | *.pid.lock 13 | 14 | # Directory for instrumented libs generated by jscoverage/JSCover 15 | lib-cov 16 | 17 | # Coverage directory used by tools like istanbul 18 | coverage 19 | 20 | # nyc test coverage 21 | .nyc_output 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # Bower dependency directory (https://bower.io/) 27 | bower_components 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (http://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules/ 37 | jspm_packages/ 38 | 39 | # Typescript v1 declaration files 40 | typings/ 41 | 42 | # Optional npm cache directory 43 | .npm 44 | 45 | # Optional eslint cache 46 | .eslintcache 47 | 48 | # Optional REPL history 49 | .node_repl_history 50 | 51 | # Output of 'npm pack' 52 | *.tgz 53 | 54 | # Yarn Integrity file 55 | .yarn-integrity 56 | 57 | # dotenv environment variables file 58 | .env 59 | 60 | -------------------------------------------------------------------------------- /sources.js: -------------------------------------------------------------------------------- 1 | const { 2 | parseFreeProxyList, 3 | parseSocksProxy, 4 | parseProxyNova, 5 | } = require("./parsers"); 6 | 7 | module.exports = [ 8 | { 9 | url: [ 10 | "https://free-proxy-list.net/", 11 | "https://www.us-proxy.org/", 12 | "https://www.sslproxies.org/", 13 | ], 14 | parse: parseFreeProxyList, 15 | method: "GET", 16 | }, 17 | { 18 | url: "https://www.socks-proxy.net/", 19 | parse: parseSocksProxy, 20 | method: "GET", 21 | }, 22 | { 23 | url: [ 24 | "https://www.proxynova.com/proxy-server-list/", 25 | "https://www.proxynova.com/proxy-server-list/country-us/", 26 | "https://www.proxynova.com/proxy-server-list/country-ca/", 27 | "https://www.proxynova.com/proxy-server-list/country-br/", 28 | "https://www.proxynova.com/proxy-server-list/country-ve/", 29 | "https://www.proxynova.com/proxy-server-list/country-ar/", 30 | "https://www.proxynova.com/proxy-server-list/country-gb/", 31 | "https://www.proxynova.com/proxy-server-list/country-ru/", 32 | "https://www.proxynova.com/proxy-server-list/country-fr/", 33 | "https://www.proxynova.com/proxy-server-list/country-de/", 34 | "https://www.proxynova.com/proxy-server-list/country-pl/", 35 | "https://www.proxynova.com/proxy-server-list/country-ua/", 36 | "https://www.proxynova.com/proxy-server-list/country-cn/", 37 | "https://www.proxynova.com/proxy-server-list/country-hk/", 38 | "https://www.proxynova.com/proxy-server-list/country-tw/", 39 | "https://www.proxynova.com/proxy-server-list/country-kr/", 40 | "https://www.proxynova.com/proxy-server-list/country-jp/", 41 | "https://www.proxynova.com/proxy-server-list/country-id/", 42 | "https://www.proxynova.com/proxy-server-list/country-th/", 43 | ], 44 | parse: parseProxyNova, 45 | method: "GET", 46 | }, 47 | ]; 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Proxy List Builder 2 | 3 | Proxy List Builder is a powerful tool designed to scrape proxy servers from various websites. It enables you to obtain free proxies from different sources and store them in a MongoDB database. By using this tool, you can easily access a list of functional and reliable proxies, which can be useful for different purposes. 4 | 5 | ## Supported Websites 6 | 7 | Proxy List Builder can extract proxies from the following websites: 8 | 9 | - https://free-proxy-list.net/ 10 | - https://www.us-proxy.org/ 11 | - https://www.sslproxies.org/ 12 | - https://www.socks-proxy.net/ 13 | - https://www.proxynova.com/proxy-server-list/ 14 | 15 | ## Installation 16 | 17 | To use Proxy List Builder, you need to clone the repository from Github: 18 | 19 | ```sh 20 | $ git clone https://github.com/asad-haider/proxy-list-builder 21 | ``` 22 | 23 | After cloning, navigate to the project directory and install the required dependencies using the following command: 24 | 25 | ```sh 26 | $ npm install 27 | ``` 28 | 29 | ## Usage 30 | 31 | To start using Proxy List Builder, you need to run the following command in the terminal: 32 | 33 | ```sh 34 | $ node index.js 35 | ``` 36 | 37 | This will execute the script, and you can monitor the output to see the status of the scraping process. 38 | 39 | ## Development 40 | 41 | If you want to contribute to the development of this tool, you can visit the project's public repository on Github. You can clone the repository, make changes, and submit a pull request. 42 | 43 | ## TODO and Roadmap 44 | 45 | The Proxy List Builder project has some features that we plan to add in the future, including: 46 | 47 | - Adding support for more websites. 48 | - Checking whether a proxy is working or not. 49 | - Check response time of proxy. 50 | 51 | ## Updates 52 | 53 | The Proxy List Builder project has recently undergone major updates, including: 54 | 55 | - The tool was rewritten using the Spidey framework, which is more efficient and reliable. 56 | - We removed websites that have invalid proxies or are no longer operational. 57 | - We made significant code changes and cleanup to improve the overall quality of the codebase. -------------------------------------------------------------------------------- /parsers.js: -------------------------------------------------------------------------------- 1 | const parseFreeProxyList = ($, meta) => { 2 | const proxies = []; 3 | $("table tbody tr").each((index, element) => { 4 | try { 5 | proxies.push({ 6 | ...meta, 7 | ip: $(element).find("td:nth-child(1)").text(), 8 | port: $(element).find("td:nth-child(2)").text(), 9 | country: $(element).find("td:nth-child(4)").text(), 10 | anonymity: $(element).find("td:nth-child(5)").text(), 11 | type: 12 | $(element).find("td:nth-child(7)").text().toLowerCase() === "yes" 13 | ? "HTTPS" 14 | : "HTTP", 15 | }); 16 | } catch (error) {} 17 | }); 18 | return proxies; 19 | }; 20 | 21 | const parseSocksProxy = ($, meta) => { 22 | const proxies = []; 23 | $("table tbody tr").each((index, element) => { 24 | try { 25 | proxies.push({ 26 | ...meta, 27 | ip: $(element).find("td:nth-child(1)").text(), 28 | port: $(element).find("td:nth-child(2)").text(), 29 | country: $(element).find("td:nth-child(4)").text(), 30 | anonymity: $(element).find("td:nth-child(6)").text(), 31 | type: $(element).find("td:nth-child(5)").text(), 32 | }); 33 | } catch (error) {} 34 | }); 35 | return proxies; 36 | }; 37 | 38 | const parseProxyNova = function ($, meta) { 39 | const proxies = []; 40 | $("#tbl_proxy_list tbody tr").each(function (index, element) { 41 | try { 42 | let ip = $(element).find("td:nth-child(1) script").html(); 43 | if (ip) { 44 | const regex = /document.write\((.*)\)/g; 45 | const cleanIp = regex.exec(ip)[1]; 46 | ip = eval(cleanIp); 47 | } else { 48 | ip = $(element).find("td:nth-child(1)").text(); 49 | } 50 | proxies.push({ 51 | ...meta, 52 | ip: ip.trim(), 53 | port: $(element).find("td:nth-child(2)").text().trim(), 54 | country: $(element) 55 | .find("td:nth-child(6) a") 56 | .children() 57 | .remove() 58 | .end() 59 | .text() 60 | .trim(), 61 | anonymity: $(element).find("td:nth-child(7)").text().trim(), 62 | }); 63 | } catch (error) {} 64 | }); 65 | return proxies; 66 | }; 67 | 68 | module.exports = { 69 | parseFreeProxyList, 70 | parseSocksProxy, 71 | parseProxyNova, 72 | }; 73 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const { Spidey, DiscardItemError } = require("spidey"); 2 | const sources = require("./sources"); 3 | const dotnev = require("dotenv"); 4 | const { MongoClient } = require("mongodb"); 5 | dotnev.config(); 6 | 7 | class ValidationPipeline { 8 | constructor(options) {} 9 | 10 | isValidHost(ip) { 11 | const ipRegex = new RegExp( 12 | "^([1-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|" + 13 | "25[0-5])\\.([0-9]|[1-9][0-9]|1[0-9]{2}|" + 14 | "2[0-4][0-9]|25[0-5])\\.([0-9]|[1-9][0-9]|" + 15 | "1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.([1-9]|" + 16 | "[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|" + 17 | "25[0-5])$" 18 | ); 19 | return ipRegex.test(ip); 20 | } 21 | 22 | isValidPort(port) { 23 | const portRegex = new RegExp( 24 | "^(0|[1-9]\\d{0,3}|[1-5]\\d{4}|6[0-4]\\d{3}" + 25 | "|65[0-4]\\d{2}|655[0-2]\\d|6553[0-5])$" 26 | ); 27 | return portRegex.test(port); 28 | } 29 | 30 | process(data) { 31 | if (!data.url) throw new DiscardItemError("Missing or Invalid URL"); 32 | if (this.isValidHost(data.ip) && this.isValidPort(data.port)) return data; 33 | else throw new DiscardItemError("Invalid IP or Port"); 34 | } 35 | } 36 | 37 | class ManipulatePipeline { 38 | constructor(options) {} 39 | 40 | process(data) { 41 | data.port = parseInt(data.port, 10); 42 | if (data.type) data.type = data.type.toUpperCase(); 43 | if (data.anonymity) data.anonymity = data.anonymity.toUpperCase(); 44 | return data; 45 | } 46 | } 47 | 48 | class MongoPipeline { 49 | proxiesCol; 50 | options; 51 | 52 | constructor(options) { 53 | this.options = options; 54 | this.client = new MongoClient(options.mongoUrl, { 55 | useNewUrlParser: true, 56 | useUnifiedTopology: true, 57 | }); 58 | } 59 | 60 | async start() { 61 | await this.client.connect(); 62 | const db = this.client.db(this.options.database); 63 | this.proxiesCol = db.collection(this.options.collection); 64 | } 65 | 66 | async process(data) { 67 | const query = { 68 | ip: data.ip, 69 | port: data.port, 70 | }; 71 | const update = { 72 | $set: data, 73 | $setOnInsert: { 74 | crawledAt: new Date(), 75 | }, 76 | }; 77 | await this.proxiesCol.updateOne(query, update, { upsert: true }); 78 | return data; 79 | } 80 | 81 | async complete() { 82 | await this.client.close(); 83 | } 84 | } 85 | 86 | class ProxySpidey extends Spidey { 87 | constructor() { 88 | super({ 89 | retries: 3, 90 | concurrency: 10, 91 | mongoUrl: process.env.MONGO_URL, 92 | database: process.env.MONGO_DB, 93 | collection: process.env.MONGO_COLLECTION, 94 | pipelines: [ValidationPipeline, ManipulatePipeline, MongoPipeline], 95 | }); 96 | } 97 | 98 | start() { 99 | for (const source of sources) { 100 | const urls = this.castArray(source.url); 101 | urls.forEach((url) => { 102 | this.request( 103 | { 104 | url, 105 | method: source.method, 106 | body: source.body, 107 | meta: { parse: source.parse }, 108 | }, 109 | this.parse.bind(this) 110 | ); 111 | }); 112 | } 113 | } 114 | 115 | parse(response) { 116 | const url = response.url; 117 | const parser = response.meta.parse; 118 | const proxies = parser(response.$, { 119 | url, 120 | }); 121 | proxies.forEach((proxy) => this.save(proxy)); 122 | } 123 | 124 | castArray(...args) { 125 | return args[0] instanceof Array ? args[0] : args; 126 | } 127 | } 128 | 129 | new ProxySpidey().start(); 130 | --------------------------------------------------------------------------------