├── README.md └── proxy-scraper.py /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Proxy Scraper & Checker & Free List 4 | 5 | Easy proxy scraper & checker, and publicly available list 6 | 7 | ``` 8 | usage: proxy-scraper.py [-h] [-c] -o OUTPUT [-t THREADS] [--timeout TIMEOUT] 9 | [--http] [--check-with-website CHECK_WITH_WEBSITE] 10 | [--country] [--connection-time] [-f] [-i] 11 | 12 | optional arguments: 13 | -h, --help show this help message and exit 14 | -c, --check Check the scraped proxies 15 | -o OUTPUT, --output OUTPUT 16 | Output file 17 | -t THREADS, --threads THREADS 18 | Checker threads count (default: 30) 19 | --timeout TIMEOUT Checker timeout in seconds (default: 5) 20 | --http Check proxies for HTTP instead of HTTPS 21 | --check-with-website CHECK_WITH_WEBSITE (default: httpbin.org/ip) 22 | Website to connect with proxy. If it doesn't return 23 | HTTP 200, it's dead 24 | --country Locate and print country (requires maxminddb-geolite2) 25 | --connection-time Print connection time information 26 | -f, --write-immediately 27 | Force flush the output file every time 28 | -i, --extra-information 29 | Print last updated time, and configuration description 30 | ``` 31 | 32 | 33 | This runs on my server: 34 | ``` 35 | python3 /root/proxy-scraper.py --check -t 300 --timeout 5 --check-with-website httpbin.org/ip --country --connection-time --extra-information --output /home/admin/web/cagriari.com/public_html/fresh_proxy.txt 36 | ``` 37 | 38 |

39 | ~Hourly~ Daily updated & checked proxy list: https://cagriari.com/fresh_proxy.txt 40 | 41 | ~I'm no longer running it on my server as it doesn't cover the server costs.~ 42 | 43 | I changed the interval to every 6 hours instead of hourly to decrease server load. 44 | -------------------------------------------------------------------------------- /proxy-scraper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import sys 4 | 5 | if sys.version_info[0] < 3: 6 | print("This script needs Python 3") 7 | exit() 8 | 9 | import requests, re, queue, threading, traceback, requests, datetime, time, argparse 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('-c', '--check', help="Check the scraped proxies", action='store_true') 13 | parser.add_argument('-o', '--output', help="Output file", required=True) 14 | parser.add_argument('-t', '--threads', type=int, default=20, help="Checker threads count") 15 | parser.add_argument('--timeout', type=int, default=5, help="Checker timeout in seconds") 16 | parser.add_argument('--http', help="Check proxies for HTTP instead of HTTPS", action='store_true') 17 | parser.add_argument('--check-with-website', help="Website to connect with proxy. If it doesn't return HTTP 200, it's dead", default="httpbin.org/ip") 18 | parser.add_argument('--country', help="Locate and print country (requires maxminddb-geolite2)", action='store_true') 19 | parser.add_argument('--connection-time', help="Print connection time information", action='store_true') 20 | parser.add_argument('-f', '--write-immediately', help="Force flush the output file every time", action='store_true') 21 | parser.add_argument('-i', '--extra-information', help="Print last updated time, and configuration description", action='store_true') 22 | parserx = parser.parse_args() 23 | threads = parserx.threads 24 | https = not parserx.http 25 | timeout = parserx.timeout 26 | reader = None 27 | if parserx.country: 28 | try: 29 | from geolite2 import geolite2 30 | reader = geolite2.reader() 31 | except ImportError: 32 | print("Error: maxminddb-geolite2 is not installed. Please try without --country option or install this package.") 33 | exit() 34 | 35 | proxies = [] 36 | 37 | 38 | def fetchAndParseProxies(url, custom_regex): 39 | global proxies 40 | n = 0 41 | proxylist = requests.get(url, timeout=5).text 42 | proxylist = proxylist.replace('null', '"N/A"') 43 | custom_regex = custom_regex.replace('%ip%', '([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3})') 44 | custom_regex = custom_regex.replace('%port%', '([0-9]{1,5})') 45 | for proxy in re.findall(re.compile(custom_regex), proxylist): 46 | proxies.append(proxy[0] + ":" + proxy[1]) 47 | n += 1 48 | sys.stdout.write("{0: >5} proxies fetched from {1}\n".format(n, url)) 49 | 50 | 51 | proxysources = [ 52 | ["http://spys.me/proxy.txt","%ip%:%port% "], 53 | ["http://www.httptunnel.ge/ProxyListForFree.aspx"," target=\"_new\">%ip%:%port%"], 54 | ["https://raw.githubusercontent.com/sunny9577/proxy-scraper/master/proxies.json", "\"ip\":\"%ip%\",\"port\":\"%port%\","], 55 | ["https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list", '"host": "%ip%".*?"country": "(.*?){2}",.*?"port": %port%'], 56 | ["https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list.txt", '%ip%:%port% (.*?){2}-.-S \\+'], 57 | ["https://raw.githubusercontent.com/opsxcq/proxy-list/master/list.txt", '%ip%", "type": "http", "port": %port%'], 58 | ["https://www.us-proxy.org/", "%ip%<\\/td>%port%<\\/td>(.*?){2}<\\/td>.*?<\\/td>.*?<\\/td>.*?<\\/td>(.*?)<\\/td>.*?<\\/td><\\/tr>"], 59 | ["https://free-proxy-list.net/", "%ip%<\\/td>%port%<\\/td>(.*?){2}<\\/td>.*?<\\/td>.*?<\\/td>.*?<\\/td>(.*?)<\\/td>.*?<\\/td><\\/tr>"], 60 | ["https://www.sslproxies.org/", "%ip%<\\/td>%port%<\\/td>(.*?){2}<\\/td>.*?<\\/td>.*?<\\/td>.*?<\\/td>(.*?)<\\/td>.*?<\\/td><\\/tr>"], 61 | ["https://www.proxy-list.download/api/v0/get?l=en&t=https", '"IP": "%ip%", "PORT": "%port%",'], 62 | ["https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=6000&country=all&ssl=yes&anonymity=all", "%ip%:%port%"], 63 | ["https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt", "%ip%:%port%"], 64 | ["https://raw.githubusercontent.com/shiftytr/proxy-list/master/proxy.txt", "%ip%:%port%"], 65 | ["https://proxylist.icu/", "%ip%:%port%http<"], 66 | ["https://proxylist.icu/proxy/1", "%ip%:%port%http<"], 67 | ["https://proxylist.icu/proxy/2", "%ip%:%port%http<"], 68 | ["https://proxylist.icu/proxy/3", "%ip%:%port%http<"], 69 | ["https://proxylist.icu/proxy/4", "%ip%:%port%http<"], 70 | ["https://proxylist.icu/proxy/5", "%ip%:%port%http<"], 71 | ["https://www.hide-my-ip.com/proxylist.shtml", '"i":"%ip%","p":"%port%",'], 72 | ["https://raw.githubusercontent.com/scidam/proxy-list/master/proxy.json", '"ip": "%ip%",\n.*?"port": "%port%",'] 73 | ] 74 | 75 | sourcethreads = [] 76 | for source in proxysources: 77 | t = threading.Thread(target=fetchAndParseProxies, args=(source[0], source[1])) 78 | sourcethreads.append(t) 79 | t.start() 80 | 81 | for t in sourcethreads: 82 | t.join() 83 | 84 | proxies_unique = list(set(proxies)) 85 | print("{0: >5} proxies fetched total, {1} unique.".format(len(proxies), len(proxies_unique))) 86 | proxies = proxies_unique 87 | proxies_ok = [] 88 | 89 | f = open(parserx.output, "w") 90 | if parserx.extra_information: 91 | f.write("# Last updated: {}\n".format(datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S"))) 92 | if parserx.check: 93 | f.write("# {}, {}-second timeout\n".format("HTTPS" if https else "HTTP", timeout)) 94 | f.write("# https://github.com/sh4dowb/proxy-scraper\n\n") 95 | 96 | if parserx.check: 97 | print("Checking with {} threads ({}, {} seconds timeout)".format(threads, "HTTPS" if https else "HTTP", timeout)) 98 | q = queue.Queue() 99 | for x in proxies: 100 | q.put([x, "N/A"]) 101 | dead = 0 102 | alive = 0 103 | def checkProxies(): 104 | global q 105 | global dead 106 | global alive 107 | global f 108 | global proxies 109 | global timeout 110 | while not q.empty(): 111 | proxy = q.get() 112 | try: 113 | resp = requests.get(("https" if https else "http") + ("://" + parserx.check_with_website), proxies={'http':'http://'+proxy[0],'https':'http://'+proxy[0]}, timeout=timeout) 114 | if resp.status_code != 200: 115 | raise BadProxy 116 | if parserx.country: 117 | try: 118 | proxy[1] = reader.get(proxy[0].split(':')[0])['country']['iso_code'] 119 | except KeyError: 120 | pass 121 | except IndexError: 122 | pass 123 | except TypeError: 124 | pass 125 | f.write("{}|{}|{:.2f}s\n".format(proxy[0], proxy[1], resp.elapsed.total_seconds())) 126 | if alive % 30 == 0: 127 | f.flush() 128 | alive += 1 129 | except: 130 | dead += 1 131 | 132 | sys.stdout.write("\rChecked %{:.2f} - (Alive: {} - Dead: {})".format((alive + dead) / len(proxies) * 100, alive, dead)) 133 | sys.stdout.flush() 134 | 135 | threadsl = [] 136 | for i in range(0, threads): 137 | t = threading.Thread(target=checkProxies) 138 | t.start() 139 | threadsl.append(t) 140 | for t in threadsl: 141 | t.join() 142 | 143 | sys.stdout.write("\rCompleted - Alive: {} - Dead: {} \n".format(alive, dead)) 144 | print("") 145 | else: 146 | for proxy in proxies: 147 | f.write("{}\n".format(proxy)) 148 | 149 | f.close() 150 | --------------------------------------------------------------------------------