├── Dockerfile ├── Makefile ├── README.md ├── gimmeproxy.py ├── haproxy.cfg ├── parse_proxy_list.py ├── proxies.txt ├── requirements.txt └── run.sh /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.5 2 | MAINTAINER Johannes Gontrum 3 | 4 | ENV CHECK_URL "https://www.google.com" 5 | ENV CHECK_FOR "initHistory" 6 | ENV PROXY_TIMEOUT "10.0" 7 | ENV PROXY_FILE "/scripts/files/proxies.txt" 8 | 9 | RUN mkdir -p /scripts 10 | RUN mkdir -p /scripts/files 11 | 12 | COPY gimmeproxy.py /scripts/gimmeproxy.py 13 | COPY parse_proxy_list.py /scripts/parse_proxy_list.py 14 | COPY haproxy.cfg /scripts/haproxy.cfg 15 | COPY requirements.txt /scripts/requirements.txt 16 | COPY run.sh /scripts/run.sh 17 | COPY proxies.txt /scripts/files/proxies.txt 18 | 19 | RUN echo deb http://httpredir.debian.org/debian jessie-backports main | sed 's/\(.*\)-sloppy \(.*\)/&@\1 \2/' | tr @ '\n' | tee /etc/apt/sources.list.d/backports.list 20 | 21 | RUN apt-get update 22 | RUN apt-get install -y --force-yes iptables zlib1g zlib1g-dev haproxy -t jessie-backports --fix-missing 23 | RUN apt-get clean 24 | 25 | RUN pip install -r /scripts/requirements.txt 26 | 27 | RUN chmod -R 777 /scripts 28 | RUN chmod -R 777 /etc/haproxy 29 | 30 | CMD ["/scripts/run.sh"] 31 | EXPOSE 5566 32 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | docker build -t jgontrum/rotatingproxy . 3 | 4 | run: 5 | docker run -d --restart=always -v /root/logs:/scripts/files --name rotatingproxy -e "CHECK_URL=https://www.immobilienscout24.de" -e "CHECK_FOR=IS24" -e "PROXY_TIMEOUT=10.0" --privileged jgontrum/rotatingproxy 2>/dev/null >/dev/null 6 | 7 | run_rm: 8 | docker run -v /root/logs:/scripts/files --rm --name rotatingproxy -e "CHECK_URL=https://www.immobilienscout24.de" -e "CHECK_FOR=IS24" -e "PROXY_TIMEOUT=10.0" --privileged jgontrum/rotatingproxy 9 | 10 | stop: 11 | docker kill rotatingproxy 2>/dev/null; true 12 | docker rm rotatingproxy 2>/dev/null; true 13 | 14 | all: build stop run 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Proxies Rotator 2 | Dockerfile for Proxy Rotation 3 | 4 | 1) Pulls proxies from GimmeProxy.com 5 | 2) Test the proxies against a configurable URL 6 | 3) Rotate the proxies using HAProxy (round robin) 7 | 4) Repeat every 5 minutes 8 | 5) Expose port 5566 as a proxy 9 | 10 | ## Settings 11 | 12 | You can set the URL that is used for testing the proxy. 13 | This can be beneficial since some proxies may block certain websites. 14 | 15 | In addition, you must set a keyword for which we search in the returned 16 | website. This prevents the usage of proxies that always return a default 17 | site for every URL. 18 | 19 | Another setting is the timeout. If a proxy takes longer than this amount 20 | in seconds to retrieve the given URL, we will ignore it. 21 | 22 | Default values: 23 | 24 | CHECK_URL = http://www.google.com 25 | 26 | CHECK_FOR = Google Inc. 27 | 28 | PROXY_TIMEOUT = 10.0 29 | 30 | ## Usage 31 | ``` 32 | docker pull jgontrum/rotatingproxy 33 | 34 | docker run -d --name rotatingproxy -p 127.0.0.0:5566:5566 --privileged jgontrum/rotatingproxy 35 | 36 | OR: 37 | 38 | docker run --rm --name rotatingproxy -e "CHECK_URL=https://www.immobilienscout24.de" -e "CHECK_FOR=IS24" -e "PROXY_TIMEOUT=10.0" --privileged --net=host jgontrum/rotatingproxy 39 | ``` 40 | 41 | ## Testing 42 | ``` 43 | curl --proxy 127.0.0.1:5566 http://www.google.com 44 | ``` 45 | -------------------------------------------------------------------------------- /gimmeproxy.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | import os 5 | from time import sleep, time 6 | from aiohttp import ClientSession, ProxyConnector, Timeout 7 | import requests 8 | 9 | number_of_proxies = 100 10 | request_proxies = 15 11 | timeout = os.environ.get("PROXY_TIMEOUT") 12 | timeout = float(timeout) if timeout else 10.0 13 | filepath = os.environ.get("PROXY_FILE") or "./proxies.txt" 14 | 15 | proxies = [] 16 | new_proxies = [] 17 | ips = set() 18 | 19 | proxy_provider_url = "http://gimmeproxy.com/api/getProxy?get=true&" +\ 20 | "protocol=http&supportsHttps=true" 21 | test_url = os.environ.get("CHECK_URL") or "https://www.google.com" 22 | test_for = os.environ.get("CHECK_FOR") or "initHistory" 23 | 24 | async def fetch(proxy): 25 | conn = ProxyConnector(proxy=proxy) 26 | with Timeout(timeout): 27 | try: 28 | async with ClientSession(connector=conn) as session: 29 | async with session.get(test_url) as r: 30 | if r.status == 200: 31 | text = str(await r.read()) 32 | if re.search(test_for, str(await r.read())): 33 | return True 34 | except Exception: 35 | pass 36 | 37 | async def test_server(proxy): 38 | t = time() 39 | if not await fetch(proxy['address']): 40 | return 41 | proxy['time'] = time() - t 42 | new_proxies.append(proxy) 43 | 44 | 45 | def get_proxy_servers(r): 46 | responses = [requests.get(proxy_provider_url).text for i in range(r)] 47 | 48 | for resp in responses: 49 | try: 50 | response = json.loads(resp) 51 | if response.get('status') == 429: 52 | print("Too many requests :(") 53 | continue 54 | 55 | if response['ipPort'] in ips: 56 | continue 57 | ips.add(response['ipPort']) 58 | 59 | proxies.append({ 60 | 'ipPort': response['ipPort'], 61 | 'address': response['curl'], 62 | 'time': -1, 63 | }) 64 | except Exception: 65 | continue 66 | 67 | 68 | def test_proxy_servers(): 69 | loop = asyncio.get_event_loop() 70 | 71 | tasks = [] 72 | for proxy in proxies: 73 | task = asyncio.ensure_future(test_server(proxy)) 74 | tasks.append(task) 75 | loop.run_until_complete(asyncio.wait(tasks)) 76 | 77 | 78 | def sort_proxies(): 79 | global proxies 80 | prx = sorted(new_proxies, key=lambda x: x['time']) 81 | proxies = prx[:number_of_proxies] 82 | 83 | 84 | def load_old_proxies(): 85 | try: 86 | with open(filepath, 'r') as f: 87 | for line in f: 88 | line = line.strip() 89 | proxies.append({ 90 | 'ipPort': line, 91 | 'address': "http://%s" % line, 92 | 'time': -1, 93 | }) 94 | ips.add("http://%s" % line) 95 | except: 96 | pass 97 | print("Loaded %s old proxies." % len(proxies)) 98 | 99 | if __name__ == '__main__': 100 | load_old_proxies() 101 | get_proxy_servers(request_proxies) 102 | test_proxy_servers() 103 | sort_proxies() 104 | 105 | new = set([proxy['ipPort'].strip() for proxy in proxies]) 106 | 107 | with open(filepath, 'w') as f: 108 | for proxy in new: 109 | f.write("%s\n" % proxy) 110 | print("Saved %s new proxies." % len(new)) 111 | -------------------------------------------------------------------------------- /haproxy.cfg: -------------------------------------------------------------------------------- 1 | global 2 | log 127.0.0.1 local0 3 | log 127.0.0.1 local1 notice 4 | log /var/log/haproxy.log local0 debug 5 | #log loghost local0 info 6 | maxconn 4096 7 | #chroot /usr/share/haproxy 8 | user haproxy 9 | group haproxy 10 | daemon 11 | debug 12 | #quiet 13 | stats socket /tmp/haproxy 14 | 15 | defaults 16 | log global 17 | mode http 18 | option httplog 19 | option dontlognull 20 | retries 3 21 | option redispatch 22 | maxconn 2000 23 | contimeout 5000 24 | clitimeout 50000 25 | srvtimeout 50000 26 | 27 | frontend rotating_proxies 28 | bind *:5566 29 | default_backend tor 30 | option http_proxy 31 | 32 | backend tor 33 | balance roundrobin 34 | -------------------------------------------------------------------------------- /parse_proxy_list.py: -------------------------------------------------------------------------------- 1 | s = """ 2 | global 3 | log 127.0.0.1 local0 4 | log 127.0.0.1 local1 notice 5 | #local0.* /var/log/haproxy.log 6 | #log /var/log/haproxy.log local0 debug 7 | #log loghost local0 info 8 | maxconn 4096 9 | #chroot /usr/share/haproxy 10 | user haproxy 11 | group haproxy 12 | daemon 13 | debug 14 | #quiet 15 | stats socket /tmp/haproxy 16 | 17 | defaults 18 | log global 19 | mode http 20 | option httplog 21 | option dontlognull 22 | retries 3 23 | option redispatch 24 | maxconn 2000 25 | timeout connect 5000 26 | timeout client 50000 27 | timeout server 50000 28 | #clitimeout 50000 29 | #srvtimeout 50000 30 | 31 | frontend rotating_proxies 32 | bind *:5566 33 | default_backend tor 34 | option http_proxy 35 | 36 | backend tor 37 | balance roundrobin """ 38 | 39 | import os 40 | 41 | hostCount = 1 42 | 43 | filepath = os.environ.get("PROXY_FILE") or "./proxies.txt" 44 | for line in open(filepath): 45 | line = line.strip() 46 | if len(line) > 0: 47 | results = ("\nserver srv{hostCount} {line} " + 48 | "weight 1 maxconn 100 check").format(hostCount=hostCount, 49 | line=line) 50 | s += results 51 | hostCount += 1 52 | 53 | with open("/etc/haproxy/haproxy.cfg", "w") as text_file: 54 | text_file.write(s) 55 | with open("/scripts/haproxy.cfg", "w") as text_file: 56 | text_file.write(s) 57 | -------------------------------------------------------------------------------- /proxies.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgontrum/proxies-rotator/76e7c07a5f076dc0e05305364c5b37bd11076d39/proxies.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==0.22.5 2 | asyncio==3.4.3 3 | requests==2.11.1 4 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Press [CTRL+C] to stop.." 3 | while : 4 | do 5 | echo "Downloading Proxies" 6 | python /scripts/gimmeproxy.py >> /scripts/files/rotatingproxy.log 7 | iptables -I INPUT -p tcp --dport $PORT 5566 -j DROP 8 | sleep 1 9 | python /scripts/parse_proxy_list.py 10 | service haproxy restart 11 | iptables -D INPUT -p tcp --dport $PORT 5566 -j DROP 12 | echo "Sleeping for 10 minutes" 13 | sleep 600 14 | done 15 | --------------------------------------------------------------------------------