├── credentials.json ├── png ├── terminal.png └── scraper-architecture.png ├── requirements.txt ├── csv └── sample_jobs.csv ├── app.yaml ├── README.md ├── utils.py ├── slave.py └── master.py /credentials.json: -------------------------------------------------------------------------------- 1 | { 2 | Replace with your authentication credentials JSON file. ;) 3 | } -------------------------------------------------------------------------------- /png/terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juanluisrto/Scraping-orchestra/HEAD/png/terminal.png -------------------------------------------------------------------------------- /png/scraper-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juanluisrto/Scraping-orchestra/HEAD/png/scraper-architecture.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask 2 | google==3.0.0 3 | pandas 4 | google-cloud-logging 5 | google-cloud-storage 6 | gunicorn 7 | gcsfs 8 | pexpect 9 | -------------------------------------------------------------------------------- /csv/sample_jobs.csv: -------------------------------------------------------------------------------- 1 | ,query,start,end 2 | 0,Apple,2019-01-01,2019-01-06 3 | 1,Tesla,2019-03-01,2019-03-03 4 | 2,Space-X,2019-02-01,2019-02-03 5 | 3,Warren Buffet,2019-03-01,2019-03-03 6 | -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | runtime: python38 2 | entrypoint: gunicorn -b :$PORT slave:app --timeout 360000 --preload 3 | env_variables: 4 | BUCKET: your-bucket.appspot.com #gcloud bucket where you want to store the outputs 5 | CLOUD: True #env variable which says whether the Slave is running locally or in the cloud 6 | instance_class: B2 #Bs are basic instances without automatic scaling. Necessary to restart them manually 7 | manual_scaling: 8 | instances: 1 #Number of instances must be defined -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scraping-orchestra 2 | A scraping Master-slave system based on Google App Engine 3 | 4 | This repository showcases an approach to orchestrate from a local process a Scraper deployed in Google App Engine. 5 | The proposal is a workaround to the **HTTP 429 Too Many Requests Error**. 6 | The main idea is to redeploy the Scraper to get a new IP whenever the Error shows up. 7 | 8 | ### Medium article 9 | Take a look at the [article](https://juanluisrto.medium.com/scraping-google-search-without-getting-caught-e43bb91b363e?sk=944b7dc0368b04345a9ad2a2416b311d) I published about this 10 | 11 | ### System architecture 12 | ![alt text](/png/scraper-architecture.png). 13 | 14 | ### Running locally 15 | To test this locally clone the repo and run: 16 | * `pip install -r requirements.txt` 17 | * `python master.py` in one terminal 18 | * `gunicorn -b :8080 slave:app --timeout 360000 --preload` in a different terminal 19 | 20 | 21 | 22 | The output of the master looks like this. 23 | 24 | ![alt text](/png/terminal.png). 25 | 26 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from google.cloud import logging as gc_logging 2 | import pandas as pd 3 | from googlesearch import search, get_tbs 4 | import os, logging 5 | 6 | class GCloudConnection: 7 | 8 | def __init__(self, URL, LOG_NAME): 9 | # env variable declared only for gcloud authentication during local tests. Not necessary at deployed instances 10 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = './credentials.json' 11 | logging.getLogger().setLevel(logging.INFO) 12 | self.connect_cloud_services(LOG_NAME) 13 | self.URL = URL 14 | 15 | def connect_cloud_services(self, LOG_NAME): 16 | # connect gcloud logger to default logging. 17 | logging_client = gc_logging.Client() 18 | logging_client.get_default_handler() 19 | logging_client.setup_logging() 20 | logging_client.logger(LOG_NAME) 21 | 22 | class Scraper: 23 | # runs same query filtering by every date in date range 24 | def scrape(self, job, number_of_urls = 10): 25 | query, from_date, to_date = job.values() 26 | urls = [] 27 | for d in pd.date_range(from_date, to_date): 28 | tbs = get_tbs(from_date=d, to_date=d) #"%Y-%m-%d" 29 | results = search(query, tbs=tbs, pause=2, stop=number_of_urls) 30 | for url in results: 31 | urls.append({"date" : d.date(), "url" : url}) 32 | return pd.DataFrame(urls, columns=["date", "url"]) 33 | 34 | def filename(self, job): 35 | #stock, keywords, from_date, to_date = job.values() 36 | #filename = f"{stock}/{stock}_{from_date}_{to_date}.csv" 37 | query, from_date, to_date = job.values() 38 | filename = f"{query}_{from_date}_{to_date}.csv" 39 | return filename -------------------------------------------------------------------------------- /slave.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | import logging, os, time 3 | from multiprocessing import Process, Pipe 4 | from utils import GCloudConnection, Scraper 5 | 6 | class Slave(GCloudConnection): 7 | 8 | def __init__(self, URL): 9 | GCloudConnection.__init__(self,URL, LOG_NAME= "slave-scraper") 10 | self.state = "idle" 11 | self.parent, self.child = Pipe() 12 | self.scraper = Scraper() 13 | 14 | def store(self, df, filename): 15 | bucket = self.URL #define url to bucket where results are stored 16 | url = f"gs://{bucket}/csv/{filename}" if "CLOUD" in os.environ else f"./csv/{filename}" 17 | df.to_csv(url) 18 | logging.info(f"{filename} stored succesfully") 19 | 20 | def scrape(self, job): 21 | self.child.send("busy") #updates state to stop receiving more jobs 22 | try: 23 | df = self.scraper.scrape(job) 24 | self.child.send("idle") 25 | except Exception as ex: 26 | self.child.send("scraping-detected") 27 | logging.error(f"Job {job} failed with an error: {ex}") 28 | df = "Failed" 29 | return df # returns the job output, or "Failed" if an error arised 30 | 31 | 32 | def run(self, pipe): 33 | self.child = pipe 34 | while True: 35 | job = self.child.recv() 36 | if job != None: 37 | logging.info(f"Running job: {job}") 38 | df = self.scrape(job) 39 | if str(df) != "Failed": 40 | self.store(df, self.scraper.filename(job)) 41 | else: 42 | time.sleep(3) 43 | 44 | 45 | app = Flask(__name__) 46 | 47 | @app.route('/start') 48 | def start_child_process(): #Gunicorn does not allow the creation of new processes before the app creation, so we need to define this route 49 | url = os.getenv("BUCKET") 50 | global slave 51 | slave = Slave(url) 52 | p = Process(target=slave.run, args=[slave.child]) 53 | p.start() 54 | logging.info("Scraper running") 55 | return "Scraper running" 56 | 57 | @app.route('/job') 58 | def process_job(): 59 | slave.parent.send(request.args) #sends a job to the Scraper through the "parent" end of the pipe 60 | return f"Job {request.args} started" 61 | 62 | @app.route('/state') 63 | def current_state(): 64 | try: 65 | if slave.parent.poll(timeout=3): #checks if there are new messages from the child process 66 | slave.state = slave.parent.recv() # updates the state in such case 67 | return slave.state 68 | except: 69 | return "not-started" 70 | 71 | if __name__ == "__main__": 72 | app.run(host='127.0.0.1', port=8080, debug=True) 73 | -------------------------------------------------------------------------------- /master.py: -------------------------------------------------------------------------------- 1 | import logging, pexpect 2 | import requests 3 | from urllib.parse import urlencode 4 | import os, time, pandas as pd 5 | from utils import GCloudConnection 6 | 7 | 8 | class Master(GCloudConnection): 9 | 10 | def __init__(self, URL): 11 | GCloudConnection.__init__(self,URL, LOG_NAME= "master-scraper") 12 | self.pending_jobs = [] 13 | self.current_job = None 14 | self.is_restarting = False 15 | 16 | def restart_machine(self): 17 | # execute these commands locally to manually re deploy instance 18 | try: 19 | logging.info("Re-deploying instance") 20 | deploy = pexpect.spawn('gcloud app deploy orchestra.yaml --version v1') 21 | deploy.expect('Do you want to continue (Y/n)?') 22 | deploy.sendline('Y') 23 | deploy.expect("Deployed service", timeout=100) 24 | self.is_restarting = True 25 | except Exception as e: 26 | self.is_restarting = False 27 | logging.error(f"Problem re-deploying: {e}") 28 | 29 | def start(self): 30 | try: 31 | requests.get(f"{self.URL}/start", timeout=3) 32 | except Exception: 33 | logging.error("Slave not running") 34 | 35 | def check_slave_state(self): 36 | try: 37 | response = requests.get(f"{self.URL}/state", timeout=10) 38 | state = response.content.decode("utf-8") 39 | except Exception: 40 | state = "no-answer" 41 | return state 42 | 43 | def send_job(self, job): 44 | url = self.URL + "/job?" + urlencode(job) 45 | requests.get(url, timeout=10) 46 | logging.info(f"Sending job = {job} to {url}") 47 | 48 | def orchestrate(self): 49 | while(len(self.pending_jobs) > 0): 50 | state = self.check_slave_state() 51 | logging.info(f"Current state of slave: {state}") 52 | next_job_ready = False # wont change if state == "busy" or "no-answer" 53 | if state == "not-started": 54 | self.start() 55 | if state == "scraping-detected" and self.is_restarting == False: # Error 429 in slave. 56 | self.pending_jobs.insert(0, self.current_job) 57 | self.restart_machine() 58 | elif state == "idle": 59 | next_job_ready = True 60 | if next_job_ready: 61 | self.current_job = self.pending_jobs.pop(0) 62 | self.send_job(self.current_job) 63 | time.sleep(3) 64 | 65 | def import_jobs(self): 66 | df_jobs = pd.read_csv("./csv/sample_jobs.csv", index_col = 0) 67 | self.pending_jobs = list(df_jobs.to_dict("index").values()) 68 | 69 | 70 | if __name__ == "__main__": 71 | url = os.getenv("URL") 72 | if url is None: 73 | url = "http://0.0.0.0:8080" #local mode 74 | master = Master(url) 75 | master.import_jobs() 76 | master.orchestrate() 77 | --------------------------------------------------------------------------------