├── credentials.json
├── png
    ├── terminal.png
    └── scraper-architecture.png
├── requirements.txt
├── csv
    └── sample_jobs.csv
├── app.yaml
├── README.md
├── utils.py
├── slave.py
└── master.py


/credentials.json:
--------------------------------------------------------------------------------
1 | {
2 |   Replace with your authentication credentials JSON file.   ;)
3 | }


--------------------------------------------------------------------------------
/png/terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juanluisrto/Scraping-orchestra/HEAD/png/terminal.png


--------------------------------------------------------------------------------
/png/scraper-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juanluisrto/Scraping-orchestra/HEAD/png/scraper-architecture.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask
2 | google==3.0.0
3 | pandas
4 | google-cloud-logging
5 | google-cloud-storage
6 | gunicorn
7 | gcsfs
8 | pexpect
9 | 


--------------------------------------------------------------------------------
/csv/sample_jobs.csv:
--------------------------------------------------------------------------------
1 | ,query,start,end
2 | 0,Apple,2019-01-01,2019-01-06
3 | 1,Tesla,2019-03-01,2019-03-03
4 | 2,Space-X,2019-02-01,2019-02-03
5 | 3,Warren Buffet,2019-03-01,2019-03-03
6 | 


--------------------------------------------------------------------------------
/app.yaml:
--------------------------------------------------------------------------------
1 | runtime: python38
2 | entrypoint: gunicorn -b :$PORT slave:app  --timeout 360000 --preload
3 | env_variables:
4 |   BUCKET: your-bucket.appspot.com #gcloud bucket where you want to store the outputs
5 |   CLOUD: True #env variable which says whether the Slave is running locally or in the cloud
6 | instance_class: B2  #Bs are basic instances without automatic scaling. Necessary to restart them manually
7 | manual_scaling:
8 |   instances: 1      #Number of instances must be defined


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scraping-orchestra
 2 | A scraping Master-slave system based on Google App Engine
 3 | 
 4 | This repository showcases an approach to orchestrate from a local process a Scraper deployed in Google App Engine. 
 5 | The proposal is a workaround to the **HTTP 429 Too Many Requests Error**. 
 6 | The main idea is to redeploy the Scraper to get a new IP whenever the Error shows up.
 7 | 
 8 | ### Medium article
 9 | Take a look at the [article](https://juanluisrto.medium.com/scraping-google-search-without-getting-caught-e43bb91b363e?sk=944b7dc0368b04345a9ad2a2416b311d) I published about this
10 | 
11 | ### System architecture
12 | ![alt text](/png/scraper-architecture.png).
13 | 
14 | ### Running locally
15 | To test this locally clone the repo  and run:
16 | * `pip install -r requirements.txt`
17 | * `python master.py` in one terminal
18 | * `gunicorn -b :8080 slave:app  --timeout 360000 --preload` in a different terminal
19 | 
20 | 
21 | 
22 | The output of the master looks like this.
23 | 
24 | ![alt text](/png/terminal.png).
25 | 
26 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | from google.cloud import logging as gc_logging
 2 | import pandas as pd
 3 | from googlesearch import search, get_tbs
 4 | import os, logging
 5 | 
 6 | class GCloudConnection:
 7 | 
 8 |     def __init__(self, URL, LOG_NAME):
 9 |         # env variable declared only for gcloud authentication during local tests. Not necessary at deployed instances
10 |         os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = './credentials.json'
11 |         logging.getLogger().setLevel(logging.INFO)
12 |         self.connect_cloud_services(LOG_NAME)
13 |         self.URL = URL
14 | 
15 |     def connect_cloud_services(self, LOG_NAME):
16 |             # connect gcloud logger to default logging.
17 |             logging_client = gc_logging.Client()
18 |             logging_client.get_default_handler()
19 |             logging_client.setup_logging()
20 |             logging_client.logger(LOG_NAME)
21 | 
22 | class Scraper:
23 |     # runs same query filtering by every date in date range
24 |     def scrape(self, job, number_of_urls = 10):
25 |         query, from_date, to_date = job.values()
26 |         urls = []
27 |         for d in pd.date_range(from_date, to_date):
28 |             tbs = get_tbs(from_date=d, to_date=d) #"%Y-%m-%d"
29 |             results = search(query, tbs=tbs, pause=2, stop=number_of_urls)
30 |             for url in results:
31 |                 urls.append({"date" : d.date(), "url" : url})
32 |         return pd.DataFrame(urls, columns=["date", "url"])
33 | 
34 |     def filename(self, job):
35 |         #stock, keywords, from_date, to_date = job.values()
36 |         #filename = f"{stock}/{stock}_{from_date}_{to_date}.csv"
37 |         query, from_date, to_date = job.values()
38 |         filename = f"{query}_{from_date}_{to_date}.csv"
39 |         return filename


--------------------------------------------------------------------------------
/slave.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request
 2 | import logging, os, time
 3 | from multiprocessing import Process, Pipe
 4 | from utils import GCloudConnection, Scraper
 5 | 
 6 | class Slave(GCloudConnection):
 7 | 
 8 |     def __init__(self, URL):
 9 |         GCloudConnection.__init__(self,URL, LOG_NAME= "slave-scraper")
10 |         self.state = "idle"
11 |         self.parent, self.child = Pipe()
12 |         self.scraper = Scraper()
13 | 
14 |     def store(self, df, filename):
15 |         bucket = self.URL  #define url to bucket where results are stored
16 |         url = f"gs://{bucket}/csv/{filename}" if "CLOUD" in os.environ else f"./csv/{filename}"
17 |         df.to_csv(url)
18 |         logging.info(f"{filename} stored succesfully")
19 | 
20 |     def scrape(self, job):
21 |         self.child.send("busy") #updates state to stop receiving more jobs
22 |         try:
23 |             df = self.scraper.scrape(job)
24 |             self.child.send("idle")
25 |         except Exception as ex:
26 |             self.child.send("scraping-detected")
27 |             logging.error(f"Job {job} failed with an error: {ex}")
28 |             df = "Failed"
29 |         return df  # returns the job output, or "Failed" if an error arised
30 | 
31 | 
32 |     def run(self, pipe):
33 |         self.child = pipe
34 |         while True:
35 |             job = self.child.recv()
36 |             if job != None:
37 |                 logging.info(f"Running job: {job}")
38 |                 df = self.scrape(job)
39 |                 if str(df) != "Failed":
40 |                     self.store(df, self.scraper.filename(job))
41 |             else:
42 |                 time.sleep(3)
43 | 
44 | 
45 | app = Flask(__name__)
46 | 
47 | @app.route('/start')
48 | def start_child_process(): #Gunicorn does not allow the creation of new processes before the app creation, so we need to define this route
49 |     url = os.getenv("BUCKET")
50 |     global slave
51 |     slave = Slave(url)
52 |     p = Process(target=slave.run, args=[slave.child])
53 |     p.start()
54 |     logging.info("Scraper running")
55 |     return "Scraper running"
56 | 
57 | @app.route('/job')
58 | def process_job():
59 |     slave.parent.send(request.args) #sends a job to the Scraper through the "parent" end of the pipe
60 |     return f"Job {request.args} started"
61 | 
62 | @app.route('/state')
63 | def current_state():
64 |     try:
65 |         if slave.parent.poll(timeout=3): #checks if there are new messages from the child process
66 |             slave.state = slave.parent.recv() # updates the state in such case
67 |         return slave.state
68 |     except:
69 |         return "not-started"
70 | 
71 | if __name__ == "__main__":
72 |     app.run(host='127.0.0.1', port=8080, debug=True)
73 | 


--------------------------------------------------------------------------------
/master.py:
--------------------------------------------------------------------------------
 1 | import logging, pexpect
 2 | import requests
 3 | from urllib.parse import urlencode
 4 | import os, time, pandas as pd
 5 | from utils import GCloudConnection
 6 | 
 7 | 
 8 | class Master(GCloudConnection):
 9 | 
10 |     def __init__(self, URL):
11 |         GCloudConnection.__init__(self,URL, LOG_NAME= "master-scraper")
12 |         self.pending_jobs = []
13 |         self.current_job = None
14 |         self.is_restarting = False
15 | 
16 |     def restart_machine(self):
17 |         # execute these commands locally to manually re deploy instance
18 |         try:
19 |             logging.info("Re-deploying instance")
20 |             deploy = pexpect.spawn('gcloud app deploy orchestra.yaml --version v1')
21 |             deploy.expect('Do you want to continue (Y/n)?')
22 |             deploy.sendline('Y')
23 |             deploy.expect("Deployed service", timeout=100)
24 |             self.is_restarting = True
25 |         except  Exception as  e:
26 |             self.is_restarting = False
27 |             logging.error(f"Problem re-deploying: {e}")
28 | 
29 |     def start(self):
30 |         try:
31 |             requests.get(f"{self.URL}/start", timeout=3)
32 |         except Exception:
33 |             logging.error("Slave not running")
34 | 
35 |     def check_slave_state(self):
36 |         try:
37 |             response = requests.get(f"{self.URL}/state", timeout=10)
38 |             state = response.content.decode("utf-8")
39 |         except Exception:
40 |             state = "no-answer"
41 |         return state
42 | 
43 |     def send_job(self, job):
44 |         url = self.URL + "/job?" + urlencode(job)
45 |         requests.get(url, timeout=10)
46 |         logging.info(f"Sending job = {job} to {url}")
47 | 
48 |     def orchestrate(self):
49 |         while(len(self.pending_jobs) > 0):
50 |             state = self.check_slave_state()
51 |             logging.info(f"Current state of slave: {state}")
52 |             next_job_ready = False # wont change if state == "busy" or "no-answer"
53 |             if state == "not-started":
54 |                 self.start()
55 |             if state == "scraping-detected" and self.is_restarting == False:  # Error 429 in slave.
56 |                 self.pending_jobs.insert(0, self.current_job)
57 |                 self.restart_machine()
58 |             elif state == "idle":
59 |                 next_job_ready = True
60 |             if next_job_ready:
61 |                 self.current_job = self.pending_jobs.pop(0)
62 |                 self.send_job(self.current_job)
63 |             time.sleep(3)
64 | 
65 |     def import_jobs(self):
66 |         df_jobs = pd.read_csv("./csv/sample_jobs.csv", index_col = 0)
67 |         self.pending_jobs = list(df_jobs.to_dict("index").values())
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     url = os.getenv("URL")
72 |     if url is None:
73 |         url = "http://0.0.0.0:8080" #local mode
74 |     master = Master(url)
75 |     master.import_jobs()
76 |     master.orchestrate()
77 | 


--------------------------------------------------------------------------------