├── .gitignore
├── README.md
└── scrape_google.py


/.gitignore:
--------------------------------------------------------------------------------
1 | chromedriver*
2 | *.json
3 | 
4 | #IDE configuration
5 | .idea/*
6 | workspace.xml
7 | tasks.xml
8 | *__pycache__*
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Google Jobs Scraper
 2 | 
 3 | ### About the project
 4 | > Simply, The script scrapes all the jobs from all the pages (first to final available page) located on [https://careers.google.com/jobs](https://careers.google.com/jobs#t=sq&li=20&st=0&jlo=all) and return's the result as a JSON string, Then you will have a JSON file containing  all scraped data.
 5 | 
 6 | ### How to Run the Program
 7 | 
 8 | 1. Download and Install [Python 3](https://www.python.org/)
 9 | 2. Install requirements
10 | ```
11 | pip install requests beautifulsoup4 selenium
12 | ```
13 | 3. Download the latest release of [Chrome Driver](https://sites.google.com/a/chromium.org/chromedriver/downloads) for your OS
14 | 4. Extract chromedriver and move it to the same directory of `scrape_google.py` file
15 | 5. Finally, Run `scrape_google.py`
16 | ```
17 | python scrape_google.py
18 | ```
19 | 
20 | **Note:** For Windows users, Please check [this video](https://drive.google.com/open?id=0BzTpKjilS_t0WU5sOGV3TS01d0U)
21 | 
22 | ### Structure of JSON output 
23 | ```
24 | {
25 |   "total": "total_count",
26 |   "jobs": [
27 |     {
28 |       "job_id": "id1",
29 |       "title": "title1",
30 |       "location": "location1", 
31 |       "intro": "introduction1", 
32 |       "resps": "responsibilities1",
33 |       "quals": "qualifications1"
34 |     },
35 |     {
36 |       "job_id": "id2",
37 |       "title": "title2",
38 |       "location": "location2", 
39 |       "intro": "introduction2", 
40 |       "resps": "responsibilities2",
41 |       "quals": "qualifications2"
42 |     },
43 |     ...
44 |   ]
45 | }
46 | ```
47 | 
48 | 


--------------------------------------------------------------------------------
/scrape_google.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from selenium import webdriver
  3 | from selenium.webdriver.common.by import By
  4 | from selenium.webdriver.support.ui import WebDriverWait
  5 | from selenium.webdriver.support import expected_conditions as EC
  6 | from selenium.common.exceptions import TimeoutException
  7 | from bs4 import BeautifulSoup
  8 | import json
  9 | import time
 10 | 
 11 | 
 12 | jobs = []
 13 | total = 0
 14 | 
 15 | 
 16 | def scrape(start_url):
 17 |     """Prepare a website url and scrape all pages"""
 18 | 
 19 |     # Scrape all pages
 20 |     start = time.time()
 21 |     count_pages = 0
 22 |     while True:
 23 |         website_url = start_url.replace('st=0', 'st={}'.format(count_pages))
 24 |         try:
 25 |             parse(website_url)
 26 |         except TimeoutException:
 27 |             browser.quit()
 28 |             print('All data successfully scraped!')
 29 |             end = time.time()
 30 |             print('Time: {} minutes \n'.format(round((end - start) / 60), 1))
 31 |             break
 32 |         count_pages += 20
 33 | 
 34 |     # Store all jobs and total count
 35 |     data = {
 36 |         'total': total,
 37 |         'jobs': jobs
 38 |     }
 39 |     return json.dumps(data)
 40 | 
 41 | 
 42 | def parse(jobs_page):
 43 |     """Parse main jobs page and get all jobs URLs"""
 44 |     global total
 45 | 
 46 |     # Open first URL
 47 |     browser.get(jobs_page)
 48 | 
 49 |     # Waite or sleep till all page data loaded
 50 |     WebDriverWait(browser, 20).until(
 51 |         EC.presence_of_all_elements_located((By.CLASS_NAME, 'GXRRIBB-e-G'))
 52 |     )
 53 | 
 54 |     body = browser.page_source
 55 | 
 56 |     # Parse page html to extract jobs info
 57 |     soup = BeautifulSoup(body, 'html.parser')
 58 |     jobs_content = soup.select('.GXRRIBB-e-G')
 59 | 
 60 |     # Store the count of jobs located in this page
 61 |     total += len(jobs_content)
 62 | 
 63 |     # Get all jobs links (URLs) form jobs info (jobs_content variable)
 64 |     jobs_urls = []
 65 |     for job in jobs_content:
 66 |         job_header = job.select_one('h2 a')
 67 | 
 68 |         # Check that the job related to Google company
 69 |         # To make sure that the html structure will be the same
 70 |         company = job.select_one(
 71 |             'div.sr-content div.summary .secondary-text').get_text()
 72 |         if company != 'DeepMind':
 73 |             job_link = 'https://careers.google.com/jobs'\
 74 |                        + job_header.get('href')
 75 |             jobs_urls.append(job_link)
 76 | 
 77 |     parse_jobs(jobs_urls)
 78 | 
 79 | 
 80 | def parse_jobs(jobs_urls):
 81 |     """Parse all jobs data"""
 82 |     global jobs
 83 | 
 84 |     # Open jobs URLs one by one to get page html
 85 |     jobs_html = []
 86 |     for url in jobs_urls:
 87 |         browser.execute_script("window.open('{}', 'new_window')".format(url))
 88 |         browser.switch_to.window(browser.window_handles[1])
 89 | 
 90 |         # Waite till all page data loaded
 91 |         try:
 92 |             WebDriverWait(browser, 20).until(
 93 |                 EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.secondary-text'))
 94 |             )
 95 |         except TimeoutException:
 96 |             browser.close()
 97 |             browser.switch_to.window(browser.window_handles[0])
 98 |             continue
 99 |         '''
100 |         WebDriverWait(browser, 20).until(
101 |             EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.description-section p'))
102 |         )
103 |         '''
104 | 
105 |         jobs_html.append(browser.page_source)
106 | 
107 |         # Close the current windows and switch to the main window
108 |         browser.close()
109 |         browser.switch_to.window(browser.window_handles[0])
110 | 
111 |     # Parse all pages html to get jobs data
112 |     jobs_list = []
113 |     for job in jobs_html:
114 |         soup = BeautifulSoup(job, 'html.parser')
115 |         job_id = soup.find('div', attrs={'itemtype': 'http://schema.org/JobPosting'}).get('id')
116 |         job_title = soup.select_one('div.card-company-job-details > h1 a.title.text').get_text()
117 |         location = soup.select_one('div.card-company-job-details .details-panel > a').get_text()
118 |         desc = soup.select_one('div.detail-item .description-section.text.with-benefits').get_text()
119 |         resp_qual = soup.select('div.detail-item .description-section .GXRRIBB-S-c .description-content')
120 |         resp = resp_qual[0].get_text()
121 |         qual = resp_qual[1].get_text()
122 | 
123 |         job_dict = {
124 |             'job_id': job_id,
125 |             'title': job_title,
126 |             'location': location,
127 |             'intro': desc,
128 |             'resps': resp,
129 |             'quals': qual
130 |         }
131 | 
132 |         jobs_list.append(job_dict)
133 | 
134 |     # Store the current jobs list with all scraped lists before
135 |     jobs += jobs_list
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     print('Start scraping all google jobs ...')
140 | 
141 |     # Prepare the URL
142 |     base_url = 'https://careers.google.com/jobs'
143 |     params = {
144 |         't': 'sq',
145 |         'li': '20',
146 |         'st': '0',
147 |         'jlo': 'all'
148 |     }
149 |     result = requests.get(url=base_url, params=params)
150 |     url = result.url.replace('?', '#')
151 |     
152 |     print('Please, Do not close chrome driver. '
153 |           'It will be closed automatically after finished.')
154 |     print('This process maybe take more than 10 minutes')
155 | 
156 |     chrome_options = webdriver.ChromeOptions()
157 |     # Set chrome in headless mode to hide the UI
158 |     # chrome_options.add_argument('headless')
159 |     
160 |     # Creates and open a new instance of the chrome driver
161 |     browser = webdriver.Chrome(chrome_options=chrome_options)
162 | 
163 |     data_json = scrape(url)
164 |     print(data_json)
165 |     with open('data.json', 'w') as file:
166 |         json.dump(json.loads(data_json), file)
167 | 


--------------------------------------------------------------------------------