├── .gitignore ├── README.md └── scrape_google.py /.gitignore: -------------------------------------------------------------------------------- 1 | chromedriver* 2 | *.json 3 | 4 | #IDE configuration 5 | .idea/* 6 | workspace.xml 7 | tasks.xml 8 | *__pycache__* 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Jobs Scraper 2 | 3 | ### About the project 4 | > Simply, The script scrapes all the jobs from all the pages (first to final available page) located on [https://careers.google.com/jobs](https://careers.google.com/jobs#t=sq&li=20&st=0&jlo=all) and return's the result as a JSON string, Then you will have a JSON file containing all scraped data. 5 | 6 | ### How to Run the Program 7 | 8 | 1. Download and Install [Python 3](https://www.python.org/) 9 | 2. Install requirements 10 | ``` 11 | pip install requests beautifulsoup4 selenium 12 | ``` 13 | 3. Download the latest release of [Chrome Driver](https://sites.google.com/a/chromium.org/chromedriver/downloads) for your OS 14 | 4. Extract chromedriver and move it to the same directory of `scrape_google.py` file 15 | 5. Finally, Run `scrape_google.py` 16 | ``` 17 | python scrape_google.py 18 | ``` 19 | 20 | **Note:** For Windows users, Please check [this video](https://drive.google.com/open?id=0BzTpKjilS_t0WU5sOGV3TS01d0U) 21 | 22 | ### Structure of JSON output 23 | ``` 24 | { 25 | "total": "total_count", 26 | "jobs": [ 27 | { 28 | "job_id": "id1", 29 | "title": "title1", 30 | "location": "location1", 31 | "intro": "introduction1", 32 | "resps": "responsibilities1", 33 | "quals": "qualifications1" 34 | }, 35 | { 36 | "job_id": "id2", 37 | "title": "title2", 38 | "location": "location2", 39 | "intro": "introduction2", 40 | "resps": "responsibilities2", 41 | "quals": "qualifications2" 42 | }, 43 | ... 44 | ] 45 | } 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /scrape_google.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.common.exceptions import TimeoutException 7 | from bs4 import BeautifulSoup 8 | import json 9 | import time 10 | 11 | 12 | jobs = [] 13 | total = 0 14 | 15 | 16 | def scrape(start_url): 17 | """Prepare a website url and scrape all pages""" 18 | 19 | # Scrape all pages 20 | start = time.time() 21 | count_pages = 0 22 | while True: 23 | website_url = start_url.replace('st=0', 'st={}'.format(count_pages)) 24 | try: 25 | parse(website_url) 26 | except TimeoutException: 27 | browser.quit() 28 | print('All data successfully scraped!') 29 | end = time.time() 30 | print('Time: {} minutes \n'.format(round((end - start) / 60), 1)) 31 | break 32 | count_pages += 20 33 | 34 | # Store all jobs and total count 35 | data = { 36 | 'total': total, 37 | 'jobs': jobs 38 | } 39 | return json.dumps(data) 40 | 41 | 42 | def parse(jobs_page): 43 | """Parse main jobs page and get all jobs URLs""" 44 | global total 45 | 46 | # Open first URL 47 | browser.get(jobs_page) 48 | 49 | # Waite or sleep till all page data loaded 50 | WebDriverWait(browser, 20).until( 51 | EC.presence_of_all_elements_located((By.CLASS_NAME, 'GXRRIBB-e-G')) 52 | ) 53 | 54 | body = browser.page_source 55 | 56 | # Parse page html to extract jobs info 57 | soup = BeautifulSoup(body, 'html.parser') 58 | jobs_content = soup.select('.GXRRIBB-e-G') 59 | 60 | # Store the count of jobs located in this page 61 | total += len(jobs_content) 62 | 63 | # Get all jobs links (URLs) form jobs info (jobs_content variable) 64 | jobs_urls = [] 65 | for job in jobs_content: 66 | job_header = job.select_one('h2 a') 67 | 68 | # Check that the job related to Google company 69 | # To make sure that the html structure will be the same 70 | company = job.select_one( 71 | 'div.sr-content div.summary .secondary-text').get_text() 72 | if company != 'DeepMind': 73 | job_link = 'https://careers.google.com/jobs'\ 74 | + job_header.get('href') 75 | jobs_urls.append(job_link) 76 | 77 | parse_jobs(jobs_urls) 78 | 79 | 80 | def parse_jobs(jobs_urls): 81 | """Parse all jobs data""" 82 | global jobs 83 | 84 | # Open jobs URLs one by one to get page html 85 | jobs_html = [] 86 | for url in jobs_urls: 87 | browser.execute_script("window.open('{}', 'new_window')".format(url)) 88 | browser.switch_to.window(browser.window_handles[1]) 89 | 90 | # Waite till all page data loaded 91 | try: 92 | WebDriverWait(browser, 20).until( 93 | EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.secondary-text')) 94 | ) 95 | except TimeoutException: 96 | browser.close() 97 | browser.switch_to.window(browser.window_handles[0]) 98 | continue 99 | ''' 100 | WebDriverWait(browser, 20).until( 101 | EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.description-section p')) 102 | ) 103 | ''' 104 | 105 | jobs_html.append(browser.page_source) 106 | 107 | # Close the current windows and switch to the main window 108 | browser.close() 109 | browser.switch_to.window(browser.window_handles[0]) 110 | 111 | # Parse all pages html to get jobs data 112 | jobs_list = [] 113 | for job in jobs_html: 114 | soup = BeautifulSoup(job, 'html.parser') 115 | job_id = soup.find('div', attrs={'itemtype': 'http://schema.org/JobPosting'}).get('id') 116 | job_title = soup.select_one('div.card-company-job-details > h1 a.title.text').get_text() 117 | location = soup.select_one('div.card-company-job-details .details-panel > a').get_text() 118 | desc = soup.select_one('div.detail-item .description-section.text.with-benefits').get_text() 119 | resp_qual = soup.select('div.detail-item .description-section .GXRRIBB-S-c .description-content') 120 | resp = resp_qual[0].get_text() 121 | qual = resp_qual[1].get_text() 122 | 123 | job_dict = { 124 | 'job_id': job_id, 125 | 'title': job_title, 126 | 'location': location, 127 | 'intro': desc, 128 | 'resps': resp, 129 | 'quals': qual 130 | } 131 | 132 | jobs_list.append(job_dict) 133 | 134 | # Store the current jobs list with all scraped lists before 135 | jobs += jobs_list 136 | 137 | 138 | if __name__ == '__main__': 139 | print('Start scraping all google jobs ...') 140 | 141 | # Prepare the URL 142 | base_url = 'https://careers.google.com/jobs' 143 | params = { 144 | 't': 'sq', 145 | 'li': '20', 146 | 'st': '0', 147 | 'jlo': 'all' 148 | } 149 | result = requests.get(url=base_url, params=params) 150 | url = result.url.replace('?', '#') 151 | 152 | print('Please, Do not close chrome driver. ' 153 | 'It will be closed automatically after finished.') 154 | print('This process maybe take more than 10 minutes') 155 | 156 | chrome_options = webdriver.ChromeOptions() 157 | # Set chrome in headless mode to hide the UI 158 | # chrome_options.add_argument('headless') 159 | 160 | # Creates and open a new instance of the chrome driver 161 | browser = webdriver.Chrome(chrome_options=chrome_options) 162 | 163 | data_json = scrape(url) 164 | print(data_json) 165 | with open('data.json', 'w') as file: 166 | json.dump(json.loads(data_json), file) 167 | --------------------------------------------------------------------------------