├── .gitignore
├── jobs.png
├── README.md
└── indeed.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.lock/


--------------------------------------------------------------------------------
/jobs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pokemon918/Indeed-Scraping-Selenium/HEAD/jobs.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Indeed Scraping with Selenium
 2 | 
 3 | ## 1 Intro:
 4 | In this repo you will learn how to make a basic scrape in Indeed using Selenium
 5 | 
 6 | ## 2 Goals:
 7 | The goal of the project is to automate your job seek and display the jobs you are interested in a csv file
 8 | 
 9 | ## 3 Steps:
10 | - Install Selenium and the Chrome WebDriver
11 | - Import libraries
12 | - Change the variables
13 | 
14 | ## 4 Final Output:
15 | The final output is a csv file with the links to the jobs you are interested in, the tittle of the job, the company offering it, the number of days since the release and the condition/s you require
16 | 
17 | You should see something like this:
18 | 
19 | ![csv-file](https://github.com/diego-florez/Selenium-Web-Scraping/blob/master/jobs.png)
20 | 
21 | ## How it works:
22 | - Install Selenium --> https://pypi.org/project/selenium/
23 | - Install Chrome WebDriver --> https://www.liquidweb.com/kb/how-to-install-selenium-tools-on-ubuntu-18-04/
24 | - Go to indeed.py
25 | - Type the indeed domain of you country --> variable "url"
26 | - Type the job you are looking for and the location --> variable "keyword"
27 | - Change "días" to "days" or the translation in your language --> variable "release_date"
28 | - Type your condition --> variable "condition"
29 | 


--------------------------------------------------------------------------------
/indeed.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium.webdriver.common.keys import Keys
  3 | from selenium.common.exceptions import NoSuchElementException
  4 | from selenium.webdriver.common.by import By
  5 | from selenium.webdriver.support.ui import WebDriverWait
  6 | from selenium.webdriver.support import expected_conditions as EC
  7 | from selenium.common.exceptions import TimeoutException
  8 | from selenium.webdriver.common.action_chains import ActionChains
  9 | import csv
 10 | import re
 11 | 
 12 | #setting driver
 13 | options = webdriver.ChromeOptions()
 14 | options.add_argument("--incognito")
 15 | options.add_argument('--disable-gpu')
 16 | 
 17 | #creating driver
 18 | driver = webdriver.Chrome(options=options)
 19 | 
 20 | url = "https://es.indeed.com"
 21 | driver.get(url)
 22 | 
 23 | wait = WebDriverWait(driver, 5)
 24 | #searching in a bar
 25 | search_bar = driver.find_element_by_name("q")
 26 | search_bar.clear()
 27 | #type your job, also the location is taken --> ex: data analyst dublin
 28 | keyword = "data madrid"
 29 | print("looking for",keyword)
 30 | search_bar.send_keys(keyword)
 31 | search_bar.send_keys(Keys.RETURN)
 32 | 
 33 | driver.current_url
 34 | 
 35 | 
 36 | links = []
 37 | while True:    
 38 |     new_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".jobtitle.turnstileLink ")))
 39 |     links.extend([l.get_attribute("href") for l in new_links])
 40 | 
 41 |     try: #EC needed as otherwise the element was not clickable
 42 |         next_page = wait.until(EC.element_to_be_clickable((By.XPATH, "//ul[contains(@class, 'agination')]/li[last()]/a")))
 43 |         #ActionChains is needed as Indeed opens a small window and it is needed to be closed to continue
 44 |         ActionChains(driver).move_to_element(next_page).click().perform()
 45 |     except TimeoutException:
 46 |         print("links scraped")
 47 |         break
 48 | 
 49 | offer_links = []
 50 | positions = []
 51 | companies = []
 52 | days = []
 53 | conditions = []
 54 | for l in links:
 55 |     driver.get(l)
 56 |     #get original link offer
 57 |     try:
 58 |         offer_link = driver.find_element_by_xpath("//div[contains(@class, 'icl-u-xs-hide icl-u-lg-block icl-u-lg-textCenter')]/a").get_attribute("href")
 59 |         offer_links.append(offer_link)
 60 |     except NoSuchElementException:
 61 |         offer_links.append("no original link offer")
 62 |     #find job position
 63 |     try:
 64 |         position = driver.find_element_by_xpath("//h3[contains(@class, 'jobsearch-JobInfoHeader-title')]").text
 65 |         positions.append(position)
 66 |     except NoSuchElementException:
 67 |         positions.append("no position description")
 68 |     #find company
 69 |     try:
 70 |         company = driver.find_element_by_xpath("//div[contains(@class, 'icl-u-lg-mr--sm icl-u-xs-mr--xs')]").text
 71 |         companies.append(company)
 72 |     except NoSuchElementException:
 73 |         companies.append("no company description")
 74 |     #release day
 75 |     try:
 76 |         meta = driver.find_element_by_xpath("//div[contains(@class, 'jobsearch-JobMetadataFooter')]").text
 77 |         #change días to days or your language translation
 78 |         release_date = "días"
 79 |         search = re.search(f"(\d+).*({release_date})", str(meta))
 80 |         if search:
 81 |             release = "".join([search.group(1)," ",search.group(2)])
 82 |         else:
 83 |             release = "today/yesterday"
 84 |         days.append(release)
 85 |     except NoSuchElementException:
 86 |         days.append("no release description")
 87 |     #my condition -- I wanted jobs that included python, change or add the conditions you'd like
 88 |     condition = "ython"
 89 |     if condition in driver.page_source:
 90 |         conditions.append("python")
 91 |     else:
 92 |         conditions.append("nop")
 93 | 
 94 | #saving in csv
 95 | with open("".join([keyword.replace(" ","-"), ".csv"]), 'w', newline='') as csvfile:
 96 |     fieldnames = ["indeed_link", "offer_link", "position", "company", "release day", "contains"]
 97 |     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 98 | 
 99 |     writer.writeheader()
100 |     for link, offer_link, position, company, release, condition in zip(links, offer_links, positions, companies, days, conditions):
101 |         writer.writerow({"indeed_link":link, "offer_link":offer_link, "position":position, "company":company, "release day":release, "contains":condition})
102 | print("".join([keyword.replace(" ","-"), ".csv file available"]))
103 | 
104 |     
105 | 


--------------------------------------------------------------------------------