├── .gitignore ├── jobs.png ├── README.md └── indeed.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.lock/ -------------------------------------------------------------------------------- /jobs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pokemon918/Indeed-Scraping-Selenium/HEAD/jobs.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Indeed Scraping with Selenium 2 | 3 | ## 1 Intro: 4 | In this repo you will learn how to make a basic scrape in Indeed using Selenium 5 | 6 | ## 2 Goals: 7 | The goal of the project is to automate your job seek and display the jobs you are interested in a csv file 8 | 9 | ## 3 Steps: 10 | - Install Selenium and the Chrome WebDriver 11 | - Import libraries 12 | - Change the variables 13 | 14 | ## 4 Final Output: 15 | The final output is a csv file with the links to the jobs you are interested in, the tittle of the job, the company offering it, the number of days since the release and the condition/s you require 16 | 17 | You should see something like this: 18 | 19 | ![csv-file](https://github.com/diego-florez/Selenium-Web-Scraping/blob/master/jobs.png) 20 | 21 | ## How it works: 22 | - Install Selenium --> https://pypi.org/project/selenium/ 23 | - Install Chrome WebDriver --> https://www.liquidweb.com/kb/how-to-install-selenium-tools-on-ubuntu-18-04/ 24 | - Go to indeed.py 25 | - Type the indeed domain of you country --> variable "url" 26 | - Type the job you are looking for and the location --> variable "keyword" 27 | - Change "días" to "days" or the translation in your language --> variable "release_date" 28 | - Type your condition --> variable "condition" 29 | -------------------------------------------------------------------------------- /indeed.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.keys import Keys 3 | from selenium.common.exceptions import NoSuchElementException 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from selenium.common.exceptions import TimeoutException 8 | from selenium.webdriver.common.action_chains import ActionChains 9 | import csv 10 | import re 11 | 12 | #setting driver 13 | options = webdriver.ChromeOptions() 14 | options.add_argument("--incognito") 15 | options.add_argument('--disable-gpu') 16 | 17 | #creating driver 18 | driver = webdriver.Chrome(options=options) 19 | 20 | url = "https://es.indeed.com" 21 | driver.get(url) 22 | 23 | wait = WebDriverWait(driver, 5) 24 | #searching in a bar 25 | search_bar = driver.find_element_by_name("q") 26 | search_bar.clear() 27 | #type your job, also the location is taken --> ex: data analyst dublin 28 | keyword = "data madrid" 29 | print("looking for",keyword) 30 | search_bar.send_keys(keyword) 31 | search_bar.send_keys(Keys.RETURN) 32 | 33 | driver.current_url 34 | 35 | 36 | links = [] 37 | while True: 38 | new_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".jobtitle.turnstileLink "))) 39 | links.extend([l.get_attribute("href") for l in new_links]) 40 | 41 | try: #EC needed as otherwise the element was not clickable 42 | next_page = wait.until(EC.element_to_be_clickable((By.XPATH, "//ul[contains(@class, 'agination')]/li[last()]/a"))) 43 | #ActionChains is needed as Indeed opens a small window and it is needed to be closed to continue 44 | ActionChains(driver).move_to_element(next_page).click().perform() 45 | except TimeoutException: 46 | print("links scraped") 47 | break 48 | 49 | offer_links = [] 50 | positions = [] 51 | companies = [] 52 | days = [] 53 | conditions = [] 54 | for l in links: 55 | driver.get(l) 56 | #get original link offer 57 | try: 58 | offer_link = driver.find_element_by_xpath("//div[contains(@class, 'icl-u-xs-hide icl-u-lg-block icl-u-lg-textCenter')]/a").get_attribute("href") 59 | offer_links.append(offer_link) 60 | except NoSuchElementException: 61 | offer_links.append("no original link offer") 62 | #find job position 63 | try: 64 | position = driver.find_element_by_xpath("//h3[contains(@class, 'jobsearch-JobInfoHeader-title')]").text 65 | positions.append(position) 66 | except NoSuchElementException: 67 | positions.append("no position description") 68 | #find company 69 | try: 70 | company = driver.find_element_by_xpath("//div[contains(@class, 'icl-u-lg-mr--sm icl-u-xs-mr--xs')]").text 71 | companies.append(company) 72 | except NoSuchElementException: 73 | companies.append("no company description") 74 | #release day 75 | try: 76 | meta = driver.find_element_by_xpath("//div[contains(@class, 'jobsearch-JobMetadataFooter')]").text 77 | #change días to days or your language translation 78 | release_date = "días" 79 | search = re.search(f"(\d+).*({release_date})", str(meta)) 80 | if search: 81 | release = "".join([search.group(1)," ",search.group(2)]) 82 | else: 83 | release = "today/yesterday" 84 | days.append(release) 85 | except NoSuchElementException: 86 | days.append("no release description") 87 | #my condition -- I wanted jobs that included python, change or add the conditions you'd like 88 | condition = "ython" 89 | if condition in driver.page_source: 90 | conditions.append("python") 91 | else: 92 | conditions.append("nop") 93 | 94 | #saving in csv 95 | with open("".join([keyword.replace(" ","-"), ".csv"]), 'w', newline='') as csvfile: 96 | fieldnames = ["indeed_link", "offer_link", "position", "company", "release day", "contains"] 97 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 98 | 99 | writer.writeheader() 100 | for link, offer_link, position, company, release, condition in zip(links, offer_links, positions, companies, days, conditions): 101 | writer.writerow({"indeed_link":link, "offer_link":offer_link, "position":position, "company":company, "release day":release, "contains":condition}) 102 | print("".join([keyword.replace(" ","-"), ".csv file available"])) 103 | 104 | 105 | --------------------------------------------------------------------------------