├── LinkedInWebcrawler2019.py ├── ProfileCrawler2019.py ├── README.md └── profileScraper.ipynb /LinkedInWebcrawler2019.py: -------------------------------------------------------------------------------- 1 | # Load libraries 2 | from selenium import webdriver 3 | from selenium.webdriver.common.keys import Keys 4 | import time 5 | from bs4 import BeautifulSoup 6 | import pandas as pd 7 | 8 | # Create values Selenium 9 | COMPANY = input("Enter Company ID: ") #Uber = 1815218 10 | USERNAME = input("Enter username: ") 11 | PASSWORD = input("Enter password: ") 12 | EMPLOYEE = 1000 #int(raw_input("Enter number of results: ")) 13 | linkedin = 'https://www.linkedin.com' 14 | 15 | # Open Selenium 16 | browser = webdriver.Firefox() 17 | browser.get(linkedin) 18 | time.sleep(3) 19 | # Identify email and password inputs and enter in information 20 | email = browser.find_element_by_name('session_key') 21 | password = browser.find_element_by_name('session_password') 22 | email.send_keys(USERNAME + Keys.RETURN) 23 | password.send_keys(PASSWORD + Keys.RETURN) 24 | # Go to company search and scroll to the bottom of the page to get all results on the page 25 | time.sleep(3) 26 | search = "https://www.linkedin.com/search/results/people/?facetCurrentCompany=%5B%22" + str(COMPANY) + "%22%5D" 27 | browser.get(search) 28 | time.sleep(3) 29 | browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") 30 | current_url = 'url_placeholder' # this is a placeholder for the URL check 31 | 32 | # Create empty dataframe 33 | df = pd.DataFrame(columns = ['name', 'title', 'location', 'profile']) 34 | 35 | # Go through pages and download data 36 | while True: 37 | # Check to see if url is the 100th page in search 38 | if current_url.find('page=100') != -1: 39 | break 40 | # Check to see if this url has been scraped before; break loop if it has 41 | previous_url = current_url 42 | current_url = browser.current_url 43 | if current_url == previous_url: 44 | break 45 | 46 | # Start scraping and filling in the dataframe 47 | page = BeautifulSoup(browser.page_source, 'lxml') 48 | page_names = page.find_all('span', class_ = 'actor-name') 49 | page_titles = page.find_all('p', class_ = 'subline-level-1') 50 | page_locations = page.find_all('p', class_ = 'subline-level-2') 51 | page_profiles = page.find_all('a', class_ = 'search-result__result-link') 52 | 53 | # Put scraped data into a dataframe 54 | names = list(map(lambda x: x.text, page_names)) 55 | titles = list(map(lambda x: x.text.replace('\n', ''), page_titles)) 56 | locations = list(map(lambda x: x.text.replace('\n', ''), page_locations)) 57 | profiles = list(map(lambda x: linkedin + x['href'], page_profiles))[::2] 58 | temp = pd.DataFrame({'name':names, 'title':titles, 'location':locations, 'profile':profiles}) 59 | 60 | # Filter out members who do not provide information 61 | temp = temp[temp['name'] != 'LinkedIn Member'] 62 | 63 | # Append new data to df 64 | df = df.append(temp) 65 | 66 | # Stop appending if the number of retrieved records exceeds the limit 67 | if df.shape[0] >= EMPLOYEE: 68 | break 69 | 70 | # Find next button and hit next 71 | nextt = browser.find_element_by_class_name('next') 72 | nextt.click() 73 | time.sleep(5) 74 | 75 | 76 | # Reset dataframe index 77 | df.reset_index() 78 | 79 | # Export results 80 | df.to_csv("output_search.csv", index = False) 81 | 82 | # Close Selenium 83 | browser.quit() 84 | 85 | 86 | -------------------------------------------------------------------------------- /ProfileCrawler2019.py: -------------------------------------------------------------------------------- 1 | # Load libraries 2 | import requests 3 | import re 4 | from selenium import webdriver 5 | from selenium.webdriver.common.keys import Keys 6 | from selenium.webdriver.common.action_chains import ActionChains 7 | import time 8 | from bs4 import BeautifulSoup 9 | import pandas as pd 10 | 11 | # Create values Selenium 12 | USERNAME = #raw_input("Enter username: ") 13 | PASSWORD = #raw_input("Enter password: ") 14 | linkedin = 'https://www.linkedin.com' 15 | 16 | # Open Selenium 17 | browser = webdriver.Firefox() 18 | browser.get(linkedin) 19 | time.sleep(3) 20 | # Identify email and password inputs and enter in information 21 | email = browser.find_element_by_name('session_key') 22 | password = browser.find_element_by_name('session_password') 23 | email.send_keys(USERNAME + Keys.RETURN) 24 | password.send_keys(PASSWORD + Keys.RETURN) 25 | # Go to profile page and scroll to the bottom of the page to load elements of page 26 | time.sleep(3) 27 | 28 | #Create Large Loop 29 | 30 | #Read csv 31 | import csv 32 | r = pd.read_csv("output_search.csv") 33 | 34 | # Function to identify driver 35 | def driving(x): 36 | if x.lower().find('data') != -1 or x.lower().find('scien') != -1 or x.lower().find('Data') != -1 or x.lower().find('Scien') != -1 or x.lower().find('machine') != -1: 37 | return(1) 38 | else: 39 | return(0) 40 | 41 | 42 | # Create driver column 43 | r['driver'] = list(map(driving, r['title'])) 44 | #Remove value 0 45 | r = r[r.driver != 0] 46 | 47 | 48 | #create empty data frame 49 | Exp_df = pd.DataFrame(columns = ['profile', 'exp_title', 'exp_company', 'exp_dates']) 50 | Edu_df = pd.DataFrame(columns = ['profile', 'ed_name', 'ed_deg', 'ed_dates']) 51 | Ski_df = pd.DataFrame(columns = ['profile', 'skill']) 52 | 53 | #Create big loop 54 | #for link in r.loc[0:5,'profile']: 55 | for link in r.loc[:,'profile']: 56 | if link == 'https://www.linkedin.com#': #if it equal link then skip 57 | continue 58 | time.sleep(2) 59 | # This section is where you put in the profile link (loaded from the csv file) and browse to it 60 | search = link 61 | browser.get(search) 62 | time.sleep(2) 63 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 64 | time.sleep(.75) 65 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 66 | time.sleep(.75) 67 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 68 | time.sleep(.75) 69 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 70 | time.sleep(.75) 71 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 72 | #browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") 73 | #raw = urlopen(link).read() 74 | #page = BeautifulSoup(raw, "html.parser") 75 | page = BeautifulSoup(browser.page_source, 'lxml') 76 | 77 | #Experience Section 78 | titles = page.find_all('div', class_ = "pv-entity__position-group-pager") 79 | companies = page.find_all('span', class_ = "pv-entity__secondary-title") 80 | dates = page.find_all('h4', class_ = "pv-entity__date-range") 81 | 82 | #Put scraped data into exp_df 83 | 84 | arraylen1 = len(page.find_all('div', class_ = "pv-entity__position-group-pager")) 85 | 86 | profile = link 87 | exp_titles = list(map(lambda x: x.h3.text.strip(), titles))[0:arraylen1] 88 | exp_companies = list(map(lambda x: x.text.strip(), companies))[0:arraylen1] 89 | exp_dates = list(map(lambda x: x.text.strip().split('\n')[-1], dates))[0:arraylen1] 90 | 91 | #Education Section 92 | institution = page.find_all('div', class_ = "pv-entity__degree-info") 93 | degree = page.find_all('p', class_ = "pv-entity__degree-name") 94 | dates = page.find_all('p', class_ = "pv-entity__dates") 95 | 96 | #Put scraped data into edu_df 97 | 98 | arraylen2 = len(page.find_all('div', class_ = "pv-entity__degree-info")) 99 | 100 | profile = link 101 | ed_name = list(map(lambda x: x.text.strip().split('\n')[-1], institution))[0:arraylen2] 102 | ed_deg = list(map(lambda x: x.text.strip().split('\n')[-1], degree))[0:arraylen2] 103 | ed_dates = list(map(lambda x: x.text.strip().split('\n')[-1], dates))[0:arraylen2] 104 | if len(ed_dates) < arraylen2: 105 | ed_dates = 'NA' 106 | #Skill Section 107 | skill = page.find_all('span', class_ = "pv-skill-category-entity__name-text") 108 | 109 | #Put scraped data into a ski_df 110 | 111 | arraylen3 = len(page.find_all('span', class_ = "pv-skill-category-entity__name-text")) 112 | 113 | profile = link 114 | skill = list(map(lambda x: x.text.strip(), skill))[0:arraylen3] 115 | try: 116 | temp1 = pd.DataFrame({'profile':profile, 'exp_title':exp_titles, 'exp_company':exp_companies, 'exp_dates':exp_dates}) 117 | temp2 = pd.DataFrame({'profile':profile, 'ed_name':ed_name, 'ed_deg':ed_deg, 'ed_dates':ed_dates}) 118 | temp3 = pd.DataFrame({'profile':profile, 'skill':skill}) 119 | Exp_df = Exp_df.append(temp1) 120 | Edu_df = Edu_df.append(temp2) 121 | Ski_df = Ski_df.append(temp3) 122 | print(link, 'completed') 123 | except: 124 | print(link, 'skipped') 125 | continue 126 | 127 | 128 | # Reset dataframe index 129 | Exp_df.reset_index() 130 | Edu_df.reset_index() 131 | Ski_df.reset_index() 132 | 133 | # Export results 134 | Exp_df.to_csv("output_experience.csv", index = False,sep='\t', encoding='utf-8') 135 | Edu_df.to_csv("output_education.csv", index = False,sep='\t', encoding='utf-8') 136 | Ski_df.to_csv("output_skills.csv", index = False,sep='\t', encoding='utf-8') 137 | 138 | # Close Selenium 139 | browser.quit() 140 | 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web-Scraping-Project 2 | The objective of this project was to explore the data related job market and it's requirements within the industry. More particularly within the company Uber. Also, I wanted to gain more knowledge and experience in web-scraping. 3 | 4 | LinkedIn is an American company established in 2002. They created a platform for the social media of professional networking. This platform is mainly used for professional networking which includes employers posting jobs and job seekers posting their resumes or CVs. Eventually most of LinkedIn's revenue started coming from selling access to information about its members to recruiters and sales professionals. As of 2019, LinkedIn has 610 million registered members in 200 countries. Of these users more than 250 million are active. 5 | 6 | I wrote two python scripts, namely "LinkedInWebcrawler2019.py" and "ProfileCrawler2019.py" to responsibly scrape the LinkedIn website. I sent queries about Data Scientist/Data Engineer/Data Analyst jobs type roles within the company Uber. Used the selenium package and collected information about the Uber employees. "LinkedIncrawler2019.py" is code in which it gathers first csv file of profiles. "ProfileCrawler2019.py" is code in which it gathers information from each profile link. 7 | 8 | I ran my analysis on about 1,000 employee profiles. The analysis code is in the Jypyter python notebook "profileScraper.ipynb". 9 | 10 | Here is a blog post I wrote about the project and results: http://nycdatascience.com/blog/student-works/web-scraping/web-scraping-linkedin:-exploring-the-background-of-a-data-scientist/ 11 | --------------------------------------------------------------------------------