├── classes ├── __init__.py ├── JobScraper.py └── UserScraper.py ├── _config.yml ├── .gitignore ├── captcha_resolver.py ├── conf.json ├── requirements.txt ├── README.md ├── scrape_users.py ├── scrape_jobs.py └── utils.py /classes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-modernist -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # binary files 2 | *.pyc 3 | *~ 4 | *.ipynb 5 | *myconf*.json 6 | *.log 7 | *.json 8 | 9 | # dirs 10 | *.ipynb* 11 | .venv/ 12 | *log* 13 | -------------------------------------------------------------------------------- /captcha_resolver.py: -------------------------------------------------------------------------------- 1 | import pytesseract 2 | import argparse 3 | from PIL import Image 4 | from subprocess import check_output 5 | 6 | 7 | def resolve(image_path): 8 | print("Resampling the Image") 9 | check_output( 10 | ['convert', image_path, '-resample', '600', image_path]) 11 | return pytesseract.image_to_string(Image.open(image_path)) 12 | 13 | 14 | if __name__ == "__main__": 15 | argparser = argparse.ArgumentParser() 16 | argparser.add_argument('path', help='Captcha file path') 17 | args = argparser.parse_args() 18 | path = args.path 19 | print('Resolving Captcha') 20 | captcha_text = resolve(path) 21 | print('Extracted Text', captcha_text) 22 | -------------------------------------------------------------------------------- /conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "parameters": { 3 | "CHROMEDRIVER_PATH": "/path/to/chromedriver", 4 | "CHROME_PATH": "/path/to/chrome-executable", 5 | "LOG_DIRECTORY": "./logdir/", 6 | "N_PAGES": 1, 7 | "USER_QUERIES": [ 8 | "site:it.linkedin.com/in/ AND \"giurisprudenza\"", 9 | "site:it.linkedin.com/in/ AND \"archeologia\"", 10 | "site:it.linkedin.com/in/ AND \"biotecnologia\"" 11 | ], 12 | "JOB_QUERIES": [ 13 | "laurea giurisprudenza", 14 | "laurea archeologia", 15 | "laurea biotecnologia" 16 | ], 17 | "HOST": "@mongo_host" 18 | }, 19 | "credentials": { 20 | "LINUSERNAME": "user@email.com", 21 | "LINPWD": "linkedinpwd", 22 | "MONGOUSER": "mongouser", 23 | "MONGOPWD": "mongopwd" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attrs==19.1.0 2 | backcall==0.1.0 3 | beautifulsoup4==4.7.1 4 | bs4==0.0.1 5 | certifi==2019.3.9 6 | chardet==3.0.4 7 | cssselect==1.0.3 8 | decorator==4.4.0 9 | dnspython==1.16.0 10 | idna==2.8 11 | ipykernel==5.1.0 12 | ipython==7.3.0 13 | ipython-genutils==0.2.0 14 | jedi==0.13.3 15 | jsonschema==3.0.1 16 | jupyter-client==5.2.4 17 | jupyter-core==4.4.0 18 | lxml==4.6.2 19 | numpy==1.16.2 20 | pandas==0.24.2 21 | parsel==1.5.1 22 | parso==0.5.0 23 | pexpect==4.6.0 24 | pickleshare==0.7.5 25 | pillow>=6.2.2 26 | prompt-toolkit==2.0.9 27 | ptyprocess==0.6.0 28 | Pygments==2.3.1 29 | pymongo==3.7.2 30 | pyrsistent==0.14.11 31 | pytesseract==0.2.6 32 | python-dateutil==2.8.0 33 | pytz==2018.9 34 | pyzmq==18.0.1 35 | requests==2.21.0 36 | selenium==3.141.0 37 | six==1.12.0 38 | soupsieve==1.8 39 | tesseract==0.1.3 40 | tornado==6.0.1 41 | traitlets==4.3.2 42 | urllib3==1.24.2 43 | validator-collection==1.3.3 44 | w3lib==1.20.0 45 | wcwidth==0.1.7 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LinkedIn Scraper 2 | 3 | ## Disclaimer 4 | Scraping data off of LinkedIn is against their User Agreement. This is purely intended for educational purposes. 5 | 6 | ## Acknowledgements 7 | Thanks to David Craven who I took inspiration from ([link here](https://www.linkedin.com/pulse/how-easy-scraping-data-from-linkedin-profiles-david-craven/)) 8 | 9 | ## What is this? 10 | This was a tool capable of scraping linkedin profiles in 2018/2019. As of today, this repository can only represent a starting point, but it will most likely not work as expected. 11 | 12 | ## Dependencies 13 | It is based on selenium and BeautifulSoup 14 | 15 | ## How to use 16 | Back in the days, you would first download the Chrome Driver from [here](http://chromedriver.chromium.org/) and extract it to your favourite location. 17 | Create a python3 virtual environment following [this](https://docs.python.org/3/tutorial/venv.html). 18 | Within the virtual environment 19 | ```pip install -r requirements.txt``` 20 | 21 | Edit the `conf.json` config file accordingly specifying the chrome bin path, e.g. by typying 22 | ```which google-chrome``` in a UNIX shell command line, the chrome driver path, the desired queries 23 | and so forth. 24 | 25 | Ultimately to scrape users, you would've run 26 | ```python scrape_users.py --conf conf.json``` 27 | or jobs 28 | ```python scrape_jobs.py --conf conf.json``` 29 | -------------------------------------------------------------------------------- /classes/JobScraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | A class to define the methods to scrape LinkedIn job web pages 3 | """ 4 | 5 | 6 | class JobScraper(object): 7 | def __init__(self, soup, url, query): 8 | """ 9 | Initialize the class 10 | 11 | :param soup: BeautifulSoup instance 12 | :param url: str job URL to scrape 13 | :param query: str query to perform 14 | """ 15 | self.soup = soup 16 | self.url = url 17 | self.query = query 18 | 19 | def get_job_skills(self): 20 | """ 21 | Get the skills required by the job offer being scraped. 22 | 23 | :return: list of skills 24 | """ 25 | requested_skills = [rq.get_text() for rq in self.soup.find_all( 26 | class_="jobs-ppc-criteria__value")] 27 | return requested_skills 28 | 29 | def get_job_title(self): 30 | """ 31 | Get the job title of the job page is being scraped. 32 | Return a string containing the job title 33 | 34 | :return: str job title 35 | """ 36 | try: 37 | job_title = self.soup.find_all( 38 | class_="jobs-top-card__job-title")[0].get_text() 39 | except IndexError: 40 | job_title = "" 41 | return job_title 42 | 43 | def get_job_location(self): 44 | """ 45 | Get the location of the job offer being scraped. 46 | Return a string containing the location. 47 | 48 | """ 49 | def validate_location(loc): 50 | """ 51 | Validate the location by checking that the string extracted 52 | by the preferred "jobs-top-card__exact-location" HTML class 53 | is not empty, otherwise get the location string from the 54 | "jobs-top-card__bullet" HTML class 55 | 56 | :param loc: str of the location 57 | :return: str location 58 | """ 59 | if loc: 60 | return loc 61 | else: 62 | try: 63 | loc = [l.get_text().strip() 64 | for l in self.soup.find_all( 65 | class_="jobs-top-card__bullet")][0] 66 | except IndexError: 67 | loc = "" 68 | return loc 69 | try: 70 | location = [l.get_text().strip() 71 | for l in self.soup.find_all( 72 | class_="jobs-top-card__exact-location")][0] 73 | except IndexError: 74 | location = "" 75 | return validate_location(location) 76 | 77 | def get_job_data(self): 78 | """ 79 | Get the job data by using the get* methods of the class. 80 | Return a dictionary 81 | 82 | :return: dict job data 83 | """ 84 | skills = self.get_job_skills() 85 | if len(skills) == 0: 86 | return {} 87 | else: 88 | job_data = { 89 | "URL": self.url, 90 | "query": self.query, 91 | "job_title": self.get_job_title(), 92 | "location": self.get_job_location(), 93 | "skills": skills 94 | } 95 | return job_data 96 | -------------------------------------------------------------------------------- /scrape_users.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scrape linkedin URLs by using selenium, to simulate the navigation 3 | (click, scroll) and BeautifulSoup to parse the HTML code of the page 4 | Perform a number of queries and log a number of files 5 | for each scraped user. 6 | Write dataset to mongoDB with the scraped data 7 | 8 | """ 9 | from selenium.webdriver.common.keys import Keys 10 | from selenium.common.exceptions import ElementNotInteractableException 11 | from utils import init_driver, get_profile_urls, login,\ 12 | print_scraped_data, load_config,\ 13 | get_unseen_urls, connect_mongo 14 | from time import sleep 15 | from classes.UserScraper import UserScraper 16 | import argparse 17 | import sys 18 | 19 | 20 | parser = argparse.ArgumentParser( 21 | description=("Scrape linkedin profiles based on the " + 22 | "queries specified in the conf file") 23 | ) 24 | parser.add_argument( 25 | '-c', '--conf', 26 | type=str, 27 | metavar='', 28 | required=True, 29 | help='Specify the path of the configuration file' 30 | ) 31 | args = parser.parse_args() 32 | conf = load_config(args.conf) 33 | parameters = conf["parameters"] 34 | credentials = conf["credentials"] 35 | CHROME_PATH = parameters["CHROME_PATH"] 36 | CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"] 37 | QUERIES = parameters["USER_QUERIES"] 38 | N_PAGES = parameters["N_PAGES"] 39 | LINUSERNAME = credentials["LINUSERNAME"] 40 | LINPWD = credentials["LINPWD"] 41 | MONGOUSER = credentials["MONGOUSER"] 42 | MONGOPWD = credentials["MONGOPWD"] 43 | HOST = parameters["HOST"] 44 | client = connect_mongo(HOST, MONGOUSER, MONGOPWD) 45 | db = client["linkedin"] 46 | users = db["users"] 47 | driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH) 48 | driver.get("https://www.linkedin.com") 49 | login(driver, LINUSERNAME, LINPWD) 50 | us = UserScraper(driver) 51 | for query in QUERIES: 52 | driver.get("https://www.google.com") 53 | sleep(2) 54 | search_query = driver.find_element_by_name('q') 55 | try: 56 | search_query.send_keys(query) 57 | except ElementNotInteractableException: 58 | print("ERROR :: Cannot send query. Google might be blocking") 59 | sys.exit(1) 60 | sleep(0.5) 61 | search_query.send_keys(Keys.RETURN) 62 | profile_urls = get_profile_urls(driver, N_PAGES) 63 | if len(profile_urls) == 0: 64 | print() 65 | print("WARNING :: " + 66 | "Could not get any URLs for the query\n" + query) 67 | print("Please double-check that Google is not " + 68 | "blocking the query") 69 | continue 70 | unseen_urls = get_unseen_urls(users, profile_urls) 71 | if len(unseen_urls) != 0: 72 | print("INFO :: Resuming from URL", unseen_urls[0]) 73 | else: 74 | print("INFO :: All URLs from " + str(N_PAGES) + 75 | " Google-search page(s) for the query " + query + 76 | " have already been scraped. " + 77 | "Moving onto the next query if any.") 78 | continue 79 | for url in unseen_urls: 80 | user_data = us.scrape_user(query, url) 81 | if user_data and\ 82 | not db["users"].count_documents(user_data, limit=1): 83 | print_scraped_data(user_data) 84 | users.insert_one(user_data) 85 | driver.quit() 86 | -------------------------------------------------------------------------------- /scrape_jobs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scrape linkedin jobs by using selenium, to simulate the navigation 3 | (click, scroll) and BeautifulSoup to parse the HTML code of the page 4 | Perform a number of queries and log a number of files 5 | for each scraped job offer. 6 | Write dataset to mongoDB with the scraped data 7 | 8 | """ 9 | from selenium.common.exceptions import TimeoutException 10 | from utils import init_driver, get_job_urls, login, print_scraped_data,\ 11 | load_config, get_unseen_urls, scroll_job_panel, connect_mongo 12 | from time import sleep 13 | from bs4 import BeautifulSoup 14 | from classes.JobScraper import JobScraper 15 | import argparse 16 | 17 | 18 | parser = argparse.ArgumentParser( 19 | description=("Scrape linkedin job offers based on the " + 20 | "queries specified in the conf file") 21 | ) 22 | parser.add_argument('-c', '--conf', 23 | type=str, 24 | metavar='', 25 | required=True, 26 | help='Specify the path of the configuration file') 27 | args = parser.parse_args() 28 | conf = load_config(args.conf) 29 | parameters = conf["parameters"] 30 | credentials = conf["credentials"] 31 | CHROME_PATH = parameters["CHROME_PATH"] 32 | CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"] 33 | QUERIES = parameters["JOB_QUERIES"] 34 | LINUSERNAME = credentials["LINUSERNAME"] 35 | LINPWD = credentials["LINPWD"] 36 | MONGOUSER = credentials["MONGOUSER"] 37 | MONGOPWD = credentials["MONGOPWD"] 38 | HOST = parameters["HOST"] 39 | client = connect_mongo(HOST, MONGOUSER, MONGOPWD) 40 | db = client["linkedin"] 41 | jobs = db["jobs"] 42 | driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH) 43 | driver.get("https://www.linkedin.com") 44 | login(driver, LINUSERNAME, LINPWD) 45 | JOB_SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords=" 46 | for query in QUERIES: 47 | driver.get(JOB_SEARCH_URL + query) 48 | sleep(0.5) 49 | scroll_job_panel(driver) 50 | soup = BeautifulSoup(driver.page_source, 'html.parser') 51 | n_results_element = soup.find(class_="t-12 t-black--light t-normal") 52 | n_results_string = n_results_element.get_text() 53 | n_results = int(n_results_string.split()[0].replace(',', '')) 54 | job_urls = get_job_urls(soup) 55 | start = 25 56 | url = JOB_SEARCH_URL + query + "&start=" + str(start) 57 | while start < n_results: 58 | try: 59 | driver.get(url) 60 | scroll_job_panel(driver) 61 | soup = BeautifulSoup(driver.page_source, 'html.parser') 62 | job_urls.extend(get_job_urls(soup)) 63 | start += 25 64 | except TimeoutException: 65 | print( 66 | "\nINFO :: TimeoutException raised while getting " + 67 | "URL\n" + url 68 | ) 69 | if len(job_urls) == 0: 70 | print() 71 | print("WARNING :: Could not get any URLs for the query\n" + 72 | query) 73 | print("Please double-check that LinkedIn is not " + 74 | "blocking the query") 75 | continue 76 | unseen_urls = get_unseen_urls(jobs, job_urls) 77 | if len(unseen_urls) != 0: 78 | print("INFO :: Resuming from URL", unseen_urls[0]) 79 | else: 80 | print("INFO :: All job URLs for the query " + query + 81 | " have already been scraped. " + 82 | "Moving onto the next query if any.") 83 | continue 84 | for url in unseen_urls: 85 | driver.get(url) 86 | soup = BeautifulSoup(driver.page_source, 'html.parser') 87 | js = JobScraper(soup, url, query) 88 | job_data = js.get_job_data() 89 | if job_data and\ 90 | not db["jobs"].count_documents(job_data, limit=1): 91 | print_scraped_data(job_data) 92 | jobs.insert_one(job_data) 93 | driver.quit() 94 | -------------------------------------------------------------------------------- /classes/UserScraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | A class to define the methods to scrape LinkedIn user-profile webpages 3 | 4 | """ 5 | from selenium.webdriver.common.keys import Keys 6 | from selenium.common.exceptions import TimeoutException 7 | from utils import validate_field, scroll_profile_page, is_button_found,\ 8 | validate_user_data, filter_non_printable 9 | from time import sleep 10 | from bs4 import BeautifulSoup as bs 11 | 12 | 13 | class UserScraper(object): 14 | def __init__(self, driver): 15 | """ 16 | Initialize the class 17 | 18 | :param driver: selenium chrome driver object 19 | """ 20 | self.driver = driver 21 | 22 | @staticmethod 23 | def get_name(soup): 24 | """ 25 | Get the name of the user whose profile page is being scraped. 26 | 27 | :param soup: BeautifulSoup object 28 | :return: name: str name of the user 29 | """ 30 | try: 31 | name_tag = soup.find_all(class_="pv-top-card-section__name")[0] 32 | name = name_tag.get_text(strip=True) 33 | return name 34 | except IndexError: 35 | return "" 36 | 37 | @staticmethod 38 | def get_job_title(soup): 39 | """ 40 | Get the job title of the user whose profile 41 | page is being scraped 42 | 43 | :param soup: BeautifulSoup object 44 | :return: job_title: str 45 | """ 46 | try: 47 | job_title_tag = soup.find_all( 48 | class_="pv-top-card-section__headline")[0] 49 | job_title = job_title_tag.get_text(strip=True) 50 | job_title = filter_non_printable(job_title) 51 | return job_title 52 | except IndexError: 53 | return "" 54 | 55 | @staticmethod 56 | def get_location(soup): 57 | """ 58 | Get the location of the user whose profile 59 | page is being scraped. 60 | 61 | :param soup: BeautifulSoup object 62 | :return: location: str 63 | """ 64 | try: 65 | location_tag = soup.find_all( 66 | class_="pv-top-card-section__location")[0] 67 | location = location_tag.get_text(strip=True) 68 | return location 69 | except IndexError: 70 | return "" 71 | 72 | @staticmethod 73 | def get_degree(soup): 74 | """ 75 | Get the last degree of the user whose profile page 76 | is being scraped. 77 | 78 | :param soup: BeautifulSoup object 79 | :return: degree: str 80 | """ 81 | degree_tags = soup.find_all( 82 | class_="pv-entity__degree-name") 83 | if len(degree_tags) != 0: 84 | degree = degree_tags[0].get_text().split('\n')[2] 85 | degree = validate_field(degree) 86 | else: 87 | degree = '' 88 | return degree 89 | 90 | def get_skills(self): 91 | """ 92 | Get the skills of the user whose profile page is being scraped. 93 | Scroll down the page by sending the PAGE_DOWN button 94 | until either the "show more" button in the skills section 95 | has been found, or the end of the page has been reached 96 | Return a list of skills. 97 | 98 | :return: list: skills 99 | """ 100 | skills = [] 101 | button_found = False 102 | endofpage_reached = False 103 | attempt = 0 104 | max_attempts = 3 105 | delay = 3 # seconds 106 | body = self.driver.find_element_by_tag_name("body") 107 | last_height = self.driver.execute_script( 108 | "return document.body.scrollHeight") 109 | while not button_found: 110 | body.send_keys(Keys.PAGE_DOWN) 111 | sleep(2) 112 | new_height = self.driver.execute_script( 113 | "return document.body.scrollHeight") 114 | button_found, showmore_button = is_button_found( 115 | self.driver, delay) 116 | if button_found: 117 | self.driver.execute_script("arguments[0].click();", 118 | showmore_button) 119 | sleep(2) 120 | soup = bs(self.driver.page_source, 'html.parser') 121 | skills_tags = soup.find_all( 122 | class_="pv-skill-category-entity__name-text") 123 | skills = [item.get_text(strip=True) 124 | for item in skills_tags] 125 | skills = [validate_field(skill) for skill in skills] 126 | if new_height == last_height: 127 | attempt += 1 128 | if attempt == max_attempts: 129 | endofpage_reached = True 130 | else: 131 | last_height = new_height 132 | if button_found or endofpage_reached: 133 | break 134 | return skills 135 | 136 | @staticmethod 137 | def get_languages(soup): 138 | """ 139 | Get the languages in the "Accomplishments" section 140 | of the user whose profile page is being scraped. 141 | Look for the accomplishment tags first, and get all the language 142 | elements from them. 143 | Return a list of languages. 144 | 145 | :param soup: BeautifulSoup object 146 | :return: list: languages list 147 | """ 148 | languages = [] 149 | accomplishment_tags = soup.find_all( 150 | class_="pv-accomplishments-block__list-container") 151 | for a in accomplishment_tags: 152 | try: 153 | if a["id"] == "languages-expandable-content": 154 | languages = [l.get_text() for l in a.find_all("li")] 155 | except KeyError: 156 | pass 157 | return languages 158 | 159 | def scrape_user(self, query, url): 160 | """ 161 | Get the user data for a given query and linkedin URL. 162 | Call get_name() and get_job_title() to get name and 163 | job title, respectively. Scroll down the given URL 164 | to make the skill-section HTML code appear; 165 | call get_skills() and get_degree() to extract the user skills 166 | and their degree, respectively. Scroll down the page until its 167 | end to extract the user languages by calling 168 | get_languages(). 169 | Finally, return a dictionary with the extracted data. 170 | 171 | :param query: str 172 | :param url: str URL to scrape 173 | :return: 174 | """ 175 | attempt = 0 176 | max_attempts = 3 177 | success = False 178 | user_data = {} 179 | while not success: 180 | try: 181 | attempt += 1 182 | self.driver.get(url) 183 | sleep(2) 184 | self.driver.execute_script( 185 | "document.body.style.zoom='50%'") 186 | sleep(3) 187 | skills = self.get_skills() 188 | scroll_profile_page(self.driver) 189 | soup = bs(self.driver.page_source, 'html.parser') 190 | name = self.get_name(soup) 191 | job_title = self.get_job_title(soup) 192 | location = self.get_location(soup) 193 | degree = self.get_degree(soup) 194 | languages = self.get_languages(soup) 195 | user_data = { 196 | "URL": url, 197 | "name": name, 198 | "query": query, 199 | "job_title": job_title, 200 | "degree": degree, 201 | "location": location, 202 | "languages": languages, 203 | "skills": skills 204 | } 205 | success = True 206 | except TimeoutException: 207 | print("\nINFO :: TimeoutException raised while " + 208 | "getting URL\n" + url) 209 | print("INFO :: Attempt n." + str(attempt) + " of " + 210 | str(max_attempts) + 211 | "\nNext attempt in 60 seconds") 212 | sleep(60) 213 | if success: 214 | break 215 | if attempt == max_attempts and not user_data: 216 | print("INFO :: Max number of attempts reached. " + 217 | "Skipping URL" + 218 | "\nUser data will be empty.") 219 | return validate_user_data(user_data) 220 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from selenium import webdriver 3 | from selenium.webdriver.common.keys import Keys 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions 6 | from selenium.webdriver.common.by import By 7 | from selenium.common.exceptions import NoSuchElementException,\ 8 | TimeoutException 9 | from pymongo import MongoClient 10 | from validator_collection import checkers 11 | import json 12 | import os 13 | import errno 14 | import unicodedata 15 | 16 | 17 | def load_config(path): 18 | """ 19 | Load configuration file with all the needed parameters 20 | 21 | :param path: str path of the conf file 22 | :return: dict 23 | """ 24 | with open(path, 'r') as conf_file: 25 | conf = json.load(conf_file) 26 | return conf 27 | 28 | 29 | def create_nonexistent_dir(path, exc_raise=False): 30 | """ 31 | Create directory from given path 32 | Return True if created, False if it exists 33 | 34 | :param path: str dir path 35 | :param exc_raise: bool raise exception 36 | :return: str path of the created dir, None otherwise 37 | """ 38 | try: 39 | os.makedirs(path) 40 | print("INFO :: Created directory with path:", str(path)) 41 | return path 42 | except OSError as e: 43 | if e.errno != errno.EEXIST: 44 | print("ERROR :: Could not create directory with path: " + 45 | "%s\n", str(path)) 46 | if exc_raise: 47 | raise 48 | return None 49 | 50 | 51 | def validate_field(field): 52 | """ 53 | Return field if it exists 54 | otherwise empty string 55 | 56 | :param field: string to validate 57 | :return: field: input string if not empty, empty string otherwise 58 | """ 59 | if field: 60 | pass 61 | else: 62 | field = '' 63 | return field 64 | 65 | 66 | def validate_user_data(user_data): 67 | """ 68 | Validate user_data dict by checking that the majority of the keys 69 | have non-empty values. 70 | Return an empty dictionary if main keys' values are empty, 71 | otherwise the original dictionary. 72 | 73 | :param user_data: 74 | :return: dict 75 | """ 76 | try: 77 | if user_data["skills"] == []\ 78 | and user_data["languages"] == []\ 79 | and user_data["name"] == ""\ 80 | and user_data["job_title"] == ""\ 81 | and user_data["degree"] == ""\ 82 | and user_data["location"] == "": 83 | return {} 84 | else: 85 | return user_data 86 | except KeyError: 87 | return {} 88 | 89 | 90 | def init_driver(chrome_path, chromedriver_path): 91 | """ 92 | Iniitialize Chrome driver 93 | :param chrome_path: str chrome executable path 94 | :param chromedriver_path: str chrome driver path 95 | :return: selenium driver object 96 | """ 97 | chrome_options = webdriver.ChromeOptions() 98 | chrome_options.binary_location = chrome_path 99 | chrome_options.add_argument("--normal") 100 | chrome_options.add_argument("--start-maximized") 101 | chrome_options.add_argument("--disable-extensions") 102 | chrome_options.add_argument("--disable-infobars") 103 | driver = webdriver.Chrome(executable_path=chromedriver_path, 104 | chrome_options=chrome_options) 105 | return driver 106 | 107 | 108 | def get_job_urls(soup): 109 | """ 110 | Return a list of job URLs taken from the 111 | results of a query on LinkedIn. 112 | 113 | :param soup: BeautifulSoup instance 114 | :return: list of linkedin-job URLs 115 | """ 116 | base_url = "http://www.linkedin.com" 117 | job_urls = [base_url + url['href'].split('/?')[0] 118 | for url in soup.find_all( 119 | class_="job-card-search__link-wrapper", 120 | href=True)] 121 | return list(dict.fromkeys(job_urls)) 122 | 123 | 124 | def get_profile_urls(driver, n_pages=1): 125 | """ 126 | Return a list without repetitions of alphabetically sorted URLs 127 | taken from the results of a given query on Google search. 128 | 129 | :param driver: selenium chrome driver object 130 | :param n_pages: int number of google pages to loop over 131 | :return: list of linkedin-profile URLs 132 | """ 133 | linkedin_urls = [] 134 | for i in range(n_pages): 135 | urls = driver.find_elements_by_class_name('iUh30') 136 | linkedin_urls += [url.text for url in urls 137 | if checkers.is_url(url.text)] 138 | sleep(0.5) 139 | if i > 1: 140 | try: 141 | next_button_url = driver.find_element_by_css_selector( 142 | '#pnnext').get_attribute('href') 143 | driver.get(next_button_url) 144 | except NoSuchElementException: 145 | break 146 | linkedin_urls_no_rep = sorted( 147 | list(dict.fromkeys([url for url in linkedin_urls]))) 148 | return linkedin_urls_no_rep 149 | 150 | 151 | def login(driver, user, pwd): 152 | """ 153 | Type user email and password in the relevant fields and 154 | perform log in on linkedin.com by using the given credentials. 155 | 156 | :param driver: selenium chrome driver object 157 | :param user: str username, email 158 | :param pwd: str password 159 | :return: None 160 | """ 161 | username = driver.find_element_by_class_name('login-email') 162 | username.send_keys(user) 163 | sleep(0.5) 164 | password = driver.find_element_by_class_name('login-password') 165 | password.send_keys(pwd) 166 | sleep(0.5) 167 | sign_in_button = driver.find_element_by_xpath('//*[@type="submit"]') 168 | sign_in_button.click() 169 | 170 | 171 | def scroll_job_panel(driver): 172 | """ 173 | Scroll the left panel containing the job offers by sending PAGE_DOWN 174 | key until the very end has been reached 175 | 176 | :param driver: selenium chrome driver object 177 | :return: None 178 | """ 179 | panel = driver.find_element_by_class_name("jobs-search-results") 180 | last_height = driver.execute_script( 181 | "return document.getElementsByClassName(" + 182 | "'jobs-search-results')[0].scrollHeight") 183 | while True: 184 | panel.send_keys(Keys.PAGE_DOWN) 185 | sleep(0.2) 186 | new_height = driver.execute_script( 187 | "return document.getElementsByClassName(" + 188 | "'jobs-search-results')[0].scrollHeight") 189 | if new_height == last_height: 190 | break 191 | else: 192 | last_height = new_height 193 | javascript = ( 194 | "var x = document.getElementsByClassName(" + 195 | "'jobs-search-results')[0]; x.scrollTo(0, x.scrollHeight)" 196 | ) 197 | driver.execute_script(javascript) 198 | 199 | 200 | def scroll_profile_page(driver): 201 | """ 202 | Scroll a profile page by sending the keys PAGE_DOWN 203 | until the end of the page has been reached. 204 | 205 | :param driver: selenium chrome driver object 206 | :return: 207 | """ 208 | body = driver.find_element_by_tag_name("body") 209 | last_height = driver.execute_script( 210 | "return document.body.scrollHeight") 211 | while True: 212 | body.send_keys(Keys.PAGE_DOWN) 213 | sleep(3) 214 | new_height = driver.execute_script( 215 | "return document.body.scrollHeight") 216 | if new_height == last_height: 217 | break 218 | else: 219 | last_height = new_height 220 | 221 | 222 | def is_button_found(driver, delay): 223 | """ 224 | Try to find the "show more" button in the "skills" section. 225 | Return a boolean and the button element. 226 | 227 | :param driver: selenium chrome driver object 228 | :param delay: float delay in seconds 229 | :return: 230 | """ 231 | button_found = False 232 | button_element = None 233 | try: 234 | condition_is_met = expected_conditions.presence_of_element_located( 235 | (By.XPATH, "//button[@class=" + 236 | "'pv-profile-section__card-action-bar " + 237 | "pv-skills-section__additional-skills " + 238 | "artdeco-container-card-action-bar']")) 239 | button_element = WebDriverWait(driver, delay).until(condition_is_met) 240 | button_found = True 241 | except TimeoutException: 242 | pass 243 | return button_found, button_element 244 | 245 | 246 | def print_scraped_data(data): 247 | """ 248 | Print the user data returned by scrape_url(). 249 | 250 | """ 251 | print() 252 | for key in data: 253 | print(key + ": " + str(data[key])) 254 | 255 | 256 | def get_unseen_urls(collection, urls): 257 | """ 258 | Get a list of URLs that have not already been scraped. 259 | Loop over all the db entries and create a list with the 260 | URLs already scraped. 261 | Get the difference of such list and the list of all the URLs 262 | for a given query. 263 | Return a list of URLs which have not already been scraped. 264 | 265 | :param collection: Mongo DB collection 266 | :param urls: lsit of URLs to check 267 | :return: list of unseen URLs 268 | """ 269 | scraped_urls = [entry["URL"] for entry in collection.find()] 270 | unseen_urls = list(set(urls) - set(scraped_urls)) 271 | return unseen_urls 272 | 273 | 274 | def connect_mongo(host, user, pwd): 275 | """ 276 | Conncect Mongo Client 277 | 278 | :param host: 279 | :param user: 280 | :param pwd: 281 | :return: client: Mongo client object 282 | """ 283 | client = MongoClient("mongodb+srv://" + user + ":" + pwd + host) 284 | return client 285 | 286 | 287 | def filter_non_printable(string_to_filter): 288 | """ 289 | Filter string 's' by removing non-printable chars 290 | 291 | :param string_to_filter: 292 | :return: 293 | """ 294 | output_string = ''.join( 295 | c for c in string_to_filter 296 | if not unicodedata.category(c) in set('Cf') 297 | ) 298 | return output_string 299 | --------------------------------------------------------------------------------