├── classes
    ├── __init__.py
    ├── JobScraper.py
    └── UserScraper.py
├── _config.yml
├── .gitignore
├── captcha_resolver.py
├── conf.json
├── requirements.txt
├── README.md
├── scrape_users.py
├── scrape_jobs.py
└── utils.py


/classes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-modernist


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # binary files
 2 | *.pyc
 3 | *~
 4 | *.ipynb
 5 | *myconf*.json
 6 | *.log
 7 | *.json
 8 | 
 9 | # dirs
10 | *.ipynb*
11 | .venv/
12 | *log*
13 | 


--------------------------------------------------------------------------------
/captcha_resolver.py:
--------------------------------------------------------------------------------
 1 | import pytesseract
 2 | import argparse
 3 | from PIL import Image
 4 | from subprocess import check_output
 5 | 
 6 | 
 7 | def resolve(image_path):
 8 |     print("Resampling the Image")
 9 |     check_output(
10 |         ['convert', image_path, '-resample', '600', image_path])
11 |     return pytesseract.image_to_string(Image.open(image_path))
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     argparser = argparse.ArgumentParser()
16 |     argparser.add_argument('path', help='Captcha file path')
17 |     args = argparser.parse_args()
18 |     path = args.path
19 |     print('Resolving Captcha')
20 |     captcha_text = resolve(path)
21 |     print('Extracted Text', captcha_text)
22 | 


--------------------------------------------------------------------------------
/conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "parameters": {
 3 |         "CHROMEDRIVER_PATH": "/path/to/chromedriver",
 4 |         "CHROME_PATH": "/path/to/chrome-executable",
 5 |         "LOG_DIRECTORY": "./logdir/",
 6 |         "N_PAGES": 1,
 7 | 	"USER_QUERIES": [
 8 | 	    "site:it.linkedin.com/in/ AND \"giurisprudenza\"",
 9 | 	    "site:it.linkedin.com/in/ AND \"archeologia\"",
10 | 	    "site:it.linkedin.com/in/ AND \"biotecnologia\""
11 |         ],
12 | 	"JOB_QUERIES": [
13 | 	    "laurea giurisprudenza",
14 | 	    "laurea archeologia",
15 | 	    "laurea biotecnologia"
16 |         ],
17 |         "HOST": "@mongo_host"
18 |     },
19 |     "credentials": {
20 |         "LINUSERNAME": "user@email.com",
21 |         "LINPWD": "linkedinpwd",
22 |         "MONGOUSER": "mongouser",
23 |         "MONGOPWD": "mongopwd"
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | attrs==19.1.0
 2 | backcall==0.1.0
 3 | beautifulsoup4==4.7.1
 4 | bs4==0.0.1
 5 | certifi==2019.3.9
 6 | chardet==3.0.4
 7 | cssselect==1.0.3
 8 | decorator==4.4.0
 9 | dnspython==1.16.0
10 | idna==2.8
11 | ipykernel==5.1.0
12 | ipython==7.3.0
13 | ipython-genutils==0.2.0
14 | jedi==0.13.3
15 | jsonschema==3.0.1
16 | jupyter-client==5.2.4
17 | jupyter-core==4.4.0
18 | lxml==4.6.2
19 | numpy==1.16.2
20 | pandas==0.24.2
21 | parsel==1.5.1
22 | parso==0.5.0
23 | pexpect==4.6.0
24 | pickleshare==0.7.5
25 | pillow>=6.2.2
26 | prompt-toolkit==2.0.9
27 | ptyprocess==0.6.0
28 | Pygments==2.3.1
29 | pymongo==3.7.2
30 | pyrsistent==0.14.11
31 | pytesseract==0.2.6
32 | python-dateutil==2.8.0
33 | pytz==2018.9
34 | pyzmq==18.0.1
35 | requests==2.21.0
36 | selenium==3.141.0
37 | six==1.12.0
38 | soupsieve==1.8
39 | tesseract==0.1.3
40 | tornado==6.0.1
41 | traitlets==4.3.2
42 | urllib3==1.24.2
43 | validator-collection==1.3.3
44 | w3lib==1.20.0
45 | wcwidth==0.1.7
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LinkedIn Scraper
 2 | 
 3 | ## Disclaimer
 4 | Scraping data off of LinkedIn is against their User Agreement. This is purely intended for educational purposes.
 5 | 
 6 | ## Acknowledgements
 7 | Thanks to David Craven who I took inspiration from ([link here](https://www.linkedin.com/pulse/how-easy-scraping-data-from-linkedin-profiles-david-craven/))
 8 | 
 9 | ## What is this? 
10 | This was a tool capable of scraping linkedin profiles in 2018/2019. As of today, this repository can only represent a starting point, but it will most likely not work as expected.
11 | 
12 | ## Dependencies 
13 | It is based on selenium and BeautifulSoup
14 | 
15 | ## How to use
16 | Back in the days, you would first download the Chrome Driver from [here](http://chromedriver.chromium.org/) and extract it to your favourite location.
17 | Create a python3 virtual environment following [this](https://docs.python.org/3/tutorial/venv.html).
18 | Within the virtual environment
19 | ```pip install -r requirements.txt```
20 | 
21 | Edit the `conf.json` config file accordingly specifying the chrome bin path, e.g. by typying 
22 | ```which google-chrome``` in a UNIX shell command line, the chrome driver path, the desired queries
23 | and so forth. 
24 | 
25 | Ultimately to scrape users, you would've run 
26 | ```python scrape_users.py --conf conf.json```
27 | or jobs
28 | ```python scrape_jobs.py --conf conf.json```
29 | 


--------------------------------------------------------------------------------
/classes/JobScraper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A class to define the methods to scrape LinkedIn job web pages
 3 | """
 4 | 
 5 | 
 6 | class JobScraper(object):
 7 |     def __init__(self, soup, url, query):
 8 |         """
 9 |         Initialize the class
10 | 
11 |         :param soup: BeautifulSoup instance
12 |         :param url: str job URL to scrape
13 |         :param query: str query to perform
14 |         """
15 |         self.soup = soup
16 |         self.url = url
17 |         self.query = query
18 | 
19 |     def get_job_skills(self):
20 |         """
21 |         Get the skills required by the job offer being scraped.
22 | 
23 |         :return: list of skills
24 |         """
25 |         requested_skills = [rq.get_text() for rq in self.soup.find_all(
26 |             class_="jobs-ppc-criteria__value")]
27 |         return requested_skills
28 | 
29 |     def get_job_title(self):
30 |         """
31 |         Get the job title of the job page is being scraped.
32 |         Return a string containing the job title
33 | 
34 |         :return: str job title
35 |         """
36 |         try:
37 |             job_title = self.soup.find_all(
38 |                 class_="jobs-top-card__job-title")[0].get_text()
39 |         except IndexError:
40 |             job_title = ""
41 |         return job_title
42 | 
43 |     def get_job_location(self):
44 |         """
45 |         Get the location of the job offer being scraped.
46 |         Return a string containing the location.
47 | 
48 |         """
49 |         def validate_location(loc):
50 |             """
51 |             Validate the location by checking that the string extracted
52 |             by the preferred "jobs-top-card__exact-location" HTML class
53 |             is not empty, otherwise get the location string from the
54 |             "jobs-top-card__bullet" HTML class
55 | 
56 |             :param loc: str of the location
57 |             :return: str location
58 |             """
59 |             if loc:
60 |                 return loc
61 |             else:
62 |                 try:
63 |                     loc = [l.get_text().strip()
64 |                            for l in self.soup.find_all(
65 |                                class_="jobs-top-card__bullet")][0]
66 |                 except IndexError:
67 |                     loc = ""
68 |             return loc
69 |         try:
70 |             location = [l.get_text().strip()
71 |                         for l in self.soup.find_all(
72 |                             class_="jobs-top-card__exact-location")][0]
73 |         except IndexError:
74 |             location = ""
75 |         return validate_location(location)
76 | 
77 |     def get_job_data(self):
78 |         """
79 |         Get the job data by using the get* methods of the class.
80 |         Return a dictionary
81 | 
82 |         :return: dict job data
83 |         """
84 |         skills = self.get_job_skills()
85 |         if len(skills) == 0:
86 |             return {}
87 |         else:
88 |             job_data = {
89 |                 "URL": self.url,
90 |                 "query": self.query,
91 |                 "job_title": self.get_job_title(),
92 |                 "location": self.get_job_location(),
93 |                 "skills": skills
94 |             }
95 |             return job_data
96 | 


--------------------------------------------------------------------------------
/scrape_users.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Scrape linkedin URLs by using selenium, to simulate the navigation
 3 | (click, scroll) and BeautifulSoup to parse the HTML code of the page
 4 | Perform a number of queries and log a number of files
 5 | for each scraped user.
 6 | Write dataset to mongoDB with the scraped data
 7 | 
 8 | """
 9 | from selenium.webdriver.common.keys import Keys
10 | from selenium.common.exceptions import ElementNotInteractableException
11 | from utils import init_driver, get_profile_urls, login,\
12 |     print_scraped_data, load_config,\
13 |     get_unseen_urls, connect_mongo
14 | from time import sleep
15 | from classes.UserScraper import UserScraper
16 | import argparse
17 | import sys
18 | 
19 | 
20 | parser = argparse.ArgumentParser(
21 |     description=("Scrape linkedin profiles based on the " +
22 |                  "queries specified in the conf file")
23 | )
24 | parser.add_argument(
25 |     '-c', '--conf',
26 |     type=str,
27 |     metavar='',
28 |     required=True,
29 |     help='Specify the path of the configuration file'
30 | )
31 | args = parser.parse_args()
32 | conf = load_config(args.conf)
33 | parameters = conf["parameters"]
34 | credentials = conf["credentials"]
35 | CHROME_PATH = parameters["CHROME_PATH"]
36 | CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"]
37 | QUERIES = parameters["USER_QUERIES"]
38 | N_PAGES = parameters["N_PAGES"]
39 | LINUSERNAME = credentials["LINUSERNAME"]
40 | LINPWD = credentials["LINPWD"]
41 | MONGOUSER = credentials["MONGOUSER"]
42 | MONGOPWD = credentials["MONGOPWD"]
43 | HOST = parameters["HOST"]
44 | client = connect_mongo(HOST, MONGOUSER, MONGOPWD)
45 | db = client["linkedin"]
46 | users = db["users"]
47 | driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH)
48 | driver.get("https://www.linkedin.com")
49 | login(driver, LINUSERNAME, LINPWD)
50 | us = UserScraper(driver)
51 | for query in QUERIES:
52 |     driver.get("https://www.google.com")
53 |     sleep(2)
54 |     search_query = driver.find_element_by_name('q')
55 |     try:
56 |         search_query.send_keys(query)
57 |     except ElementNotInteractableException:
58 |         print("ERROR :: Cannot send query. Google might be blocking")
59 |         sys.exit(1)
60 |     sleep(0.5)
61 |     search_query.send_keys(Keys.RETURN)
62 |     profile_urls = get_profile_urls(driver, N_PAGES)
63 |     if len(profile_urls) == 0:
64 |         print()
65 |         print("WARNING :: " +
66 |               "Could not get any URLs for the query\n" + query)
67 |         print("Please double-check that Google is not " +
68 |               "blocking the query")
69 |         continue
70 |     unseen_urls = get_unseen_urls(users, profile_urls)
71 |     if len(unseen_urls) != 0:
72 |         print("INFO :: Resuming from URL", unseen_urls[0])
73 |     else:
74 |         print("INFO :: All URLs from " + str(N_PAGES) +
75 |               " Google-search page(s) for the query " + query +
76 |               " have already been scraped. " +
77 |               "Moving onto the next query if any.")
78 |         continue
79 |     for url in unseen_urls:
80 |         user_data = us.scrape_user(query, url)
81 |         if user_data and\
82 |            not db["users"].count_documents(user_data, limit=1):
83 |             print_scraped_data(user_data)
84 |             users.insert_one(user_data)
85 | driver.quit()
86 | 


--------------------------------------------------------------------------------
/scrape_jobs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Scrape linkedin jobs by using selenium, to simulate the navigation
 3 | (click, scroll) and BeautifulSoup to parse the HTML code of the page
 4 | Perform a number of queries and log a number of files
 5 | for each scraped job offer.
 6 | Write dataset to mongoDB with the scraped data
 7 | 
 8 | """
 9 | from selenium.common.exceptions import TimeoutException
10 | from utils import init_driver, get_job_urls, login, print_scraped_data,\
11 |     load_config, get_unseen_urls, scroll_job_panel, connect_mongo
12 | from time import sleep
13 | from bs4 import BeautifulSoup
14 | from classes.JobScraper import JobScraper
15 | import argparse
16 | 
17 | 
18 | parser = argparse.ArgumentParser(
19 |     description=("Scrape linkedin job offers based on the " +
20 |                  "queries specified in the conf file")
21 | )
22 | parser.add_argument('-c', '--conf',
23 |                     type=str,
24 |                     metavar='',
25 |                     required=True,
26 |                     help='Specify the path of the configuration file')
27 | args = parser.parse_args()
28 | conf = load_config(args.conf)
29 | parameters = conf["parameters"]
30 | credentials = conf["credentials"]
31 | CHROME_PATH = parameters["CHROME_PATH"]
32 | CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"]
33 | QUERIES = parameters["JOB_QUERIES"]
34 | LINUSERNAME = credentials["LINUSERNAME"]
35 | LINPWD = credentials["LINPWD"]
36 | MONGOUSER = credentials["MONGOUSER"]
37 | MONGOPWD = credentials["MONGOPWD"]
38 | HOST = parameters["HOST"]
39 | client = connect_mongo(HOST, MONGOUSER, MONGOPWD)
40 | db = client["linkedin"]
41 | jobs = db["jobs"]
42 | driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH)
43 | driver.get("https://www.linkedin.com")
44 | login(driver, LINUSERNAME, LINPWD)
45 | JOB_SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords="
46 | for query in QUERIES:
47 |     driver.get(JOB_SEARCH_URL + query)
48 |     sleep(0.5)
49 |     scroll_job_panel(driver)
50 |     soup = BeautifulSoup(driver.page_source, 'html.parser')
51 |     n_results_element = soup.find(class_="t-12 t-black--light t-normal")
52 |     n_results_string = n_results_element.get_text()
53 |     n_results = int(n_results_string.split()[0].replace(',', ''))
54 |     job_urls = get_job_urls(soup)
55 |     start = 25
56 |     url = JOB_SEARCH_URL + query + "&start=" + str(start)
57 |     while start < n_results:
58 |         try:
59 |             driver.get(url)
60 |             scroll_job_panel(driver)
61 |             soup = BeautifulSoup(driver.page_source, 'html.parser')
62 |             job_urls.extend(get_job_urls(soup))
63 |             start += 25
64 |         except TimeoutException:
65 |             print(
66 |                 "\nINFO :: TimeoutException raised while getting " +
67 |                 "URL\n" + url
68 |             )
69 |     if len(job_urls) == 0:
70 |         print()
71 |         print("WARNING :: Could not get any URLs for the query\n" +
72 |               query)
73 |         print("Please double-check that LinkedIn is not " +
74 |               "blocking the query")
75 |         continue
76 |     unseen_urls = get_unseen_urls(jobs, job_urls)
77 |     if len(unseen_urls) != 0:
78 |         print("INFO :: Resuming from URL", unseen_urls[0])
79 |     else:
80 |         print("INFO :: All job URLs for the query " + query +
81 |               " have already been scraped. " +
82 |               "Moving onto the next query if any.")
83 |         continue
84 |     for url in unseen_urls:
85 |         driver.get(url)
86 |         soup = BeautifulSoup(driver.page_source, 'html.parser')
87 |         js = JobScraper(soup, url, query)
88 |         job_data = js.get_job_data()
89 |         if job_data and\
90 |            not db["jobs"].count_documents(job_data, limit=1):
91 |             print_scraped_data(job_data)
92 |             jobs.insert_one(job_data)
93 | driver.quit()
94 | 


--------------------------------------------------------------------------------
/classes/UserScraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A class to define the methods to scrape LinkedIn user-profile webpages
  3 | 
  4 | """
  5 | from selenium.webdriver.common.keys import Keys
  6 | from selenium.common.exceptions import TimeoutException
  7 | from utils import validate_field, scroll_profile_page, is_button_found,\
  8 |     validate_user_data, filter_non_printable
  9 | from time import sleep
 10 | from bs4 import BeautifulSoup as bs
 11 | 
 12 | 
 13 | class UserScraper(object):
 14 |     def __init__(self, driver):
 15 |         """
 16 |         Initialize the class
 17 | 
 18 |         :param driver: selenium chrome driver object
 19 |         """
 20 |         self.driver = driver
 21 | 
 22 |     @staticmethod
 23 |     def get_name(soup):
 24 |         """
 25 |         Get the name of the user whose profile page is being scraped.
 26 | 
 27 |         :param soup: BeautifulSoup object
 28 |         :return: name: str name of the user
 29 |         """
 30 |         try:
 31 |             name_tag = soup.find_all(class_="pv-top-card-section__name")[0]
 32 |             name = name_tag.get_text(strip=True)
 33 |             return name
 34 |         except IndexError:
 35 |             return ""
 36 | 
 37 |     @staticmethod
 38 |     def get_job_title(soup):
 39 |         """
 40 |         Get the job title of the user whose profile
 41 |         page is being scraped
 42 | 
 43 |         :param soup: BeautifulSoup object
 44 |         :return: job_title: str
 45 |         """
 46 |         try:
 47 |             job_title_tag = soup.find_all(
 48 |                 class_="pv-top-card-section__headline")[0]
 49 |             job_title = job_title_tag.get_text(strip=True)
 50 |             job_title = filter_non_printable(job_title)
 51 |             return job_title
 52 |         except IndexError:
 53 |             return ""
 54 | 
 55 |     @staticmethod
 56 |     def get_location(soup):
 57 |         """
 58 |         Get the location of the user whose profile
 59 |         page is being scraped.
 60 | 
 61 |         :param soup: BeautifulSoup object
 62 |         :return: location: str
 63 |         """
 64 |         try:
 65 |             location_tag = soup.find_all(
 66 |                 class_="pv-top-card-section__location")[0]
 67 |             location = location_tag.get_text(strip=True)
 68 |             return location
 69 |         except IndexError:
 70 |             return ""
 71 | 
 72 |     @staticmethod
 73 |     def get_degree(soup):
 74 |         """
 75 |         Get the last degree of the user whose profile page
 76 |         is being scraped.
 77 | 
 78 |         :param soup: BeautifulSoup object
 79 |         :return: degree: str
 80 |         """
 81 |         degree_tags = soup.find_all(
 82 |             class_="pv-entity__degree-name")
 83 |         if len(degree_tags) != 0:
 84 |             degree = degree_tags[0].get_text().split('\n')[2]
 85 |             degree = validate_field(degree)
 86 |         else:
 87 |             degree = ''
 88 |         return degree
 89 | 
 90 |     def get_skills(self):
 91 |         """
 92 |         Get the skills of the user whose profile page is being scraped.
 93 |         Scroll down the page by sending the PAGE_DOWN button
 94 |         until either the "show more" button in the skills section
 95 |         has been found, or the end of the page has been reached
 96 |         Return a list of skills.
 97 | 
 98 |         :return: list: skills
 99 |         """
100 |         skills = []
101 |         button_found = False
102 |         endofpage_reached = False
103 |         attempt = 0
104 |         max_attempts = 3
105 |         delay = 3  # seconds
106 |         body = self.driver.find_element_by_tag_name("body")
107 |         last_height = self.driver.execute_script(
108 |             "return document.body.scrollHeight")
109 |         while not button_found:
110 |             body.send_keys(Keys.PAGE_DOWN)
111 |             sleep(2)
112 |             new_height = self.driver.execute_script(
113 |                 "return document.body.scrollHeight")
114 |             button_found, showmore_button = is_button_found(
115 |                 self.driver, delay)
116 |             if button_found:
117 |                 self.driver.execute_script("arguments[0].click();",
118 |                                            showmore_button)
119 |                 sleep(2)
120 |                 soup = bs(self.driver.page_source, 'html.parser')
121 |                 skills_tags = soup.find_all(
122 |                     class_="pv-skill-category-entity__name-text")
123 |                 skills = [item.get_text(strip=True)
124 |                           for item in skills_tags]
125 |                 skills = [validate_field(skill) for skill in skills]
126 |             if new_height == last_height:
127 |                 attempt += 1
128 |                 if attempt == max_attempts:
129 |                     endofpage_reached = True
130 |             else:
131 |                 last_height = new_height
132 |             if button_found or endofpage_reached:
133 |                 break
134 |         return skills
135 | 
136 |     @staticmethod
137 |     def get_languages(soup):
138 |         """
139 |         Get the languages in the "Accomplishments" section
140 |         of the user whose profile page is being scraped.
141 |         Look for the accomplishment tags first, and get all the language
142 |         elements from them.
143 |         Return a list of languages.
144 | 
145 |         :param soup: BeautifulSoup object
146 |         :return: list: languages list
147 |         """
148 |         languages = []
149 |         accomplishment_tags = soup.find_all(
150 |             class_="pv-accomplishments-block__list-container")
151 |         for a in accomplishment_tags:
152 |             try:
153 |                 if a["id"] == "languages-expandable-content":
154 |                     languages = [l.get_text() for l in a.find_all("li")]
155 |             except KeyError:
156 |                 pass
157 |         return languages
158 | 
159 |     def scrape_user(self, query, url):
160 |         """
161 |         Get the user data for a given query and linkedin URL.
162 |         Call get_name() and get_job_title() to get name and
163 |         job title, respectively. Scroll down the given URL
164 |         to make the skill-section HTML code appear;
165 |         call get_skills() and get_degree() to extract the user skills
166 |         and their degree, respectively. Scroll down the page until its
167 |         end to extract the user languages by calling
168 |         get_languages().
169 |         Finally, return a dictionary with the extracted data.
170 | 
171 |         :param query: str
172 |         :param url: str URL to scrape
173 |         :return:
174 |         """
175 |         attempt = 0
176 |         max_attempts = 3
177 |         success = False
178 |         user_data = {}
179 |         while not success:
180 |             try:
181 |                 attempt += 1
182 |                 self.driver.get(url)
183 |                 sleep(2)
184 |                 self.driver.execute_script(
185 |                     "document.body.style.zoom='50%'")
186 |                 sleep(3)
187 |                 skills = self.get_skills()
188 |                 scroll_profile_page(self.driver)
189 |                 soup = bs(self.driver.page_source, 'html.parser')
190 |                 name = self.get_name(soup)
191 |                 job_title = self.get_job_title(soup)
192 |                 location = self.get_location(soup)
193 |                 degree = self.get_degree(soup)
194 |                 languages = self.get_languages(soup)
195 |                 user_data = {
196 |                     "URL": url,
197 |                     "name": name,
198 |                     "query": query,
199 |                     "job_title": job_title,
200 |                     "degree": degree,
201 |                     "location": location,
202 |                     "languages": languages,
203 |                     "skills": skills
204 |                 }
205 |                 success = True
206 |             except TimeoutException:
207 |                 print("\nINFO :: TimeoutException raised while " +
208 |                       "getting URL\n" + url)
209 |                 print("INFO :: Attempt n." + str(attempt) + " of " +
210 |                       str(max_attempts) +
211 |                       "\nNext attempt in 60 seconds")
212 |                 sleep(60)
213 |             if success:
214 |                 break
215 |             if attempt == max_attempts and not user_data:
216 |                 print("INFO :: Max number of attempts reached. " +
217 |                       "Skipping URL" +
218 |                       "\nUser data will be empty.")
219 |         return validate_user_data(user_data)
220 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from time import sleep
  2 | from selenium import webdriver
  3 | from selenium.webdriver.common.keys import Keys
  4 | from selenium.webdriver.support.ui import WebDriverWait
  5 | from selenium.webdriver.support import expected_conditions
  6 | from selenium.webdriver.common.by import By
  7 | from selenium.common.exceptions import NoSuchElementException,\
  8 |     TimeoutException
  9 | from pymongo import MongoClient
 10 | from validator_collection import checkers
 11 | import json
 12 | import os
 13 | import errno
 14 | import unicodedata
 15 | 
 16 | 
 17 | def load_config(path):
 18 |     """
 19 |     Load configuration file with all the needed parameters
 20 | 
 21 |     :param path: str path of the conf file
 22 |     :return: dict
 23 |     """
 24 |     with open(path, 'r') as conf_file:
 25 |         conf = json.load(conf_file)
 26 |     return conf
 27 | 
 28 | 
 29 | def create_nonexistent_dir(path, exc_raise=False):
 30 |     """
 31 |     Create directory from given path
 32 |     Return True if created, False if it exists
 33 | 
 34 |     :param path: str dir path
 35 |     :param exc_raise: bool raise exception
 36 |     :return: str path of the created dir, None otherwise
 37 |     """
 38 |     try:
 39 |         os.makedirs(path)
 40 |         print("INFO :: Created directory with path:", str(path))
 41 |         return path
 42 |     except OSError as e:
 43 |         if e.errno != errno.EEXIST:
 44 |             print("ERROR :: Could not create directory with path: " +
 45 |                   "%s\n", str(path))
 46 |             if exc_raise:
 47 |                 raise
 48 |         return None
 49 | 
 50 | 
 51 | def validate_field(field):
 52 |     """
 53 |     Return field if it exists
 54 |     otherwise empty string
 55 | 
 56 |     :param field: string to validate
 57 |     :return: field: input string if not empty, empty string otherwise
 58 |     """
 59 |     if field:
 60 |         pass
 61 |     else:
 62 |         field = ''
 63 |     return field
 64 | 
 65 | 
 66 | def validate_user_data(user_data):
 67 |     """
 68 |     Validate user_data dict by checking that the majority of the keys
 69 |     have non-empty values.
 70 |     Return an empty dictionary if main keys' values are empty,
 71 |     otherwise the original dictionary.
 72 | 
 73 |     :param user_data:
 74 |     :return: dict
 75 |     """
 76 |     try:
 77 |         if user_data["skills"] == []\
 78 |            and user_data["languages"] == []\
 79 |            and user_data["name"] == ""\
 80 |            and user_data["job_title"] == ""\
 81 |            and user_data["degree"] == ""\
 82 |            and user_data["location"] == "":
 83 |             return {}
 84 |         else:
 85 |             return user_data
 86 |     except KeyError:
 87 |         return {}
 88 | 
 89 | 
 90 | def init_driver(chrome_path, chromedriver_path):
 91 |     """
 92 |     Iniitialize Chrome driver
 93 |     :param chrome_path: str chrome executable path
 94 |     :param chromedriver_path: str chrome driver path
 95 |     :return: selenium driver object
 96 |     """
 97 |     chrome_options = webdriver.ChromeOptions()
 98 |     chrome_options.binary_location = chrome_path
 99 |     chrome_options.add_argument("--normal")
100 |     chrome_options.add_argument("--start-maximized")
101 |     chrome_options.add_argument("--disable-extensions")
102 |     chrome_options.add_argument("--disable-infobars")
103 |     driver = webdriver.Chrome(executable_path=chromedriver_path,
104 |                               chrome_options=chrome_options)
105 |     return driver
106 | 
107 | 
108 | def get_job_urls(soup):
109 |     """
110 |     Return a list of job URLs taken from the
111 |     results of a query on LinkedIn.
112 | 
113 |     :param soup: BeautifulSoup instance
114 |     :return: list of linkedin-job URLs
115 |     """
116 |     base_url = "http://www.linkedin.com"
117 |     job_urls = [base_url + url['href'].split('/?')[0]
118 |                 for url in soup.find_all(
119 |                     class_="job-card-search__link-wrapper",
120 |                     href=True)]
121 |     return list(dict.fromkeys(job_urls))
122 | 
123 | 
124 | def get_profile_urls(driver, n_pages=1):
125 |     """
126 |     Return a list without repetitions of alphabetically sorted URLs
127 |     taken from the results of a given query on Google search.
128 | 
129 |     :param driver: selenium chrome driver object
130 |     :param n_pages: int number of google pages to loop over
131 |     :return: list of linkedin-profile URLs
132 |     """
133 |     linkedin_urls = []
134 |     for i in range(n_pages):
135 |         urls = driver.find_elements_by_class_name('iUh30')
136 |         linkedin_urls += [url.text for url in urls
137 |                           if checkers.is_url(url.text)]
138 |         sleep(0.5)
139 |         if i > 1:
140 |             try:
141 |                 next_button_url = driver.find_element_by_css_selector(
142 |                     '#pnnext').get_attribute('href')
143 |                 driver.get(next_button_url)
144 |             except NoSuchElementException:
145 |                 break
146 |     linkedin_urls_no_rep = sorted(
147 |         list(dict.fromkeys([url for url in linkedin_urls])))
148 |     return linkedin_urls_no_rep
149 | 
150 | 
151 | def login(driver, user, pwd):
152 |     """
153 |     Type user email and password in the relevant fields and
154 |     perform log in on linkedin.com by using the given credentials.
155 | 
156 |     :param driver: selenium chrome driver object
157 |     :param user: str username, email
158 |     :param pwd: str password
159 |     :return: None
160 |     """
161 |     username = driver.find_element_by_class_name('login-email')
162 |     username.send_keys(user)
163 |     sleep(0.5)
164 |     password = driver.find_element_by_class_name('login-password')
165 |     password.send_keys(pwd)
166 |     sleep(0.5)
167 |     sign_in_button = driver.find_element_by_xpath('//*[@type="submit"]')
168 |     sign_in_button.click()
169 | 
170 | 
171 | def scroll_job_panel(driver):
172 |     """
173 |     Scroll the left panel containing the job offers by sending PAGE_DOWN
174 |     key until the very end has been reached
175 | 
176 |     :param driver: selenium chrome driver object
177 |     :return: None
178 |     """
179 |     panel = driver.find_element_by_class_name("jobs-search-results")
180 |     last_height = driver.execute_script(
181 |         "return document.getElementsByClassName(" +
182 |         "'jobs-search-results')[0].scrollHeight")
183 |     while True:
184 |         panel.send_keys(Keys.PAGE_DOWN)
185 |         sleep(0.2)
186 |         new_height = driver.execute_script(
187 |             "return document.getElementsByClassName(" +
188 |             "'jobs-search-results')[0].scrollHeight")
189 |         if new_height == last_height:
190 |             break
191 |         else:
192 |             last_height = new_height
193 |     javascript = (
194 |         "var x = document.getElementsByClassName(" +
195 |         "'jobs-search-results')[0]; x.scrollTo(0, x.scrollHeight)"
196 |     )
197 |     driver.execute_script(javascript)
198 | 
199 | 
200 | def scroll_profile_page(driver):
201 |     """
202 |     Scroll a profile page by sending the keys PAGE_DOWN
203 |     until the end of the page has been reached.
204 | 
205 |     :param driver: selenium chrome driver object
206 |     :return:
207 |     """
208 |     body = driver.find_element_by_tag_name("body")
209 |     last_height = driver.execute_script(
210 |         "return document.body.scrollHeight")
211 |     while True:
212 |         body.send_keys(Keys.PAGE_DOWN)
213 |         sleep(3)
214 |         new_height = driver.execute_script(
215 |             "return document.body.scrollHeight")
216 |         if new_height == last_height:
217 |             break
218 |         else:
219 |             last_height = new_height
220 | 
221 | 
222 | def is_button_found(driver, delay):
223 |     """
224 |     Try to find the "show more" button in the "skills" section.
225 |     Return a boolean and the button element.
226 | 
227 |     :param driver: selenium chrome driver object
228 |     :param delay: float delay in seconds
229 |     :return:
230 |     """
231 |     button_found = False
232 |     button_element = None
233 |     try:
234 |         condition_is_met = expected_conditions.presence_of_element_located(
235 |                 (By.XPATH, "//button[@class=" +
236 |                  "'pv-profile-section__card-action-bar " +
237 |                  "pv-skills-section__additional-skills " +
238 |                  "artdeco-container-card-action-bar']"))
239 |         button_element = WebDriverWait(driver, delay).until(condition_is_met)
240 |         button_found = True
241 |     except TimeoutException:
242 |         pass
243 |     return button_found, button_element
244 | 
245 | 
246 | def print_scraped_data(data):
247 |     """
248 |     Print the user data returned by scrape_url().
249 | 
250 |     """
251 |     print()
252 |     for key in data:
253 |         print(key + ": " + str(data[key]))
254 | 
255 | 
256 | def get_unseen_urls(collection, urls):
257 |     """
258 |     Get a list of URLs that have not already been scraped.
259 |     Loop over all the db entries and create a list with the
260 |     URLs already scraped.
261 |     Get the difference of such list and the list of all the URLs
262 |     for a given query.
263 |     Return a list of URLs which have not already been scraped.
264 | 
265 |     :param collection: Mongo DB collection
266 |     :param urls: lsit of URLs to check
267 |     :return: list of unseen URLs
268 |     """
269 |     scraped_urls = [entry["URL"] for entry in collection.find()]
270 |     unseen_urls = list(set(urls) - set(scraped_urls))
271 |     return unseen_urls
272 | 
273 | 
274 | def connect_mongo(host, user, pwd):
275 |     """
276 |     Conncect Mongo Client
277 | 
278 |     :param host:
279 |     :param user:
280 |     :param pwd:
281 |     :return: client: Mongo client object
282 |     """
283 |     client = MongoClient("mongodb+srv://" + user + ":" + pwd + host)
284 |     return client
285 | 
286 | 
287 | def filter_non_printable(string_to_filter):
288 |     """
289 |     Filter string 's' by removing non-printable chars
290 | 
291 |     :param string_to_filter:
292 |     :return:
293 |     """
294 |     output_string = ''.join(
295 |         c for c in string_to_filter
296 |         if not unicodedata.category(c) in set('Cf')
297 |     )
298 |     return output_string
299 | 


--------------------------------------------------------------------------------