├── urls.py
├── README.md
└── extract.py


/urls.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time
 3 | from bs4 import BeautifulSoup
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | query_keyword = ""
 8 | no_of_pages = 1
 9 | email = ""
10 | password = ""
11 | 
12 | driver = webdriver.Chrome()
13 | driver.get('https://www.linkedin.com/')
14 | 
15 | email_box = driver.find_element_by_id('login-email')
16 | email_box.send_keys(email)
17 | pass_box = driver.find_element_by_id('login-password')
18 | pass_box.send_keys(password)
19 | submit_button = driver.find_element_by_id('login-submit')
20 | submit_button.click()
21 | 
22 | time.sleep(1)
23 | 
24 | urls = []
25 | for i in tqdm(range(no_of_pages)):
26 | 	try:
27 | 		driver.get(
28 | 			'https://www.linkedin.com/search/results/people/?'
29 | 			'origin=FACETED_SEARCH&page=' + str(i) +
30 | 			'&title=' + query_keyword
31 | 		)
32 | 		soup = BeautifulSoup(driver.page_source, "lxml")
33 | 		soup = soup.find_all(class_="search-result__result-link")
34 | 		for s in soup:
35 | 			url = 'https://www.linkedin.com' + s['href']
36 | 			urls.append(url)
37 | 		print(i)
38 | 	except KeyboardInterrupt:
39 | 		break
40 | 
41 | urls = list(set(urls))
42 | with open("URL/" + query_keyword + "Urls.txt", "a") as f:
43 | 	for url in urls:
44 | 		f.write(url + "\n")
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Linkedin-Scraper
 2 | 
 3 | Scraping LinkedIn profiles is not fun(at all). Without signing-in with a LinkedIn account, you can hardly access more than 15 profiles.
 4 | 
 5 |  1. You need to sign-in with your account.
 6 |  2. LinkedIn profile pages are dynamic and don't load completely unless you scroll the entire page.
 7 | 
 8 | This program let's you scrape profiles based on a keyword in the user's title. (You can edit the link to look for profiles of users working at a specific company or studying at some school)
 9 | It generates a csv file of these profiles with columns:
10 |  1. Months of Experience
11 |  2. Skills (Seperated by ':')
12 |  3. Recommendations received
13 |  4. No. of Projects
14 |  5. No. of Publications
15 |  6. No. of Followers 
16 | ----------
17 | ### Requirements
18 | 
19 |  - Python3
20 |  - Chrome web driver
21 |  - Selenium
22 |  - BeautifulSoup
23 | 
24 | 
25 | ----------
26 | ### Setup
27 | https://chromedriver.storage.googleapis.com/2.33/chromedriver_linux64.zip
28 |  1. Download and unzip this
29 |  2. chmod +x chromedriver
30 |  3. sudo mv -f chromedriver /usr/local/share/chromedriver
31 |  4. sudo ln -s /usr/local/share/chromedriver /usr/local/bin/chromedriver
32 |  5. sudo ln -s /usr/local/share/chromedriver /usr/bin/chromedriver
33 |  6. pip3 install selenium (Try sudo -H pip3 install selenium if this fails)
34 |  7. pip3 install beautifulsoup4
35 |  8. pip3 install tqdm
36 |  
37 | 
38 | ----------
39 | ### Usage
40 |  1. Edit ***urls.py*** by modifying the ***query_keyword*** variable that scrapes profiles with query_keyword as the title(eg. student or professor or founder), ***set no_of_pages*** to the number of search result pages you'd like to scrape(Each page has upto 10 profiles), and enter your ***linkedin credentials***.
41 |  2. python3 urls.py
42 |  3. Edit ***extract.py*** by modifying ***query_keyword*** again.
43 |  4. python3 extract.py
44 |   
45 | 
46 | ----------
47 | P.S. With great power comes great responsibility. Scrape too much too fast, and your account might get blocked. Scrape safe.
48 | 
49 | 


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from bs4 import BeautifulSoup
  3 | import time
  4 | from tqdm import tqdm
  5 | 
  6 | query_keyword = ""
  7 | email = ""
  8 | password = ""
  9 | 
 10 | driver = webdriver.Chrome()
 11 | driver.get('https://www.linkedin.com/')
 12 | 
 13 | email_box = driver.find_element_by_id('login-email')
 14 | email_box.send_keys(email)
 15 | pass_box = driver.find_element_by_id('login-password')
 16 | pass_box.send_keys(password)
 17 | submit_button = driver.find_element_by_id('login-submit')
 18 | submit_button.click()
 19 | 
 20 | 
 21 | def getMonths(page):
 22 | 	months = 0
 23 | 	soup = page.find_all(class_="pv-entity__bullet-item")
 24 | 	for s in soup:
 25 | 		s = s.string
 26 | 		if "Cause" in s:
 27 | 			continue
 28 | 		if "less" not in s:
 29 | 			exp = [int(x) for x in s.split() if x.isdigit()]
 30 | 			if(len(exp) == 2):
 31 | 				months += 12 * exp[0] + exp[1]
 32 | 			else:
 33 | 				if "yr" not in s:
 34 | 					months += exp[0]
 35 | 				else:
 36 | 					months += 12 * exp[0]
 37 | 	months = str(months)
 38 | 	return months
 39 | 
 40 | 
 41 | def getSkills(page):
 42 | 	skills = ''
 43 | 	soup = page.find_all("span", class_="pv-skill-entity__skill-name")
 44 | 	for s in soup:
 45 | 		skills += s.string + ':'
 46 | 	return skills
 47 | 
 48 | 
 49 | def getRecommendations(page):
 50 | 	soup = page.find("div", class_="recommendations-inlining")
 51 | 	soup = soup.find("artdeco-tab")
 52 | 	soup = soup.string
 53 | 	s = [int(x) for x in soup if x.isdigit()]
 54 | 	s = str(s[0])
 55 | 	return s
 56 | 
 57 | 
 58 | def getProjects(page):
 59 | 	soup = page.find("section", class_="projects")
 60 | 	soup = soup.find_all("span")
 61 | 	soup = soup[1].string
 62 | 	return soup
 63 | 
 64 | 
 65 | def getPublications(page):
 66 | 	soup = page.find("section", class_="publications")
 67 | 	soup = soup.find_all("span")
 68 | 	soup = soup[1].string
 69 | 	return soup
 70 | 
 71 | 
 72 | def getFollowers(page):
 73 | 	soup = page.find("h3", class_="pv-top-card-section__connections")
 74 | 	soup = soup.find("span")
 75 | 	soup = soup.string
 76 | 	return soup
 77 | 
 78 | 
 79 | with open("URL/" + query_keyword + "Urls.txt", "r") as f:
 80 | 	urls = f.read().splitlines()
 81 | 
 82 | with open("CSV/" + query_keyword + ".csv", "a") as file:
 83 | 	file.write(
 84 | 		"Months of Experience, Skills, Recommendations received, "
 85 | 		"No. of Projects, No. of Publications, No. of Followers \n"
 86 | 	)
 87 | 
 88 | for i, soup in enumerate(tqdm(urls)):
 89 | 	driver.get(soup)
 90 | 	scheight = .1
 91 | 	while scheight < 20:
 92 | 		driver.execute_script(
 93 | 			"window.scrollTo(0, document.body.scrollHeight/%s);"
 94 | 			% scheight
 95 | 		)
 96 | 		scheight += .01
 97 | 
 98 | 	try:
 99 | 		arrow = driver.find_element_by_css_selector(
100 | 			'button.pv-profile-section'
101 | 			'__see-more-inline'
102 | 		)
103 | 		arrow.click()
104 | 	except Exception as e:
105 | 		print(e)
106 | 	try:
107 | 		arrow = driver.find_element_by_css_selector(
108 | 			'button.pv-skills-section'
109 | 			'__additional-skills'
110 | 		)
111 | 		arrow.click()
112 | 		time.sleep(1)
113 | 	except Exception as e:
114 | 		print(e)
115 | 
116 | 	page = BeautifulSoup(driver.page_source, 'lxml')
117 | 
118 | 	row = ''
119 | 	try:
120 | 		# Experience
121 | 		months = getMonths(page)
122 | 		print("Experience: ", months)
123 | 		row += months + ','
124 | 	except Exception as e:
125 | 		row += '0,'
126 | 		print("Experience: ", e)
127 | 
128 | 	try:
129 | 		# Skills
130 | 		skills = getSkills(page)
131 | 		print("Skills: ", skills)
132 | 		row += skills + ','
133 | 	except Exception as e:
134 | 		row += ','
135 | 		print("Skills: ", e)
136 | 
137 | 	try:
138 | 		# Recommendations received
139 | 		rec = getRecommendations(page)
140 | 		print("Recommendations: ", rec)
141 | 		row += rec + ','
142 | 	except Exception as e:
143 | 		row += '0,'
144 | 		print("Recommendations: ", e)
145 | 
146 | 	try:
147 | 		# Projects
148 | 		proj = getProjects(page)
149 | 		print("Projects: ", proj)
150 | 		row += proj + ','
151 | 	except Exception as e:
152 | 		row += '0,'
153 | 		print("Projects: ", e)
154 | 
155 | 	try:
156 | 		# Publications
157 | 		pub = getPublications(page)
158 | 		print("Publications: ", pub)
159 | 		row += pub + ','
160 | 	except Exception as e:
161 | 		row += '0,'
162 | 		print("Publications: ", e)
163 | 
164 | 	try:
165 | 		# Followers
166 | 		followers = getFollowers(page)
167 | 		print("Followers: ", followers)
168 | 		row += followers
169 | 	except Exception as e:
170 | 		row += '0'
171 | 		print("Followers: ", e)
172 | 
173 | 	print()
174 | 	print()
175 | 	with open("CSV/" + query_keyword + ".csv", "a") as file:
176 | 		file.write(row + '\n')
177 | 


--------------------------------------------------------------------------------