├── urls.py ├── README.md └── extract.py /urls.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | from bs4 import BeautifulSoup 4 | from tqdm import tqdm 5 | 6 | 7 | query_keyword = "" 8 | no_of_pages = 1 9 | email = "" 10 | password = "" 11 | 12 | driver = webdriver.Chrome() 13 | driver.get('https://www.linkedin.com/') 14 | 15 | email_box = driver.find_element_by_id('login-email') 16 | email_box.send_keys(email) 17 | pass_box = driver.find_element_by_id('login-password') 18 | pass_box.send_keys(password) 19 | submit_button = driver.find_element_by_id('login-submit') 20 | submit_button.click() 21 | 22 | time.sleep(1) 23 | 24 | urls = [] 25 | for i in tqdm(range(no_of_pages)): 26 | try: 27 | driver.get( 28 | 'https://www.linkedin.com/search/results/people/?' 29 | 'origin=FACETED_SEARCH&page=' + str(i) + 30 | '&title=' + query_keyword 31 | ) 32 | soup = BeautifulSoup(driver.page_source, "lxml") 33 | soup = soup.find_all(class_="search-result__result-link") 34 | for s in soup: 35 | url = 'https://www.linkedin.com' + s['href'] 36 | urls.append(url) 37 | print(i) 38 | except KeyboardInterrupt: 39 | break 40 | 41 | urls = list(set(urls)) 42 | with open("URL/" + query_keyword + "Urls.txt", "a") as f: 43 | for url in urls: 44 | f.write(url + "\n") 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Linkedin-Scraper 2 | 3 | Scraping LinkedIn profiles is not fun(at all). Without signing-in with a LinkedIn account, you can hardly access more than 15 profiles. 4 | 5 | 1. You need to sign-in with your account. 6 | 2. LinkedIn profile pages are dynamic and don't load completely unless you scroll the entire page. 7 | 8 | This program let's you scrape profiles based on a keyword in the user's title. (You can edit the link to look for profiles of users working at a specific company or studying at some school) 9 | It generates a csv file of these profiles with columns: 10 | 1. Months of Experience 11 | 2. Skills (Seperated by ':') 12 | 3. Recommendations received 13 | 4. No. of Projects 14 | 5. No. of Publications 15 | 6. No. of Followers 16 | ---------- 17 | ### Requirements 18 | 19 | - Python3 20 | - Chrome web driver 21 | - Selenium 22 | - BeautifulSoup 23 | 24 | 25 | ---------- 26 | ### Setup 27 | https://chromedriver.storage.googleapis.com/2.33/chromedriver_linux64.zip 28 | 1. Download and unzip this 29 | 2. chmod +x chromedriver 30 | 3. sudo mv -f chromedriver /usr/local/share/chromedriver 31 | 4. sudo ln -s /usr/local/share/chromedriver /usr/local/bin/chromedriver 32 | 5. sudo ln -s /usr/local/share/chromedriver /usr/bin/chromedriver 33 | 6. pip3 install selenium (Try sudo -H pip3 install selenium if this fails) 34 | 7. pip3 install beautifulsoup4 35 | 8. pip3 install tqdm 36 | 37 | 38 | ---------- 39 | ### Usage 40 | 1. Edit ***urls.py*** by modifying the ***query_keyword*** variable that scrapes profiles with query_keyword as the title(eg. student or professor or founder), ***set no_of_pages*** to the number of search result pages you'd like to scrape(Each page has upto 10 profiles), and enter your ***linkedin credentials***. 41 | 2. python3 urls.py 42 | 3. Edit ***extract.py*** by modifying ***query_keyword*** again. 43 | 4. python3 extract.py 44 | 45 | 46 | ---------- 47 | P.S. With great power comes great responsibility. Scrape too much too fast, and your account might get blocked. Scrape safe. 48 | 49 | -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from bs4 import BeautifulSoup 3 | import time 4 | from tqdm import tqdm 5 | 6 | query_keyword = "" 7 | email = "" 8 | password = "" 9 | 10 | driver = webdriver.Chrome() 11 | driver.get('https://www.linkedin.com/') 12 | 13 | email_box = driver.find_element_by_id('login-email') 14 | email_box.send_keys(email) 15 | pass_box = driver.find_element_by_id('login-password') 16 | pass_box.send_keys(password) 17 | submit_button = driver.find_element_by_id('login-submit') 18 | submit_button.click() 19 | 20 | 21 | def getMonths(page): 22 | months = 0 23 | soup = page.find_all(class_="pv-entity__bullet-item") 24 | for s in soup: 25 | s = s.string 26 | if "Cause" in s: 27 | continue 28 | if "less" not in s: 29 | exp = [int(x) for x in s.split() if x.isdigit()] 30 | if(len(exp) == 2): 31 | months += 12 * exp[0] + exp[1] 32 | else: 33 | if "yr" not in s: 34 | months += exp[0] 35 | else: 36 | months += 12 * exp[0] 37 | months = str(months) 38 | return months 39 | 40 | 41 | def getSkills(page): 42 | skills = '' 43 | soup = page.find_all("span", class_="pv-skill-entity__skill-name") 44 | for s in soup: 45 | skills += s.string + ':' 46 | return skills 47 | 48 | 49 | def getRecommendations(page): 50 | soup = page.find("div", class_="recommendations-inlining") 51 | soup = soup.find("artdeco-tab") 52 | soup = soup.string 53 | s = [int(x) for x in soup if x.isdigit()] 54 | s = str(s[0]) 55 | return s 56 | 57 | 58 | def getProjects(page): 59 | soup = page.find("section", class_="projects") 60 | soup = soup.find_all("span") 61 | soup = soup[1].string 62 | return soup 63 | 64 | 65 | def getPublications(page): 66 | soup = page.find("section", class_="publications") 67 | soup = soup.find_all("span") 68 | soup = soup[1].string 69 | return soup 70 | 71 | 72 | def getFollowers(page): 73 | soup = page.find("h3", class_="pv-top-card-section__connections") 74 | soup = soup.find("span") 75 | soup = soup.string 76 | return soup 77 | 78 | 79 | with open("URL/" + query_keyword + "Urls.txt", "r") as f: 80 | urls = f.read().splitlines() 81 | 82 | with open("CSV/" + query_keyword + ".csv", "a") as file: 83 | file.write( 84 | "Months of Experience, Skills, Recommendations received, " 85 | "No. of Projects, No. of Publications, No. of Followers \n" 86 | ) 87 | 88 | for i, soup in enumerate(tqdm(urls)): 89 | driver.get(soup) 90 | scheight = .1 91 | while scheight < 20: 92 | driver.execute_script( 93 | "window.scrollTo(0, document.body.scrollHeight/%s);" 94 | % scheight 95 | ) 96 | scheight += .01 97 | 98 | try: 99 | arrow = driver.find_element_by_css_selector( 100 | 'button.pv-profile-section' 101 | '__see-more-inline' 102 | ) 103 | arrow.click() 104 | except Exception as e: 105 | print(e) 106 | try: 107 | arrow = driver.find_element_by_css_selector( 108 | 'button.pv-skills-section' 109 | '__additional-skills' 110 | ) 111 | arrow.click() 112 | time.sleep(1) 113 | except Exception as e: 114 | print(e) 115 | 116 | page = BeautifulSoup(driver.page_source, 'lxml') 117 | 118 | row = '' 119 | try: 120 | # Experience 121 | months = getMonths(page) 122 | print("Experience: ", months) 123 | row += months + ',' 124 | except Exception as e: 125 | row += '0,' 126 | print("Experience: ", e) 127 | 128 | try: 129 | # Skills 130 | skills = getSkills(page) 131 | print("Skills: ", skills) 132 | row += skills + ',' 133 | except Exception as e: 134 | row += ',' 135 | print("Skills: ", e) 136 | 137 | try: 138 | # Recommendations received 139 | rec = getRecommendations(page) 140 | print("Recommendations: ", rec) 141 | row += rec + ',' 142 | except Exception as e: 143 | row += '0,' 144 | print("Recommendations: ", e) 145 | 146 | try: 147 | # Projects 148 | proj = getProjects(page) 149 | print("Projects: ", proj) 150 | row += proj + ',' 151 | except Exception as e: 152 | row += '0,' 153 | print("Projects: ", e) 154 | 155 | try: 156 | # Publications 157 | pub = getPublications(page) 158 | print("Publications: ", pub) 159 | row += pub + ',' 160 | except Exception as e: 161 | row += '0,' 162 | print("Publications: ", e) 163 | 164 | try: 165 | # Followers 166 | followers = getFollowers(page) 167 | print("Followers: ", followers) 168 | row += followers 169 | except Exception as e: 170 | row += '0' 171 | print("Followers: ", e) 172 | 173 | print() 174 | print() 175 | with open("CSV/" + query_keyword + ".csv", "a") as file: 176 | file.write(row + '\n') 177 | --------------------------------------------------------------------------------