├── README.md └── linkScrape.py /README.md: -------------------------------------------------------------------------------- 1 | # linkScrape 2 | 3 | ___ __ ____ 4 | /\_ \ __ /\ \ /\ _`\ 5 | \//\ \ /\_\ ___\ \ \/'\\ \,\L\_\ ___ _ __ __ _____ __ 6 | \ \ \ \/\ \ /' _ `\ \ , < \/_\__ \ /'___\/\`'__\/'__`\ /\ '__`\ /'__`\ 7 | \_\ \_\ \ \/\ \/\ \ \ \\`\ /\ \L\ \/\ \__/\ \ \//\ \L\.\_\ \ \L\ \/\ __/ 8 | /\____\\ \_\ \_\ \_\ \_\ \_\ `\____\ \____\\ \_\\ \__/.\_\\ \ ,__/\ \____\ 9 | \/____/ \/_/\/_/\/_/\/_/\/_/\/_____/\/____/ \/_/ \/__/\/_/ \ \ \/ \/____/ 10 | \ \_\ 11 | \/_/ 12 | Description: Enumerates employee names from LinkedIn.com 13 | Created by: Nick Sanzotta/@beamr 14 | *** 15 | Installation: 16 | 17 | git clone https://github.com/NickSanzotta/linkScrape.git 18 | cd linkScrape 19 | python linkScrape.py --help 20 | 21 | 3rd Party Python libraries may be required: 22 | pip install beautifulsoup4 23 | pip install bs4 24 | pip install lxml 25 | 26 | *** 27 | Caveats 28 | 29 | Does not utilize LinkedIn's API.(This is a pure Web Scraper) 30 | LinkedIn Account may be flagged or banned. 31 | LinkedIn.com account will need 10+ connections/profile strength to perform searches. 32 | (This is a rough estimate based on current feedback) 33 | Company search results have a monthly cap. 34 | Script still has some minor bugs when scraping some character sets. 35 | *** 36 | TIPS 37 | 38 | 1.When searching for companies with "&" such as T&T use the following syntax.(Make sure to use quotes) 39 | python linkScrape.py -e LinkedInUser@email.com -c "T and T" 40 | 41 | 2. Searching Companies with common or shared names, will produce incorrect company info results. 42 | (This is a known issue, I plan on addresssing) 43 | 44 | 3. Searching Companies with with White Space currently will produce no company info. 45 | (This is a known issue, I plan on addresssing) 46 | 47 | *** 48 | Default Values: 49 | 50 | If a parameter is not defined it's default value will be choosen. 51 | Default values listed below. 52 | 53 | formatValue = 7 54 | pageResults = 5 55 | timeout = 5 56 | 57 | *** 58 | Usage(CLI): 59 | 60 | Usage: python linkScrape.py 61 | Example: python linkScrape.py -e LinkedInUser@email.com -c acme -r 1 -t 3 -m 7 -d acme.com 62 | Example: python linkScrape.py -m 7 -i ~/Company/names.txt 63 | Raw output saved to: linkedIn/linkScrape-data/Company_time.txt 64 | Formatted output saved to: linkedIn/linkScrape-data/Company-mangle[x]_time.txt 65 | 66 | Login options: 67 | -e Your LinkedIn.com Email Address. 68 | -p Your LinkedIn.com Password. 69 | 70 | Search options: 71 | -c Company you want to enumerate.(Prepends to filename if used with -i) 72 | -r Searches x amount of LinkedIn.com pages (Default is 5). 73 | -t Sets timeout value. (Default is 5.) 74 | *** 75 | Mangle Options: 76 | 77 | -m 78 | 1)FirstLast ex:nicksanzotta 79 | 2)LastFirst ex:sanzottanick 80 | 3)First.Last ex:nick.sanzotta 81 | 4)Last.First ex:sanzotta.nick 82 | 5)First_Last ex:nick_sanzotta 83 | 6)Last_First ex:sanzotta_nick 84 | 7)FLast ex:nsanzotta 85 | 8)LFirst ex:snick 86 | 9)FirstL ex:nicks 87 | 10)F.Last ex:n.sanzotta 88 | 11)L.Firstname ex:s.nick 89 | 12)FirLa ex:nicsa 90 | 13)Lastfir ex:sanznic 91 | 92 | -d Append @domain.com to enumerated user list." 93 | -i Use local file instead of LinkedIn.com to perform name Mangle against." 94 | Misc: 95 | 96 | -h Prints this help menu. 97 | 98 | 99 | 100 | *** 101 | Usage(Wizard): 102 | 103 | [*]You did not specify a parameter the wizard has launched: 104 | [*]Example: python linkScrape.py -e user@email.com -c acme 105 | [*]For help & command line options please use: python linkScrape.py --help 106 | 107 | Enter LinkedIn Email account[user@email.com]: 108 | ENTERED: "user@email.com" 109 | 110 | Enter LinkedIn Password: 111 | Enter Company[acme]: 112 | ENTERED: "acme" 113 | 114 | 115 | Mangle options: 116 | 117 | -m 118 | 1)FirstLast ex:nicksanzotta 119 | 2)LastFirst ex:sanzottanick 120 | 3)First.Last ex:nick.sanzotta 121 | 4)Last.First ex:sanzotta.nick 122 | 5)First_Last ex:nick_sanzotta 123 | 6)Last_First ex:sanzotta_nick 124 | 7)FLast ex:nsanzotta 125 | 8)LFirst ex:snick 126 | 9)FirstL ex:nicks 127 | 10)F.Last ex:n.sanzotta 128 | 11)L.Firstname ex:s.nick 129 | 12)FirLa ex:nicsa 130 | 13)Lastfir ex:sanznic 131 | 132 | Enter name Managle choice[ex:7]: 133 | ENTERED: "7" 134 | 135 | [*]TIP: This value will determine how many page results will be returned. 136 | Enter number of pages results[ex:2]: 137 | ENTERED: "5" 138 | 139 | [*]TIP: This value will determine how long of a delay(in seconds) each page will be scraped. 140 | Enter timeout value[ex:5]: 141 | ENTERED: "5" 142 | 143 | [*]TIP: This value will be added to the end of each mangled result[ex:jsmith@acme.com]. 144 | Enter Domain suffix[ex:acme.com]: 145 | ENTERED: "" 146 | 147 | 148 | *** 149 | Output Sample: 150 | 151 | Employee/Title list Saved to: linkScrape-data/acme_employee-title_20160920-1523.txt 152 | Robert Dukes : Security Lead 153 | Chang Xiu : President 154 | Danny Glover : Alliances Manager 155 | Rob Becker : SQA Engineer 156 | 157 | Raw Employee list Saved to: linkScrape-data/acme_20160920-1523.txt 158 | Robert Dukes 159 | Chang Xiu 160 | Danny Glover 161 | Rob Becker 162 | 163 | Mangled option chosen: 7 164 | Mangled list Saved to: linkScrape-data/acme-mangle-7_20160920-1523.txt 165 | rdukes 166 | cxiu 167 | dglover 168 | rbecker 169 | 170 | Completed in: 21.9s 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /linkScrape.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/python 3 | # Nick Sanzotta 4 | # Description: Enumerates employee names from LinkedIn.com based off company search results. 5 | # Version v 1.9222016 6 | import os, sys, getopt, getpass, re, requests, time 7 | from sys import argv 8 | from bs4 import BeautifulSoup 9 | 10 | timestr = time.strftime("%Y%m%d-%H%M") 11 | curr_time = time.time() 12 | 13 | class colors: 14 | white = "\033[1;37m" 15 | normal = "\033[0;00m" 16 | red = "\033[1;31m" 17 | blue = "\033[1;34m" 18 | green = "\033[1;32m" 19 | lightblue = "\033[0;34m" 20 | 21 | banner = colors.lightblue + r""" 22 | ___ __ ____ 23 | /\_ \ __ /\ \ /\ _`\ 24 | \//\ \ /\_\ ___\ \ \/'\\ \,\L\_\ ___ _ __ __ _____ __ 25 | \ \ \ \/\ \ /' _ `\ \ , < \/_\__ \ /'___\/\`'__\/'__`\ /\ '__`\ /'__`\ 26 | \_\ \_\ \ \/\ \/\ \ \ \\`\ /\ \L\ \/\ \__/\ \ \//\ \L\.\_\ \ \L\ \/\ __/ 27 | /\____\\ \_\ \_\ \_\ \_\ \_\ `\____\ \____\\ \_\\ \__/.\_\\ \ ,__/\ \____\ 28 | \/____/ \/_/\/_/\/_/\/_/\/_/\/_____/\/____/ \/_/ \/__/\/_/ \ \ \/ \/____/ 29 | \ \_\ 30 | \/_/ 31 | """+'\n' \ 32 | + colors.lightblue + '\n linkScrape.py v1.9222016' \ 33 | + colors.normal + '\n Description: Enumerates employee names from LinkedIn.com '\ 34 | + colors.normal + '\n Created by: Nick Sanzotta/@beamr' + '\n'\ 35 | + colors.normal + ' ' + '*' * 95 +'\n' + colors.normal 36 | 37 | def cls(): 38 | os.system('cls' if os.name == 'nt' else 'clear') 39 | 40 | def connection(email, password, companyName, pageResults, timeout, output): 41 | outputTitle = 'linkScrape-data/'+companyName+'_employee-title_'+timestr+'.txt' 42 | client = requests.Session() 43 | homepage = 'https://www.linkedin.com' 44 | login = 'https://www.linkedin.com/uas/login-submit' 45 | html = client.get(homepage).content 46 | soup = BeautifulSoup(html,'lxml') 47 | csrf = soup.find(id="loginCsrfParam-login")['value'] 48 | login_information = { 49 | 'session_key': email, 50 | 'session_password': password, 51 | 'loginCsrfParam': csrf, 52 | } 53 | client.post(login, data=login_information) 54 | companyInfo = '' 55 | 56 | # Company Info 57 | request1 = client.get('https://www.linkedin.com/company/'+companyName) 58 | m3 = re.findall(r"companyName\"\:\"[\w]*.[\w]", request1.text) 59 | m4 = re.findall(r"industry\"\:\"[\w]*.[\w][\W]*.[\w]*", request1.text) 60 | m5 = re.findall(r"size\"\:\"[\w]*.[\w][\W]*.[\w]*", request1.text) 61 | m6 = re.findall(r"employeeCount\"\:[\w]*.[\w]", request1.text) 62 | try: 63 | company = re.sub('companyName":"','', m3[0]) 64 | industry = re.sub('industry":"','', m4[0]) 65 | size = re.sub('size":"','', m5[0]) 66 | employeeCount = re.sub('employeeCount":','', m6[0]) 67 | except IndexError: 68 | print('[*]Company information not available. ') 69 | else: 70 | info= """ 71 | Company Name: {0} 72 | industry: {1} 73 | {2} + employees 74 | Employees on LinkedIn: {3} 75 | """ 76 | companyInfo = info.format(company,industry,size,employeeCount) 77 | print(companyInfo) 78 | 79 | r1=client.get('https://www.linkedin.com/vsearch/p?type=people&keywords='+companyName) 80 | for z in range(1, pageResults): 81 | time.sleep(timeout) 82 | r1=client.get('https://www.linkedin.com/vsearch/p?type=people&keywords='+companyName+'&page_num='+str(z)) 83 | m1 = re.findall(r"formatted_name\"\:\"[\w]*.[\w][\W]*.[\w]*", r1.text) 84 | m2 = re.findall(r"fmt_heading\"\:\"[\w]*.[\w][\W]*.[\w]*", r1.text) 85 | for i, j in zip(m1,m2): 86 | x1 = i 87 | x2 = j 88 | employee = re.sub('formatted_name":"','', x1) 89 | title = re.sub('fmt_heading":"','', x2) 90 | print(employee+' '+':'+' '+title) 91 | with open(output, 'a') as f: 92 | x=employee.encode("utf-8") 93 | f.write(x+"\n") 94 | with open(outputTitle, 'a') as f: 95 | x=employee.encode("utf-8") 96 | y=title.encode("utf-8") 97 | f.write(x+' '+':'+' '+y+"\n") 98 | cls() 99 | print(banner) 100 | print(companyInfo) 101 | print("\nEmployee/Title list Saved to: " + outputTitle) 102 | with open(outputTitle, 'r') as f: 103 | x = f.read() 104 | print(x) 105 | print("Raw Employee list Saved to: " + output) 106 | with open(output, 'r') as f: 107 | x = f.read() 108 | print(x) 109 | 110 | def name(companyName, output, formatValue, domain): 111 | filename = "linkScrape-data/"+companyName+"-"+"mangle-"+str(formatValue)+"_"+timestr+".txt" 112 | print('Mangled option chosen: '+ str(formatValue)) 113 | print('Mangled list Saved to: '+filename) 114 | for x in open(output, 'r'): 115 | full_name = ''.join([c for c in x if c == " " or c.isalpha()]) 116 | full_name = full_name.lower().split() 117 | first_name = full_name[0] 118 | last_name = full_name[-1] 119 | 120 | if formatValue == 1: 121 | newname=first_name + last_name 122 | if domain != '': 123 | newname = newname+"@"+domain 124 | write(companyName, formatValue, newname) 125 | print(newname) 126 | else: 127 | write(companyName, formatValue, newname) 128 | print(newname) 129 | elif formatValue == 2: 130 | newname = last_name + first_name 131 | if domain != '': 132 | newname = newname+"@"+domain 133 | write(companyName, formatValue, newname) 134 | print(newname) 135 | else: 136 | write(companyName, formatValue, newname) 137 | print(newname) 138 | elif formatValue == 3: 139 | newname = first_name + "." + last_name 140 | if domain != '': 141 | newname = newname+"@"+domain 142 | write(companyName, formatValue, newname) 143 | print(newname) 144 | else: 145 | write(companyName, formatValue, newname) 146 | print(newname) 147 | elif formatValue == 4: 148 | newname = last_name + "." + first_name 149 | if domain != '': 150 | newname = newname+"@"+domain 151 | write(companyName, formatValue, newname) 152 | print(newname) 153 | else: 154 | write(companyName, formatValue, newname) 155 | print(newname) 156 | elif formatValue == 5: 157 | newname = first_name + "_" + last_name 158 | if domain != '': 159 | newname = newname+"@"+domain 160 | write(companyName, formatValue, newname) 161 | print(newname) 162 | else: 163 | write(companyName, formatValue, newname) 164 | print(newname) 165 | elif formatValue == 6: 166 | newname = last_name + "_" + first_name 167 | if domain != '': 168 | newname = newname+"@"+domain 169 | write(companyName, formatValue, newname) 170 | print(newname) 171 | else: 172 | write(companyName, formatValue, newname) 173 | print(newname) 174 | elif formatValue == 7: 175 | newname = first_name[0] + last_name 176 | if domain != '': 177 | newname = newname+"@"+domain 178 | write(companyName, formatValue, newname) 179 | print(newname) 180 | else: 181 | write(companyName, formatValue, newname) 182 | print(newname) 183 | elif formatValue == 8: 184 | newname = last_name[0] + first_name 185 | if domain != '': 186 | newname = newname+"@"+domain 187 | write(companyName, formatValue, newname) 188 | print(newname) 189 | else: 190 | write(companyName, formatValue, newname) 191 | print(newname) 192 | elif formatValue == 9: 193 | newname = first_name + last_name[0] 194 | if domain != '': 195 | newname = newname+"@"+domain 196 | write(companyName, formatValue, newname) 197 | print(newname) 198 | else: 199 | write(companyName, formatValue, newname) 200 | print(newname) 201 | elif formatValue == 10: 202 | newname = first_name[0] + "." + last_name 203 | if domain != '': 204 | newname = newname+"@"+domain 205 | write(companyName, formatValue, newname) 206 | print(newname) 207 | else: 208 | write(companyName, formatValue, newname) 209 | print(newname) 210 | elif formatValue == 11: 211 | newname = last_name[0] + "." + first_name 212 | if domain != '': 213 | newname = newname+"@"+domain 214 | write(companyName, formatValue, newname) 215 | print(newname) 216 | else: 217 | write(companyName, formatValue, newname) 218 | print(newname) 219 | elif formatValue == 12: 220 | newname = last_name[0:3] + first_name[0:2] 221 | if domain != '': 222 | newname = newname+"@"+domain 223 | write(companyName, formatValue, newname) 224 | print(newname) 225 | else: 226 | write(companyName, formatValue, newname) 227 | print(newname) 228 | elif formatValue == 13: 229 | newname = last_name[0:4] + first_name[0:3] 230 | if domain != '': 231 | newname = newname+"@"+domain 232 | write(companyName, formatValue, newname) 233 | print(newname) 234 | else: 235 | write(companyName, formatValue, newname) 236 | print(newname) 237 | else: 238 | sys.exit(2) 239 | 240 | def write(companyName, formatValue, newname): 241 | filename = "linkScrape-data/"+companyName+"-"+"mangle-"+str(formatValue)+"_"+timestr+".txt" 242 | with open(filename, 'a') as f: 243 | f.write(newname+"\n") 244 | 245 | 246 | def help(): 247 | print banner 248 | print " Usage: python linkScrape.py \n" 249 | print " Example: python linkScrape.py -e LinkedInUser@email.com -c acme -r 1 -t 3 -m 7 -d acme.com\n" 250 | print " Example: python linkScrape.py -m 7 -i ~/Company/names.txt\n" 251 | print " Raw output saved to: linkedIn/linkScrape-data/Company_time.txt " 252 | print " Formatted output saved to: linkedIn/linkScrape-data/Company-mangle[x]_time.txt \n" 253 | print colors.lightblue + " Login options:\n" + colors.normal 254 | print "\t -e \t\tYour LinkedIn.com Email Address. " 255 | print "\t -p \t\tYour LinkedIn.com Password. " 256 | print colors.lightblue + "\n Search options:\n" + colors.normal 257 | print "\t -c \t\tCompany you want to enumerate.(Prepends to filename if used with -i) " 258 | print "\t -r \t\tSearches x amount of LinkedIn.com pages (Default is 5)." 259 | print "\t -t \t\tSets timeout value. (Default is 5.)" 260 | print colors.lightblue + "\n Mangle options:\n" + colors.normal 261 | print """\t -m \t\t 262 | 1)FirstLast ex:nicksanzotta 263 | 2)LastFirst ex:sanzottanick 264 | 3)First.Last ex:nick.sanzotta 265 | 4)Last.First ex:sanzotta.nick 266 | 5)First_Last ex:nick_sanzotta 267 | 6)Last_First ex:sanzotta_nick 268 | 7)FLast ex:nsanzotta 269 | 8)LFirst ex:snick 270 | 9)FirstL ex:nicks 271 | 10)F.Last ex:n.sanzotta 272 | 11)L.Firstname ex:s.nick 273 | 12)FirLa ex:nicsa 274 | 13)Lastfir ex:sanznic 275 | """ 276 | print "\t -d \t\tAppend @domain.com to enumerated user list." 277 | print "\t -i \t\tUse local file instead of LinkedIn.com to perform name Mangle against." 278 | print colors.lightblue + "\n Misc:\n" + colors.normal 279 | print "\t -h \t\tPrints this help menu." 280 | sys.exit(2) 281 | 282 | def main(argv): 283 | print(banner) 284 | email= '' 285 | password= '' 286 | companyName= '' 287 | formatValue = 7 288 | pageResults = 5 289 | timeout = 5 290 | domain = '' 291 | outputTitle = '' 292 | 293 | if not os.path.exists("linkScrape-data/"): 294 | os.mkdir("linkScrape-data/") 295 | 296 | if len(argv) < 1: 297 | print('\n') 298 | #WIZARD Menu: If no args are defined the wizard will be launched. 299 | print('['+colors.red+'*'+colors.normal+']You did not specify a parameter the wizard has launched:') 300 | print('['+colors.lightblue+'*'+colors.normal+']Example: python linkScrape.py -e user@email.com -c acme') 301 | print('[*]For help & command line options please use: python linkScrape.py --help\n') 302 | email = raw_input('Enter LinkedIn Email account[user@email.com]: ') or email 303 | print('ENTERED: "%s"' % email + '\n') 304 | 305 | password = getpass.getpass('Enter LinkedIn Password: ') or password 306 | 307 | companyName = raw_input('Enter Company[ex:acme]: ') or companyName 308 | output = 'linkScrape-data/'+companyName+'_'+timestr+'.txt' 309 | print('ENTERED: "%s"' % companyName + '\n') 310 | 311 | print colors.lightblue + "\n Mangle options:\n" + colors.normal 312 | print """\t -m \t\t 313 | 1)FirstLast ex:nicksanzotta 314 | 2)LastFirst ex:sanzottanick 315 | 3)First.Last ex:nick.sanzotta 316 | 4)Last.First ex:sanzotta.nick 317 | 5)First_Last ex:nick_sanzotta 318 | 6)Last_First ex:sanzotta_nick 319 | 7)FLast ex:nsanzotta 320 | 8)LFirst ex:snick 321 | 9)FirstL ex:nicks 322 | 10)F.Last ex:n.sanzotta 323 | 11)L.Firstname ex:s.nick 324 | 12)FirLa ex:nicsa 325 | 13)Lastfir ex:sanznic 326 | """ 327 | formatValue = raw_input('Enter name Managle choice[ex:7]: ') or formatValue 328 | formatValue = int(formatValue) 329 | print('ENTERED: "%s"' % formatValue + '\n') 330 | 331 | print('[*]TIP: This value will determine how many page results will be returned.') 332 | pageResults = raw_input('Enter number of pages results[ex:5]: ') or pageResults 333 | pageResults = int(pageResults) 334 | pageResults+=1 335 | print('ENTERED: "%s"' % pageResults + '\n') 336 | 337 | print('[*]TIP: This value will determine how long of a delay(in seconds) each page will be scraped.') 338 | timeout = raw_input('Enter timeout value[ex:5]: ') or timeout 339 | timeout = int(timeout) 340 | print('ENTERED: "%s"' % timeout + '\n') 341 | 342 | print('[*]TIP: This value will be added to the end of each mangled result[ex:jsmith@acme.com].') 343 | domain = raw_input('Enter Domain suffix[ex:acme.com]: ') or domain 344 | print('ENTERED: "%s"' % domain + '\n') 345 | cls() 346 | print(banner) 347 | connection(email, password, companyName, pageResults, timeout, output) 348 | name(companyName, output, formatValue, domain) 349 | print "\nCompleted in: %.1fs\n" % (time.time() - curr_time) 350 | 351 | else: 352 | try: 353 | opts, args = getopt.getopt(argv, 'e:c:r:t:o:m:d:i:h',['email=','company=','results=','timeout=','output=','mangle=','--domain=','--input=','help']) 354 | #GETOPT Menu: 355 | for opt, arg in opts: 356 | if opt in ('-h', '--help'): 357 | help() 358 | sys.exit(2) 359 | elif opt in ('-e', '--email'): 360 | email = arg 361 | password = getpass.getpass(r'Enter password: ') 362 | elif opt in ('-c', '--company'): 363 | companyName = arg 364 | output = 'linkScrape-data/'+companyName+'_'+timestr+'.txt' 365 | elif opt in ('-r', '--results'): 366 | pageResults = int(arg) 367 | pageResults+=1 368 | elif opt in ('-t', '--timeout'): 369 | timeout = int(arg) 370 | elif opt in ('o','--output'): 371 | output = arg 372 | elif opt in ('-m', '--mangle'): 373 | formatValue = int(arg) 374 | elif opt in ('-d','--domain'): 375 | domain = arg 376 | elif opt in ('-i','--input'): 377 | inputfile = arg 378 | output = inputfile 379 | name(companyName, output, formatValue, domain) 380 | sys.exit(2) 381 | else: 382 | help() 383 | sys.exit(2) 384 | connection(email, password, companyName, pageResults, timeout, output) 385 | name(companyName, output, formatValue, domain) 386 | print "\nCompleted in: %.1fs\n" % (time.time() - curr_time) 387 | 388 | except getopt.GetoptError: 389 | help() 390 | sys.exit(2) 391 | 392 | 393 | if __name__ == "__main__": 394 | main(argv[1:]) 395 | --------------------------------------------------------------------------------