.
675 |
--------------------------------------------------------------------------------
/Linkedin Basic Profile Fields.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4iji/Linkedin-Scraper/cc3de25ed332bd6873243e82cf1aa2c4cc0d0895/Linkedin Basic Profile Fields.xlsx
--------------------------------------------------------------------------------
/Linkedin search input.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4iji/Linkedin-Scraper/cc3de25ed332bd6873243e82cf1aa2c4cc0d0895/Linkedin search input.xlsx
--------------------------------------------------------------------------------
/Linkedin search output.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4iji/Linkedin-Scraper/cc3de25ed332bd6873243e82cf1aa2c4cc0d0895/Linkedin search output.xlsx
--------------------------------------------------------------------------------
/Linkedin.py:
--------------------------------------------------------------------------------
1 | import pandas
2 | from bs4 import BeautifulSoup
3 | import re
4 | import requests
5 | from sys import *
6 | from googleapiclient.discovery import build
7 | from openpyxl import load_workbook #VERSION 1.8.5 ONLY
8 | import xlsxwriter
9 | import fuzzy
10 |
11 |
12 |
13 | Start_Index=0
14 | Last_Index=201
15 |
16 |
17 | username=raw_input('Login:')
18 | password=raw_input('Password:')
19 |
20 |
21 | writer=pandas.ExcelWriter("Linkedin search output.xlsx",engine='openpyxl')
22 | book=load_workbook("Linkedin search output.xlsx")
23 | writer.book=book
24 | writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
25 |
26 | input_file=pandas.ExcelFile("Linkedin search input.xlsx")
27 | sheet=input_file.parse(0)
28 |
29 | def writeDataFrame(index,conm,namedes,midname,id,firstname,lastname,profile_link,maidenname='NA',formattedname='NA',phoneticfirstname='NA',phoneticlastname='NA',
30 | fphoneticname='NA',headline='NA',location='NA',industry='NA',currentshare='NA',numconnection='NA',summary='NA',specialities='NA',positions='NA',
31 | picurl='NA'):
32 | global writer
33 | df = pandas.DataFrame({'A_ID':[index+1],'B_Company':[conm],'C_name':[namedes],'D_firstname':[firstname],'E_midName':[midname],'F_lastName':[lastname],
34 | 'G_Linkedin Profile Link':[profile_link],'H_Linkedin-Id':[id],'I_maidenname':[maidenname],'J_formatted-name':[formattedname],
35 | 'K_phonetic-firstname':[phoneticfirstname],'L_phonetic-lastname':[phoneticlastname],'M_formatted-phonetic-name':[fphoneticname],
36 | 'N_Headline':[headline],'O_Location':[location],'P_Industry':[industry],'Q_CurrentShare':[currentshare],'R_Connections':[numconnection],
37 | 'S_Summary':[summary],'T_Specialities':[specialities],'U_Positions':[positions],'V_Picture-Url':[picurl]})
38 | if index==0:
39 | df.to_excel(writer,sheet_name="Sheet1",startrow=index,index=False)
40 | else:
41 | df.to_excel(writer,sheet_name="Sheet1",index=False,startrow=index+1,header=False)
42 | writer.save()
43 | return
44 |
45 | def readExcel(index):
46 | global sheet
47 | row=sheet.irow(index).real
48 | if str(row[4])=='nan':
49 | row[4]=""
50 | return row[3]+" "+row[4]+" "+row[5],row[1],row[4],row[3]+" "+row[5],row[2],row[1]
51 |
52 | session = requests.Session()
53 | LINKEDIN_URL = 'https://www.linkedin.com'
54 | LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
55 | html = session.get(LINKEDIN_URL).content
56 | soup = BeautifulSoup(html,'html5lib')
57 | csrf = soup.find(id="loginCsrfParam-login")['value']
58 |
59 | login_information = {
60 | 'session_key':username,
61 | 'session_password':password,
62 | 'loginCsrfParam': csrf,
63 | }
64 |
65 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:31.0) Gecko/20100101 Firefox/31.0'}
66 | session.post(LOGIN_URL, headers=headers, data=login_information)
67 | device= build("customsearch","v1",developerKey="AIzaSyCVlwAUM9QvlrClxQXOl51WcfOFKxITsh8") #API Keys: AIzaSyCz56VeNPTb7BNbAwUXJvrLcBNssHa0WDU , AIzaSyDY37hOk_lEmIUg_T-goXFYzL1qYm7zIaA
68 |
69 | for j in range(Start_Index,Last_Index):
70 | print j
71 | r,c,m,o,nd,com=readExcel(j)
72 |
73 | try:
74 | results=device.cse().list(q=r+" "+c,num=1,cx='009259153963367271982:fkxfnawlcn4').execute() # cx: 018365894794576624310:ubawqf09zme , 005187750176123224094:h3ylihk-rmi
75 | #print results
76 | link=results['items'][0]['formattedUrl']
77 | #print link
78 | if j==37:
79 | link="http://"+link
80 | html=session.get(link).content
81 | except Exception as e:
82 | print 'Error:',
83 | print e
84 | writeDataFrame(j,com,nd,m,"NA","NA","NA","NA")
85 | continue
86 |
87 | soup=BeautifulSoup(html,'html5lib')
88 |
89 |
90 | names=soup.find_all('span')
91 | fullnames=''
92 | for i in names:
93 | #print i
94 | s=re.findall('.*class="full-name".*>(.*)',str(i))
95 | if s!=[]:
96 | if re.match('.*',s[0]):
97 | fullnames=(s[0])
98 | if len(fullnames)==0:
99 | writeDataFrame(j,com,nd,m,"NA","NA","NA","NA")
100 | continue
101 | fullnames=fullnames.split()
102 |
103 | con1=soup.find_all('div')
104 | connections="NA"
105 | for i in con1:
106 | #print i
107 | s=[]
108 | s=re.findall('class="member-connections">(.*)',str(i))
109 | if s!=[]:
110 | connections=s[0]
111 | break
112 |
113 |
114 | summary="NA"
115 | sum1=soup.find_all('p')
116 | for i in sum1:
117 | s=[]
118 | #print i
119 | s=re.findall('(.*)
',str(i))
120 | #print s
121 | if s!=[]:
122 | summary=s[0]
123 | break
124 |
125 |
126 | headline=""
127 | for i in sum1:
128 | s=[]
129 | s=re.findall('class="title" dir="ltr">(.*)
',str(i))
130 | if s!=[]:
131 | headline=s[0]
132 | break
133 |
134 | loc1=soup.find_all('a')
135 | location="NA"
136 | industry="NA"
137 | for i in loc1:
138 | s=[]
139 | s=re.findall('name="location".*>(.*)',str(i))
140 | if s!=[]:
141 | location=s[0]
142 | break
143 | for i in loc1:
144 | s=[]
145 | s=re.findall('name="industry".*>(.*)',str(i))
146 | if s!=[]:
147 | industry=s[0]
148 | break
149 |
150 |
151 | img1=soup.find_all('img')
152 | pic_url="NA"
153 | for i in img1:
154 | #print i
155 | s=[]
156 | s=re.findall('
',str(i))
157 | if s!=[]:
158 | pic_url=s[0]
159 | break
160 |
161 | positions="NA"
162 | for i in loc1:
163 | s=[]
164 | s=re.findall('(.*)',str(i))
165 | if s!=[]:
166 | positions=s[0]
167 | break
168 |
169 | skills="NA"
170 | for i in loc1:
171 | #print i
172 | s=[]
173 | s=re.findall('(.*)',str(i))
174 | if s!=[]:
175 | skills=s[0]
176 | break
177 |
178 | id1="NA"
179 | #print soup
180 | nom=soup.find_all('div')
181 | for i in nom:
182 | #print i
183 | s=[]
184 | s=re.findall('',str(i))
185 | if s!=[]:
186 | id1=s[0]
187 | break
188 |
189 |
190 | #print id1
191 | phonetic_first_name=fuzzy.DMetaphone()(fullnames[0])[0]
192 | phonetic_last_name=fuzzy.nysiis(fullnames[1])
193 | formatted_phonetic=fuzzy.nysiis(r)
194 | writeDataFrame(j,com,nd,m,id1,fullnames[0],fullnames[1],link,formattedname=o,phoneticfirstname=phonetic_first_name,
195 | phoneticlastname=phonetic_last_name,fphoneticname=formatted_phonetic,numconnection=connections,summary=summary,headline=headline,
196 | location=location,industry=industry,positions=positions,specialities=skills,picurl=pic_url)
197 |
198 |
199 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Linkedin-Scraper
2 | This is a python program which scrapes linkedin information upto 98% accuracy using the google custom search API. It also uses pandas to read the search parameters from Excel Sheets and then scrape and write them back into another Excel Sheet.
3 |
4 | The Packages needed are
5 | 1.Pandas
6 | 2.RE
7 | 3.Beautiful Soup 4
8 | 4.Requests
9 | 5.googleapiclient
10 | 6.openpyxl
11 | 7.fuzzy
12 | 8.Read the Readme.txt file on instructions.
13 |
--------------------------------------------------------------------------------
/Readme.txt:
--------------------------------------------------------------------------------
1 | The Program can be executed by running Linkedin.py
2 |
3 |
4 | The Program runs on Python 2.7 (64-bit)
5 |
6 | The required Packages are:
7 | 1. requests
8 | 2. bs4
9 | 3. re (Regular Expressions)
10 | 4. Pandas
11 | 5. GoogleApiClient
12 | 6. openpyxl ( Version 1.8.5)
13 | 7. fuzzy , Link: https://pypi.python.org/pypi/Fuzzy
14 |
15 |
16 | ->The program can be run on any IDE or from CommandLine
17 |
18 | ->In the program the Start_Index and the Last_index can be given,
19 | based on the index, the input will be read from the excel file and the output will
20 | written into the output file.
21 |
22 | ->The Program needs a Linkedin user-id and password. For testing purposes
23 |
24 | username : jamesmurray7702213959@gmail.com
25 | password : james123
26 |
27 | -> The Program uses google Api client, with custom search . An API key is needed
28 | to use it, which is already provided.
29 | ->A cx value for the google custom search is needed, this custom search searches
30 | the linkedin.com domain for the query
31 |
32 | Execute the program and wait for it to finish .
33 |
34 | Warning: The Free version of the google API only allows 100 requests per day.
--------------------------------------------------------------------------------