├── README.md
├── spider-cityAllCompany.py
├── 手工绕开企查查的登录验证.py
├── 指定公司全称获取信息02282320.py
└── 根据指定网址提取信息02282320.py


/README.md:
--------------------------------------------------------------------------------
1 | # python_spider
2 | Get companies' information from the web use python-selenium
3 | 


--------------------------------------------------------------------------------
/spider-cityAllCompany.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | #获取指定城市所有类型的公司信息，每种类型的公司保存为一个excel表格 
  3 | #默认不显示浏览器图形界面
  4 | 
  5 | #安装chromedriver、selenium、xlwt
  6 | #如果except 语法报错，请使用python2.7 或修改代码适应python3.*版本
  7 | 
  8 | from selenium import webdriver
  9 | from selenium.common.exceptions import NoSuchElementException
 10 | from selenium.webdriver.chrome.options import Options
 11 | import time
 12 | import xlwt
 13 | import sys
 14 | import re
 15 | 
 16 | def search(driver,city,isdebug,web_order):
 17 |     try:#浏览器打开网页
 18 |         driver.get("http://m.54114.cn/"+city+'/'+web_order +'/')#-------------------网址修改之一
 19 |     except Exception,e:#如果网页打开有误，则直接退出
 20 |         print('0'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
 21 |         return 
 22 |     s=(driver.find_element_by_xpath("/html/body/div[4]").text)#.encode('utf-8')  #获取公司总数目
 23 |     try:
 24 |         s=s.split(' ')[11]
 25 |         s=s.replace(u'下','').replace(u'一','').replace(u'页','').replace('>','')
 26 |     except IndexError:#有些行业可能只有一个页面、甚至只有一个公司
 27 |         pass
 28 |     try:
 29 |         page_num=int(s)
 30 |     except:#有些行业甚至一个公司都没有
 31 |         return
 32 | 
 33 |     workbook = xlwt.Workbook(encoding="utf-8")
 34 |     worksheet = workbook.add_sheet("my worksheet")#,cell_overwrite_ok=True)#解决重写报错
 35 |     worksheet.write(0,0,label=u"序号")
 36 |     worksheet.write(0,1,label=u"公司名称")
 37 |     worksheet.write(0,2,label=u"电话")
 38 |     worksheet.write(0,3,label=u"邮箱")
 39 |     worksheet.write(0,4,label=u"网址")
 40 |     worksheet.write(0,5,label=u"地址")
 41 |     worksheet.write(0,6,label=u"经营范围")
 42 | 
 43 |     url_num=page_num*20
 44 |     #url_num = (int)(driver.find_element_by_xpath("/html/body/div[4]/span[1]").text.replace(u'共','').replace(u'纪录',''))
 45 | 
 46 |     page_url = []  #获取每个页面的链接，存放到page_url
 47 |     for i in range(1,page_num+1):
 48 |         page_url.append("http://m.54114.cn/"+city+"/"+web_order+"_p"+str(i)+"/")#-------------------网址修改之二
 49 |     
 50 |     url_list = []# 存放所有页面中所有公司的链接
 51 | 
 52 |     for i in range(page_num):#对每个页面，尝试获取每个公司的链接
 53 |         current_page = page_url[i]
 54 |         driver.get(current_page)
 55 |         try:
 56 |             for j in range(1,21):
 57 |                 xpath = "/html/body/div[3]/div[3]/ul/li["+str(j)+"]/a"
 58 |                 url_list.append(driver.find_element_by_xpath(xpath).get_attribute("href"))
 59 |         except NoSuchElementException:
 60 |             pass
 61 |         except Exception,e:
 62 |             print('1'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
 63 |             print(current_page)
 64 |             print('order:'+str(i+1))
 65 |             return
 66 |     
 67 |     if isdebug==1:#如果是调试模式，则把所有公司信息页面的网址都写入到文件中
 68 |         fp = open(str("incURL_"+city+".txt"),"w+")
 69 |         try:
 70 |             for i in range(1000):
 71 |                 fp.write(str(url_list[i])+'\n')
 72 |         except IndexError:#如果某页面列出的公司数量没有20个,忽略即可
 73 |             pass
 74 |         except Exception,e:
 75 |             print('2'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
 76 |             print('order:'+str(i+1))
 77 |             return
 78 |             #print(len(url_list ))
 79 |         fp.close()
 80 |     
 81 |     for j in range(url_num):#对每个公司的链接，进入该网址，获取信息
 82 |         try:
 83 |             url = url_list[j]
 84 |         except IndexError:#默认的是每页有20个公司。如果没有这么多，则直接退出for循环
 85 |             break
 86 |         try:
 87 |             driver.get(url)
 88 |         #//如果打开有误，说明实际的网页数量并没有url_num这么多，退出即可
 89 |         except Exception,e:#“电话：暂无联系方式” 这种形势虽然有
 90 |             workbook.save(city+'_'+web_order+'.xls')
 91 |             print('3'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
 92 |             print('order:'+str(j))
 93 |             print(url)
 94 |             return
 95 |         company = driver.find_element_by_xpath("/html/body/div[3]/div[1]/strong").text
 96 |         if(isdebug==1):
 97 |             print(j+1)
 98 |             print(company)
 99 |         worksheet.write(j+1,0,label=str(j+1))
100 |         worksheet.write(j+1,1,label=company)
101 |         
102 |         #正则表达式提取电话号码，电话号码有多种形式，因此下面用了4种表达式，例外 电话：(0571);
103 |         try:
104 |             phone = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[2]/span/font/a").text
105 |             o = re.findall(r'\d{3,4}[-]?\d{3}[-]?\d{4}',phone)#400-123-4567 或 400-1234567
106 |             m = re.findall(r'\(?0\d{2,3}[) -]?\d{7,8}',phone)#座机
107 |             l = re.findall(r'(\d{8,9})',phone)#座机纯8位 或纯9位的号码     
108 |             n = re.findall(r'(86)?(1\d{10})',phone)#手机
109 |             
110 |             #优先向表格写入座机，因为座机具备一定的信息，大公司的号码网上页能搜得到
111 |             have_phone = 0#标记号码是否已经输出到表格中
112 |             if (m):
113 |                 have_phone = 1
114 |                 if(isdebug==1):
115 |                     print(m[0])
116 |                 worksheet.write(j+1,2,label=m[0])
117 |             elif (o and (have_phone == 0) ):
118 |                 have_phone = 1
119 |                 if(isdebug==1):
120 |                     print(o[0])
121 |                 worksheet.write(j+1,2,label=o[0])
122 |             elif (l and (have_phone == 0) ):
123 |                 have_phone = 1
124 |                 if(isdebug==1):
125 |                     print(l[0])
126 |                 worksheet.write(j+1,2,label=l[0])
127 |             elif (n and (have_phone == 0) ):
128 |                 have_phone = 1
129 |                 if(isdebug==1):
130 |                     print(n[0])
131 |                 worksheet.write(j+1,2,label=n[0])
132 |             else:#处理例外情况 电话：(0571);
133 |                 worksheet.write(j+1,2,label='null')
134 |         except NoSuchElementException:#“电话：暂无联系方式”
135 |             worksheet.write(j+1,2,label='null')
136 |         except Exception,e: 
137 |             print('4'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
138 |             print('order:'+str(j))
139 |             print(url)
140 |             workbook.save(city+'_'+web_order+'.xls')
141 |             return
142 |         have_mail = 0#标记各个属性是否已经成功提取
143 |         have_url  = 0
144 |         have_addr = 0
145 |         have_sales= 0
146 | 
147 |         #邮箱、网址这2项可能会有1-2项缺失，且仅根据网页标签无法区分，只能每次获取都进行三次匹配
148 |         #分情况讨论：
149 |         #1 假如邮箱和网址都缺失，只有两次获取成功，分别是地址、经营范围
150 |         #2 假如仅邮箱缺失，3次分别获取到的是网址、地址、经营范围
151 |         #3 假如仅网址缺失，3次分别获取到的是邮箱、地址、经营范围
152 | 
153 |         try:#第1次获取
154 |             mail_or_url = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[3]/span").text
155 |             if(mail_or_url.find(u'邮箱')==0 ):
156 |                 have_mail=1
157 |                 mails = re.findall(r"[a-zA-Z0-9\.\-+_]+@?[a-zA-Z0-9\.\-+_]+[\.]?[a-zA-Z]+", mail_or_url)#邮箱不规范，@后没写点，因此这里的.设置为可选项
158 |                 if(isdebug==1):
159 |                     print(mails[0])
160 |                 worksheet.write(j+1,3,label=mails[0])
161 |             else:
162 |                 pass
163 |             if(mail_or_url.find(u'网址')==0):
164 |                 if(have_mail==0):
165 |                     worksheet.write(j+1,3,label='null')#mail is null
166 |                 have_url  = 1
167 |                 url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[3]/span/a[1]').text
168 |                 if(isdebug==1):
169 |                     print(url)
170 |                 worksheet.write(j+1,4,label=url)
171 |             else:
172 |                 pass
173 |             if(mail_or_url.find(u'地址：')==0 ):
174 |                 if(have_mail==0):
175 |                     worksheet.write(j+1,3,label='null')#mail is null
176 |                 if(have_url==0):
177 |                     worksheet.write(j+1,4,label='null')#url is null
178 |                 have_addr = 1
179 |                 if(isdebug==1):
180 |                     print(mail_or_url.replace('地址：',''))
181 |                 worksheet.write(j+1,5,label=mail_or_url.replace('地址：',''))
182 |             else:
183 |                 pass
184 |         except IndexError:
185 |             worksheet.write(j+1,3,label=(mail_or_url.replace('邮箱：','').replace('。','.')))#有些邮箱不规范,把点写作了句号
186 |         except Exception,e:
187 |             print('5'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
188 |             print('order:'+str(j))
189 |             print(url)
190 |             workbook.save(city+'_'+web_order+'.xls')
191 |             return
192 | 
193 |         try:#第2次获取
194 |             url_or_addr = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span').text
195 |             if(have_url  == 0):
196 |                 if(url_or_addr.find(u'网址：')==0):     #/html/body/div[3]/div[4]/ul/li[4]/span/a[1]
197 |                     have_url  = 1
198 |                     inc_url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span/a').text
199 |                     if(isdebug==1):
200 |                         print(inc_url)
201 |                     worksheet.write(j+1,4,label=inc_url)
202 |             else:
203 |                 pass
204 |             if(have_addr == 0):
205 |                 if(url_or_addr.find(u'网址：')==-1 and url_or_addr.find(u'地址：')==0 ):#有些情况下，网址的那一行里有“下载地址”。 首字符匹配到“地址”并且没有出现“网址”才能算作地址
206 |                     if(have_url==0):
207 |                         worksheet.write(j+1,4,label='null')#url is null
208 |                     have_addr = 1
209 |                     if(isdebug==1):
210 |                         print(url_or_addr.replace('地址：',''))
211 |                     worksheet.write(j+1,5,label=url_or_addr.replace('地址：',''))
212 |             else:
213 |                 pass 
214 |             if(url_or_addr.find(u'经营范围：')==0 ):
215 |                 have_sales= 1
216 |                 if(have_addr==0):
217 |                     worksheet.write(j+1,5,label='null')
218 |                 if(isdebug==1):
219 |                     print(url_or_addr.replace('经营范围：','').replace('...',''))
220 |                 worksheet.write(j+1,6,label=url_or_addr.replace('经营范围：','').replace('...',''))
221 |             else:
222 |                 pass 
223 |         except Exception,e:
224 |             print('6'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
225 |             print('order:'+str(j))
226 |             print(url)
227 |             workbook.save(city+'_'+web_order+'.xls')
228 |             return
229 | 
230 |         try:#第3次获取
231 |             if(have_sales== 1):#如果前面已经出现过经营范围了，后面就没必要判断了。因为经营范围是最后一个项目
232 |                 pass
233 |             else:
234 |                 addr_or_sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[5]/span").text
235 |             
236 |                 if(addr_or_sales.find(u'地址：')==0 ):#有的在经营范围里出现了 停车地址...  and have_addr==0
237 |                     have_addr=1
238 |                     if(have_url==0):
239 |                         worksheet.write(j+1,4,label='null')#url is null
240 |                     if(isdebug==1):
241 |                         print(addr_or_sales.replace('地址：',''))
242 |                     worksheet.write(j+1,5,label=addr_or_sales.replace('地址：',''))
243 |                 if(addr_or_sales.find(u'经营范围：')==0 ):
244 |                     have_sales= 1
245 |                     if(have_addr==0):
246 |                         worksheet.write(j+1,5,label='null')
247 |                     if(isdebug==1):
248 |                         print(addr_or_sales.replace('经营范围：','').replace('...',''))
249 |                     worksheet.write(j+1,6,label=addr_or_sales.replace('经营范围：','').replace('...',''))
250 |         except Exception,e:
251 |             print('7'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
252 |             print('order:'+str(j))
253 |             print(url)
254 |             workbook.save(city+'_'+web_order+'.xls')
255 |             return
256 | 
257 |         try:
258 |             if(have_sales== 1):#如果前面已经出现过经营范围了，后面就没必要判断了。因为经营范围是最后一个项目
259 |                 pass
260 |             else:
261 |                 sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[6]/span").text
262 |                 if(sales.find(u'经营范围：')==0 ):
263 |                     #这里曾经重写报错Attempt to overwrite cell: sheetname=u'my worksheet' rowx=1 colx=5 Exception
264 |                     #报错的原因:前面写入地址后，没有更新have_addr的值，导致这里重复写入地址null
265 |                     if(have_addr==0):
266 |                         worksheet.write(j+1,5,label='null')
267 |                     if(isdebug==1):
268 |                         print(sales.replace('经营范围：','').replace('...',''))
269 |                     worksheet.write(j+1,6,label=sales.replace('经营范围：','').replace('...',''))
270 |         except Exception,e:
271 |             print('8'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
272 |             print('order:'+str(j))
273 |             print(url)
274 |             workbook.save(city+'_'+web_order+'.xls')
275 |             return
276 |         if(isdebug==1):        
277 |             print(" ")
278 |     workbook.save(city+'_'+web_order+'.xls')
279 |     print(city+' '+web_order+' is done.')
280 | 
281 | if __name__=='__main__':
282 |     reload(sys) 
283 |     sys.setdefaultencoding('utf-8')
284 | 
285 |     isdebug = 0 #如果是1就在终端打印信息，如果是0就不打印
286 |     
287 |     if(isdebug==0):#默认不显示浏览器图形界面
288 |         chrome_options = Options()
289 |         chrome_options.add_argument('--headless')
290 |         chrome_options.add_argument('--disable-gpu')
291 |         driver = webdriver.Chrome(chrome_options=chrome_options)
292 |     else:#如果是调试模式，则显示浏览器图形界面
293 |         driver = webdriver.Chrome()
294 | 
295 |     city =[ 'beijing','shanghai','guangzhou','shenzhen','hangzhou']
296 |     
297 |     time_start = time.time()
298 |     for i in range(len(city)):
299 |         for j in range(1,21):
300 |             web_order ='hangye'
301 |             web_order =web_order+str(j)
302 |             search(driver,city[i],isdebug,web_order)
303 |     #time.sleep(5) #信息爬取完毕之后，网页显示5秒再关闭
304 |     driver.close()
305 |     driver.quit()
306 |     time_end = time.time()
307 |     print('time cost:',time_end-time_start,'s')
308 |     
309 | #后期考虑，TimeoutException() 超时类型的错误可以通过读取网址列表的形式，增加一个断点重启的功能
310 | #后期考虑，在excel添加一栏 城市 字段，如 xx市。需要正则提取xx省xx市、xx市两种情况
311 | 


--------------------------------------------------------------------------------
/手工绕开企查查的登录验证.py:
--------------------------------------------------------------------------------
 1 | #默认使用python 2.*，如果使用的是python3.*则需要对两个语句进行替换。
 2 | from selenium import webdriver
 3 | from selenium.common.exceptions import NoSuchElementException
 4 | import time
 5 | import xlwt
 6 | import sys
 7 | 
 8 | reload(sys)                    #if python 2.*
 9 | sys.setdefaultencoding('utf-8')#if python 2.*
10 | 
11 | #import importlib      #if python 3.*
12 | #importlib.reload(sys) #if python 3.*
13 | 
14 | 
15 | #伪装成浏览器，防止被识破
16 | option = webdriver.ChromeOptions()
17 | option.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"')
18 | driver = webdriver.Chrome(chrome_options=option)
19 | 
20 | #打开登录页面
21 | driver.get('https://www.qichacha.com/user_login')
22 | #单击用户名密码登录的标签
23 | tag = driver.find_element_by_xpath('//*[@id="normalLogin"]')
24 | tag.click()
25 | #将用户名、密码注入
26 | driver.find_element_by_id('nameNormal').send_keys('username')
27 | driver.find_element_by_id('pwdNormal').send_keys('password')
28 | time.sleep(10)#休眠，人工完成验证步骤，等待程序单击“登录”
29 | #单击登录按钮
30 | btn = driver.find_element_by_xpath('//*[@id="user_login_normal"]/button')
31 | btn.click()
32 | 
33 | inc_list = ['阿里巴巴','腾讯','今日头条','滴滴','美团']
34 | inc_len = len(inc_list)
35 | 
36 | for i in range(inc_len):
37 |     txt = inc_list[i]
38 |     time.sleep(1)
39 |     
40 |     if (i==0):
41 |         #向搜索框注入文字
42 |         txt=txt.decode('utf-8')
43 |         driver.find_element_by_id('searchkey').send_keys(txt)
44 |         #单击搜索按钮
45 |         srh_btn = driver.find_element_by_xpath('//*[@id="V3_Search_bt"]')
46 |         srh_btn.click()
47 |     else:
48 |         #向搜索框注入下一个公司地址
49 |         txt=txt.decode('utf-8')
50 |         driver.find_element_by_id('headerKey').send_keys(txt)
51 |         #搜索按钮 
52 |         srh_btn = driver.find_element_by_xpath('/html/body/header/div/form/div/div/span/button')
53 |         srh_btn.click()
54 | 
55 |     #获取首个企业文本
56 |     print(i+1)
57 |     inc_full = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/a').text                                
58 |     print(inc_full)
59 |     money = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[1]/span[1]').text
60 |     print(money)
61 |     date = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[1]/span[2]').text
62 |     print(date)
63 |     mail_phone = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[2]').text
64 |     print(mail_phone)
65 |     addr = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[3]').text
66 |     print(addr)
67 |     try:
68 |         stock_or_others = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[4]').text
69 |         print(stock_or_others)
70 |     except:
71 |         pass
72 | 
73 |     #获取网页地址，进入
74 |     inner = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/a').get_attribute("href")
75 |     driver.get(inner)
76 | 
77 |     #单击进入后 官网 通过href属性获得：
78 |     inc_web = driver.find_element_by_xpath('//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[3]/a').get_attribute("href")
79 |     print("官网："+inc_web)
80 |     print(' ')
81 | 
82 | driver.close()
83 | 
84 | #bug list:
85 | #UnicodeDecodeError: 'utf8' codec can't decode byte 0xe9 in position 0: unexpected end of data
86 | #原因：向搜索栏注入中文字符串时，必须先采用如下方式转换成utf-8编码
87 | #解决：send_keys("阿里巴巴".decode('utf-8'))
88 | 


--------------------------------------------------------------------------------
/指定公司全称获取信息02282320.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | #指定公司全称的列表文件，该程序每次从中读取一行，去指定网页检索该公司的信息，然后存储为excel表格
  3 | 
  4 | from selenium import webdriver
  5 | import time
  6 | import xlwt
  7 | import sys
  8 | import re
  9 | 
 10 | def search(driver,i,worksheet,mytxt):
 11 |     driver.get("http://m.54114.cn/hangzhou/")#打开网页
 12 |     #mytxt = u"杭州海康威视数字技术股份有限公司"
 13 |     driver.find_element_by_xpath("/html/body/div[2]/div/form/div/div/input[2]").send_keys(mytxt)#向输入框注入待搜索字符串
 14 |     driver.find_element_by_xpath("//*[@id=\"qixc\"]").click()#单击搜索按钮
 15 | 
 16 |     try:
 17 |         incname=driver.find_element_by_xpath("/html/body/div[3]/div[3]/ul/li[1]/a").text #获取搜索到的第一个公司名称
 18 |     except Exception,e:#如果报错，说明网页中没有搜索结果，则在表格中该公司的一行全部填写null，然后退出
 19 |         print('1'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
 20 |         worksheet.write(i,1,label=mytxt)
 21 |         worksheet.write(i,2,label='null')
 22 |         worksheet.write(i,3,label='null')
 23 |         worksheet.write(i,4,label='null')
 24 |         worksheet.write(i,5,label='null')
 25 |         worksheet.write(i,6,label='null')
 26 |         print(mytxt)
 27 |         print('未检索到该公司的信息\n')
 28 |         return
 29 |     
 30 |     if(incname==mytxt):#如果搜索到的公司名称与输入的相等，则
 31 |         print(incname)
 32 |         worksheet.write(i,1,label=incname)# 将信息输入表格
 33 |     else:#如果搜索到的公司名称与输入的不等，则说明该网站目前未收录该公司的信息，表格里写入null后退出
 34 |         print(mytxt)
 35 |         worksheet.write(i,1,label=mytxt)
 36 |         worksheet.write(i,2,label='null')
 37 |         worksheet.write(i,3,label='null')
 38 |         worksheet.write(i,4,label='null')
 39 |         worksheet.write(i,5,label='null')
 40 |         worksheet.write(i,6,label='null')
 41 |         print('未检索到该公司的信息\n')
 42 |         return
 43 | 
 44 |     try:#获取详情页的地址，并单击进入该页面
 45 |         realweb = driver.find_element_by_xpath("/html/body/div[3]/div[3]/ul/li/a").get_attribute("href")
 46 |         driver.get(realweb)
 47 |     except Exception,e:#如果获取详情页的网址有误，则直接退出
 48 |         print('2'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
 49 |         return 
 50 | 
 51 |     
 52 |     #正则表达式提取电话号码，电话号码有多种形式，因此下面用了4种表达式，目前尚未遇到例外
 53 |     try:
 54 |         phone = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[2]/span/font/a").text
 55 |         o = re.findall(r'\d{3,4}[-]?\d{3}[-]?\d{4}',phone)#400-123-4567 或 400-1234567
 56 |         m = re.findall(r'\(?0\d{2,3}[) -]?\d{7,8}',phone)#座机
 57 |         l = re.findall(r'(\d{8,9})',phone)#座机纯8位 或纯9位的号码
 58 |         n = re.findall(r'(86)?(1\d{10})',phone)#手机
 59 |         
 60 |         #优先向表格写入座机，因为座机具备一定的信息，大公司的号码网上页能搜得到
 61 |         have_phone = 0#标记号码是否已经输出到表格中
 62 |         if (m):
 63 |             have_phone = 1
 64 |             print(m[0])
 65 |             worksheet.write(i,2,label=m[0])
 66 |         if (o and (have_phone == 0) ):
 67 |             have_phone = 1
 68 |             print(o[0])
 69 |             worksheet.write(i,2,label=o[0])
 70 |         if (l and (have_phone == 0) ):
 71 |             have_phone = 1
 72 |             print(l[0])
 73 |             worksheet.write(i,2,label=l[0])
 74 |         if (n and (have_phone == 0) ):
 75 |             have_phone = 1
 76 |             print(n[0])
 77 |             worksheet.write(i,2,label=n[0])
 78 |     except Exception,e:#“电话：暂无联系方式” 这种形势虽然有
 79 |         worksheet.write(i,2,label='null')
 80 |         print('3'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
 81 | 
 82 |     have_mail = 0#标记各个属性是否已经成功提取
 83 |     have_url  = 0
 84 |     have_addr = 0
 85 |     have_sales= 0
 86 | 
 87 | 
 88 |     #邮箱、网址这2项可能会有1-2项缺失，且仅根据网页标签无法区分，只能每次获取都进行三次匹配
 89 |     #分情况讨论：
 90 |     #1 假如邮箱和网址都缺失，只有两次获取成功，分别是地址、经营范围
 91 |     #2 假如仅邮箱缺失，3次分别获取到的是网址、地址、经营范围
 92 |     #3 假如仅网址缺失，3次分别获取到的是邮箱、地址、经营范围
 93 | 
 94 |     try:#第1次获取
 95 |         mail_or_url = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[3]/span").text
 96 |         if(mail_or_url.find(u'邮箱')!=-1 ):
 97 |             have_mail=1
 98 |             mails = re.findall(r"[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+[\.]?[a-zA-Z]+", mail_or_url)#有些邮箱不规范，因此这里的'.'是可选项
 99 |             print(mails[0])
100 |             worksheet.write(i,3,label=mails[0])
101 |         else:
102 |             pass
103 |         if(mail_or_url.find(u'网址')!=-1):
104 |             if(have_mail==0):
105 |                 worksheet.write(i,3,label='null')#mail is null
106 |             have_url  = 1
107 |             url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[3]/span/a[1]').text
108 |             print(url)
109 |             worksheet.write(i,4,label=url)
110 |         else:
111 |             pass
112 |         if(mail_or_url.find(u'地址：')!=-1 ):
113 |             if(have_mail==0):
114 |                 worksheet.write(i,3,label='null')#mail is null
115 |             if(have_url==0):
116 |                 worksheet.write(i,4,label='null')#url is null
117 |             have_addr = 1
118 |             print(mail_or_url.replace('地址：',''))
119 |             worksheet.write(i,5,label=mail_or_url.replace('地址：',''))
120 |         else:
121 |             pass
122 |     except IndexError:
123 |             worksheet.write(j+1,3,label=(mail_or_url.replace('邮箱：','').replace('。','.')))#有些邮箱不规范,把点写作了句号
124 |     except Exception,e:
125 |         print('4'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
126 | 
127 |     try:#第2次获取
128 |         url_or_addr = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span').text
129 |         if(have_url  == 0):
130 |             if(url_or_addr.find(u'网址：')!=-1):     #/html/body/div[3]/div[4]/ul/li[4]/span/a[1]
131 |                 have_url  = 1
132 |                 url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span/a').text
133 |                 print(url)
134 |                 worksheet.write(i,4,label=url)
135 |         else:
136 |             pass
137 |         if(have_addr  == 0):
138 |             if(url_or_addr.find(u'地址：')!=-1 ):
139 |                 if(have_url==0):
140 |                     worksheet.write(i,4,label='null')#url is null
141 |                 have_addr = 1
142 |                 print(url_or_addr.replace('地址：',''))
143 |                 worksheet.write(i,5,label=url_or_addr.replace('地址：',''))
144 |         else:
145 |             pass 
146 |         if(url_or_addr.find(u'经营范围：')!=-1 ):
147 |             have_sales= 1
148 |             print(url_or_addr.replace('经营范围：','').replace('...',''))
149 |             worksheet.write(i,6,label=url_or_addr.replace('经营范围：','').replace('...',''))
150 |         else:
151 |             pass
152 |         
153 |     except Exception,e:
154 |         print('5'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
155 | 
156 | 
157 |     try:#第3次获取
158 |         if(have_sales== 1):#如果前面已经出现过经营范围了，后面就没必要判断了。因为经营范围是最后一个项目
159 |             pass
160 |         else:
161 |             addr_or_sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[5]/span").text
162 |            
163 |             if(addr_or_sales.find(u'地址：')!=-1 ):
164 |                 if(have_url==0):
165 |                     worksheet.write(i,4,label='null')#url is null
166 |                 print(addr_or_sales.replace('地址：',''))
167 |                 worksheet.write(i,5,label=addr_or_sales.replace('地址：',''))
168 |             if(addr_or_sales.find(u'经营范围：')!=-1 ):
169 |                 have_sales= 1
170 |                 print(addr_or_sales.replace('经营范围：','').replace('...',''))
171 |                 worksheet.write(i,6,label=addr_or_sales.replace('经营范围：','').replace('...',''))
172 |     except Exception,e:
173 |         print('6'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
174 | 
175 |     try:
176 |         if(have_sales== 1):#如果前面已经出现过经营范围了，后面就没必要判断了。因为经营范围是最后一个项目
177 |             pass
178 |         else:
179 |             sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[6]/span").text
180 |             if(sales.find(u'经营范围：')!=-1 ):
181 |                 print(sales.replace('经营范围：','').replace('...',''))
182 |                 worksheet.write(i,6,label=sales.replace('经营范围：','').replace('...',''))
183 |     except Exception,e:
184 |         print('7'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
185 |     print(" ")
186 | 
187 | 
188 | if __name__=='__main__':
189 |     reload(sys) 
190 |     sys.setdefaultencoding('utf-8')
191 |     f = open("inc.txt")
192 |     line = f.readline()
193 |     i = 0
194 | 
195 |     workbook = xlwt.Workbook(encoding="utf-8")
196 |     worksheet = workbook.add_sheet("my worksheet")
197 |     worksheet.write(0,0,label=u"序号")
198 |     worksheet.write(0,1,label=u"公司名称")
199 |     worksheet.write(0,2,label=u"电话")
200 |     worksheet.write(0,3,label=u"邮箱")
201 |     worksheet.write(0,4,label=u"网址")
202 |     worksheet.write(0,5,label=u"地址")
203 |     worksheet.write(0,6,label=u"经营范围")
204 | 
205 |     driver = webdriver.Chrome()
206 |     while line:
207 |         i=i+1
208 |         print(i)
209 |         worksheet.write(i,0,label=i)
210 |         #由于该网站的检索结果中的小括号都使用全角编码，所以检索之前将可能存在的英文半角括号替换为全角括号，否则检索结果不唯一
211 |         search(driver,i,worksheet,line.replace('\n','').replace(')','）').replace('(','（').decode('utf-8'))
212 |         
213 |         line = f.readline()
214 |     f.close()
215 |     driver.close()
216 |     workbook.save('excel.xls')
217 | 
218 | #待添加的功能 timeout 的情况下，要workbook.save一下，把已经爬到的数据写到文件里
219 | 


--------------------------------------------------------------------------------
/根据指定网址提取信息02282320.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | #按照指定网址获取信息
  3 | 
  4 | from selenium import webdriver
  5 | import time
  6 | import xlwt
  7 | import sys
  8 | import re
  9 | 
 10 | def search(driver,city,isdebug):
 11 | 
 12 |     workbook = xlwt.Workbook(encoding="utf-8")
 13 |     worksheet = workbook.add_sheet("my worksheet")
 14 |     worksheet.write(0,0,label=u"序号")
 15 |     worksheet.write(0,1,label=u"公司名称")
 16 |     worksheet.write(0,2,label=u"电话")
 17 |     worksheet.write(0,3,label=u"邮箱")
 18 |     worksheet.write(0,4,label=u"网址")
 19 |     worksheet.write(0,5,label=u"地址")
 20 |     worksheet.write(0,6,label=u"经营范围")
 21 | 
 22 | 
 23 |     url_list = []# 存放所有页面中所有公司的链接
 24 |     with open(str('incURL_'+city+'.txt'),'r') as f:
 25 |         for line in f:
 26 |             url_list.append(line)
 27 | 
 28 |     
 29 |     
 30 |     #对每个公司的链接，进入该网址，获取信息
 31 |     for j in range(len(url_list)):
 32 |         url = url_list[j]
 33 |         try:
 34 |             driver.get(url)
 35 |         except Exception,e:#“电话：暂无联系方式” 这种形势虽然有
 36 |             workbook.save(city+'.xls')
 37 |             print('3'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
 38 |             print('order:'+str(j))
 39 |             print(url)
 40 |             return
 41 | 
 42 |         #
 43 |         company = driver.find_element_by_xpath("/html/body/div[3]/div[1]/strong").text
 44 |         if(isdebug==1):
 45 |             print(j)
 46 |             print(company)
 47 |         worksheet.write(j+1,0,label=str(j))
 48 |         worksheet.write(j+1,1,label=company)
 49 |         
 50 | 
 51 |         #正则表达式提取电话号码，电话号码有多种形式，因此下面用了4种表达式，目前尚未遇到例外
 52 |         try:
 53 |             phone = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[2]/span/font/a").text
 54 |             o = re.findall(r'\d{3,4}[-]?\d{3}[-]?\d{4}',phone)#400-123-4567 或 400-1234567
 55 |             m = re.findall(r'\(?0\d{2,3}[) -]?\d{7,8}',phone)#座机
 56 |             l = re.findall(r'(\d{8,9})',phone)#座机纯8位 或纯9位的号码
 57 |             n = re.findall(r'(86)?(1\d{10})',phone)#手机
 58 |             
 59 |             #优先向表格写入座机，因为座机具备一定的信息，大公司的号码网上页能搜得到
 60 |             have_phone = 0#标记号码是否已经输出到表格中
 61 |             if (m):
 62 |                 have_phone = 1
 63 |                 if(isdebug==1):
 64 |                     print(m[0])
 65 |                 worksheet.write(j+1,2,label=m[0])
 66 |             if (o and (have_phone == 0) ):
 67 |                 have_phone = 1
 68 |                 if(isdebug==1):
 69 |                     print(o[0])
 70 |                 worksheet.write(j+1,2,label=o[0])
 71 |             if (l and (have_phone == 0) ):
 72 |                 have_phone = 1
 73 |                 if(isdebug==1):
 74 |                     print(l[0])
 75 |                 worksheet.write(j+1,2,label=l[0])
 76 |             if (n and (have_phone == 0) ):
 77 |                 have_phone = 1
 78 |                 if(isdebug==1):
 79 |                     print(n[0])
 80 |                 worksheet.write(j+1,2,label=n[0])
 81 |         except Exception,e:#“电话：暂无联系方式” 
 82 |             worksheet.write(j+1,2,label='null')
 83 |             print('4'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
 84 |             print('order:'+str(j))
 85 |             print(url)
 86 |             workbook.save(city+'.xls')
 87 |             return
 88 | 
 89 |         
 90 |         have_mail = 0#标记各个属性是否已经成功提取
 91 |         have_url  = 0
 92 |         have_addr = 0
 93 |         have_sales= 0
 94 | 
 95 | 
 96 |         #邮箱、网址这2项可能会有1-2项缺失，且仅根据网页标签无法区分，只能每次获取都进行三次匹配
 97 |         #分情况讨论：
 98 |         #1 假如邮箱和网址都缺失，只有两次获取成功，分别是地址、经营范围
 99 |         #2 假如仅邮箱缺失，3次分别获取到的是网址、地址、经营范围
100 |         #3 假如仅网址缺失，3次分别获取到的是邮箱、地址、经营范围
101 | 
102 |         try:#第1次获取
103 |             mail_or_url = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[3]/span").text
104 |             if(mail_or_url.find(u'邮箱')!=-1 ):
105 |                 have_mail=1
106 |                 
107 |                 mails = re.findall(r"[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+[\.]?[a-zA-Z]+", mail_or_url)
108 |                 
109 |                 if(isdebug==1):
110 |                     print(mails[0])
111 |                 
112 |                 worksheet.write(j+1,3,label=mails[0])
113 |                 
114 |             else:
115 |                 pass
116 |             if(mail_or_url.find(u'网址')!=-1):
117 |                 if(have_mail==0):
118 |                     worksheet.write(j+1,3,label='null')#mail is null
119 |                 have_url  = 1
120 |                 url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[3]/span/a[1]').text
121 |                 if(isdebug==1):
122 |                     print(url)
123 |                 worksheet.write(j+1,4,label=url)
124 |             else:
125 |                 pass
126 |             if(mail_or_url.find(u'地址：')!=-1 ):
127 |                 if(have_mail==0):
128 |                     worksheet.write(j+1,3,label='null')#mail is null
129 |                 if(have_url==0):
130 |                     worksheet.write(j+1,4,label='null')#url is null
131 |                 have_addr = 1
132 |                 if(isdebug==1):
133 |                     print(mail_or_url.replace('地址：',''))
134 |                 worksheet.write(j+1,5,label=mail_or_url.replace('地址：',''))
135 |             else:
136 |                 pass
137 |         except IndexError:
138 |             worksheet.write(j+1,3,label=(mail_or_url.replace('邮箱：','').replace('。','.')))
139 |         except Exception,e:
140 |             print('5'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
141 |             print('order:'+str(j))
142 |             print(url)
143 |             workbook.save(city+'.xls')
144 |             return
145 | 
146 |         try:#第2次获取
147 |             url_or_addr = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span').text
148 |             if(have_url  == 0):
149 |                 if(url_or_addr.find(u'网址：')!=-1):     #/html/body/div[3]/div[4]/ul/li[4]/span/a[1]
150 |                     have_url  = 1
151 |                     url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span/a').text
152 |                     if(isdebug==1):
153 |                         print(url)
154 |                     worksheet.write(j+1,4,label=url)
155 |             else:
156 |                 pass
157 |             if(have_addr  == 0):
158 |                 if(url_or_addr.find(u'地址：')!=-1 ):
159 |                     if(have_url==0):
160 |                         worksheet.write(j+1,4,label='null')#url is null
161 |                     have_addr = 1
162 |                     if(isdebug==1):
163 |                         print(url_or_addr.replace('地址：',''))
164 |                     worksheet.write(j,5,label=url_or_addr.replace('地址：',''))
165 |             else:
166 |                 pass 
167 |             if(url_or_addr.find(u'经营范围：')!=-1 ):
168 |                 have_sales= 1
169 |                 if(isdebug==1):
170 |                     print(url_or_addr.replace('经营范围：','').replace('...',''))
171 |                 worksheet.write(j+1,6,label=url_or_addr.replace('经营范围：','').replace('...',''))
172 |             else:
173 |                 pass
174 |             
175 |         except Exception,e:
176 |             print('6'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
177 |             print('order:'+str(j))
178 |             print(url)
179 |             workbook.save(city+'.xls')
180 |             return
181 | 
182 | 
183 |         try:#第3次获取
184 |             if(have_sales== 1):#如果前面已经出现过经营范围了，后面就没必要判断了。因为经营范围是最后一个项目
185 |                 pass
186 |             else:
187 |                 addr_or_sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[5]/span").text
188 |             
189 |                 if(addr_or_sales.find(u'地址：')!=-1 ):
190 |                     if(have_url==0):
191 |                         worksheet.write(j+1,4,label='null')#url is null
192 |                     if(isdebug==1):
193 |                         print(addr_or_sales.replace('地址：',''))
194 |                     worksheet.write(j+1,5,label=addr_or_sales.replace('地址：',''))
195 |                 if(addr_or_sales.find(u'经营范围：')!=-1 ):
196 |                     have_sales= 1
197 |                     if(isdebug==1):
198 |                         print(addr_or_sales.replace('经营范围：','').replace('...',''))
199 |                     worksheet.write(j+1,6,label=addr_or_sales.replace('经营范围：','').replace('...',''))
200 |         except Exception,e:
201 |             print('7'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
202 |             print('order:'+str(j))
203 |             print(url)
204 |             workbook.save(city+'.xls')
205 |             return
206 | 
207 |         try:
208 |             if(have_sales== 1):#如果前面已经出现过经营范围了，后面就没必要判断了。因为经营范围是最后一个项目
209 |                 pass
210 |             else:
211 |                 sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[6]/span").text
212 |                 if(sales.find(u'经营范围：')!=-1 ):
213 |                     if(isdebug==1):
214 |                         print(sales.replace('经营范围：','').replace('...',''))
215 |                     worksheet.write(j+1,6,label=sales.replace('经营范围：','').replace('...',''))
216 |         except Exception,e:
217 |             print('8'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message)
218 |             print('order:'+str(j))
219 |             print(url)
220 |             workbook.save(city+'.xls')
221 |             return
222 |         if(isdebug==1):        
223 |             print(" ")
224 |     workbook.save(city+'.xls')
225 |     print(city+' is done.')
226 |     
227 | 
228 | if __name__=='__main__':
229 |     reload(sys) 
230 |     sys.setdefaultencoding('utf-8')
231 |     driver = webdriver.Chrome()
232 |     isdebug = 1
233 | 
234 |     city =['hangzhou']#,'shenzhen','hangzhou']
235 |     for i in range(len(city)):
236 |         search(driver,city[i],isdebug)
237 |     
238 |     time.sleep(5)
239 |     driver.close()
240 |     
241 | #上海 bug网址 http://m.54114.cn/hangye90/8f3678a2d1.html
242 | 
243 | #写一个函数，读文件，文件里每一行都是一个公司信息页面。如果中断，还可以通过这个方式继续进行下去。
244 | 
245 | #TimeoutException() 超时类型的错误可以考虑加一个断点重启的功能
246 | 
247 | 
248 | #excel添加 城市 字段，如 xx市
249 | 
250 | 


--------------------------------------------------------------------------------