├── README.md ├── spider-cityAllCompany.py ├── 手工绕开企查查的登录验证.py ├── 指定公司全称获取信息02282320.py └── 根据指定网址提取信息02282320.py /README.md: -------------------------------------------------------------------------------- 1 | # python_spider 2 | Get companies' information from the web use python-selenium 3 | -------------------------------------------------------------------------------- /spider-cityAllCompany.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #获取指定城市所有类型的公司信息,每种类型的公司保存为一个excel表格 3 | #默认不显示浏览器图形界面 4 | 5 | #安装chromedriver、selenium、xlwt 6 | #如果except 语法报错,请使用python2.7 或修改代码适应python3.*版本 7 | 8 | from selenium import webdriver 9 | from selenium.common.exceptions import NoSuchElementException 10 | from selenium.webdriver.chrome.options import Options 11 | import time 12 | import xlwt 13 | import sys 14 | import re 15 | 16 | def search(driver,city,isdebug,web_order): 17 | try:#浏览器打开网页 18 | driver.get("http://m.54114.cn/"+city+'/'+web_order +'/')#-------------------网址修改之一 19 | except Exception,e:#如果网页打开有误,则直接退出 20 | print('0'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 21 | return 22 | s=(driver.find_element_by_xpath("/html/body/div[4]").text)#.encode('utf-8') #获取公司总数目 23 | try: 24 | s=s.split(' ')[11] 25 | s=s.replace(u'下','').replace(u'一','').replace(u'页','').replace('>','') 26 | except IndexError:#有些行业可能只有一个页面、甚至只有一个公司 27 | pass 28 | try: 29 | page_num=int(s) 30 | except:#有些行业甚至一个公司都没有 31 | return 32 | 33 | workbook = xlwt.Workbook(encoding="utf-8") 34 | worksheet = workbook.add_sheet("my worksheet")#,cell_overwrite_ok=True)#解决重写报错 35 | worksheet.write(0,0,label=u"序号") 36 | worksheet.write(0,1,label=u"公司名称") 37 | worksheet.write(0,2,label=u"电话") 38 | worksheet.write(0,3,label=u"邮箱") 39 | worksheet.write(0,4,label=u"网址") 40 | worksheet.write(0,5,label=u"地址") 41 | worksheet.write(0,6,label=u"经营范围") 42 | 43 | url_num=page_num*20 44 | #url_num = (int)(driver.find_element_by_xpath("/html/body/div[4]/span[1]").text.replace(u'共','').replace(u'纪录','')) 45 | 46 | page_url = [] #获取每个页面的链接,存放到page_url 47 | for i in range(1,page_num+1): 48 | page_url.append("http://m.54114.cn/"+city+"/"+web_order+"_p"+str(i)+"/")#-------------------网址修改之二 49 | 50 | url_list = []# 存放所有页面中所有公司的链接 51 | 52 | for i in range(page_num):#对每个页面,尝试获取每个公司的链接 53 | current_page = page_url[i] 54 | driver.get(current_page) 55 | try: 56 | for j in range(1,21): 57 | xpath = "/html/body/div[3]/div[3]/ul/li["+str(j)+"]/a" 58 | url_list.append(driver.find_element_by_xpath(xpath).get_attribute("href")) 59 | except NoSuchElementException: 60 | pass 61 | except Exception,e: 62 | print('1'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 63 | print(current_page) 64 | print('order:'+str(i+1)) 65 | return 66 | 67 | if isdebug==1:#如果是调试模式,则把所有公司信息页面的网址都写入到文件中 68 | fp = open(str("incURL_"+city+".txt"),"w+") 69 | try: 70 | for i in range(1000): 71 | fp.write(str(url_list[i])+'\n') 72 | except IndexError:#如果某页面列出的公司数量没有20个,忽略即可 73 | pass 74 | except Exception,e: 75 | print('2'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 76 | print('order:'+str(i+1)) 77 | return 78 | #print(len(url_list )) 79 | fp.close() 80 | 81 | for j in range(url_num):#对每个公司的链接,进入该网址,获取信息 82 | try: 83 | url = url_list[j] 84 | except IndexError:#默认的是每页有20个公司。如果没有这么多,则直接退出for循环 85 | break 86 | try: 87 | driver.get(url) 88 | #//如果打开有误,说明实际的网页数量并没有url_num这么多,退出即可 89 | except Exception,e:#“电话:暂无联系方式” 这种形势虽然有 90 | workbook.save(city+'_'+web_order+'.xls') 91 | print('3'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 92 | print('order:'+str(j)) 93 | print(url) 94 | return 95 | company = driver.find_element_by_xpath("/html/body/div[3]/div[1]/strong").text 96 | if(isdebug==1): 97 | print(j+1) 98 | print(company) 99 | worksheet.write(j+1,0,label=str(j+1)) 100 | worksheet.write(j+1,1,label=company) 101 | 102 | #正则表达式提取电话号码,电话号码有多种形式,因此下面用了4种表达式,例外 电话:(0571); 103 | try: 104 | phone = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[2]/span/font/a").text 105 | o = re.findall(r'\d{3,4}[-]?\d{3}[-]?\d{4}',phone)#400-123-4567 或 400-1234567 106 | m = re.findall(r'\(?0\d{2,3}[) -]?\d{7,8}',phone)#座机 107 | l = re.findall(r'(\d{8,9})',phone)#座机纯8位 或纯9位的号码 108 | n = re.findall(r'(86)?(1\d{10})',phone)#手机 109 | 110 | #优先向表格写入座机,因为座机具备一定的信息,大公司的号码网上页能搜得到 111 | have_phone = 0#标记号码是否已经输出到表格中 112 | if (m): 113 | have_phone = 1 114 | if(isdebug==1): 115 | print(m[0]) 116 | worksheet.write(j+1,2,label=m[0]) 117 | elif (o and (have_phone == 0) ): 118 | have_phone = 1 119 | if(isdebug==1): 120 | print(o[0]) 121 | worksheet.write(j+1,2,label=o[0]) 122 | elif (l and (have_phone == 0) ): 123 | have_phone = 1 124 | if(isdebug==1): 125 | print(l[0]) 126 | worksheet.write(j+1,2,label=l[0]) 127 | elif (n and (have_phone == 0) ): 128 | have_phone = 1 129 | if(isdebug==1): 130 | print(n[0]) 131 | worksheet.write(j+1,2,label=n[0]) 132 | else:#处理例外情况 电话:(0571); 133 | worksheet.write(j+1,2,label='null') 134 | except NoSuchElementException:#“电话:暂无联系方式” 135 | worksheet.write(j+1,2,label='null') 136 | except Exception,e: 137 | print('4'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 138 | print('order:'+str(j)) 139 | print(url) 140 | workbook.save(city+'_'+web_order+'.xls') 141 | return 142 | have_mail = 0#标记各个属性是否已经成功提取 143 | have_url = 0 144 | have_addr = 0 145 | have_sales= 0 146 | 147 | #邮箱、网址这2项可能会有1-2项缺失,且仅根据网页标签无法区分,只能每次获取都进行三次匹配 148 | #分情况讨论: 149 | #1 假如邮箱和网址都缺失,只有两次获取成功,分别是地址、经营范围 150 | #2 假如仅邮箱缺失,3次分别获取到的是网址、地址、经营范围 151 | #3 假如仅网址缺失,3次分别获取到的是邮箱、地址、经营范围 152 | 153 | try:#第1次获取 154 | mail_or_url = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[3]/span").text 155 | if(mail_or_url.find(u'邮箱')==0 ): 156 | have_mail=1 157 | mails = re.findall(r"[a-zA-Z0-9\.\-+_]+@?[a-zA-Z0-9\.\-+_]+[\.]?[a-zA-Z]+", mail_or_url)#邮箱不规范,@后没写点,因此这里的.设置为可选项 158 | if(isdebug==1): 159 | print(mails[0]) 160 | worksheet.write(j+1,3,label=mails[0]) 161 | else: 162 | pass 163 | if(mail_or_url.find(u'网址')==0): 164 | if(have_mail==0): 165 | worksheet.write(j+1,3,label='null')#mail is null 166 | have_url = 1 167 | url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[3]/span/a[1]').text 168 | if(isdebug==1): 169 | print(url) 170 | worksheet.write(j+1,4,label=url) 171 | else: 172 | pass 173 | if(mail_or_url.find(u'地址:')==0 ): 174 | if(have_mail==0): 175 | worksheet.write(j+1,3,label='null')#mail is null 176 | if(have_url==0): 177 | worksheet.write(j+1,4,label='null')#url is null 178 | have_addr = 1 179 | if(isdebug==1): 180 | print(mail_or_url.replace('地址:','')) 181 | worksheet.write(j+1,5,label=mail_or_url.replace('地址:','')) 182 | else: 183 | pass 184 | except IndexError: 185 | worksheet.write(j+1,3,label=(mail_or_url.replace('邮箱:','').replace('。','.')))#有些邮箱不规范,把点写作了句号 186 | except Exception,e: 187 | print('5'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 188 | print('order:'+str(j)) 189 | print(url) 190 | workbook.save(city+'_'+web_order+'.xls') 191 | return 192 | 193 | try:#第2次获取 194 | url_or_addr = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span').text 195 | if(have_url == 0): 196 | if(url_or_addr.find(u'网址:')==0): #/html/body/div[3]/div[4]/ul/li[4]/span/a[1] 197 | have_url = 1 198 | inc_url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span/a').text 199 | if(isdebug==1): 200 | print(inc_url) 201 | worksheet.write(j+1,4,label=inc_url) 202 | else: 203 | pass 204 | if(have_addr == 0): 205 | if(url_or_addr.find(u'网址:')==-1 and url_or_addr.find(u'地址:')==0 ):#有些情况下,网址的那一行里有“下载地址”。 首字符匹配到“地址”并且没有出现“网址”才能算作地址 206 | if(have_url==0): 207 | worksheet.write(j+1,4,label='null')#url is null 208 | have_addr = 1 209 | if(isdebug==1): 210 | print(url_or_addr.replace('地址:','')) 211 | worksheet.write(j+1,5,label=url_or_addr.replace('地址:','')) 212 | else: 213 | pass 214 | if(url_or_addr.find(u'经营范围:')==0 ): 215 | have_sales= 1 216 | if(have_addr==0): 217 | worksheet.write(j+1,5,label='null') 218 | if(isdebug==1): 219 | print(url_or_addr.replace('经营范围:','').replace('...','')) 220 | worksheet.write(j+1,6,label=url_or_addr.replace('经营范围:','').replace('...','')) 221 | else: 222 | pass 223 | except Exception,e: 224 | print('6'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 225 | print('order:'+str(j)) 226 | print(url) 227 | workbook.save(city+'_'+web_order+'.xls') 228 | return 229 | 230 | try:#第3次获取 231 | if(have_sales== 1):#如果前面已经出现过经营范围了,后面就没必要判断了。因为经营范围是最后一个项目 232 | pass 233 | else: 234 | addr_or_sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[5]/span").text 235 | 236 | if(addr_or_sales.find(u'地址:')==0 ):#有的在经营范围里出现了 停车地址... and have_addr==0 237 | have_addr=1 238 | if(have_url==0): 239 | worksheet.write(j+1,4,label='null')#url is null 240 | if(isdebug==1): 241 | print(addr_or_sales.replace('地址:','')) 242 | worksheet.write(j+1,5,label=addr_or_sales.replace('地址:','')) 243 | if(addr_or_sales.find(u'经营范围:')==0 ): 244 | have_sales= 1 245 | if(have_addr==0): 246 | worksheet.write(j+1,5,label='null') 247 | if(isdebug==1): 248 | print(addr_or_sales.replace('经营范围:','').replace('...','')) 249 | worksheet.write(j+1,6,label=addr_or_sales.replace('经营范围:','').replace('...','')) 250 | except Exception,e: 251 | print('7'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 252 | print('order:'+str(j)) 253 | print(url) 254 | workbook.save(city+'_'+web_order+'.xls') 255 | return 256 | 257 | try: 258 | if(have_sales== 1):#如果前面已经出现过经营范围了,后面就没必要判断了。因为经营范围是最后一个项目 259 | pass 260 | else: 261 | sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[6]/span").text 262 | if(sales.find(u'经营范围:')==0 ): 263 | #这里曾经重写报错Attempt to overwrite cell: sheetname=u'my worksheet' rowx=1 colx=5 Exception 264 | #报错的原因:前面写入地址后,没有更新have_addr的值,导致这里重复写入地址null 265 | if(have_addr==0): 266 | worksheet.write(j+1,5,label='null') 267 | if(isdebug==1): 268 | print(sales.replace('经营范围:','').replace('...','')) 269 | worksheet.write(j+1,6,label=sales.replace('经营范围:','').replace('...','')) 270 | except Exception,e: 271 | print('8'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 272 | print('order:'+str(j)) 273 | print(url) 274 | workbook.save(city+'_'+web_order+'.xls') 275 | return 276 | if(isdebug==1): 277 | print(" ") 278 | workbook.save(city+'_'+web_order+'.xls') 279 | print(city+' '+web_order+' is done.') 280 | 281 | if __name__=='__main__': 282 | reload(sys) 283 | sys.setdefaultencoding('utf-8') 284 | 285 | isdebug = 0 #如果是1就在终端打印信息,如果是0就不打印 286 | 287 | if(isdebug==0):#默认不显示浏览器图形界面 288 | chrome_options = Options() 289 | chrome_options.add_argument('--headless') 290 | chrome_options.add_argument('--disable-gpu') 291 | driver = webdriver.Chrome(chrome_options=chrome_options) 292 | else:#如果是调试模式,则显示浏览器图形界面 293 | driver = webdriver.Chrome() 294 | 295 | city =[ 'beijing','shanghai','guangzhou','shenzhen','hangzhou'] 296 | 297 | time_start = time.time() 298 | for i in range(len(city)): 299 | for j in range(1,21): 300 | web_order ='hangye' 301 | web_order =web_order+str(j) 302 | search(driver,city[i],isdebug,web_order) 303 | #time.sleep(5) #信息爬取完毕之后,网页显示5秒再关闭 304 | driver.close() 305 | driver.quit() 306 | time_end = time.time() 307 | print('time cost:',time_end-time_start,'s') 308 | 309 | #后期考虑,TimeoutException() 超时类型的错误可以通过读取网址列表的形式,增加一个断点重启的功能 310 | #后期考虑,在excel添加一栏 城市 字段,如 xx市。需要正则提取xx省xx市、xx市两种情况 311 | -------------------------------------------------------------------------------- /手工绕开企查查的登录验证.py: -------------------------------------------------------------------------------- 1 | #默认使用python 2.*,如果使用的是python3.*则需要对两个语句进行替换。 2 | from selenium import webdriver 3 | from selenium.common.exceptions import NoSuchElementException 4 | import time 5 | import xlwt 6 | import sys 7 | 8 | reload(sys) #if python 2.* 9 | sys.setdefaultencoding('utf-8')#if python 2.* 10 | 11 | #import importlib #if python 3.* 12 | #importlib.reload(sys) #if python 3.* 13 | 14 | 15 | #伪装成浏览器,防止被识破 16 | option = webdriver.ChromeOptions() 17 | option.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"') 18 | driver = webdriver.Chrome(chrome_options=option) 19 | 20 | #打开登录页面 21 | driver.get('https://www.qichacha.com/user_login') 22 | #单击用户名密码登录的标签 23 | tag = driver.find_element_by_xpath('//*[@id="normalLogin"]') 24 | tag.click() 25 | #将用户名、密码注入 26 | driver.find_element_by_id('nameNormal').send_keys('username') 27 | driver.find_element_by_id('pwdNormal').send_keys('password') 28 | time.sleep(10)#休眠,人工完成验证步骤,等待程序单击“登录” 29 | #单击登录按钮 30 | btn = driver.find_element_by_xpath('//*[@id="user_login_normal"]/button') 31 | btn.click() 32 | 33 | inc_list = ['阿里巴巴','腾讯','今日头条','滴滴','美团'] 34 | inc_len = len(inc_list) 35 | 36 | for i in range(inc_len): 37 | txt = inc_list[i] 38 | time.sleep(1) 39 | 40 | if (i==0): 41 | #向搜索框注入文字 42 | txt=txt.decode('utf-8') 43 | driver.find_element_by_id('searchkey').send_keys(txt) 44 | #单击搜索按钮 45 | srh_btn = driver.find_element_by_xpath('//*[@id="V3_Search_bt"]') 46 | srh_btn.click() 47 | else: 48 | #向搜索框注入下一个公司地址 49 | txt=txt.decode('utf-8') 50 | driver.find_element_by_id('headerKey').send_keys(txt) 51 | #搜索按钮 52 | srh_btn = driver.find_element_by_xpath('/html/body/header/div/form/div/div/span/button') 53 | srh_btn.click() 54 | 55 | #获取首个企业文本 56 | print(i+1) 57 | inc_full = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/a').text 58 | print(inc_full) 59 | money = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[1]/span[1]').text 60 | print(money) 61 | date = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[1]/span[2]').text 62 | print(date) 63 | mail_phone = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[2]').text 64 | print(mail_phone) 65 | addr = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[3]').text 66 | print(addr) 67 | try: 68 | stock_or_others = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[4]').text 69 | print(stock_or_others) 70 | except: 71 | pass 72 | 73 | #获取网页地址,进入 74 | inner = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/a').get_attribute("href") 75 | driver.get(inner) 76 | 77 | #单击进入后 官网 通过href属性获得: 78 | inc_web = driver.find_element_by_xpath('//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[3]/a').get_attribute("href") 79 | print("官网:"+inc_web) 80 | print(' ') 81 | 82 | driver.close() 83 | 84 | #bug list: 85 | #UnicodeDecodeError: 'utf8' codec can't decode byte 0xe9 in position 0: unexpected end of data 86 | #原因:向搜索栏注入中文字符串时,必须先采用如下方式转换成utf-8编码 87 | #解决:send_keys("阿里巴巴".decode('utf-8')) 88 | -------------------------------------------------------------------------------- /指定公司全称获取信息02282320.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #指定公司全称的列表文件,该程序每次从中读取一行,去指定网页检索该公司的信息,然后存储为excel表格 3 | 4 | from selenium import webdriver 5 | import time 6 | import xlwt 7 | import sys 8 | import re 9 | 10 | def search(driver,i,worksheet,mytxt): 11 | driver.get("http://m.54114.cn/hangzhou/")#打开网页 12 | #mytxt = u"杭州海康威视数字技术股份有限公司" 13 | driver.find_element_by_xpath("/html/body/div[2]/div/form/div/div/input[2]").send_keys(mytxt)#向输入框注入待搜索字符串 14 | driver.find_element_by_xpath("//*[@id=\"qixc\"]").click()#单击搜索按钮 15 | 16 | try: 17 | incname=driver.find_element_by_xpath("/html/body/div[3]/div[3]/ul/li[1]/a").text #获取搜索到的第一个公司名称 18 | except Exception,e:#如果报错,说明网页中没有搜索结果,则在表格中该公司的一行全部填写null,然后退出 19 | print('1'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 20 | worksheet.write(i,1,label=mytxt) 21 | worksheet.write(i,2,label='null') 22 | worksheet.write(i,3,label='null') 23 | worksheet.write(i,4,label='null') 24 | worksheet.write(i,5,label='null') 25 | worksheet.write(i,6,label='null') 26 | print(mytxt) 27 | print('未检索到该公司的信息\n') 28 | return 29 | 30 | if(incname==mytxt):#如果搜索到的公司名称与输入的相等,则 31 | print(incname) 32 | worksheet.write(i,1,label=incname)# 将信息输入表格 33 | else:#如果搜索到的公司名称与输入的不等,则说明该网站目前未收录该公司的信息,表格里写入null后退出 34 | print(mytxt) 35 | worksheet.write(i,1,label=mytxt) 36 | worksheet.write(i,2,label='null') 37 | worksheet.write(i,3,label='null') 38 | worksheet.write(i,4,label='null') 39 | worksheet.write(i,5,label='null') 40 | worksheet.write(i,6,label='null') 41 | print('未检索到该公司的信息\n') 42 | return 43 | 44 | try:#获取详情页的地址,并单击进入该页面 45 | realweb = driver.find_element_by_xpath("/html/body/div[3]/div[3]/ul/li/a").get_attribute("href") 46 | driver.get(realweb) 47 | except Exception,e:#如果获取详情页的网址有误,则直接退出 48 | print('2'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 49 | return 50 | 51 | 52 | #正则表达式提取电话号码,电话号码有多种形式,因此下面用了4种表达式,目前尚未遇到例外 53 | try: 54 | phone = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[2]/span/font/a").text 55 | o = re.findall(r'\d{3,4}[-]?\d{3}[-]?\d{4}',phone)#400-123-4567 或 400-1234567 56 | m = re.findall(r'\(?0\d{2,3}[) -]?\d{7,8}',phone)#座机 57 | l = re.findall(r'(\d{8,9})',phone)#座机纯8位 或纯9位的号码 58 | n = re.findall(r'(86)?(1\d{10})',phone)#手机 59 | 60 | #优先向表格写入座机,因为座机具备一定的信息,大公司的号码网上页能搜得到 61 | have_phone = 0#标记号码是否已经输出到表格中 62 | if (m): 63 | have_phone = 1 64 | print(m[0]) 65 | worksheet.write(i,2,label=m[0]) 66 | if (o and (have_phone == 0) ): 67 | have_phone = 1 68 | print(o[0]) 69 | worksheet.write(i,2,label=o[0]) 70 | if (l and (have_phone == 0) ): 71 | have_phone = 1 72 | print(l[0]) 73 | worksheet.write(i,2,label=l[0]) 74 | if (n and (have_phone == 0) ): 75 | have_phone = 1 76 | print(n[0]) 77 | worksheet.write(i,2,label=n[0]) 78 | except Exception,e:#“电话:暂无联系方式” 这种形势虽然有 79 | worksheet.write(i,2,label='null') 80 | print('3'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 81 | 82 | have_mail = 0#标记各个属性是否已经成功提取 83 | have_url = 0 84 | have_addr = 0 85 | have_sales= 0 86 | 87 | 88 | #邮箱、网址这2项可能会有1-2项缺失,且仅根据网页标签无法区分,只能每次获取都进行三次匹配 89 | #分情况讨论: 90 | #1 假如邮箱和网址都缺失,只有两次获取成功,分别是地址、经营范围 91 | #2 假如仅邮箱缺失,3次分别获取到的是网址、地址、经营范围 92 | #3 假如仅网址缺失,3次分别获取到的是邮箱、地址、经营范围 93 | 94 | try:#第1次获取 95 | mail_or_url = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[3]/span").text 96 | if(mail_or_url.find(u'邮箱')!=-1 ): 97 | have_mail=1 98 | mails = re.findall(r"[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+[\.]?[a-zA-Z]+", mail_or_url)#有些邮箱不规范,因此这里的'.'是可选项 99 | print(mails[0]) 100 | worksheet.write(i,3,label=mails[0]) 101 | else: 102 | pass 103 | if(mail_or_url.find(u'网址')!=-1): 104 | if(have_mail==0): 105 | worksheet.write(i,3,label='null')#mail is null 106 | have_url = 1 107 | url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[3]/span/a[1]').text 108 | print(url) 109 | worksheet.write(i,4,label=url) 110 | else: 111 | pass 112 | if(mail_or_url.find(u'地址:')!=-1 ): 113 | if(have_mail==0): 114 | worksheet.write(i,3,label='null')#mail is null 115 | if(have_url==0): 116 | worksheet.write(i,4,label='null')#url is null 117 | have_addr = 1 118 | print(mail_or_url.replace('地址:','')) 119 | worksheet.write(i,5,label=mail_or_url.replace('地址:','')) 120 | else: 121 | pass 122 | except IndexError: 123 | worksheet.write(j+1,3,label=(mail_or_url.replace('邮箱:','').replace('。','.')))#有些邮箱不规范,把点写作了句号 124 | except Exception,e: 125 | print('4'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 126 | 127 | try:#第2次获取 128 | url_or_addr = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span').text 129 | if(have_url == 0): 130 | if(url_or_addr.find(u'网址:')!=-1): #/html/body/div[3]/div[4]/ul/li[4]/span/a[1] 131 | have_url = 1 132 | url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span/a').text 133 | print(url) 134 | worksheet.write(i,4,label=url) 135 | else: 136 | pass 137 | if(have_addr == 0): 138 | if(url_or_addr.find(u'地址:')!=-1 ): 139 | if(have_url==0): 140 | worksheet.write(i,4,label='null')#url is null 141 | have_addr = 1 142 | print(url_or_addr.replace('地址:','')) 143 | worksheet.write(i,5,label=url_or_addr.replace('地址:','')) 144 | else: 145 | pass 146 | if(url_or_addr.find(u'经营范围:')!=-1 ): 147 | have_sales= 1 148 | print(url_or_addr.replace('经营范围:','').replace('...','')) 149 | worksheet.write(i,6,label=url_or_addr.replace('经营范围:','').replace('...','')) 150 | else: 151 | pass 152 | 153 | except Exception,e: 154 | print('5'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 155 | 156 | 157 | try:#第3次获取 158 | if(have_sales== 1):#如果前面已经出现过经营范围了,后面就没必要判断了。因为经营范围是最后一个项目 159 | pass 160 | else: 161 | addr_or_sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[5]/span").text 162 | 163 | if(addr_or_sales.find(u'地址:')!=-1 ): 164 | if(have_url==0): 165 | worksheet.write(i,4,label='null')#url is null 166 | print(addr_or_sales.replace('地址:','')) 167 | worksheet.write(i,5,label=addr_or_sales.replace('地址:','')) 168 | if(addr_or_sales.find(u'经营范围:')!=-1 ): 169 | have_sales= 1 170 | print(addr_or_sales.replace('经营范围:','').replace('...','')) 171 | worksheet.write(i,6,label=addr_or_sales.replace('经营范围:','').replace('...','')) 172 | except Exception,e: 173 | print('6'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 174 | 175 | try: 176 | if(have_sales== 1):#如果前面已经出现过经营范围了,后面就没必要判断了。因为经营范围是最后一个项目 177 | pass 178 | else: 179 | sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[6]/span").text 180 | if(sales.find(u'经营范围:')!=-1 ): 181 | print(sales.replace('经营范围:','').replace('...','')) 182 | worksheet.write(i,6,label=sales.replace('经营范围:','').replace('...','')) 183 | except Exception,e: 184 | print('7'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 185 | print(" ") 186 | 187 | 188 | if __name__=='__main__': 189 | reload(sys) 190 | sys.setdefaultencoding('utf-8') 191 | f = open("inc.txt") 192 | line = f.readline() 193 | i = 0 194 | 195 | workbook = xlwt.Workbook(encoding="utf-8") 196 | worksheet = workbook.add_sheet("my worksheet") 197 | worksheet.write(0,0,label=u"序号") 198 | worksheet.write(0,1,label=u"公司名称") 199 | worksheet.write(0,2,label=u"电话") 200 | worksheet.write(0,3,label=u"邮箱") 201 | worksheet.write(0,4,label=u"网址") 202 | worksheet.write(0,5,label=u"地址") 203 | worksheet.write(0,6,label=u"经营范围") 204 | 205 | driver = webdriver.Chrome() 206 | while line: 207 | i=i+1 208 | print(i) 209 | worksheet.write(i,0,label=i) 210 | #由于该网站的检索结果中的小括号都使用全角编码,所以检索之前将可能存在的英文半角括号替换为全角括号,否则检索结果不唯一 211 | search(driver,i,worksheet,line.replace('\n','').replace(')',')').replace('(','(').decode('utf-8')) 212 | 213 | line = f.readline() 214 | f.close() 215 | driver.close() 216 | workbook.save('excel.xls') 217 | 218 | #待添加的功能 timeout 的情况下,要workbook.save一下,把已经爬到的数据写到文件里 219 | -------------------------------------------------------------------------------- /根据指定网址提取信息02282320.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #按照指定网址获取信息 3 | 4 | from selenium import webdriver 5 | import time 6 | import xlwt 7 | import sys 8 | import re 9 | 10 | def search(driver,city,isdebug): 11 | 12 | workbook = xlwt.Workbook(encoding="utf-8") 13 | worksheet = workbook.add_sheet("my worksheet") 14 | worksheet.write(0,0,label=u"序号") 15 | worksheet.write(0,1,label=u"公司名称") 16 | worksheet.write(0,2,label=u"电话") 17 | worksheet.write(0,3,label=u"邮箱") 18 | worksheet.write(0,4,label=u"网址") 19 | worksheet.write(0,5,label=u"地址") 20 | worksheet.write(0,6,label=u"经营范围") 21 | 22 | 23 | url_list = []# 存放所有页面中所有公司的链接 24 | with open(str('incURL_'+city+'.txt'),'r') as f: 25 | for line in f: 26 | url_list.append(line) 27 | 28 | 29 | 30 | #对每个公司的链接,进入该网址,获取信息 31 | for j in range(len(url_list)): 32 | url = url_list[j] 33 | try: 34 | driver.get(url) 35 | except Exception,e:#“电话:暂无联系方式” 这种形势虽然有 36 | workbook.save(city+'.xls') 37 | print('3'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 38 | print('order:'+str(j)) 39 | print(url) 40 | return 41 | 42 | # 43 | company = driver.find_element_by_xpath("/html/body/div[3]/div[1]/strong").text 44 | if(isdebug==1): 45 | print(j) 46 | print(company) 47 | worksheet.write(j+1,0,label=str(j)) 48 | worksheet.write(j+1,1,label=company) 49 | 50 | 51 | #正则表达式提取电话号码,电话号码有多种形式,因此下面用了4种表达式,目前尚未遇到例外 52 | try: 53 | phone = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[2]/span/font/a").text 54 | o = re.findall(r'\d{3,4}[-]?\d{3}[-]?\d{4}',phone)#400-123-4567 或 400-1234567 55 | m = re.findall(r'\(?0\d{2,3}[) -]?\d{7,8}',phone)#座机 56 | l = re.findall(r'(\d{8,9})',phone)#座机纯8位 或纯9位的号码 57 | n = re.findall(r'(86)?(1\d{10})',phone)#手机 58 | 59 | #优先向表格写入座机,因为座机具备一定的信息,大公司的号码网上页能搜得到 60 | have_phone = 0#标记号码是否已经输出到表格中 61 | if (m): 62 | have_phone = 1 63 | if(isdebug==1): 64 | print(m[0]) 65 | worksheet.write(j+1,2,label=m[0]) 66 | if (o and (have_phone == 0) ): 67 | have_phone = 1 68 | if(isdebug==1): 69 | print(o[0]) 70 | worksheet.write(j+1,2,label=o[0]) 71 | if (l and (have_phone == 0) ): 72 | have_phone = 1 73 | if(isdebug==1): 74 | print(l[0]) 75 | worksheet.write(j+1,2,label=l[0]) 76 | if (n and (have_phone == 0) ): 77 | have_phone = 1 78 | if(isdebug==1): 79 | print(n[0]) 80 | worksheet.write(j+1,2,label=n[0]) 81 | except Exception,e:#“电话:暂无联系方式” 82 | worksheet.write(j+1,2,label='null') 83 | print('4'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 84 | print('order:'+str(j)) 85 | print(url) 86 | workbook.save(city+'.xls') 87 | return 88 | 89 | 90 | have_mail = 0#标记各个属性是否已经成功提取 91 | have_url = 0 92 | have_addr = 0 93 | have_sales= 0 94 | 95 | 96 | #邮箱、网址这2项可能会有1-2项缺失,且仅根据网页标签无法区分,只能每次获取都进行三次匹配 97 | #分情况讨论: 98 | #1 假如邮箱和网址都缺失,只有两次获取成功,分别是地址、经营范围 99 | #2 假如仅邮箱缺失,3次分别获取到的是网址、地址、经营范围 100 | #3 假如仅网址缺失,3次分别获取到的是邮箱、地址、经营范围 101 | 102 | try:#第1次获取 103 | mail_or_url = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[3]/span").text 104 | if(mail_or_url.find(u'邮箱')!=-1 ): 105 | have_mail=1 106 | 107 | mails = re.findall(r"[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+[\.]?[a-zA-Z]+", mail_or_url) 108 | 109 | if(isdebug==1): 110 | print(mails[0]) 111 | 112 | worksheet.write(j+1,3,label=mails[0]) 113 | 114 | else: 115 | pass 116 | if(mail_or_url.find(u'网址')!=-1): 117 | if(have_mail==0): 118 | worksheet.write(j+1,3,label='null')#mail is null 119 | have_url = 1 120 | url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[3]/span/a[1]').text 121 | if(isdebug==1): 122 | print(url) 123 | worksheet.write(j+1,4,label=url) 124 | else: 125 | pass 126 | if(mail_or_url.find(u'地址:')!=-1 ): 127 | if(have_mail==0): 128 | worksheet.write(j+1,3,label='null')#mail is null 129 | if(have_url==0): 130 | worksheet.write(j+1,4,label='null')#url is null 131 | have_addr = 1 132 | if(isdebug==1): 133 | print(mail_or_url.replace('地址:','')) 134 | worksheet.write(j+1,5,label=mail_or_url.replace('地址:','')) 135 | else: 136 | pass 137 | except IndexError: 138 | worksheet.write(j+1,3,label=(mail_or_url.replace('邮箱:','').replace('。','.'))) 139 | except Exception,e: 140 | print('5'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 141 | print('order:'+str(j)) 142 | print(url) 143 | workbook.save(city+'.xls') 144 | return 145 | 146 | try:#第2次获取 147 | url_or_addr = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span').text 148 | if(have_url == 0): 149 | if(url_or_addr.find(u'网址:')!=-1): #/html/body/div[3]/div[4]/ul/li[4]/span/a[1] 150 | have_url = 1 151 | url = driver.find_element_by_xpath('/html/body/div[3]/div[4]/ul/li[4]/span/a').text 152 | if(isdebug==1): 153 | print(url) 154 | worksheet.write(j+1,4,label=url) 155 | else: 156 | pass 157 | if(have_addr == 0): 158 | if(url_or_addr.find(u'地址:')!=-1 ): 159 | if(have_url==0): 160 | worksheet.write(j+1,4,label='null')#url is null 161 | have_addr = 1 162 | if(isdebug==1): 163 | print(url_or_addr.replace('地址:','')) 164 | worksheet.write(j,5,label=url_or_addr.replace('地址:','')) 165 | else: 166 | pass 167 | if(url_or_addr.find(u'经营范围:')!=-1 ): 168 | have_sales= 1 169 | if(isdebug==1): 170 | print(url_or_addr.replace('经营范围:','').replace('...','')) 171 | worksheet.write(j+1,6,label=url_or_addr.replace('经营范围:','').replace('...','')) 172 | else: 173 | pass 174 | 175 | except Exception,e: 176 | print('6'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 177 | print('order:'+str(j)) 178 | print(url) 179 | workbook.save(city+'.xls') 180 | return 181 | 182 | 183 | try:#第3次获取 184 | if(have_sales== 1):#如果前面已经出现过经营范围了,后面就没必要判断了。因为经营范围是最后一个项目 185 | pass 186 | else: 187 | addr_or_sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[5]/span").text 188 | 189 | if(addr_or_sales.find(u'地址:')!=-1 ): 190 | if(have_url==0): 191 | worksheet.write(j+1,4,label='null')#url is null 192 | if(isdebug==1): 193 | print(addr_or_sales.replace('地址:','')) 194 | worksheet.write(j+1,5,label=addr_or_sales.replace('地址:','')) 195 | if(addr_or_sales.find(u'经营范围:')!=-1 ): 196 | have_sales= 1 197 | if(isdebug==1): 198 | print(addr_or_sales.replace('经营范围:','').replace('...','')) 199 | worksheet.write(j+1,6,label=addr_or_sales.replace('经营范围:','').replace('...','')) 200 | except Exception,e: 201 | print('7'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 202 | print('order:'+str(j)) 203 | print(url) 204 | workbook.save(city+'.xls') 205 | return 206 | 207 | try: 208 | if(have_sales== 1):#如果前面已经出现过经营范围了,后面就没必要判断了。因为经营范围是最后一个项目 209 | pass 210 | else: 211 | sales = driver.find_element_by_xpath("/html/body/div[3]/div[4]/ul/li[6]/span").text 212 | if(sales.find(u'经营范围:')!=-1 ): 213 | if(isdebug==1): 214 | print(sales.replace('经营范围:','').replace('...','')) 215 | worksheet.write(j+1,6,label=sales.replace('经营范围:','').replace('...','')) 216 | except Exception,e: 217 | print('8'+str(Exception)+' '+str(e)+' '+repr(e)+' '+e.message) 218 | print('order:'+str(j)) 219 | print(url) 220 | workbook.save(city+'.xls') 221 | return 222 | if(isdebug==1): 223 | print(" ") 224 | workbook.save(city+'.xls') 225 | print(city+' is done.') 226 | 227 | 228 | if __name__=='__main__': 229 | reload(sys) 230 | sys.setdefaultencoding('utf-8') 231 | driver = webdriver.Chrome() 232 | isdebug = 1 233 | 234 | city =['hangzhou']#,'shenzhen','hangzhou'] 235 | for i in range(len(city)): 236 | search(driver,city[i],isdebug) 237 | 238 | time.sleep(5) 239 | driver.close() 240 | 241 | #上海 bug网址 http://m.54114.cn/hangye90/8f3678a2d1.html 242 | 243 | #写一个函数,读文件,文件里每一行都是一个公司信息页面。如果中断,还可以通过这个方式继续进行下去。 244 | 245 | #TimeoutException() 超时类型的错误可以考虑加一个断点重启的功能 246 | 247 | 248 | #excel添加 城市 字段,如 xx市 249 | 250 | --------------------------------------------------------------------------------