├── README.md └── 天眼查爬虫.py /README.md: -------------------------------------------------------------------------------- 1 | # 基于Python的天眼查爬虫,爬取完整的公司数据(可爬需要VIP才能用的邮箱和电话等) 2 | PS:运行前请将自己的cookie放到26行那里。 3 | 4 | PPS:爬不到数据?自己手动去过一遍验证码再试试看。 5 | 6 | PPPS:爬到的是空数据?自己去手动搜索一下看看这个词搜不搜得到。 7 | 8 | PPPPS:最后一句了…要是觉着有用的话…请关注并收藏此链接…我嘛…刚买了几年服务器….不出意外的话,几年内都在,请关注我…拜托拜托 ~ 9 | 10 | 博客:blog.lrvin.com 11 | -------------------------------------------------------------------------------- /天眼查爬虫.py: -------------------------------------------------------------------------------- 1 | #-*- coding-8 -*- 2 | import requests 3 | import lxml 4 | import sys 5 | from bs4 import BeautifulSoup 6 | import xlwt 7 | import time 8 | import urllib 9 | 10 | def craw(url,key_word,x,new_num): 11 | User_Agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0' 12 | if x == 0: 13 | re = 'https://www.tianyancha.com/search?key='+key_word 14 | else: 15 | re = 'https://www.tianyancha.com/search/p{}?key={}'.format(x-1,key_word) 16 | headers = { 17 | 'Host': 'www.tianyancha.com', 18 | 'Connection': 'keep-alive', 19 | 'Cache-Control': 'max-age=0', 20 | 'Upgrade-Insecure-Requests': '1', 21 | 'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 22 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 23 | 'Referer': re,#'https://www.tianyancha.com/search?key=%E5%B1%B1%E4%B8%9C%20%E7%A7%91%E6%8A%80', 24 | 'Accept-Encoding': 'gzip, deflate, br', 25 | 'Accept-Language': 'zh-CN,zh;q=0.9', 26 | 'Cookie': r'cookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookie', 27 | } 28 | try: 29 | response = requests.get(url,headers = headers) 30 | if response.status_code != 200: 31 | response.encoding = 'utf-8' 32 | print(response.status_code) 33 | print('ERROR') 34 | soup = BeautifulSoup(response.text,'lxml') 35 | except Exception: 36 | print('请求都不让,这天眼查也搞事情吗???') 37 | try: 38 | com_all_info = soup.body.select('.mt74 .container.-top .container-left .search-block.header-block-container')[0] 39 | com_all_info_array = com_all_info.select('.search-item.sv-search-company') 40 | print('开始爬取数据,请勿打开excel') 41 | 42 | except Exception: 43 | print('好像被拒绝访问了呢...请稍后再试叭...') 44 | try: 45 | for i in range(new_num,len(com_all_info_array)): 46 | try: 47 | temp_g_name = com_all_info_array[i].select('.content .header .name')[0].text #获取公司名 48 | temp_g_state = com_all_info_array[i].select('.content .header .tag-common.-normal-bg')[0].text #获取公司状态 49 | temp_r_name = com_all_info_array[i].select('.content .legalPersonName.link-click')[0].text #获取法人名 50 | temp_g_money = com_all_info_array[i].select('.content .info.row.text-ellipsis div')[1].text.strip('注册资本:') #获取注册资本 51 | temp_g_date = com_all_info_array[i].select('.content .info.row.text-ellipsis div')[2].text.strip('成立日期:') #获取公司注册时间 52 | try: 53 | try: 54 | temp_r_phone = com_all_info_array[i].select('.content .contact.row div script')[0].text.strip('[').strip(']') #获取法人手机号 55 | except Exception: 56 | temp_r_phone = com_all_info_array[i].select('.content .contact.row div')[0].select('span span')[0].text #获取法人手机号 57 | except Ellipsis: 58 | temp_r_phone = '未找到法人手机号' 59 | try: 60 | try: 61 | temp_r_email = com_all_info_array[i].select('.content .contact.row div script')[1].text.strip('[').strip(']') #获取法人Email 62 | except Exception: 63 | temp_r_email = com_all_info_array[i].select('.content .contact.row div')[1].select('span')[1].text #获取法人Email 64 | except Exception: 65 | temp_r_email = '未找到法人邮箱' 66 | # temp_g_desc = com_all_info_array[i].select('.content .match.row.text-ellipsis')[0].text #获取公司备注 67 | 68 | g_name_list.append(temp_g_name) 69 | g_state_list.append(temp_g_state) 70 | r_name_list.append(temp_r_name) 71 | g_money_list.append(temp_g_money) 72 | g_date_list.append(temp_g_date) 73 | r_phone_list.append(temp_r_phone) 74 | r_email_list.append(temp_r_email) 75 | # g_desc_list.append(temp_g_desc) 76 | except Exception: 77 | print(temp_g_name+"-信息不完整,>>>>跳过>>下一个") 78 | except Exception: 79 | print("这页有毒,换下一页") 80 | 81 | if __name__ == '__main__': 82 | global g_name_list 83 | global g_state_list 84 | global r_name_list 85 | global g_money_list 86 | global g_date_list 87 | global r_phone_list 88 | global r_email_list 89 | # global g_desc_list 90 | 91 | g_name_list=[] 92 | g_state_list=[] 93 | r_name_list=[] 94 | g_money_list=[] 95 | g_date_list=[] 96 | r_phone_list=[] 97 | r_email_list=[] 98 | # g_desc_list=[] 99 | 100 | 101 | key_word = input('请输入您想搜索的关键词:') 102 | try: 103 | new_num = int(input('请输入您想从第几页检索:'))-1 104 | except Exception: 105 | new_num = 0 106 | try: 107 | num = int(input('请输入您想检索的次数:'))+1 108 | except Exception: 109 | num = 6 110 | try: 111 | sleep_time = int(input('请输入每次检索延时的秒数:')) 112 | except Exception: 113 | sleep_time = 5 114 | 115 | key_word = urllib.parse.quote(key_word) 116 | 117 | print('正在搜索,请稍后') 118 | 119 | for x in range(1,num): 120 | url = r'https://www.tianyancha.com/search/p{}?key={}'.format(x,key_word) 121 | # print(r'https://www.tianyancha.com/search/p{}?key={}'.format(x,key_word)) 122 | # url = r'https://www.tianyancha.com/search/p2?key=%E5%B1%B1%E4%B8%9C%20%E7%A7%91%E6%8A%80' 123 | s1 = craw(url,key_word,x,new_num) 124 | time.sleep(sleep_time) 125 | workbook = xlwt.Workbook() 126 | #创建sheet对象,新建sheet 127 | sheet1 = workbook.add_sheet('天眼查数据', cell_overwrite_ok=True) 128 | #---设置excel样式--- 129 | #初始化样式 130 | style = xlwt.XFStyle() 131 | #创建字体样式 132 | font = xlwt.Font() 133 | font.name = '仿宋' 134 | # font.bold = True #加粗 135 | #设置字体 136 | style.font = font 137 | #使用样式写入数据 138 | print('正在存储数据,请勿打开excel') 139 | #向sheet中写入数据 140 | name_list = ['公司名字','公司状态','法定法人','注册资本','成立日期','法人电话','法人邮箱']#,'公司备注'] 141 | for cc in range(0,len(name_list)): 142 | sheet1.write(0,cc,name_list[cc],style) 143 | for i in range(0,len(g_name_list)): 144 | print(g_name_list[i]) 145 | sheet1.write(i+1,0,g_name_list[i],style)#公司名字 146 | sheet1.write(i+1,1,g_state_list[i],style)#公司状态 147 | sheet1.write(i+1,2,r_name_list[i],style)#法定法人 148 | sheet1.write(i+1,3,g_money_list[i],style)#注册资本 149 | sheet1.write(i+1,4,g_date_list[i],style)#成立日期 150 | sheet1.write(i+1,5,r_phone_list[i],style)#法人电话 151 | sheet1.write(i+1,6,r_email_list[i],style)#法人邮箱 152 | # sheet1.write(i+1,7,g_desc_list[i],style)#公司备注 153 | #保存excel文件,有同名的直接覆盖 154 | workbook.save(r"D:\wyy-tyc-"+time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) +".xls") 155 | print('保存完毕~') 156 | --------------------------------------------------------------------------------