├── README.md
└── 天眼查爬虫.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 基于Python的天眼查爬虫，爬取完整的公司数据（可爬需要VIP才能用的邮箱和电话等）
 2 | PS：运行前请将自己的cookie放到26行那里。
 3 | 
 4 | PPS：爬不到数据？自己手动去过一遍验证码再试试看。
 5 | 
 6 | PPPS：爬到的是空数据？自己去手动搜索一下看看这个词搜不搜得到。
 7 | 
 8 | PPPPS：最后一句了…要是觉着有用的话…请关注并收藏此链接…我嘛…刚买了几年服务器….不出意外的话，几年内都在，请关注我…拜托拜托 ~
 9 | 
10 | 博客：<a href="http://blog.lrvin.com">blog.lrvin.com</a>
11 | 


--------------------------------------------------------------------------------
/天眼查爬虫.py:
--------------------------------------------------------------------------------
  1 | #-*- coding-8 -*-
  2 | import requests
  3 | import lxml
  4 | import sys
  5 | from bs4 import BeautifulSoup
  6 | import xlwt
  7 | import time
  8 | import urllib
  9 | 
 10 | def craw(url,key_word,x,new_num):
 11 |     User_Agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'
 12 |     if x == 0:
 13 |         re = 'https://www.tianyancha.com/search?key='+key_word
 14 |     else:
 15 |         re = 'https://www.tianyancha.com/search/p{}?key={}'.format(x-1,key_word)
 16 |     headers = {
 17 |             'Host': 'www.tianyancha.com',
 18 |             'Connection': 'keep-alive',
 19 |             'Cache-Control': 'max-age=0',
 20 |             'Upgrade-Insecure-Requests': '1',
 21 |             'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
 22 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 23 |             'Referer': re,#'https://www.tianyancha.com/search?key=%E5%B1%B1%E4%B8%9C%20%E7%A7%91%E6%8A%80',
 24 |             'Accept-Encoding': 'gzip, deflate, br',
 25 |             'Accept-Language': 'zh-CN,zh;q=0.9',
 26 |             'Cookie': r'cookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookiecookie',
 27 |             }
 28 |     try:
 29 |         response = requests.get(url,headers = headers)
 30 |         if response.status_code != 200:
 31 |             response.encoding = 'utf-8'
 32 |             print(response.status_code)
 33 |             print('ERROR')
 34 |         soup = BeautifulSoup(response.text,'lxml')
 35 |     except Exception:
 36 |         print('请求都不让，这天眼查也搞事情吗？？？')
 37 |     try:
 38 |         com_all_info = soup.body.select('.mt74 .container.-top .container-left .search-block.header-block-container')[0]
 39 |         com_all_info_array = com_all_info.select('.search-item.sv-search-company')
 40 |         print('开始爬取数据，请勿打开excel')
 41 | 
 42 |     except Exception:
 43 |         print('好像被拒绝访问了呢...请稍后再试叭...')
 44 |     try:
 45 |         for i in range(new_num,len(com_all_info_array)):
 46 |             try:
 47 |                 temp_g_name = com_all_info_array[i].select('.content .header .name')[0].text    #获取公司名
 48 |                 temp_g_state = com_all_info_array[i].select('.content .header .tag-common.-normal-bg')[0].text  #获取公司状态
 49 |                 temp_r_name = com_all_info_array[i].select('.content .legalPersonName.link-click')[0].text    #获取法人名
 50 |                 temp_g_money = com_all_info_array[i].select('.content .info.row.text-ellipsis div')[1].text.strip('注册资本：')    #获取注册资本
 51 |                 temp_g_date = com_all_info_array[i].select('.content .info.row.text-ellipsis div')[2].text.strip('成立日期：')    #获取公司注册时间
 52 |                 try:
 53 |                     try:
 54 |                         temp_r_phone = com_all_info_array[i].select('.content .contact.row div script')[0].text.strip('[').strip(']')    #获取法人手机号
 55 |                     except Exception:
 56 |                         temp_r_phone = com_all_info_array[i].select('.content .contact.row div')[0].select('span span')[0].text    #获取法人手机号
 57 |                 except Ellipsis:
 58 |                     temp_r_phone = '未找到法人手机号'
 59 |                 try:
 60 |                     try:
 61 |                         temp_r_email = com_all_info_array[i].select('.content .contact.row div script')[1].text.strip('[').strip(']')    #获取法人Email
 62 |                     except Exception:
 63 |                         temp_r_email = com_all_info_array[i].select('.content .contact.row div')[1].select('span')[1].text    #获取法人Email
 64 |                 except Exception:
 65 |                     temp_r_email = '未找到法人邮箱'
 66 |     #            temp_g_desc = com_all_info_array[i].select('.content .match.row.text-ellipsis')[0].text    #获取公司备注
 67 |     
 68 |                 g_name_list.append(temp_g_name)
 69 |                 g_state_list.append(temp_g_state)
 70 |                 r_name_list.append(temp_r_name)
 71 |                 g_money_list.append(temp_g_money)
 72 |                 g_date_list.append(temp_g_date)
 73 |                 r_phone_list.append(temp_r_phone)
 74 |                 r_email_list.append(temp_r_email)
 75 |     #            g_desc_list.append(temp_g_desc)
 76 |             except Exception:
 77 |                 print(temp_g_name+"-信息不完整，>>>>跳过>>下一个")
 78 |     except Exception:
 79 |         print("这页有毒，换下一页")
 80 | 
 81 | if __name__ == '__main__':
 82 |     global g_name_list
 83 |     global g_state_list
 84 |     global r_name_list
 85 |     global g_money_list
 86 |     global g_date_list
 87 |     global r_phone_list
 88 |     global r_email_list
 89 | #    global g_desc_list
 90 | 
 91 |     g_name_list=[]
 92 |     g_state_list=[]
 93 |     r_name_list=[]
 94 |     g_money_list=[]
 95 |     g_date_list=[]
 96 |     r_phone_list=[]
 97 |     r_email_list=[]
 98 | #    g_desc_list=[]
 99 | 
100 | 
101 |     key_word = input('请输入您想搜索的关键词：')
102 |     try:
103 |         new_num = int(input('请输入您想从第几页检索：'))-1
104 |     except Exception:
105 |         new_num = 0
106 |     try:
107 |         num = int(input('请输入您想检索的次数：'))+1
108 |     except Exception:
109 |         num = 6
110 |     try:
111 |         sleep_time = int(input('请输入每次检索延时的秒数：'))
112 |     except Exception:
113 |         sleep_time = 5
114 | 
115 |     key_word = urllib.parse.quote(key_word)
116 | 
117 |     print('正在搜索，请稍后')
118 | 
119 |     for x in range(1,num):
120 |         url = r'https://www.tianyancha.com/search/p{}?key={}'.format(x,key_word)
121 | #        print(r'https://www.tianyancha.com/search/p{}?key={}'.format(x,key_word))
122 | #        url = r'https://www.tianyancha.com/search/p2?key=%E5%B1%B1%E4%B8%9C%20%E7%A7%91%E6%8A%80'
123 |         s1 = craw(url,key_word,x,new_num)
124 |         time.sleep(sleep_time)
125 |     workbook = xlwt.Workbook()
126 |     #创建sheet对象，新建sheet
127 |     sheet1 = workbook.add_sheet('天眼查数据', cell_overwrite_ok=True)
128 |     #---设置excel样式---
129 |     #初始化样式
130 |     style = xlwt.XFStyle()
131 |     #创建字体样式
132 |     font = xlwt.Font()
133 |     font.name = '仿宋'
134 | #    font.bold = True #加粗
135 |     #设置字体
136 |     style.font = font
137 |     #使用样式写入数据
138 |     print('正在存储数据，请勿打开excel')
139 |     #向sheet中写入数据
140 |     name_list = ['公司名字','公司状态','法定法人','注册资本','成立日期','法人电话','法人邮箱']#,'公司备注']
141 |     for cc in range(0,len(name_list)):
142 |         sheet1.write(0,cc,name_list[cc],style)
143 |     for i in range(0,len(g_name_list)):
144 |         print(g_name_list[i])
145 |         sheet1.write(i+1,0,g_name_list[i],style)#公司名字
146 |         sheet1.write(i+1,1,g_state_list[i],style)#公司状态
147 |         sheet1.write(i+1,2,r_name_list[i],style)#法定法人
148 |         sheet1.write(i+1,3,g_money_list[i],style)#注册资本
149 |         sheet1.write(i+1,4,g_date_list[i],style)#成立日期
150 |         sheet1.write(i+1,5,r_phone_list[i],style)#法人电话
151 |         sheet1.write(i+1,6,r_email_list[i],style)#法人邮箱
152 | #        sheet1.write(i+1,7,g_desc_list[i],style)#公司备注
153 |     #保存excel文件，有同名的直接覆盖
154 |     workbook.save(r"D:\wyy-tyc-"+time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) +".xls")
155 |     print('保存完毕~')
156 | 


--------------------------------------------------------------------------------