├── .gitignore
├── company_ifo_spider
├── __init__.py
├── __pycache__
│ ├── items.cpython-36.pyc
│ ├── __init__.cpython-36.pyc
│ ├── pipelines.cpython-36.pyc
│ └── settings.cpython-36.pyc
├── spiders
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── bafang_spider.cpython-36.pyc
│ ├── __init__.py
│ └── bafang_spider.py
├── .idea
│ ├── modules.xml
│ ├── misc.xml
│ ├── company_ifo_spider.iml
│ └── workspace.xml
├── items.py
├── pipelines.py
├── settings.py
└── middlewares.py
├── README.MD
└── scrapy.cfg
/.gitignore:
--------------------------------------------------------------------------------
1 | data.csv
2 | 开发笔记.docx
--------------------------------------------------------------------------------
/company_ifo_spider/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/company_ifo_spider/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/company_ifo_spider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/company_ifo_spider/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/company_ifo_spider/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/company_ifo_spider/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/company_ifo_spider/spiders/__pycache__/bafang_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/spiders/__pycache__/bafang_spider.cpython-36.pyc
--------------------------------------------------------------------------------
/company_ifo_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | 本项目为企业工商信息网络爬虫,输入行业关键字,例如“铜箔”,可爬取八方资源网等工商信息网上所有与铜箔有关企业的工商信息。
2 | 八方资源网企业信息爬虫使用方法:
3 | 在项目目录(company_ifo_spider)下,打开命令提示符,输入一下命令:
4 | scrapy crawl bafang_spider -a keyword=k
5 | 其中,k是用户设置的关键词,是用户爬取的行业关键词,例如:铜箔
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = company_ifo_spider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = company_ifo_spider
12 |
--------------------------------------------------------------------------------
/company_ifo_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/company_ifo_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/company_ifo_spider/.idea/company_ifo_spider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/company_ifo_spider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class BafangItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | company_name = scrapy.Field()#企业名称
15 | company_description = scrapy.Field()#企业描述
16 | economic_nature = scrapy.Field()#企业经济性质
17 | legal_representative = scrapy.Field()#法人代表或负责人
18 | company_type = scrapy.Field()#企业类型
19 | registered_site = scrapy.Field()#公司注册地
20 | registered_capital = scrapy.Field()#注册资金
21 | establish_time = scrapy.Field()#成立时间
22 | employees_number = scrapy.Field()# 员工数量
23 | monthly_production = scrapy.Field()#月产量
24 | annual_turnover = scrapy.Field()#年营业额
25 | annual_export_volume = scrapy.Field()#年出口额
26 | certification = scrapy.Field()#管理体系认证
27 | main_operating_place = scrapy.Field()#主要营业地点
28 | main_customer = scrapy.Field()#主要客户
29 | workshop_area = scrapy.Field()# 厂房面积
30 | provide_OEM = scrapy.Field()#provide OEM
31 | bank = scrapy.Field()#开户银行
32 | bank_account = scrapy.Field()#银行帐号
33 | main_market = scrapy.Field()#主要市场
34 | main_products = scrapy.Field()#主营产品或服务
35 | company_web_url = scrapy.Field()#企业网址
36 |
--------------------------------------------------------------------------------
/company_ifo_spider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import os
8 | import csv
9 |
10 | class CompanyIfoSpiderPipeline(object):
11 | def __init__(self):
12 | self.file_path = 'data.csv'
13 | self.dict_key = ['企业名称' , '企业描述' , '企业经济性质' , '法人代表或负责人' , '企业类型' , '公司注册地' , '注册资金' , '成立时间' , ' 员工数量' , '月产量' , '年营业额' , '年出口额' , '管理体系认证' , '主要营业地点' , '主要客户' , ' 厂房面积' , '是否提供OEM' , '开户银行' , '银行帐号' , '主要市场' , '主营产品或服务' , '企业网站链接' ]
14 | if not os.path.exists(self.file_path):#若是存放数据的文件不存在,那么就要写入第一行的表头
15 | self.file = open(self.file_path,'a',encoding='utf-8' , newline='')
16 | self.w=csv.writer(self.file)
17 | self.w.writerow(self.dict_key)#写入表头
18 | else:
19 | self.file = open(self.file_path,'a',encoding='utf-8' , newline='')
20 | self.w=csv.writer(self.file)
21 | def process_item(self, item, spider):
22 | print('****************管道文件运行******************')
23 | dict_values = [
24 | item['company_name'] ,
25 | item['company_description'] ,
26 | item['economic_nature'] ,
27 | item['legal_representative'] ,
28 | item['company_type'] ,
29 | item['registered_site'] ,
30 | item['registered_capital'] ,
31 | item['establish_time'] ,
32 | item['employees_number'] ,
33 | item['monthly_production'] ,
34 | item['annual_turnover'] ,
35 | item['annual_export_volume'] ,
36 | item['certification'] ,
37 | item['main_operating_place'] ,
38 | item['main_customer'] ,
39 | item['workshop_area'] ,
40 | item['provide_OEM'] ,
41 | item['bank'] ,
42 | item['bank_account'] ,
43 | item['main_market'] ,
44 | item['main_products'] ,
45 | item['company_web_url']]
46 |
47 | self.w.writerow(dict_values)#写入数据
48 | return item
49 |
50 | def close_spider(self , spider):
51 | print('**********************关闭爬虫**********************')
52 | self.file.flush()
53 | self.file.close()
--------------------------------------------------------------------------------
/company_ifo_spider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for company_ifo_spider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'company_ifo_spider'
13 |
14 | SPIDER_MODULES = ['company_ifo_spider.spiders']
15 | NEWSPIDER_MODULE = 'company_ifo_spider.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'company_ifo_spider (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'company_ifo_spider.middlewares.CompanyIfoSpiderSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'company_ifo_spider.middlewares.CompanyIfoSpiderDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'company_ifo_spider.pipelines.CompanyIfoSpiderPipeline': 300,
69 | }
70 | RETRY_ENABLED = True#是否进行失败重试
71 | RETRY_TIMES = 3
72 | HTTPERROR_ALLOW_ALL = True#下载网页时,发生所有错误返回的response都要处理,网上资料说最好不要这么设置,但是现在技术太菜,没想到其他办法
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
--------------------------------------------------------------------------------
/company_ifo_spider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class CompanyIfoSpiderSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class CompanyIfoSpiderDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/company_ifo_spider/spiders/bafang_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from pypinyin import pinyin , lazy_pinyin
4 | from scrapy.selector import Selector
5 | from company_ifo_spider.items import BafangItem
6 | import re
7 |
8 |
9 | class BafangSpiderSpider(scrapy.Spider):
10 | '''
11 | 本爬虫为八方资源网企业工商信息爬虫,使用方法是在项目目录(company_ifo_spider)下,
12 | 打开命令提示符,输入一下命令:
13 | scrapy crawl bafang_spider -a keyword=k
14 | 其中,k是用户设置的关键词,是用户爬取的行业关键词,例如:铜箔
15 | '''
16 | name = 'bafang_spider'
17 |
18 | start_urls = ['http://www.b2b168.com/']
19 |
20 | def __init__(self , keyword=None , *args , **kwargs):#keyword为用户通过命令行输入的中文关键词
21 | super(BafangSpiderSpider , self).__init__(*args , **kwargs)
22 | print('*********************启动爬虫**************************')
23 | self.keyword = ''.join(lazy_pinyin(keyword))
24 | self.start_urls = ['https://www.b2b168.com/k-' + self.keyword + '/']
25 | print(self.start_urls)
26 | self.allowed_domains = []
27 | def parse(self, response):
28 | '''
29 | 本方法主要作用是获取下一页一级公司列表页面,以及当前页面中所包含的所有公司链接
30 | :param response:
31 | :return:
32 | '''
33 |
34 | try:
35 | data_link = Selector(response).re(r'', string)
38 | company_url = link + '/home.aspx'
39 | print('当前链接:{}'.format(company_url))
40 | #进一步爬取企业信息详情页面
41 | #八方资源网的企业信息详情页有两种,两种页面的数据拜访不一样,所以要用不同的方法来提取
42 | if not 'b2b168.com/c168' in link:#如果网址中不存在这几个字符则是第一类网页
43 | yield scrapy.Request(url=company_url, callback=self.parse_first_type_company_web)
44 | else:#网址中存在这几个字符就是第二类网页
45 | yield scrapy.Request(url=company_url, callback=self.parse_second_type_company_web)
46 |
47 | except Exception as e:
48 | print('抓取搜索结果中企业链接列表时发生异常:{}-->{}'.format(e,response.url))
49 |
50 | next_page = response.xpath('//ul[@class="page"]/a[last()]/@href').extract()[0]
51 | next_page_url = 'https://www.b2b168.com' + next_page
52 | print('************下一页:************'+next_page_url)
53 | yield scrapy.Request(url=next_page_url, callback=self.parse)
54 |
55 | def parse_first_type_company_web(self,response):
56 | '''
57 | 对第一种类型的企业网站进行数据抓取
58 | :param response:
59 | :return:
60 | '''
61 | try:
62 | item = BafangItem()
63 | company_name = Selector(response).re(r'.*?')
64 | if company_name:#如果有企业名称,则对企业名称进行处理
65 | company_name = company_name[0].strip('').strip('')
66 | print('企业名称---->{}'.format(company_name))
67 | else:#如果没有企业名称,退出函数,直接返回
68 | print('没有企业名称-------->{}'.format(response.url))
69 | return None
70 | company_description = Selector(response).re(r'[\s\S]*
')
71 | if company_description:#如果有详细信息描述,则对信息进行处理
72 | company_description = company_description[0].strip('').strip('').replace('\r\n','')
73 | else:#没有详细信息描述,令详细信息为空
74 | company_description = ''
75 | data_attr = Selector(response).re(r'class="table b">.*?')
76 | data_value = Selector(response).re(r'class="table1">[\s\S]*?')
77 | if not (data_attr and data_value):#如果没有抓取到企业信息,直接退出函数,返回-1
78 | print('没有企业详细信息****************')
79 | return None
80 | item['company_web_url'] = response.url#企业网址
81 | item['company_description']=company_description#企业描述
82 | item['company_name'] = company_name#企业名称
83 | item['economic_nature'] = data_value[0][15:-5]#企业经济性质
84 | item['legal_representative'] = data_value[1][15:-5]#法人代表或负责人
85 | item['company_type'] = data_value[2][15:-5]#企业类型
86 | item['registered_site'] = data_value[3][15:-5]#公司注册地
87 | item['registered_capital'] = data_value[4][15:-5]#注册资金
88 | item['establish_time'] = data_value[5][15:-5]#成立时间
89 | item['employees_number'] = data_value[6][15:-5]# 员工数量
90 | item['monthly_production'] = data_value[7][15:-5]#月产量
91 | item['annual_turnover'] = data_value[8][15:-5]#年营业额
92 | item['annual_export_volume'] = data_value[9][15:-5]#年出口额
93 | item['certification'] = data_value[10][15:-5]#管理体系认证
94 | item['main_operating_place'] = data_value[11][15:-5]#主要营业地点
95 | item['main_customer'] = data_value[12][15:-5]#主要客户
96 | item['workshop_area'] = data_value[13][15:-5]# 厂房面积
97 | item['provide_OEM'] = data_value[14][15:-5]#provide OEM
98 | item['bank'] = data_value[15][15:-5]#开户银行
99 | item['bank_account'] = data_value[16][15:-5]#银行帐号
100 | item['main_market'] = data_value[17][15:-5]#主要市场
101 | item['main_products'] = data_value[18][15:-5]#主营产品或服务
102 | yield item
103 | except Exception as e :
104 | print(e.__context__)
105 | print('抓取信息时发生异常:{}-->url:{}'.format(e,response.url))
106 | return None
107 |
108 | def parse_second_type_company_web(self,response):
109 | '''
110 | 对第一种类型的企业网站进行数据抓取
111 | :param response:
112 | :return:
113 | '''
114 | try:
115 | item = BafangItem()
116 | company_name = Selector(response).re(r'')
117 | if company_name:#如果有企业名称,则对企业名称进行处理
118 | company_name = company_name[0].strip('')
119 |
120 | else:#如果没有企业名称,退出函数,直接返回--1
121 | return None
122 | '''
123 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
124 | 以下两行代码是到目前为止还没有搞清楚的bug问题,下面两行代码提取出来的内容只有
125 | ,而其他真正的中文文本全被没有,但是print(response.text)
126 | 出来的结果证明,爬取出来的网页源代码中是包含中文文本的。所以后续只能改成用re.findall()
127 | 方法来提取数据。
128 | '''
129 | # company_description = Selector(response).re(r'')
130 | # company_description = response.xpath('//ul[@class="Cgsjj"]').extract()
131 | company_description = re.findall(r'', response.text)
132 | print('************************')
133 | if company_description:#如果有详细信息描述,则对信息进行处理
134 | company_description = company_description[0]
135 | delete_str = re.findall(r'<[\s\S]*?>', company_description)
136 | for string in delete_str:
137 | company_description = company_description.replace(string , '')
138 | company_description = company_description.replace('\n' , '')
139 | else:#没有详细信息描述,令详细信息为空
140 | company_description = ''
141 | print(company_name)
142 | print(company_description)
143 | item['company_description']=company_description
144 | item['company_name'] = company_name
145 | data_attr = re.findall('
', response.text)
146 | data_value = re.findall(' | ', response.text)
147 | print('**************************{}'.format(len(data_attr)))
148 | if not (data_attr and data_value):#如果没有抓取到企业信息,直接退出函数,返回-1
149 | return None
150 | item['company_web_url'] = response.url#企业网址
151 | item['company_name'] = company_name#企业名称
152 | item['company_description']=company_description#企业描述
153 | item['economic_nature'] = self.txt_wrap_by('>', '<', data_value[0])#企业经济性质
154 | item['legal_representative'] = self.txt_wrap_by('>', '<', data_value[1])#法人代表或负责人
155 | item['company_type'] = self.txt_wrap_by('>', '<', data_value[2])#企业类型
156 | item['registered_site'] = self.txt_wrap_by('>', '<', data_value[3])#公司注册地
157 | item['registered_capital'] = self.txt_wrap_by('>', '<', data_value[4])#注册资金
158 | item['establish_time'] = self.txt_wrap_by('>', '<', data_value[5])#成立时间
159 | item['employees_number'] = self.txt_wrap_by('>', '<', data_value[6])# 员工数量
160 | item['monthly_production'] = self.txt_wrap_by('>', '<', data_value[7])
161 | item['annual_turnover'] = self.txt_wrap_by('>', '<', data_value[8])#年营业额
162 | item['annual_export_volume'] = self.txt_wrap_by('>', '<', data_value[9])#年出口额
163 | item['certification'] = self.txt_wrap_by('>', '<', data_value[10])#管理体系认证
164 | item['main_operating_place'] = self.txt_wrap_by('>', '<', data_value[11])#主要营业地点
165 | item['main_customer'] = self.txt_wrap_by('>', '<', data_value[12])#主要客户
166 | item['workshop_area'] = self.txt_wrap_by('>', '<', data_value[13])# 厂房面积
167 | item['provide_OEM'] = self.txt_wrap_by('>', '<', data_value[14])#provide OEM
168 | item['bank'] = self.txt_wrap_by('>', '<', data_value[15])#开户银行
169 | item['bank_account'] = self.txt_wrap_by('>', '<', data_value[16])#银行帐号
170 | item['main_market'] = self.txt_wrap_by('>', '<', data_value[17])#主要市场
171 | item['main_products'] = self.txt_wrap_by('>', '<', data_value[18])#主营产品或服务
172 | #事实上,在八方资源网有的企业信息详情页有更多的数据,就是说data_attr和data_value长度不止19(不包含企业名称和描述),
173 | # 但是有的却只有19所以在这里只取前面19个数据,因为19以后的数据有些包含在了前面19中
174 | yield item
175 | except Exception as e :
176 | print('抓取信息时发生异常:{}-->url:{}'.format(e,response.url))
177 | return None
178 |
179 | def txt_wrap_by(self , start_str, end_str, html):
180 | '''
181 | 获取html里面在start_str与end_str之间的字符
182 | :param start_str:
183 | :param end_str:
184 | :param html:
185 | :return:
186 | '''
187 | start = html.find(start_str)
188 | if start >= 0:
189 | start += len(start_str)
190 | end = html.find(end_str, start)
191 | if end >= 0:
192 | return html[start:end].strip()
193 |
--------------------------------------------------------------------------------
/company_ifo_spider/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 | pipe
61 |
62 |
63 |
64 |
72 |
73 |
74 |
75 |
76 | true
77 | DEFINITION_ORDER
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 | 1530609067136
138 |
139 |
140 | 1530609067136
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
--------------------------------------------------------------------------------
|