├── .gitignore ├── company_ifo_spider ├── __init__.py ├── __pycache__ │ ├── items.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ └── settings.cpython-36.pyc ├── spiders │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── bafang_spider.cpython-36.pyc │ ├── __init__.py │ └── bafang_spider.py ├── .idea │ ├── modules.xml │ ├── misc.xml │ ├── company_ifo_spider.iml │ └── workspace.xml ├── items.py ├── pipelines.py ├── settings.py └── middlewares.py ├── README.MD └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | data.csv 2 | 开发笔记.docx -------------------------------------------------------------------------------- /company_ifo_spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /company_ifo_spider/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /company_ifo_spider/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /company_ifo_spider/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /company_ifo_spider/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /company_ifo_spider/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /company_ifo_spider/spiders/__pycache__/bafang_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/spiders/__pycache__/bafang_spider.cpython-36.pyc -------------------------------------------------------------------------------- /company_ifo_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | 本项目为企业工商信息网络爬虫,输入行业关键字,例如“铜箔”,可爬取八方资源网等工商信息网上所有与铜箔有关企业的工商信息。 2 | 八方资源网企业信息爬虫使用方法: 3 | 在项目目录(company_ifo_spider)下,打开命令提示符,输入一下命令: 4 | scrapy crawl bafang_spider -a keyword=k 5 | 其中,k是用户设置的关键词,是用户爬取的行业关键词,例如:铜箔 -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = company_ifo_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = company_ifo_spider 12 | -------------------------------------------------------------------------------- /company_ifo_spider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /company_ifo_spider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /company_ifo_spider/.idea/company_ifo_spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /company_ifo_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BafangItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | company_name = scrapy.Field()#企业名称 15 | company_description = scrapy.Field()#企业描述 16 | economic_nature = scrapy.Field()#企业经济性质 17 | legal_representative = scrapy.Field()#法人代表或负责人 18 | company_type = scrapy.Field()#企业类型 19 | registered_site = scrapy.Field()#公司注册地 20 | registered_capital = scrapy.Field()#注册资金 21 | establish_time = scrapy.Field()#成立时间 22 | employees_number = scrapy.Field()# 员工数量 23 | monthly_production = scrapy.Field()#月产量 24 | annual_turnover = scrapy.Field()#年营业额 25 | annual_export_volume = scrapy.Field()#年出口额 26 | certification = scrapy.Field()#管理体系认证 27 | main_operating_place = scrapy.Field()#主要营业地点 28 | main_customer = scrapy.Field()#主要客户 29 | workshop_area = scrapy.Field()# 厂房面积 30 | provide_OEM = scrapy.Field()#provide OEM 31 | bank = scrapy.Field()#开户银行 32 | bank_account = scrapy.Field()#银行帐号 33 | main_market = scrapy.Field()#主要市场 34 | main_products = scrapy.Field()#主营产品或服务 35 | company_web_url = scrapy.Field()#企业网址 36 | -------------------------------------------------------------------------------- /company_ifo_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import os 8 | import csv 9 | 10 | class CompanyIfoSpiderPipeline(object): 11 | def __init__(self): 12 | self.file_path = 'data.csv' 13 | self.dict_key = ['企业名称' , '企业描述' , '企业经济性质' , '法人代表或负责人' , '企业类型' , '公司注册地' , '注册资金' , '成立时间' , ' 员工数量' , '月产量' , '年营业额' , '年出口额' , '管理体系认证' , '主要营业地点' , '主要客户' , ' 厂房面积' , '是否提供OEM' , '开户银行' , '银行帐号' , '主要市场' , '主营产品或服务' , '企业网站链接' ] 14 | if not os.path.exists(self.file_path):#若是存放数据的文件不存在,那么就要写入第一行的表头 15 | self.file = open(self.file_path,'a',encoding='utf-8' , newline='') 16 | self.w=csv.writer(self.file) 17 | self.w.writerow(self.dict_key)#写入表头 18 | else: 19 | self.file = open(self.file_path,'a',encoding='utf-8' , newline='') 20 | self.w=csv.writer(self.file) 21 | def process_item(self, item, spider): 22 | print('****************管道文件运行******************') 23 | dict_values = [ 24 | item['company_name'] , 25 | item['company_description'] , 26 | item['economic_nature'] , 27 | item['legal_representative'] , 28 | item['company_type'] , 29 | item['registered_site'] , 30 | item['registered_capital'] , 31 | item['establish_time'] , 32 | item['employees_number'] , 33 | item['monthly_production'] , 34 | item['annual_turnover'] , 35 | item['annual_export_volume'] , 36 | item['certification'] , 37 | item['main_operating_place'] , 38 | item['main_customer'] , 39 | item['workshop_area'] , 40 | item['provide_OEM'] , 41 | item['bank'] , 42 | item['bank_account'] , 43 | item['main_market'] , 44 | item['main_products'] , 45 | item['company_web_url']] 46 | 47 | self.w.writerow(dict_values)#写入数据 48 | return item 49 | 50 | def close_spider(self , spider): 51 | print('**********************关闭爬虫**********************') 52 | self.file.flush() 53 | self.file.close() -------------------------------------------------------------------------------- /company_ifo_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for company_ifo_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'company_ifo_spider' 13 | 14 | SPIDER_MODULES = ['company_ifo_spider.spiders'] 15 | NEWSPIDER_MODULE = 'company_ifo_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'company_ifo_spider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'company_ifo_spider.middlewares.CompanyIfoSpiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'company_ifo_spider.middlewares.CompanyIfoSpiderDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'company_ifo_spider.pipelines.CompanyIfoSpiderPipeline': 300, 69 | } 70 | RETRY_ENABLED = True#是否进行失败重试 71 | RETRY_TIMES = 3 72 | HTTPERROR_ALLOW_ALL = True#下载网页时,发生所有错误返回的response都要处理,网上资料说最好不要这么设置,但是现在技术太菜,没想到其他办法 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /company_ifo_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CompanyIfoSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class CompanyIfoSpiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /company_ifo_spider/spiders/bafang_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from pypinyin import pinyin , lazy_pinyin 4 | from scrapy.selector import Selector 5 | from company_ifo_spider.items import BafangItem 6 | import re 7 | 8 | 9 | class BafangSpiderSpider(scrapy.Spider): 10 | ''' 11 | 本爬虫为八方资源网企业工商信息爬虫,使用方法是在项目目录(company_ifo_spider)下, 12 | 打开命令提示符,输入一下命令: 13 | scrapy crawl bafang_spider -a keyword=k 14 | 其中,k是用户设置的关键词,是用户爬取的行业关键词,例如:铜箔 15 | ''' 16 | name = 'bafang_spider' 17 | 18 | start_urls = ['http://www.b2b168.com/'] 19 | 20 | def __init__(self , keyword=None , *args , **kwargs):#keyword为用户通过命令行输入的中文关键词 21 | super(BafangSpiderSpider , self).__init__(*args , **kwargs) 22 | print('*********************启动爬虫**************************') 23 | self.keyword = ''.join(lazy_pinyin(keyword)) 24 | self.start_urls = ['https://www.b2b168.com/k-' + self.keyword + '/'] 25 | print(self.start_urls) 26 | self.allowed_domains = [] 27 | def parse(self, response): 28 | ''' 29 | 本方法主要作用是获取下一页一级公司列表页面,以及当前页面中所包含的所有公司链接 30 | :param response: 31 | :return: 32 | ''' 33 | 34 | try: 35 | data_link = Selector(response).re(r'', string) 38 | company_url = link + '/home.aspx' 39 | print('当前链接:{}'.format(company_url)) 40 | #进一步爬取企业信息详情页面 41 | #八方资源网的企业信息详情页有两种,两种页面的数据拜访不一样,所以要用不同的方法来提取 42 | if not 'b2b168.com/c168' in link:#如果网址中不存在这几个字符则是第一类网页 43 | yield scrapy.Request(url=company_url, callback=self.parse_first_type_company_web) 44 | else:#网址中存在这几个字符就是第二类网页 45 | yield scrapy.Request(url=company_url, callback=self.parse_second_type_company_web) 46 | 47 | except Exception as e: 48 | print('抓取搜索结果中企业链接列表时发生异常:{}-->{}'.format(e,response.url)) 49 | 50 | next_page = response.xpath('//ul[@class="page"]/a[last()]/@href').extract()[0] 51 | next_page_url = 'https://www.b2b168.com' + next_page 52 | print('************下一页:************'+next_page_url) 53 | yield scrapy.Request(url=next_page_url, callback=self.parse) 54 | 55 | def parse_first_type_company_web(self,response): 56 | ''' 57 | 对第一种类型的企业网站进行数据抓取 58 | :param response: 59 | :return: 60 | ''' 61 | try: 62 | item = BafangItem() 63 | company_name = Selector(response).re(r'
  • .*?
  • ') 64 | if company_name:#如果有企业名称,则对企业名称进行处理 65 | company_name = company_name[0].strip('
  • ').strip('
  • ') 66 | print('企业名称---->{}'.format(company_name)) 67 | else:#如果没有企业名称,退出函数,直接返回 68 | print('没有企业名称-------->{}'.format(response.url)) 69 | return None 70 | company_description = Selector(response).re(r'
    [\s\S]*
    ') 71 | if company_description:#如果有详细信息描述,则对信息进行处理 72 | company_description = company_description[0].strip('
    ').strip('
    ').replace('\r\n','') 73 | else:#没有详细信息描述,令详细信息为空 74 | company_description = '' 75 | data_attr = Selector(response).re(r'class="table b">.*?') 76 | data_value = Selector(response).re(r'class="table1">[\s\S]*?') 77 | if not (data_attr and data_value):#如果没有抓取到企业信息,直接退出函数,返回-1 78 | print('没有企业详细信息****************') 79 | return None 80 | item['company_web_url'] = response.url#企业网址 81 | item['company_description']=company_description#企业描述 82 | item['company_name'] = company_name#企业名称 83 | item['economic_nature'] = data_value[0][15:-5]#企业经济性质 84 | item['legal_representative'] = data_value[1][15:-5]#法人代表或负责人 85 | item['company_type'] = data_value[2][15:-5]#企业类型 86 | item['registered_site'] = data_value[3][15:-5]#公司注册地 87 | item['registered_capital'] = data_value[4][15:-5]#注册资金 88 | item['establish_time'] = data_value[5][15:-5]#成立时间 89 | item['employees_number'] = data_value[6][15:-5]# 员工数量 90 | item['monthly_production'] = data_value[7][15:-5]#月产量 91 | item['annual_turnover'] = data_value[8][15:-5]#年营业额 92 | item['annual_export_volume'] = data_value[9][15:-5]#年出口额 93 | item['certification'] = data_value[10][15:-5]#管理体系认证 94 | item['main_operating_place'] = data_value[11][15:-5]#主要营业地点 95 | item['main_customer'] = data_value[12][15:-5]#主要客户 96 | item['workshop_area'] = data_value[13][15:-5]# 厂房面积 97 | item['provide_OEM'] = data_value[14][15:-5]#provide OEM 98 | item['bank'] = data_value[15][15:-5]#开户银行 99 | item['bank_account'] = data_value[16][15:-5]#银行帐号 100 | item['main_market'] = data_value[17][15:-5]#主要市场 101 | item['main_products'] = data_value[18][15:-5]#主营产品或服务 102 | yield item 103 | except Exception as e : 104 | print(e.__context__) 105 | print('抓取信息时发生异常:{}-->url:{}'.format(e,response.url)) 106 | return None 107 | 108 | def parse_second_type_company_web(self,response): 109 | ''' 110 | 对第一种类型的企业网站进行数据抓取 111 | :param response: 112 | :return: 113 | ''' 114 | try: 115 | item = BafangItem() 116 | company_name = Selector(response).re(r'') 117 | if company_name:#如果有企业名称,则对企业名称进行处理 118 | company_name = company_name[0].strip('') 119 | 120 | else:#如果没有企业名称,退出函数,直接返回--1 121 | return None 122 | ''' 123 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 124 | 以下两行代码是到目前为止还没有搞清楚的bug问题,下面两行代码提取出来的内容只有 125 | ,而其他真正的中文文本全被没有,但是print(response.text) 126 | 出来的结果证明,爬取出来的网页源代码中是包含中文文本的。所以后续只能改成用re.findall() 127 | 方法来提取数据。 128 | ''' 129 | # company_description = Selector(response).re(r'') 130 | # company_description = response.xpath('//ul[@class="Cgsjj"]').extract() 131 | company_description = re.findall(r'', response.text) 132 | print('************************') 133 | if company_description:#如果有详细信息描述,则对信息进行处理 134 | company_description = company_description[0] 135 | delete_str = re.findall(r'<[\s\S]*?>', company_description) 136 | for string in delete_str: 137 | company_description = company_description.replace(string , '') 138 | company_description = company_description.replace('\n' , '') 139 | else:#没有详细信息描述,令详细信息为空 140 | company_description = '' 141 | print(company_name) 142 | print(company_description) 143 | item['company_description']=company_description 144 | item['company_name'] = company_name 145 | data_attr = re.findall('', response.text) 146 | data_value = re.findall('', response.text) 147 | print('**************************{}'.format(len(data_attr))) 148 | if not (data_attr and data_value):#如果没有抓取到企业信息,直接退出函数,返回-1 149 | return None 150 | item['company_web_url'] = response.url#企业网址 151 | item['company_name'] = company_name#企业名称 152 | item['company_description']=company_description#企业描述 153 | item['economic_nature'] = self.txt_wrap_by('>', '<', data_value[0])#企业经济性质 154 | item['legal_representative'] = self.txt_wrap_by('>', '<', data_value[1])#法人代表或负责人 155 | item['company_type'] = self.txt_wrap_by('>', '<', data_value[2])#企业类型 156 | item['registered_site'] = self.txt_wrap_by('>', '<', data_value[3])#公司注册地 157 | item['registered_capital'] = self.txt_wrap_by('>', '<', data_value[4])#注册资金 158 | item['establish_time'] = self.txt_wrap_by('>', '<', data_value[5])#成立时间 159 | item['employees_number'] = self.txt_wrap_by('>', '<', data_value[6])# 员工数量 160 | item['monthly_production'] = self.txt_wrap_by('>', '<', data_value[7]) 161 | item['annual_turnover'] = self.txt_wrap_by('>', '<', data_value[8])#年营业额 162 | item['annual_export_volume'] = self.txt_wrap_by('>', '<', data_value[9])#年出口额 163 | item['certification'] = self.txt_wrap_by('>', '<', data_value[10])#管理体系认证 164 | item['main_operating_place'] = self.txt_wrap_by('>', '<', data_value[11])#主要营业地点 165 | item['main_customer'] = self.txt_wrap_by('>', '<', data_value[12])#主要客户 166 | item['workshop_area'] = self.txt_wrap_by('>', '<', data_value[13])# 厂房面积 167 | item['provide_OEM'] = self.txt_wrap_by('>', '<', data_value[14])#provide OEM 168 | item['bank'] = self.txt_wrap_by('>', '<', data_value[15])#开户银行 169 | item['bank_account'] = self.txt_wrap_by('>', '<', data_value[16])#银行帐号 170 | item['main_market'] = self.txt_wrap_by('>', '<', data_value[17])#主要市场 171 | item['main_products'] = self.txt_wrap_by('>', '<', data_value[18])#主营产品或服务 172 | #事实上,在八方资源网有的企业信息详情页有更多的数据,就是说data_attr和data_value长度不止19(不包含企业名称和描述), 173 | # 但是有的却只有19所以在这里只取前面19个数据,因为19以后的数据有些包含在了前面19中 174 | yield item 175 | except Exception as e : 176 | print('抓取信息时发生异常:{}-->url:{}'.format(e,response.url)) 177 | return None 178 | 179 | def txt_wrap_by(self , start_str, end_str, html): 180 | ''' 181 | 获取html里面在start_str与end_str之间的字符 182 | :param start_str: 183 | :param end_str: 184 | :param html: 185 | :return: 186 | ''' 187 | start = html.find(start_str) 188 | if start >= 0: 189 | start += len(start_str) 190 | end = html.find(end_str, start) 191 | if end >= 0: 192 | return html[start:end].strip() 193 | -------------------------------------------------------------------------------- /company_ifo_spider/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | pipe 61 | 62 | 63 | 64 | 72 | 73 | 74 | 75 | 76 | true 77 | DEFINITION_ORDER 78 | 79 | 80 | 81 | 82 | 83 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 |