├── .gitignore
├── company_ifo_spider
    ├── __init__.py
    ├── __pycache__
    │   ├── items.cpython-36.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── pipelines.cpython-36.pyc
    │   └── settings.cpython-36.pyc
    ├── spiders
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── bafang_spider.cpython-36.pyc
    │   ├── __init__.py
    │   └── bafang_spider.py
    ├── .idea
    │   ├── modules.xml
    │   ├── misc.xml
    │   ├── company_ifo_spider.iml
    │   └── workspace.xml
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── middlewares.py
├── README.MD
└── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
1 | data.csv
2 | 开发笔记.docx


--------------------------------------------------------------------------------
/company_ifo_spider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/company_ifo_spider/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/company_ifo_spider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/company_ifo_spider/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/company_ifo_spider/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/company_ifo_spider/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/company_ifo_spider/spiders/__pycache__/bafang_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenHuabin321/company_ino_spider/HEAD/company_ifo_spider/spiders/__pycache__/bafang_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/company_ifo_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | 本项目为企业工商信息网络爬虫，输入行业关键字，例如“铜箔”，可爬取八方资源网等工商信息网上所有与铜箔有关企业的工商信息。
2 | 八方资源网企业信息爬虫使用方法：
3 | 在项目目录（company_ifo_spider）下，打开命令提示符，输入一下命令：
4 |     scrapy crawl bafang_spider -a keyword=k
5 |     其中，k是用户设置的关键词，是用户爬取的行业关键词，例如：铜箔


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = company_ifo_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = company_ifo_spider
12 | 


--------------------------------------------------------------------------------
/company_ifo_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/company_ifo_spider.iml" filepath="$PROJECT_DIR$/.idea/company_ifo_spider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/company_ifo_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.5 (D:\Program Files\Python\python.exe)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/company_ifo_spider/.idea/company_ifo_spider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/company_ifo_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BafangItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     company_name = scrapy.Field()#企业名称
15 |     company_description = scrapy.Field()#企业描述
16 |     economic_nature = scrapy.Field()#企业经济性质
17 |     legal_representative = scrapy.Field()#法人代表或负责人
18 |     company_type = scrapy.Field()#企业类型
19 |     registered_site = scrapy.Field()#公司注册地
20 |     registered_capital = scrapy.Field()#注册资金
21 |     establish_time = scrapy.Field()#成立时间
22 |     employees_number = scrapy.Field()# 员工数量
23 |     monthly_production = scrapy.Field()#月产量
24 |     annual_turnover = scrapy.Field()#年营业额
25 |     annual_export_volume = scrapy.Field()#年出口额
26 |     certification = scrapy.Field()#管理体系认证
27 |     main_operating_place = scrapy.Field()#主要营业地点
28 |     main_customer = scrapy.Field()#主要客户
29 |     workshop_area = scrapy.Field()# 厂房面积
30 |     provide_OEM = scrapy.Field()#provide OEM
31 |     bank = scrapy.Field()#开户银行
32 |     bank_account = scrapy.Field()#银行帐号
33 |     main_market = scrapy.Field()#主要市场
34 |     main_products = scrapy.Field()#主营产品或服务
35 |     company_web_url = scrapy.Field()#企业网址
36 | 


--------------------------------------------------------------------------------
/company_ifo_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import os
 8 | import csv
 9 | 
10 | class CompanyIfoSpiderPipeline(object):
11 |     def __init__(self):
12 |         self.file_path = 'data.csv'
13 |         self.dict_key = ['企业名称' , '企业描述' , '企业经济性质' , '法人代表或负责人' , '企业类型' , '公司注册地' , '注册资金' , '成立时间' , ' 员工数量' , '月产量' , '年营业额' , '年出口额' , '管理体系认证' , '主要营业地点' , '主要客户' , ' 厂房面积' , '是否提供OEM' , '开户银行' , '银行帐号' , '主要市场' , '主营产品或服务' , '企业网站链接' ]
14 |         if not os.path.exists(self.file_path):#若是存放数据的文件不存在，那么就要写入第一行的表头
15 |             self.file = open(self.file_path,'a',encoding='utf-8' , newline='')
16 |             self.w=csv.writer(self.file)
17 |             self.w.writerow(self.dict_key)#写入表头
18 |         else:
19 |             self.file = open(self.file_path,'a',encoding='utf-8' , newline='')
20 |             self.w=csv.writer(self.file)
21 |     def process_item(self, item, spider):
22 |         print('****************管道文件运行******************')
23 |         dict_values = [
24 |             item['company_name'] ,
25 |             item['company_description'] ,
26 |             item['economic_nature'] ,
27 |             item['legal_representative'] ,
28 |             item['company_type'] ,
29 |             item['registered_site'] ,
30 |             item['registered_capital'] ,
31 |             item['establish_time'] ,
32 |             item['employees_number'] ,
33 |             item['monthly_production'] ,
34 |             item['annual_turnover'] ,
35 |             item['annual_export_volume'] ,
36 |             item['certification'] ,
37 |             item['main_operating_place'] ,
38 |             item['main_customer'] ,
39 |             item['workshop_area'] ,
40 |             item['provide_OEM'] ,
41 |             item['bank'] ,
42 |             item['bank_account'] ,
43 |             item['main_market'] ,
44 |             item['main_products'] ,
45 |             item['company_web_url']]
46 | 
47 |         self.w.writerow(dict_values)#写入数据
48 |         return item
49 | 
50 |     def close_spider(self , spider):
51 |         print('**********************关闭爬虫**********************')
52 |         self.file.flush()
53 |         self.file.close()


--------------------------------------------------------------------------------
/company_ifo_spider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for company_ifo_spider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'company_ifo_spider'
13 | 
14 | SPIDER_MODULES = ['company_ifo_spider.spiders']
15 | NEWSPIDER_MODULE = 'company_ifo_spider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'company_ifo_spider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'company_ifo_spider.middlewares.CompanyIfoSpiderSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'company_ifo_spider.middlewares.CompanyIfoSpiderDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'company_ifo_spider.pipelines.CompanyIfoSpiderPipeline': 300,
69 | }
70 | RETRY_ENABLED = True#是否进行失败重试
71 | RETRY_TIMES = 3
72 | HTTPERROR_ALLOW_ALL = True#下载网页时，发生所有错误返回的response都要处理，网上资料说最好不要这么设置，但是现在技术太菜，没想到其他办法
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/company_ifo_spider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class CompanyIfoSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class CompanyIfoSpiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/company_ifo_spider/spiders/bafang_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from pypinyin import pinyin , lazy_pinyin
  4 | from scrapy.selector import Selector
  5 | from company_ifo_spider.items import BafangItem
  6 | import re
  7 | 
  8 | 
  9 | class BafangSpiderSpider(scrapy.Spider):
 10 |     '''
 11 |     本爬虫为八方资源网企业工商信息爬虫，使用方法是在项目目录（company_ifo_spider）下，
 12 |     打开命令提示符，输入一下命令：
 13 |     scrapy crawl bafang_spider -a keyword=k
 14 |     其中，k是用户设置的关键词，是用户爬取的行业关键词，例如：铜箔
 15 |     '''
 16 |     name = 'bafang_spider'
 17 | 
 18 |     start_urls = ['http://www.b2b168.com/']
 19 | 
 20 |     def __init__(self , keyword=None , *args , **kwargs):#keyword为用户通过命令行输入的中文关键词
 21 |             super(BafangSpiderSpider , self).__init__(*args , **kwargs)
 22 |             print('*********************启动爬虫**************************')
 23 |             self.keyword = ''.join(lazy_pinyin(keyword))
 24 |             self.start_urls = ['https://www.b2b168.com/k-' + self.keyword + '/']
 25 |             print(self.start_urls)
 26 |             self.allowed_domains = []
 27 |     def parse(self, response):
 28 |         '''
 29 |         本方法主要作用是获取下一页一级公司列表页面，以及当前页面中所包含的所有公司链接
 30 |         :param response:
 31 |         :return:
 32 |         '''
 33 | 
 34 |         try:
 35 |             data_link = Selector(response).re(r'<a class="list-item-title-text" title="[\s\S]*?</a>')
 36 |             for string in data_link:
 37 |                 link = self.txt_wrap_by('href="', '">', string)
 38 |                 company_url = link + '/home.aspx'
 39 |                 print('当前链接：{}'.format(company_url))
 40 |                 #进一步爬取企业信息详情页面
 41 |                 #八方资源网的企业信息详情页有两种，两种页面的数据拜访不一样，所以要用不同的方法来提取
 42 |                 if not 'b2b168.com/c168' in link:#如果网址中不存在这几个字符则是第一类网页
 43 |                     yield scrapy.Request(url=company_url, callback=self.parse_first_type_company_web)
 44 |                 else:#网址中存在这几个字符就是第二类网页
 45 |                     yield scrapy.Request(url=company_url, callback=self.parse_second_type_company_web)
 46 | 
 47 |         except Exception as e:
 48 |             print('抓取搜索结果中企业链接列表时发生异常：{}-->{}'.format(e,response.url))
 49 | 
 50 |         next_page = response.xpath('//ul[@class="page"]/a[last()]/@href').extract()[0]
 51 |         next_page_url = 'https://www.b2b168.com' + next_page
 52 |         print('************下一页：************'+next_page_url)
 53 |         yield scrapy.Request(url=next_page_url, callback=self.parse)
 54 | 
 55 |     def parse_first_type_company_web(self,response):
 56 |         '''
 57 |         对第一种类型的企业网站进行数据抓取
 58 |         :param response:
 59 |         :return:
 60 |         '''
 61 |         try:
 62 |             item = BafangItem()
 63 |             company_name = Selector(response).re(r'<li class="fl">.*?</li>')
 64 |             if company_name:#如果有企业名称，则对企业名称进行处理
 65 |                 company_name = company_name[0].strip('<li class="fl">').strip('</li>')
 66 |                 print('企业名称---->{}'.format(company_name))
 67 |             else:#如果没有企业名称，退出函数，直接返回
 68 |                 print('没有企业名称-------->{}'.format(response.url))
 69 |                 return None
 70 |             company_description = Selector(response).re(r'<pre>[\s\S]*</pre>')
 71 |             if company_description:#如果有详细信息描述，则对信息进行处理
 72 |                 company_description = company_description[0].strip('<pre>').strip('</pre>').replace('\r\n','')
 73 |             else:#没有详细信息描述，令详细信息为空
 74 |                 company_description = ''
 75 |             data_attr = Selector(response).re(r'class="table b">.*?</td>')
 76 |             data_value = Selector(response).re(r'class="table1">[\s\S]*?</td>')
 77 |             if not (data_attr and data_value):#如果没有抓取到企业信息，直接退出函数，返回-1
 78 |                 print('没有企业详细信息****************')
 79 |                 return None
 80 |             item['company_web_url'] = response.url#企业网址
 81 |             item['company_description']=company_description#企业描述
 82 |             item['company_name'] = company_name#企业名称
 83 |             item['economic_nature'] = data_value[0][15:-5]#企业经济性质
 84 |             item['legal_representative'] = data_value[1][15:-5]#法人代表或负责人
 85 |             item['company_type'] = data_value[2][15:-5]#企业类型
 86 |             item['registered_site'] = data_value[3][15:-5]#公司注册地
 87 |             item['registered_capital'] = data_value[4][15:-5]#注册资金
 88 |             item['establish_time'] = data_value[5][15:-5]#成立时间
 89 |             item['employees_number'] = data_value[6][15:-5]# 员工数量
 90 |             item['monthly_production'] = data_value[7][15:-5]#月产量
 91 |             item['annual_turnover'] = data_value[8][15:-5]#年营业额
 92 |             item['annual_export_volume'] = data_value[9][15:-5]#年出口额
 93 |             item['certification'] = data_value[10][15:-5]#管理体系认证
 94 |             item['main_operating_place'] = data_value[11][15:-5]#主要营业地点
 95 |             item['main_customer'] = data_value[12][15:-5]#主要客户
 96 |             item['workshop_area'] = data_value[13][15:-5]# 厂房面积
 97 |             item['provide_OEM'] = data_value[14][15:-5]#provide OEM
 98 |             item['bank'] = data_value[15][15:-5]#开户银行
 99 |             item['bank_account'] = data_value[16][15:-5]#银行帐号
100 |             item['main_market'] = data_value[17][15:-5]#主要市场
101 |             item['main_products'] = data_value[18][15:-5]#主营产品或服务
102 |             yield item
103 |         except Exception as e :
104 |             print(e.__context__)
105 |             print('抓取信息时发生异常：{}-->url：{}'.format(e,response.url))
106 |             return None
107 | 
108 |     def parse_second_type_company_web(self,response):
109 |         '''
110 |         对第一种类型的企业网站进行数据抓取
111 |         :param response:
112 |         :return:
113 |         '''
114 |         try:
115 |             item = BafangItem()
116 |             company_name = Selector(response).re(r'<ul class="company">[\s\S]*?</ul>')
117 |             if company_name:#如果有企业名称，则对企业名称进行处理
118 |                 company_name = company_name[0].strip('<ul class="company">').strip('</ul>')
119 | 
120 |             else:#如果没有企业名称，退出函数，直接返回--1
121 |                 return None
122 |                 '''
123 |                 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
124 |                 以下两行代码是到目前为止还没有搞清楚的bug问题，下面两行代码提取出来的内容只有
125 |             <ul class="Cgsjj">\r\n</ul>，而其他真正的中文文本全被没有，但是print（response.text）
126 |             出来的结果证明，爬取出来的网页源代码中是包含中文文本的。所以后续只能改成用re.findall()
127 |             方法来提取数据。
128 |                 '''
129 |             # company_description = Selector(response).re(r'<ul class="Cgsjj">[\w\W]*?</ul>')
130 |             # company_description = response.xpath('//ul[@class="Cgsjj"]').extract()
131 |             company_description = re.findall(r'<ul class="Cgsjj">[\w\W]*?</ul>', response.text)
132 |             print('************************')
133 |             if company_description:#如果有详细信息描述，则对信息进行处理
134 |                 company_description = company_description[0]
135 |                 delete_str = re.findall(r'<[\s\S]*?>', company_description)
136 |                 for string in delete_str:
137 |                     company_description = company_description.replace(string , '')
138 |                 company_description = company_description.replace('\n' , '')
139 |             else:#没有详细信息描述，令详细信息为空
140 |                 company_description = ''
141 |             print(company_name)
142 |             print(company_description)
143 |             item['company_description']=company_description
144 |             item['company_name'] = company_name
145 |             data_attr = re.findall('<th[\S\s]*?</th>', response.text)
146 |             data_value = re.findall('<td[\S\s]*?</td>', response.text)
147 |             print('**************************{}'.format(len(data_attr)))
148 |             if not (data_attr and data_value):#如果没有抓取到企业信息，直接退出函数，返回-1
149 |                 return None
150 |             item['company_web_url'] = response.url#企业网址
151 |             item['company_name'] = company_name#企业名称
152 |             item['company_description']=company_description#企业描述
153 |             item['economic_nature'] = self.txt_wrap_by('>', '<', data_value[0])#企业经济性质
154 |             item['legal_representative'] = self.txt_wrap_by('>', '<', data_value[1])#法人代表或负责人
155 |             item['company_type'] = self.txt_wrap_by('>', '<', data_value[2])#企业类型
156 |             item['registered_site'] = self.txt_wrap_by('>', '<', data_value[3])#公司注册地
157 |             item['registered_capital'] = self.txt_wrap_by('>', '<', data_value[4])#注册资金
158 |             item['establish_time'] = self.txt_wrap_by('>', '<', data_value[5])#成立时间
159 |             item['employees_number'] = self.txt_wrap_by('>', '<', data_value[6])# 员工数量
160 |             item['monthly_production'] = self.txt_wrap_by('>', '<', data_value[7])
161 |             item['annual_turnover'] = self.txt_wrap_by('>', '<', data_value[8])#年营业额
162 |             item['annual_export_volume'] = self.txt_wrap_by('>', '<', data_value[9])#年出口额
163 |             item['certification'] = self.txt_wrap_by('>', '<', data_value[10])#管理体系认证
164 |             item['main_operating_place'] = self.txt_wrap_by('>', '<', data_value[11])#主要营业地点
165 |             item['main_customer'] = self.txt_wrap_by('>', '<', data_value[12])#主要客户
166 |             item['workshop_area'] = self.txt_wrap_by('>', '<', data_value[13])# 厂房面积
167 |             item['provide_OEM'] = self.txt_wrap_by('>', '<', data_value[14])#provide OEM
168 |             item['bank'] = self.txt_wrap_by('>', '<', data_value[15])#开户银行
169 |             item['bank_account'] = self.txt_wrap_by('>', '<', data_value[16])#银行帐号
170 |             item['main_market'] = self.txt_wrap_by('>', '<', data_value[17])#主要市场
171 |             item['main_products'] = self.txt_wrap_by('>', '<', data_value[18])#主营产品或服务
172 |             #事实上，在八方资源网有的企业信息详情页有更多的数据，就是说data_attr和data_value长度不止19（不包含企业名称和描述）,
173 |             # 但是有的却只有19所以在这里只取前面19个数据，因为19以后的数据有些包含在了前面19中
174 |             yield item
175 |         except Exception as e :
176 |             print('抓取信息时发生异常：{}-->url：{}'.format(e,response.url))
177 |             return None
178 | 
179 |     def txt_wrap_by(self , start_str, end_str, html):
180 |         '''
181 |         获取html里面在start_str与end_str之间的字符
182 |         :param start_str:
183 |         :param end_str:
184 |         :param html:
185 |         :return:
186 |         '''
187 |         start = html.find(start_str)
188 |         if start >= 0:
189 |             start += len(start_str)
190 |             end = html.find(end_str, start)
191 |             if end >= 0:
192 |                 return html[start:end].strip()
193 | 


--------------------------------------------------------------------------------
/company_ifo_spider/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="5fcdadcc-e466-475c-915b-163f56732323" name="默认的" comment="" />
  5 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  6 |     <option name="TRACKING_ENABLED" value="true" />
  7 |     <option name="SHOW_DIALOG" value="false" />
  8 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
  9 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 10 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 11 |   </component>
 12 |   <component name="FileEditorManager">
 13 |     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
 14 |       <file leaf-file-name="items.py" pinned="false" current-in-tab="false">
 15 |         <entry file="file://$PROJECT_DIR$/items.py">
 16 |           <provider selected="true" editor-type-id="text-editor">
 17 |             <state relative-caret-position="420">
 18 |               <caret line="26" column="29" lean-forward="true" selection-start-line="26" selection-start-column="29" selection-end-line="26" selection-end-column="29" />
 19 |             </state>
 20 |           </provider>
 21 |         </entry>
 22 |       </file>
 23 |       <file leaf-file-name="settings.py" pinned="false" current-in-tab="false">
 24 |         <entry file="file://$PROJECT_DIR$/settings.py">
 25 |           <provider selected="true" editor-type-id="text-editor">
 26 |             <state relative-caret-position="445">
 27 |               <caret line="82" column="21" selection-start-line="82" selection-start-column="21" selection-end-line="82" selection-end-column="21" />
 28 |             </state>
 29 |           </provider>
 30 |         </entry>
 31 |       </file>
 32 |       <file leaf-file-name="bafang_spider.py" pinned="false" current-in-tab="true">
 33 |         <entry file="file://$PROJECT_DIR$/spiders/bafang_spider.py">
 34 |           <provider selected="true" editor-type-id="text-editor">
 35 |             <state relative-caret-position="362">
 36 |               <caret line="137" column="44" selection-start-line="137" selection-start-column="44" selection-end-line="137" selection-end-column="44" />
 37 |               <folding>
 38 |                 <element signature="e#24#37#0" expanded="true" />
 39 |               </folding>
 40 |             </state>
 41 |           </provider>
 42 |         </entry>
 43 |       </file>
 44 |       <file leaf-file-name="pipelines.py" pinned="false" current-in-tab="false">
 45 |         <entry file="file://$PROJECT_DIR$/pipelines.py">
 46 |           <provider selected="true" editor-type-id="text-editor">
 47 |             <state relative-caret-position="-748">
 48 |               <caret line="15" column="40" lean-forward="true" selection-start-line="15" selection-start-column="40" selection-end-line="15" selection-end-column="40" />
 49 |               <folding>
 50 |                 <element signature="e#193#202#0" expanded="true" />
 51 |               </folding>
 52 |             </state>
 53 |           </provider>
 54 |         </entry>
 55 |       </file>
 56 |     </leaf>
 57 |   </component>
 58 |   <component name="FindInProjectRecents">
 59 |     <findStrings>
 60 |       <find>pipe</find>
 61 |     </findStrings>
 62 |   </component>
 63 |   <component name="IdeDocumentHistory">
 64 |     <option name="CHANGED_PATHS">
 65 |       <list>
 66 |         <option value="$PROJECT_DIR$/items.py" />
 67 |         <option value="$PROJECT_DIR$/settings.py" />
 68 |         <option value="$PROJECT_DIR$/pipelines.py" />
 69 |         <option value="$PROJECT_DIR$/spiders/bafang_spider.py" />
 70 |       </list>
 71 |     </option>
 72 |   </component>
 73 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
 74 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
 75 |   <component name="JsGulpfileManager">
 76 |     <detection-done>true</detection-done>
 77 |     <sorting>DEFINITION_ORDER</sorting>
 78 |   </component>
 79 |   <component name="NodePackageJsonFileManager">
 80 |     <packageJsonPaths />
 81 |   </component>
 82 |   <component name="ProjectFrameBounds">
 83 |     <option name="x" value="-8" />
 84 |     <option name="y" value="-8" />
 85 |     <option name="width" value="1382" />
 86 |     <option name="height" value="744" />
 87 |   </component>
 88 |   <component name="ProjectView">
 89 |     <navigator proportions="" version="1">
 90 |       <foldersAlwaysOnTop value="true" />
 91 |     </navigator>
 92 |     <panes>
 93 |       <pane id="ProjectPane">
 94 |         <subPane>
 95 |           <expand>
 96 |             <path>
 97 |               <item name="company_ifo_spider" type="b2602c69:ProjectViewProjectNode" />
 98 |               <item name="company_ifo_spider" type="462c0819:PsiDirectoryNode" />
 99 |             </path>
100 |             <path>
101 |               <item name="company_ifo_spider" type="b2602c69:ProjectViewProjectNode" />
102 |               <item name="company_ifo_spider" type="462c0819:PsiDirectoryNode" />
103 |               <item name="spiders" type="462c0819:PsiDirectoryNode" />
104 |             </path>
105 |           </expand>
106 |           <select />
107 |         </subPane>
108 |       </pane>
109 |       <pane id="Scope" />
110 |     </panes>
111 |   </component>
112 |   <component name="PropertiesComponent">
113 |     <property name="WebServerToolWindowFactoryState" value="false" />
114 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
115 |     <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
116 |     <property name="nodejs_npm_path_reset_for_default_project" value="true" />
117 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
118 |   </component>
119 |   <component name="RunDashboard">
120 |     <option name="ruleStates">
121 |       <list>
122 |         <RuleState>
123 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
124 |         </RuleState>
125 |         <RuleState>
126 |           <option name="name" value="StatusDashboardGroupingRule" />
127 |         </RuleState>
128 |       </list>
129 |     </option>
130 |   </component>
131 |   <component name="SvnConfiguration">
132 |     <configuration />
133 |   </component>
134 |   <component name="TaskManager">
135 |     <task active="true" id="Default" summary="Default task">
136 |       <changelist id="5fcdadcc-e466-475c-915b-163f56732323" name="默认的" comment="" />
137 |       <created>1530609067136</created>
138 |       <option name="number" value="Default" />
139 |       <option name="presentableId" value="Default" />
140 |       <updated>1530609067136</updated>
141 |     </task>
142 |     <servers />
143 |   </component>
144 |   <component name="TodoView">
145 |     <todo-panel id="selected-file">
146 |       <is-autoscroll-to-source value="true" />
147 |     </todo-panel>
148 |     <todo-panel id="all">
149 |       <are-packages-shown value="true" />
150 |       <is-autoscroll-to-source value="true" />
151 |     </todo-panel>
152 |   </component>
153 |   <component name="ToolWindowManager">
154 |     <frame x="-8" y="-8" width="1382" height="744" extended-state="0" />
155 |     <editor active="true" />
156 |     <layout>
157 |       <window_info anchor="bottom" id="TODO" order="6" weight="0.32838285" />
158 |       <window_info anchor="bottom" id="调试" order="14" />
159 |       <window_info anchor="bottom" id="Event Log" order="11" side_tool="true" />
160 |       <window_info anchor="bottom" id="Database Changes" order="10" show_stripe_button="false" />
161 |       <window_info anchor="bottom" id="Version Control" order="9" show_stripe_button="false" />
162 |       <window_info anchor="bottom" id="Python Console" order="13" weight="0.32838285" />
163 |       <window_info anchor="bottom" id="运行" order="7" />
164 |       <window_info anchor="bottom" id="Terminal" order="12" weight="0.18151815" />
165 |       <window_info active="true" content_ui="combo" x="0" y="23" width="1366" height="705" id="Project" order="0" visible="true" weight="0.20196672" />
166 |       <window_info anchor="bottom" id="Docker" order="8" show_stripe_button="false" />
167 |       <window_info anchor="right" id="Database" order="4" />
168 |       <window_info anchor="right" id="SciView" order="3" />
169 |       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
170 |       <window_info id="Favorites" order="2" side_tool="true" />
171 |       <window_info anchor="bottom" id="找到" order="7" />
172 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
173 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
174 |       <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
175 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
176 |       <window_info anchor="bottom" id="Run" order="2" />
177 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
178 |       <window_info anchor="bottom" id="Message" order="0" />
179 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
180 |       <window_info anchor="bottom" id="Find" order="1" />
181 |       <window_info anchor="right" id="层次结构" order="3" weight="0.32980332" />
182 |     </layout>
183 |   </component>
184 |   <component name="TypeScriptGeneratedFilesManager">
185 |     <option name="version" value="1" />
186 |   </component>
187 |   <component name="VcsContentAnnotationSettings">
188 |     <option name="myLimit" value="2678400000" />
189 |   </component>
190 |   <component name="editorHistoryManager">
191 |     <entry file="file://$PROJECT_DIR$/items.py">
192 |       <provider selected="true" editor-type-id="text-editor">
193 |         <state relative-caret-position="390">
194 |           <caret line="13" column="22" selection-start-line="13" selection-start-column="22" selection-end-line="13" selection-end-column="22" />
195 |         </state>
196 |       </provider>
197 |     </entry>
198 |     <entry file="file://$PROJECT_DIR$/settings.py">
199 |       <provider selected="true" editor-type-id="text-editor">
200 |         <state relative-caret-position="2400">
201 |           <caret line="80" column="20" lean-forward="true" selection-start-line="80" selection-start-column="20" selection-end-line="80" selection-end-column="20" />
202 |         </state>
203 |       </provider>
204 |     </entry>
205 |     <entry file="file://$PROJECT_DIR$/spiders/bafang_spider.py">
206 |       <provider selected="true" editor-type-id="text-editor">
207 |         <state relative-caret-position="4680">
208 |           <caret line="160" column="58" selection-start-line="160" selection-start-column="58" selection-end-line="160" selection-end-column="58" />
209 |           <folding>
210 |             <element signature="e#24#37#0" expanded="true" />
211 |           </folding>
212 |         </state>
213 |       </provider>
214 |     </entry>
215 |     <entry file="file://$PROJECT_DIR$/pipelines.py">
216 |       <provider selected="true" editor-type-id="text-editor">
217 |         <state relative-caret-position="1410">
218 |           <caret line="48" lean-forward="true" selection-start-line="48" selection-end-line="48" />
219 |           <folding>
220 |             <element signature="e#193#202#0" expanded="true" />
221 |           </folding>
222 |         </state>
223 |       </provider>
224 |     </entry>
225 |     <entry file="file://$PROJECT_DIR$/items.py">
226 |       <provider selected="true" editor-type-id="text-editor">
227 |         <state relative-caret-position="420">
228 |           <caret line="14" column="23" selection-start-line="14" selection-start-column="4" selection-end-line="14" selection-end-column="23" />
229 |         </state>
230 |       </provider>
231 |     </entry>
232 |     <entry file="file://$PROJECT_DIR$/spiders/bafang_spider.py">
233 |       <provider selected="true" editor-type-id="text-editor">
234 |         <state relative-caret-position="1110">
235 |           <caret line="40" column="22" selection-start-line="40" selection-start-column="22" selection-end-line="40" selection-end-column="22" />
236 |           <folding>
237 |             <element signature="e#24#37#0" expanded="true" />
238 |           </folding>
239 |         </state>
240 |       </provider>
241 |     </entry>
242 |     <entry file="file://D:/Program Files/Python/Lib/site-packages/scrapy/spiders/__init__.py">
243 |       <provider selected="true" editor-type-id="text-editor">
244 |         <state relative-caret-position="246">
245 |           <caret line="91" column="16" lean-forward="true" selection-start-line="91" selection-start-column="16" selection-end-line="91" selection-end-column="16" />
246 |         </state>
247 |       </provider>
248 |     </entry>
249 |     <entry file="file://$PROJECT_DIR$/settings.py">
250 |       <provider selected="true" editor-type-id="text-editor">
251 |         <state relative-caret-position="445">
252 |           <caret line="82" column="21" selection-start-line="82" selection-start-column="21" selection-end-line="82" selection-end-column="21" />
253 |         </state>
254 |       </provider>
255 |     </entry>
256 |     <entry file="file://$PROJECT_DIR$/items.py">
257 |       <provider selected="true" editor-type-id="text-editor">
258 |         <state relative-caret-position="420">
259 |           <caret line="26" column="29" lean-forward="true" selection-start-line="26" selection-start-column="29" selection-end-line="26" selection-end-column="29" />
260 |         </state>
261 |       </provider>
262 |     </entry>
263 |     <entry file="file://$PROJECT_DIR$/pipelines.py">
264 |       <provider selected="true" editor-type-id="text-editor">
265 |         <state relative-caret-position="-748">
266 |           <caret line="15" column="40" lean-forward="true" selection-start-line="15" selection-start-column="40" selection-end-line="15" selection-end-column="40" />
267 |           <folding>
268 |             <element signature="e#193#202#0" expanded="true" />
269 |           </folding>
270 |         </state>
271 |       </provider>
272 |     </entry>
273 |     <entry file="file://$PROJECT_DIR$/spiders/bafang_spider.py">
274 |       <provider selected="true" editor-type-id="text-editor">
275 |         <state relative-caret-position="362">
276 |           <caret line="137" column="44" selection-start-line="137" selection-start-column="44" selection-end-line="137" selection-end-column="44" />
277 |           <folding>
278 |             <element signature="e#24#37#0" expanded="true" />
279 |           </folding>
280 |         </state>
281 |       </provider>
282 |     </entry>
283 |   </component>
284 | </project>


--------------------------------------------------------------------------------