├── .project
├── .pydevproject
├── .settings
    └── org.eclipse.core.resources.prefs
├── README.md
├── finance_qa_spider
    ├── __init__.py
    ├── __init__.pyc
    ├── cmdline.py
    ├── items.py
    ├── items.pyc
    ├── middlewares.py
    ├── middlewares.pyc
    ├── pipelines.py
    ├── pipelines.pyc
    ├── settings.py
    ├── settings.pyc
    └── spiders
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── shse_qa_spider.py
    │   └── shse_qa_spider.pyc
└── scrapy.cfg


/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>finance_qa_spider</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.python.pydev.PyDevBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.python.pydev.pythonNature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 | <?eclipse-pydev version="1.0"?><pydev_project>
3 | <pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
4 | <path>/${PROJECT_DIR_NAME}</path>
5 | </pydev_pathproperty>
6 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
7 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
8 | </pydev_project>
9 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//finance_qa_spider/cmdline.py=utf-8
3 | encoding//finance_qa_spider/items.py=utf-8
4 | encoding//finance_qa_spider/middlewares.py=utf-8
5 | encoding//finance_qa_spider/pipelines.py=utf-8
6 | encoding//finance_qa_spider/settings.py=utf-8
7 | encoding//finance_qa_spider/spiders/shse_qa_spider.py=utf-8
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 基于Scrapy框架的金融问答文本数据库建设
 2 | ---
 3 | ## 开发语言
 4 | Python
 5 | ## 开发平台
 6 | Eclipse+Pydev
 7 | ## 数据来源
 8 | 1. 上交所官方平台的问答系统  
 9 |    http://sns.sseinfo.com/qa.do  
10 | 
11 | 2. 深交所官方平台的问答系统  
12 |    http://irm.cninfo.com.cn/szse/index.html　　
13 | 
14 | 3. 全景网投资者关系互动平台  
15 |    http://rs.p5w.net/index/company/showQuestionPage.shtml　　
16 | 
17 | 4. 新浪股吧  
18 |    http://guba.sina.com.cn/?s=channel&chi
19 | ## 数据库表shse_qa
20 | 	mysql> CREATE TABLE IF NOT EXISTS `shse_qa`(
21 | 	    -> `current_time` TIMESTAMP NOT NULL,
22 | 	    -> `user_name` VARCHAR(100) NOT NULL,
23 | 	    -> `company_name` VARCHAR(100) NOT NULL,
24 | 	    -> `company_id` int(20) NOT NULL,
25 | 	    -> `question_time` VARCHAR(100) NOT NULL,
26 | 	    -> `question_content` text NOT NULL,
27 | 	    -> `answer_time` VARCHAR(100),
28 | 	    -> `answer_content` text
29 | 	    -> )ENGINE=InnoDB DEFAULT CHARSET=utf8;


--------------------------------------------------------------------------------
/finance_qa_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/__init__.py


--------------------------------------------------------------------------------
/finance_qa_spider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/__init__.pyc


--------------------------------------------------------------------------------
/finance_qa_spider/cmdline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy.cmdline
3 | 
4 | if __name__ == "__main__":
5 |     scrapy.cmdline.execute(['scrapy', 'crawl', 'shse'])


--------------------------------------------------------------------------------
/finance_qa_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class FinanceQaSpiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 
16 | 
17 | class QAItem(scrapy.Item):
18 |     user_name = scrapy.Field() #用户名
19 |     company_name = scrapy.Field() #提问的公司名
20 |     company_id = scrapy.Field() #公司ID
21 |     question_time = scrapy.Field() #提问时间
22 |     question_content = scrapy.Field() #提问内容
23 |     answer_time = scrapy.Field() #回答时间
24 |     answer_content = scrapy.Field() #回答内容
25 |     
26 |     


--------------------------------------------------------------------------------
/finance_qa_spider/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/items.pyc


--------------------------------------------------------------------------------
/finance_qa_spider/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals, crawler
 9 | import random
10 | 
11 | 
12 | class FinanceQaSpiderSpiderMiddleware(object):
13 |     # Not all methods need to be defined. If a method is not defined,
14 |     # scrapy acts as if the spider middleware does not modify the
15 |     # passed objects.
16 | 
17 |     @classmethod
18 |     def from_crawler(cls, crawler):
19 |         # This method is used by Scrapy to create your spiders.
20 |         s = cls()
21 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22 |         return s
23 | 
24 |     def process_spider_input(self, response, spider):
25 |         # Called for each response that goes through the spider
26 |         # middleware and into the spider.
27 | 
28 |         # Should return None or raise an exception.
29 |         return None
30 | 
31 |     def process_spider_output(self, response, result, spider):
32 |         # Called with the results returned from the Spider, after
33 |         # it has processed the response.
34 | 
35 |         # Must return an iterable of Request, dict or Item objects.
36 |         for i in result:
37 |             yield i
38 | 
39 |     def process_spider_exception(self, response, exception, spider):
40 |         # Called when a spider or process_spider_input() method
41 |         # (from other spider middleware) raises an exception.
42 | 
43 |         # Should return either None or an iterable of Response, dict
44 |         # or Item objects.
45 |         pass
46 | 
47 |     def process_start_requests(self, start_requests, spider):
48 |         # Called with the start requests of the spider, and works
49 |         # similarly to the process_spider_output() method, except
50 |         # that it doesn’t have a response associated.
51 | 
52 |         # Must return only requests (not items).
53 |         for r in start_requests:
54 |             yield r
55 | 
56 |     def spider_opened(self, spider):
57 |         spider.logger.info('Spider opened: %s' % spider.name)
58 |         
59 |         
60 | # 主要用来动态获取user agent
61 | class RandomUserAgent(object):
62 |     """Randomly rotate user agents based on a list of predefined ones"""
63 | 
64 |     def __init__(self, agents):
65 |         self.agents = agents
66 | 
67 |     @classmethod
68 |     def from_crawler(cls, crawler):
69 |         return cls(crawler.settings.getlist('USER_AGENTS'))
70 | 
71 |     def process_request(self, request, spider):
72 |         print "**************************" + random.choice(self.agents)
73 |         request.headers.setdefault('User-Agent', random.choice(self.agents))
74 |         
75 |         
76 | 


--------------------------------------------------------------------------------
/finance_qa_spider/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/middlewares.pyc


--------------------------------------------------------------------------------
/finance_qa_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from MySQLdb import cursors
 3 | import MySQLdb
 4 | from twisted.enterprise import adbapi
 5 | # Define your item pipelines here
 6 | #
 7 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 8 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 9 | 
10 | 
11 | class FinanceQaSpiderPipeline(object):
12 |     
13 |     def __init__(self, dbpool):
14 |         self.dbpool = dbpool
15 |         
16 |     @classmethod
17 |     def from_settings(cls, settings):
18 |         dbparams = dict(
19 |             host=settings['MYSQL_HOST'],
20 |             port=settings['MYSQL_PORT'],
21 |             user=settings['MYSQL_USER'],
22 |             passwd=settings['MYSQL_PASSWD'],
23 |             db=settings['MYSQL_DB'],
24 |             charset='utf8',
25 |             use_unicode=True,
26 |             cursorclass=MySQLdb.cursors.DictCursor,
27 |             )
28 |         
29 |         dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams)
30 |         return cls(dbpool)
31 |     
32 |     # pipeline默认调用
33 |     def process_item(self, item, spider):
34 |         query=self.dbpool.runInteraction(self._conditional_insert, item) #调用插入的方法
35 |         query.addErrback(self._handle_error, item, spider) #调用异常的处理方法
36 |         return item
37 |     
38 |     # 写入到数据库中
39 |     def _conditional_insert(self, tx, item):
40 |         sql = 'INSERT INTO shse_qa(user_name, company_name, company_id, question_time, question_content, answer_time, answer_content) VALUES(%s, %s, %s, %s, %s, %s, %s)'
41 |         params = (item['user_name'], item['company_name'], item['company_id'], item['question_time'], item['question_content'], item['answer_time'], item['answer_content'])
42 |         tx.execute(sql, params)
43 |         
44 |         print 'user_name:' + item['user_name']
45 |         
46 |     # 错误处理方法
47 |     def _handle_error(self, failue, item, spider):
48 |         print failue


--------------------------------------------------------------------------------
/finance_qa_spider/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/pipelines.pyc


--------------------------------------------------------------------------------
/finance_qa_spider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for finance_qa_spider project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'finance_qa_spider'
 13 | 
 14 | SPIDER_MODULES = ['finance_qa_spider.spiders']
 15 | NEWSPIDER_MODULE = 'finance_qa_spider.spiders'
 16 | 
 17 | LOG_LEVEL = 'INFO'
 18 | 
 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 20 | #USER_AGENT = 'finance_qa_spider (+http://www.yourdomain.com)'
 21 | USER_AGENTS = [
 22 |     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
 23 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
 24 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0',
 25 |     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
 26 |     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
 27 |     ]
 28 | 
 29 | 
 30 | # Obey robots.txt rules
 31 | ROBOTSTXT_OBEY = False #默认为True
 32 | 
 33 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 34 | #CONCURRENT_REQUESTS = 32
 35 | 
 36 | # Configure a delay for requests for the same website (default: 0)
 37 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 38 | # See also autothrottle settings and docs
 39 | DOWNLOAD_DELAY = 5 #默认取值为3，0.5到1.5之间的一个随机值*DOWNLOAD_DELAY的结果作为等待时间间隔，单位秒
 40 | # The download delay setting will honor only one of:
 41 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 42 | #CONCURRENT_REQUESTS_PER_IP = 16
 43 | 
 44 | # Disable cookies (enabled by default)
 45 | COOKIES_ENABLED = False #默认False
 46 | 
 47 | # Disable Telnet Console (enabled by default)
 48 | #TELNETCONSOLE_ENABLED = False
 49 | 
 50 | # Override the default request headers:
 51 | #DEFAULT_REQUEST_HEADERS = {
 52 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 53 | #   'Accept-Language': 'en',
 54 | #}
 55 | 
 56 | # Enable or disable spider middlewares
 57 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 58 | SPIDER_MIDDLEWARES = {
 59 |     #'finance_qa_spider.middlewares.FinanceQaSpiderSpiderMiddleware': 543,
 60 |     'finance_qa_spider.middlewares.RandomUserAgent': 543,
 61 | }
 62 | 
 63 | # Enable or disable downloader middlewares
 64 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 65 | DOWNLOADER_MIDDLEWARES = {
 66 |     #'finance_qa_spider.middlewares.MyCustomDownloaderMiddleware': 543,
 67 |     'finance_qa_spider.middlewares.RandomUserAgent':1, #添加
 68 | }
 69 | 
 70 | # Enable or disable extensions
 71 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 72 | #EXTENSIONS = {
 73 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 74 | #}
 75 | 
 76 | # 数据库的配置
 77 | MYSQL_HOST = 'localhost' #主机名(地址，即为127.0.0.1)
 78 | MYSQL_PORT = 3306 #端口号
 79 | MYSQL_USER = 'root' #用户名
 80 | MYSQL_PASSWD = '123456' #密码
 81 | MYSQL_DB = 'finance_qa' #数据库名
 82 | 
 83 | # Configure item pipelines
 84 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 85 | ITEM_PIPELINES = {
 86 |     'finance_qa_spider.pipelines.FinanceQaSpiderPipeline': 300, #保存到数据库
 87 | }
 88 | 
 89 | # Enable and configure the AutoThrottle extension (disabled by default)
 90 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 91 | #AUTOTHROTTLE_ENABLED = True
 92 | # The initial download delay
 93 | #AUTOTHROTTLE_START_DELAY = 5
 94 | # The maximum download delay to be set in case of high latencies
 95 | #AUTOTHROTTLE_MAX_DELAY = 60
 96 | # The average number of requests Scrapy should be sending in parallel to
 97 | # each remote server
 98 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 99 | # Enable showing throttling stats for every response received:
100 | #AUTOTHROTTLE_DEBUG = False
101 | 
102 | # Enable and configure HTTP caching (disabled by default)
103 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
104 | #HTTPCACHE_ENABLED = True
105 | #HTTPCACHE_EXPIRATION_SECS = 0
106 | #HTTPCACHE_DIR = 'httpcache'
107 | #HTTPCACHE_IGNORE_HTTP_CODES = []
108 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
109 | 


--------------------------------------------------------------------------------
/finance_qa_spider/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/settings.pyc


--------------------------------------------------------------------------------
/finance_qa_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/finance_qa_spider/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/spiders/__init__.pyc


--------------------------------------------------------------------------------
/finance_qa_spider/spiders/shse_qa_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from scrapy import Spider
 3 | 
 4 | from scrapy.http import Request
 5 | from finance_qa_spider.items import QAItem
 6 | 
 7 | 
 8 | 
 9 | class QASpider(Spider):
10 |     name = 'shse'
11 |     allowed_domains = ['sseinfo.com']
12 |     start_urls=['http://sns.sseinfo.com/ajax/feeds.do?type=11&pageSize=10&lastid=-1&show=1&page=1']
13 |     page = 1;
14 |     
15 |     
16 |     def parse(self, response):
17 |         sel = response.xpath('//*[@class="m_feed_item"]')
18 |         for s in sel:
19 |             item = QAItem()
20 |             item['user_name'] = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_face"]/p/text()').extract_first().strip()
21 |             company = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]/a/text()').extract_first().strip()
22 |             company_split = company.split('(')
23 |             item['company_name'] = company_split[0].replace(':', '')
24 |             item['company_id'] = company_split[1].replace(')', '')
25 |             item['question_time'] = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_func"]/div[@class="m_feed_from"]/span/text()').extract_first().strip()
26 |             item['question_content'] = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]/text()').extract()[1].strip()
27 |             item['answer_time'] = s.xpath('./div[@class="m_feed_detail m_qa"]/div[@class="m_feed_func top10"]/div[@class="m_feed_from"]/span/text()').extract_first().strip()
28 |             item['answer_content'] = s.xpath('./div[@class="m_feed_detail m_qa"]/div[@class="m_feed_cnt"]/div[@class="m_feed_txt"]/text()').extract_first().strip()
29 |             yield item
30 |             
31 |             
32 | #             print '********' + '\n' + item['user_name'] + '\n' + item['company_name'] + \
33 | #                '\n' + item['company_id'] + '\n' + item['question_time'] + '\n' + \
34 | #                item['question_content'] + '\n' + item['answer_time'] + '\n' + item['answer_content'] + '\n' + '********'
35 | 
36 |             
37 |         self.page += 1;
38 |         next_url = 'http://sns.sseinfo.com/ajax/feeds.do?type=11&pageSize=10&lastid=-1&show=1&page=' + str(self.page)
39 |         yield Request(next_url, callback=self.parse)
40 |         
41 | 
42 |         


--------------------------------------------------------------------------------
/finance_qa_spider/spiders/shse_qa_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/spiders/shse_qa_spider.pyc


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = finance_qa_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = finance_qa_spider
12 | 


--------------------------------------------------------------------------------