├── .project ├── .pydevproject ├── .settings └── org.eclipse.core.resources.prefs ├── README.md ├── finance_qa_spider ├── __init__.py ├── __init__.pyc ├── cmdline.py ├── items.py ├── items.pyc ├── middlewares.py ├── middlewares.pyc ├── pipelines.py ├── pipelines.pyc ├── settings.py ├── settings.pyc └── spiders │ ├── __init__.py │ ├── __init__.pyc │ ├── shse_qa_spider.py │ └── shse_qa_spider.pyc └── scrapy.cfg /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | finance_qa_spider 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /${PROJECT_DIR_NAME} 5 | 6 | python 2.7 7 | Default 8 | 9 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//finance_qa_spider/cmdline.py=utf-8 3 | encoding//finance_qa_spider/items.py=utf-8 4 | encoding//finance_qa_spider/middlewares.py=utf-8 5 | encoding//finance_qa_spider/pipelines.py=utf-8 6 | encoding//finance_qa_spider/settings.py=utf-8 7 | encoding//finance_qa_spider/spiders/shse_qa_spider.py=utf-8 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 基于Scrapy框架的金融问答文本数据库建设 2 | --- 3 | ## 开发语言 4 | Python 5 | ## 开发平台 6 | Eclipse+Pydev 7 | ## 数据来源 8 | 1. 上交所官方平台的问答系统 9 | http://sns.sseinfo.com/qa.do 10 | 11 | 2. 深交所官方平台的问答系统 12 | http://irm.cninfo.com.cn/szse/index.html   13 | 14 | 3. 全景网投资者关系互动平台 15 | http://rs.p5w.net/index/company/showQuestionPage.shtml   16 | 17 | 4. 新浪股吧 18 | http://guba.sina.com.cn/?s=channel&chi 19 | ## 数据库表shse_qa 20 | mysql> CREATE TABLE IF NOT EXISTS `shse_qa`( 21 | -> `current_time` TIMESTAMP NOT NULL, 22 | -> `user_name` VARCHAR(100) NOT NULL, 23 | -> `company_name` VARCHAR(100) NOT NULL, 24 | -> `company_id` int(20) NOT NULL, 25 | -> `question_time` VARCHAR(100) NOT NULL, 26 | -> `question_content` text NOT NULL, 27 | -> `answer_time` VARCHAR(100), 28 | -> `answer_content` text 29 | -> )ENGINE=InnoDB DEFAULT CHARSET=utf8; -------------------------------------------------------------------------------- /finance_qa_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/__init__.py -------------------------------------------------------------------------------- /finance_qa_spider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/__init__.pyc -------------------------------------------------------------------------------- /finance_qa_spider/cmdline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy.cmdline 3 | 4 | if __name__ == "__main__": 5 | scrapy.cmdline.execute(['scrapy', 'crawl', 'shse']) -------------------------------------------------------------------------------- /finance_qa_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class FinanceQaSpiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | 16 | 17 | class QAItem(scrapy.Item): 18 | user_name = scrapy.Field() #用户名 19 | company_name = scrapy.Field() #提问的公司名 20 | company_id = scrapy.Field() #公司ID 21 | question_time = scrapy.Field() #提问时间 22 | question_content = scrapy.Field() #提问内容 23 | answer_time = scrapy.Field() #回答时间 24 | answer_content = scrapy.Field() #回答内容 25 | 26 | -------------------------------------------------------------------------------- /finance_qa_spider/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/items.pyc -------------------------------------------------------------------------------- /finance_qa_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals, crawler 9 | import random 10 | 11 | 12 | class FinanceQaSpiderSpiderMiddleware(object): 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, dict or Item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Response, dict 44 | # or Item objects. 45 | pass 46 | 47 | def process_start_requests(self, start_requests, spider): 48 | # Called with the start requests of the spider, and works 49 | # similarly to the process_spider_output() method, except 50 | # that it doesn’t have a response associated. 51 | 52 | # Must return only requests (not items). 53 | for r in start_requests: 54 | yield r 55 | 56 | def spider_opened(self, spider): 57 | spider.logger.info('Spider opened: %s' % spider.name) 58 | 59 | 60 | # 主要用来动态获取user agent 61 | class RandomUserAgent(object): 62 | """Randomly rotate user agents based on a list of predefined ones""" 63 | 64 | def __init__(self, agents): 65 | self.agents = agents 66 | 67 | @classmethod 68 | def from_crawler(cls, crawler): 69 | return cls(crawler.settings.getlist('USER_AGENTS')) 70 | 71 | def process_request(self, request, spider): 72 | print "**************************" + random.choice(self.agents) 73 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 74 | 75 | 76 | -------------------------------------------------------------------------------- /finance_qa_spider/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/middlewares.pyc -------------------------------------------------------------------------------- /finance_qa_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from MySQLdb import cursors 3 | import MySQLdb 4 | from twisted.enterprise import adbapi 5 | # Define your item pipelines here 6 | # 7 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 8 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 9 | 10 | 11 | class FinanceQaSpiderPipeline(object): 12 | 13 | def __init__(self, dbpool): 14 | self.dbpool = dbpool 15 | 16 | @classmethod 17 | def from_settings(cls, settings): 18 | dbparams = dict( 19 | host=settings['MYSQL_HOST'], 20 | port=settings['MYSQL_PORT'], 21 | user=settings['MYSQL_USER'], 22 | passwd=settings['MYSQL_PASSWD'], 23 | db=settings['MYSQL_DB'], 24 | charset='utf8', 25 | use_unicode=True, 26 | cursorclass=MySQLdb.cursors.DictCursor, 27 | ) 28 | 29 | dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams) 30 | return cls(dbpool) 31 | 32 | # pipeline默认调用 33 | def process_item(self, item, spider): 34 | query=self.dbpool.runInteraction(self._conditional_insert, item) #调用插入的方法 35 | query.addErrback(self._handle_error, item, spider) #调用异常的处理方法 36 | return item 37 | 38 | # 写入到数据库中 39 | def _conditional_insert(self, tx, item): 40 | sql = 'INSERT INTO shse_qa(user_name, company_name, company_id, question_time, question_content, answer_time, answer_content) VALUES(%s, %s, %s, %s, %s, %s, %s)' 41 | params = (item['user_name'], item['company_name'], item['company_id'], item['question_time'], item['question_content'], item['answer_time'], item['answer_content']) 42 | tx.execute(sql, params) 43 | 44 | print 'user_name:' + item['user_name'] 45 | 46 | # 错误处理方法 47 | def _handle_error(self, failue, item, spider): 48 | print failue -------------------------------------------------------------------------------- /finance_qa_spider/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/pipelines.pyc -------------------------------------------------------------------------------- /finance_qa_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for finance_qa_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'finance_qa_spider' 13 | 14 | SPIDER_MODULES = ['finance_qa_spider.spiders'] 15 | NEWSPIDER_MODULE = 'finance_qa_spider.spiders' 16 | 17 | LOG_LEVEL = 'INFO' 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | #USER_AGENT = 'finance_qa_spider (+http://www.yourdomain.com)' 21 | USER_AGENTS = [ 22 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 23 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393', 24 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0', 25 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', 26 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 27 | ] 28 | 29 | 30 | # Obey robots.txt rules 31 | ROBOTSTXT_OBEY = False #默认为True 32 | 33 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 34 | #CONCURRENT_REQUESTS = 32 35 | 36 | # Configure a delay for requests for the same website (default: 0) 37 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 38 | # See also autothrottle settings and docs 39 | DOWNLOAD_DELAY = 5 #默认取值为3,0.5到1.5之间的一个随机值*DOWNLOAD_DELAY的结果作为等待时间间隔,单位秒 40 | # The download delay setting will honor only one of: 41 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 42 | #CONCURRENT_REQUESTS_PER_IP = 16 43 | 44 | # Disable cookies (enabled by default) 45 | COOKIES_ENABLED = False #默认False 46 | 47 | # Disable Telnet Console (enabled by default) 48 | #TELNETCONSOLE_ENABLED = False 49 | 50 | # Override the default request headers: 51 | #DEFAULT_REQUEST_HEADERS = { 52 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 53 | # 'Accept-Language': 'en', 54 | #} 55 | 56 | # Enable or disable spider middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 58 | SPIDER_MIDDLEWARES = { 59 | #'finance_qa_spider.middlewares.FinanceQaSpiderSpiderMiddleware': 543, 60 | 'finance_qa_spider.middlewares.RandomUserAgent': 543, 61 | } 62 | 63 | # Enable or disable downloader middlewares 64 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 65 | DOWNLOADER_MIDDLEWARES = { 66 | #'finance_qa_spider.middlewares.MyCustomDownloaderMiddleware': 543, 67 | 'finance_qa_spider.middlewares.RandomUserAgent':1, #添加 68 | } 69 | 70 | # Enable or disable extensions 71 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 72 | #EXTENSIONS = { 73 | # 'scrapy.extensions.telnet.TelnetConsole': None, 74 | #} 75 | 76 | # 数据库的配置 77 | MYSQL_HOST = 'localhost' #主机名(地址,即为127.0.0.1) 78 | MYSQL_PORT = 3306 #端口号 79 | MYSQL_USER = 'root' #用户名 80 | MYSQL_PASSWD = '123456' #密码 81 | MYSQL_DB = 'finance_qa' #数据库名 82 | 83 | # Configure item pipelines 84 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 85 | ITEM_PIPELINES = { 86 | 'finance_qa_spider.pipelines.FinanceQaSpiderPipeline': 300, #保存到数据库 87 | } 88 | 89 | # Enable and configure the AutoThrottle extension (disabled by default) 90 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 91 | #AUTOTHROTTLE_ENABLED = True 92 | # The initial download delay 93 | #AUTOTHROTTLE_START_DELAY = 5 94 | # The maximum download delay to be set in case of high latencies 95 | #AUTOTHROTTLE_MAX_DELAY = 60 96 | # The average number of requests Scrapy should be sending in parallel to 97 | # each remote server 98 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 99 | # Enable showing throttling stats for every response received: 100 | #AUTOTHROTTLE_DEBUG = False 101 | 102 | # Enable and configure HTTP caching (disabled by default) 103 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 104 | #HTTPCACHE_ENABLED = True 105 | #HTTPCACHE_EXPIRATION_SECS = 0 106 | #HTTPCACHE_DIR = 'httpcache' 107 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 108 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 109 | -------------------------------------------------------------------------------- /finance_qa_spider/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/settings.pyc -------------------------------------------------------------------------------- /finance_qa_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /finance_qa_spider/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/spiders/__init__.pyc -------------------------------------------------------------------------------- /finance_qa_spider/spiders/shse_qa_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from scrapy import Spider 3 | 4 | from scrapy.http import Request 5 | from finance_qa_spider.items import QAItem 6 | 7 | 8 | 9 | class QASpider(Spider): 10 | name = 'shse' 11 | allowed_domains = ['sseinfo.com'] 12 | start_urls=['http://sns.sseinfo.com/ajax/feeds.do?type=11&pageSize=10&lastid=-1&show=1&page=1'] 13 | page = 1; 14 | 15 | 16 | def parse(self, response): 17 | sel = response.xpath('//*[@class="m_feed_item"]') 18 | for s in sel: 19 | item = QAItem() 20 | item['user_name'] = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_face"]/p/text()').extract_first().strip() 21 | company = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]/a/text()').extract_first().strip() 22 | company_split = company.split('(') 23 | item['company_name'] = company_split[0].replace(':', '') 24 | item['company_id'] = company_split[1].replace(')', '') 25 | item['question_time'] = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_func"]/div[@class="m_feed_from"]/span/text()').extract_first().strip() 26 | item['question_content'] = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]/text()').extract()[1].strip() 27 | item['answer_time'] = s.xpath('./div[@class="m_feed_detail m_qa"]/div[@class="m_feed_func top10"]/div[@class="m_feed_from"]/span/text()').extract_first().strip() 28 | item['answer_content'] = s.xpath('./div[@class="m_feed_detail m_qa"]/div[@class="m_feed_cnt"]/div[@class="m_feed_txt"]/text()').extract_first().strip() 29 | yield item 30 | 31 | 32 | # print '********' + '\n' + item['user_name'] + '\n' + item['company_name'] + \ 33 | # '\n' + item['company_id'] + '\n' + item['question_time'] + '\n' + \ 34 | # item['question_content'] + '\n' + item['answer_time'] + '\n' + item['answer_content'] + '\n' + '********' 35 | 36 | 37 | self.page += 1; 38 | next_url = 'http://sns.sseinfo.com/ajax/feeds.do?type=11&pageSize=10&lastid=-1&show=1&page=' + str(self.page) 39 | yield Request(next_url, callback=self.parse) 40 | 41 | 42 | -------------------------------------------------------------------------------- /finance_qa_spider/spiders/shse_qa_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/spiders/shse_qa_spider.pyc -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = finance_qa_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = finance_qa_spider 12 | --------------------------------------------------------------------------------