├── .project
├── .pydevproject
├── .settings
└── org.eclipse.core.resources.prefs
├── README.md
├── finance_qa_spider
├── __init__.py
├── __init__.pyc
├── cmdline.py
├── items.py
├── items.pyc
├── middlewares.py
├── middlewares.pyc
├── pipelines.py
├── pipelines.pyc
├── settings.py
├── settings.pyc
└── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── shse_qa_spider.py
│ └── shse_qa_spider.pyc
└── scrapy.cfg
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | finance_qa_spider
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | /${PROJECT_DIR_NAME}
5 |
6 | python 2.7
7 | Default
8 |
9 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//finance_qa_spider/cmdline.py=utf-8
3 | encoding//finance_qa_spider/items.py=utf-8
4 | encoding//finance_qa_spider/middlewares.py=utf-8
5 | encoding//finance_qa_spider/pipelines.py=utf-8
6 | encoding//finance_qa_spider/settings.py=utf-8
7 | encoding//finance_qa_spider/spiders/shse_qa_spider.py=utf-8
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 基于Scrapy框架的金融问答文本数据库建设
2 | ---
3 | ## 开发语言
4 | Python
5 | ## 开发平台
6 | Eclipse+Pydev
7 | ## 数据来源
8 | 1. 上交所官方平台的问答系统
9 | http://sns.sseinfo.com/qa.do
10 |
11 | 2. 深交所官方平台的问答系统
12 | http://irm.cninfo.com.cn/szse/index.html
13 |
14 | 3. 全景网投资者关系互动平台
15 | http://rs.p5w.net/index/company/showQuestionPage.shtml
16 |
17 | 4. 新浪股吧
18 | http://guba.sina.com.cn/?s=channel&chi
19 | ## 数据库表shse_qa
20 | mysql> CREATE TABLE IF NOT EXISTS `shse_qa`(
21 | -> `current_time` TIMESTAMP NOT NULL,
22 | -> `user_name` VARCHAR(100) NOT NULL,
23 | -> `company_name` VARCHAR(100) NOT NULL,
24 | -> `company_id` int(20) NOT NULL,
25 | -> `question_time` VARCHAR(100) NOT NULL,
26 | -> `question_content` text NOT NULL,
27 | -> `answer_time` VARCHAR(100),
28 | -> `answer_content` text
29 | -> )ENGINE=InnoDB DEFAULT CHARSET=utf8;
--------------------------------------------------------------------------------
/finance_qa_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/__init__.py
--------------------------------------------------------------------------------
/finance_qa_spider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/__init__.pyc
--------------------------------------------------------------------------------
/finance_qa_spider/cmdline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy.cmdline
3 |
4 | if __name__ == "__main__":
5 | scrapy.cmdline.execute(['scrapy', 'crawl', 'shse'])
--------------------------------------------------------------------------------
/finance_qa_spider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class FinanceQaSpiderItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
16 |
17 | class QAItem(scrapy.Item):
18 | user_name = scrapy.Field() #用户名
19 | company_name = scrapy.Field() #提问的公司名
20 | company_id = scrapy.Field() #公司ID
21 | question_time = scrapy.Field() #提问时间
22 | question_content = scrapy.Field() #提问内容
23 | answer_time = scrapy.Field() #回答时间
24 | answer_content = scrapy.Field() #回答内容
25 |
26 |
--------------------------------------------------------------------------------
/finance_qa_spider/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/items.pyc
--------------------------------------------------------------------------------
/finance_qa_spider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals, crawler
9 | import random
10 |
11 |
12 | class FinanceQaSpiderSpiderMiddleware(object):
13 | # Not all methods need to be defined. If a method is not defined,
14 | # scrapy acts as if the spider middleware does not modify the
15 | # passed objects.
16 |
17 | @classmethod
18 | def from_crawler(cls, crawler):
19 | # This method is used by Scrapy to create your spiders.
20 | s = cls()
21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22 | return s
23 |
24 | def process_spider_input(self, response, spider):
25 | # Called for each response that goes through the spider
26 | # middleware and into the spider.
27 |
28 | # Should return None or raise an exception.
29 | return None
30 |
31 | def process_spider_output(self, response, result, spider):
32 | # Called with the results returned from the Spider, after
33 | # it has processed the response.
34 |
35 | # Must return an iterable of Request, dict or Item objects.
36 | for i in result:
37 | yield i
38 |
39 | def process_spider_exception(self, response, exception, spider):
40 | # Called when a spider or process_spider_input() method
41 | # (from other spider middleware) raises an exception.
42 |
43 | # Should return either None or an iterable of Response, dict
44 | # or Item objects.
45 | pass
46 |
47 | def process_start_requests(self, start_requests, spider):
48 | # Called with the start requests of the spider, and works
49 | # similarly to the process_spider_output() method, except
50 | # that it doesn’t have a response associated.
51 |
52 | # Must return only requests (not items).
53 | for r in start_requests:
54 | yield r
55 |
56 | def spider_opened(self, spider):
57 | spider.logger.info('Spider opened: %s' % spider.name)
58 |
59 |
60 | # 主要用来动态获取user agent
61 | class RandomUserAgent(object):
62 | """Randomly rotate user agents based on a list of predefined ones"""
63 |
64 | def __init__(self, agents):
65 | self.agents = agents
66 |
67 | @classmethod
68 | def from_crawler(cls, crawler):
69 | return cls(crawler.settings.getlist('USER_AGENTS'))
70 |
71 | def process_request(self, request, spider):
72 | print "**************************" + random.choice(self.agents)
73 | request.headers.setdefault('User-Agent', random.choice(self.agents))
74 |
75 |
76 |
--------------------------------------------------------------------------------
/finance_qa_spider/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/middlewares.pyc
--------------------------------------------------------------------------------
/finance_qa_spider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from MySQLdb import cursors
3 | import MySQLdb
4 | from twisted.enterprise import adbapi
5 | # Define your item pipelines here
6 | #
7 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
8 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
9 |
10 |
11 | class FinanceQaSpiderPipeline(object):
12 |
13 | def __init__(self, dbpool):
14 | self.dbpool = dbpool
15 |
16 | @classmethod
17 | def from_settings(cls, settings):
18 | dbparams = dict(
19 | host=settings['MYSQL_HOST'],
20 | port=settings['MYSQL_PORT'],
21 | user=settings['MYSQL_USER'],
22 | passwd=settings['MYSQL_PASSWD'],
23 | db=settings['MYSQL_DB'],
24 | charset='utf8',
25 | use_unicode=True,
26 | cursorclass=MySQLdb.cursors.DictCursor,
27 | )
28 |
29 | dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams)
30 | return cls(dbpool)
31 |
32 | # pipeline默认调用
33 | def process_item(self, item, spider):
34 | query=self.dbpool.runInteraction(self._conditional_insert, item) #调用插入的方法
35 | query.addErrback(self._handle_error, item, spider) #调用异常的处理方法
36 | return item
37 |
38 | # 写入到数据库中
39 | def _conditional_insert(self, tx, item):
40 | sql = 'INSERT INTO shse_qa(user_name, company_name, company_id, question_time, question_content, answer_time, answer_content) VALUES(%s, %s, %s, %s, %s, %s, %s)'
41 | params = (item['user_name'], item['company_name'], item['company_id'], item['question_time'], item['question_content'], item['answer_time'], item['answer_content'])
42 | tx.execute(sql, params)
43 |
44 | print 'user_name:' + item['user_name']
45 |
46 | # 错误处理方法
47 | def _handle_error(self, failue, item, spider):
48 | print failue
--------------------------------------------------------------------------------
/finance_qa_spider/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/pipelines.pyc
--------------------------------------------------------------------------------
/finance_qa_spider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for finance_qa_spider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'finance_qa_spider'
13 |
14 | SPIDER_MODULES = ['finance_qa_spider.spiders']
15 | NEWSPIDER_MODULE = 'finance_qa_spider.spiders'
16 |
17 | LOG_LEVEL = 'INFO'
18 |
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | #USER_AGENT = 'finance_qa_spider (+http://www.yourdomain.com)'
21 | USER_AGENTS = [
22 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
23 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
24 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0',
25 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
26 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
27 | ]
28 |
29 |
30 | # Obey robots.txt rules
31 | ROBOTSTXT_OBEY = False #默认为True
32 |
33 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
34 | #CONCURRENT_REQUESTS = 32
35 |
36 | # Configure a delay for requests for the same website (default: 0)
37 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
38 | # See also autothrottle settings and docs
39 | DOWNLOAD_DELAY = 5 #默认取值为3,0.5到1.5之间的一个随机值*DOWNLOAD_DELAY的结果作为等待时间间隔,单位秒
40 | # The download delay setting will honor only one of:
41 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
42 | #CONCURRENT_REQUESTS_PER_IP = 16
43 |
44 | # Disable cookies (enabled by default)
45 | COOKIES_ENABLED = False #默认False
46 |
47 | # Disable Telnet Console (enabled by default)
48 | #TELNETCONSOLE_ENABLED = False
49 |
50 | # Override the default request headers:
51 | #DEFAULT_REQUEST_HEADERS = {
52 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
53 | # 'Accept-Language': 'en',
54 | #}
55 |
56 | # Enable or disable spider middlewares
57 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
58 | SPIDER_MIDDLEWARES = {
59 | #'finance_qa_spider.middlewares.FinanceQaSpiderSpiderMiddleware': 543,
60 | 'finance_qa_spider.middlewares.RandomUserAgent': 543,
61 | }
62 |
63 | # Enable or disable downloader middlewares
64 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
65 | DOWNLOADER_MIDDLEWARES = {
66 | #'finance_qa_spider.middlewares.MyCustomDownloaderMiddleware': 543,
67 | 'finance_qa_spider.middlewares.RandomUserAgent':1, #添加
68 | }
69 |
70 | # Enable or disable extensions
71 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
72 | #EXTENSIONS = {
73 | # 'scrapy.extensions.telnet.TelnetConsole': None,
74 | #}
75 |
76 | # 数据库的配置
77 | MYSQL_HOST = 'localhost' #主机名(地址,即为127.0.0.1)
78 | MYSQL_PORT = 3306 #端口号
79 | MYSQL_USER = 'root' #用户名
80 | MYSQL_PASSWD = '123456' #密码
81 | MYSQL_DB = 'finance_qa' #数据库名
82 |
83 | # Configure item pipelines
84 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
85 | ITEM_PIPELINES = {
86 | 'finance_qa_spider.pipelines.FinanceQaSpiderPipeline': 300, #保存到数据库
87 | }
88 |
89 | # Enable and configure the AutoThrottle extension (disabled by default)
90 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
91 | #AUTOTHROTTLE_ENABLED = True
92 | # The initial download delay
93 | #AUTOTHROTTLE_START_DELAY = 5
94 | # The maximum download delay to be set in case of high latencies
95 | #AUTOTHROTTLE_MAX_DELAY = 60
96 | # The average number of requests Scrapy should be sending in parallel to
97 | # each remote server
98 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
99 | # Enable showing throttling stats for every response received:
100 | #AUTOTHROTTLE_DEBUG = False
101 |
102 | # Enable and configure HTTP caching (disabled by default)
103 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
104 | #HTTPCACHE_ENABLED = True
105 | #HTTPCACHE_EXPIRATION_SECS = 0
106 | #HTTPCACHE_DIR = 'httpcache'
107 | #HTTPCACHE_IGNORE_HTTP_CODES = []
108 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
109 |
--------------------------------------------------------------------------------
/finance_qa_spider/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/settings.pyc
--------------------------------------------------------------------------------
/finance_qa_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/finance_qa_spider/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/spiders/__init__.pyc
--------------------------------------------------------------------------------
/finance_qa_spider/spiders/shse_qa_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from scrapy import Spider
3 |
4 | from scrapy.http import Request
5 | from finance_qa_spider.items import QAItem
6 |
7 |
8 |
9 | class QASpider(Spider):
10 | name = 'shse'
11 | allowed_domains = ['sseinfo.com']
12 | start_urls=['http://sns.sseinfo.com/ajax/feeds.do?type=11&pageSize=10&lastid=-1&show=1&page=1']
13 | page = 1;
14 |
15 |
16 | def parse(self, response):
17 | sel = response.xpath('//*[@class="m_feed_item"]')
18 | for s in sel:
19 | item = QAItem()
20 | item['user_name'] = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_face"]/p/text()').extract_first().strip()
21 | company = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]/a/text()').extract_first().strip()
22 | company_split = company.split('(')
23 | item['company_name'] = company_split[0].replace(':', '')
24 | item['company_id'] = company_split[1].replace(')', '')
25 | item['question_time'] = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_func"]/div[@class="m_feed_from"]/span/text()').extract_first().strip()
26 | item['question_content'] = s.xpath('./div[@class="m_feed_detail m_qa_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]/text()').extract()[1].strip()
27 | item['answer_time'] = s.xpath('./div[@class="m_feed_detail m_qa"]/div[@class="m_feed_func top10"]/div[@class="m_feed_from"]/span/text()').extract_first().strip()
28 | item['answer_content'] = s.xpath('./div[@class="m_feed_detail m_qa"]/div[@class="m_feed_cnt"]/div[@class="m_feed_txt"]/text()').extract_first().strip()
29 | yield item
30 |
31 |
32 | # print '********' + '\n' + item['user_name'] + '\n' + item['company_name'] + \
33 | # '\n' + item['company_id'] + '\n' + item['question_time'] + '\n' + \
34 | # item['question_content'] + '\n' + item['answer_time'] + '\n' + item['answer_content'] + '\n' + '********'
35 |
36 |
37 | self.page += 1;
38 | next_url = 'http://sns.sseinfo.com/ajax/feeds.do?type=11&pageSize=10&lastid=-1&show=1&page=' + str(self.page)
39 | yield Request(next_url, callback=self.parse)
40 |
41 |
42 |
--------------------------------------------------------------------------------
/finance_qa_spider/spiders/shse_qa_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemonhu/finance-qa-spider/8ef26903bdd1606a8aa576804400bf7df724c43f/finance_qa_spider/spiders/shse_qa_spider.pyc
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = finance_qa_spider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = finance_qa_spider
12 |
--------------------------------------------------------------------------------