├── .idea
├── encodings.xml
├── misc.xml
├── modules.xml
├── research_report_spider.iml
└── vcs.xml
├── README.md
├── geckodriver.log
├── research_report_spider
├── __init__.py
├── common
│ ├── __init__.py
│ └── operation.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
│ ├── __init__.py
│ └── report_spider.py
├── run.py
└── scrapy.cfg
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/research_report_spider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 萝卜投研:A股公司研报
2 |
3 | 官网:https://robo.datayes.com/v2/fastreport/company?subType=%E4%B8%8D%E9%99%90&induName=
4 |
5 | 思路:先selenium模拟浏览器获取 cookies 然后获取数据,携带 cookies 请求。
6 |
7 | 备注:注意火狐浏览器驱动(谷歌浏览器驱动)和浏览器本身版本的配合。
8 |
--------------------------------------------------------------------------------
/geckodriver.log:
--------------------------------------------------------------------------------
1 | 1560136088970 geckodriver INFO geckodriver 0.20.1
2 | 1560136088987 geckodriver INFO Listening on 127.0.0.1:63245
3 |
--------------------------------------------------------------------------------
/research_report_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huanglaoxie0503/research_report_spider/2c1f8b84147bf97519a01aa1749473c5275a93e2/research_report_spider/__init__.py
--------------------------------------------------------------------------------
/research_report_spider/common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/research_report_spider/common/operation.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import pymysql
3 |
4 | from research_report_spider.settings import (
5 | mysql_host,
6 | mysql_user,
7 | mysql_password,
8 | mysql_db,
9 | mysql_table,
10 |
11 | )
12 |
13 |
14 | conn = pymysql.connect(
15 | host=mysql_host,
16 | user=mysql_user,
17 | passwd=mysql_password,
18 | db=mysql_db,
19 | charset="utf8",
20 | use_unicode=True,
21 | )
22 | cursor = conn.cursor()
23 |
24 |
25 | def get_article_id(art_id):
26 | """验证article_id数据库是否已经存在"""
27 | try:
28 | sql = "select * from {0} where report_id=%s;".format(mysql_table)
29 | cursor.execute(sql, (art_id,))
30 | results = cursor.fetchall()
31 | if results:
32 | return results[0][0]
33 | else:
34 | return None
35 | except pymysql.Error as e:
36 | print(e)
37 |
--------------------------------------------------------------------------------
/research_report_spider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 | from research_report_spider.settings import mysql_table
10 |
11 |
12 | class ResearchReportSpiderItem(scrapy.Item):
13 | # define the fields for your item here like:
14 | # name = scrapy.Field()
15 |
16 | # 作为id,唯一主键
17 | report_id = scrapy.Field()
18 | # 股票代码
19 | stock_code = scrapy.Field()
20 | # 股票名称
21 | stock_name = scrapy.Field()
22 | # 日期
23 | publish_time = scrapy.Field()
24 | # 作者
25 | author = scrapy.Field()
26 | # 研报标题
27 | title = scrapy.Field()
28 | # 原文评级
29 | original_rating = scrapy.Field()
30 | # 评级变动
31 | rating_changes = scrapy.Field()
32 | #
33 | rating_adjust_mark_type = scrapy.Field()
34 | # 机构
35 | org_name = scrapy.Field()
36 | # 内容
37 | content = scrapy.Field()
38 | # pdf链接
39 | pdf_link = scrapy.Field()
40 | # 文件名
41 | filename = scrapy.Field()
42 | # 文件存储路径
43 | save_path = scrapy.Field()
44 |
45 | def get_insert_sql(self):
46 | # 插入:sql语句
47 | insert_sql = """
48 | insert into {0}(report_id,stock_code,stock_name,publish_time,author,title,
49 | original_rating,rating_changes,rating_adjust_mark_type,org_name,content,pdf_link,save_path)
50 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
51 | """.format(mysql_table)
52 |
53 | params = (
54 | self['report_id'],
55 | self['stock_code'],
56 | self['stock_name'],
57 | self['publish_time'],
58 | self['author'],
59 | self['title'],
60 | self['original_rating'],
61 | self['rating_changes'],
62 | self['rating_adjust_mark_type'],
63 | self['org_name'],
64 | self['content'],
65 | self['pdf_link'],
66 | self['save_path']
67 | )
68 | return insert_sql, params
69 |
--------------------------------------------------------------------------------
/research_report_spider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class ResearchReportSpiderSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class ResearchReportSpiderDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/research_report_spider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import scrapy
8 | import logging
9 | import pymysql
10 | from research_report_spider.settings import mysql_host, mysql_user, mysql_password, mysql_db
11 | from research_report_spider.items import ResearchReportSpiderItem
12 | from scrapy.pipelines.files import FilesPipeline
13 |
14 |
15 | class MysqlPipeline(object):
16 | """同步的方式将数据保存到数据库:方法二"""
17 |
18 | def __init__(self):
19 | self.conn = pymysql.connect(
20 | host=mysql_host,
21 | user=mysql_user,
22 | passwd=mysql_password,
23 | db=mysql_db,
24 | charset="utf8",
25 | use_unicode=True,
26 | )
27 | self.cursor = self.conn.cursor()
28 |
29 | def process_item(self, item, spider):
30 | try:
31 | # 插入
32 | if isinstance(item, ResearchReportSpiderItem):
33 | self.do_insert(item)
34 | else:
35 | logging.info("Error Data")
36 | except pymysql.Error as e:
37 | logging.error("-----------------insert faild-----------")
38 | logging.error(e)
39 | print(e)
40 |
41 | return item
42 |
43 | def close_spider(self, spider):
44 | try:
45 | self.conn.close()
46 | logging.info("mysql already close")
47 | except Exception as e:
48 | logging.info("--------mysql no close-------")
49 | logging.error(e)
50 |
51 | def do_insert(self, item):
52 | try:
53 | insert_sql, params = item.get_insert_sql()
54 |
55 | self.cursor.execute(insert_sql, params)
56 | self.conn.commit()
57 | logging.info("----------------insert success-----------")
58 | except pymysql.Error as e:
59 | print(e)
60 |
61 |
62 | class MyFilesPipeline(FilesPipeline):
63 | """
64 | 下载PDF文件
65 | """
66 | def get_media_requests(self, item, info):
67 | for url in item["pdf_link"]:
68 | if url:
69 | yield scrapy.Request(url, meta={"item": item})
70 |
71 | def file_path(self, request, response=None, info=None):
72 | item = request.meta["item"]
73 | filename = item["filename"]
74 | return filename
75 |
76 | def item_completed(self, results, item, info):
77 | file_paths = [x['path'] for ok, x in results if ok]
78 | if not file_paths:
79 | pass
80 | item["filename"] = file_paths
81 | return item
82 |
--------------------------------------------------------------------------------
/research_report_spider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for research_report_spider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'research_report_spider'
13 |
14 | SPIDER_MODULES = ['research_report_spider.spiders']
15 | NEWSPIDER_MODULE = 'research_report_spider.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'research_report_spider (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'research_report_spider.middlewares.ResearchReportSpiderSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'research_report_spider.middlewares.ResearchReportSpiderDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'research_report_spider.pipelines.MysqlPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
92 | mysql_host = '127.0.0.1'
93 | mysql_user = 'root'
94 | mysql_password = 'root'
95 | mysql_db = 'crawl'
96 | mysql_table = 'research_report'
97 |
--------------------------------------------------------------------------------
/research_report_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/research_report_spider/spiders/report_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import datetime
4 | import logging
5 | import json
6 |
7 | from selenium import webdriver
8 |
9 | from research_report_spider.common import operation
10 | from research_report_spider.items import ResearchReportSpiderItem
11 |
12 |
13 | class ReportSpider(scrapy.Spider):
14 | name = 'report'
15 | allowed_domains = ['gw.datayes.com']
16 | start_urls = ['http://gw.datayes.com/']
17 |
18 | dt = datetime.datetime.now().strftime('%Y-%m-%d')
19 | today = dt.replace('-', '')
20 |
21 | base_url = 'https://gw.datayes.com/rrp_adventure/web/search?'
22 | headers = {
23 | "Origin": "https://robo.datayes.com",
24 | "Referer": "https://robo.datayes.com/v2/fastreport/company?subType=%E4%B8%8D%E9%99%90&induName=",
25 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
26 | }
27 | url = "https://gw.datayes.com/rrp_adventure/web/search?pageNow={0}&authorId=&isOptional=false&orgName=&reportType=COMPANY&secCodeList=&reportSubType=&industry=&ratingType=&pubTimeStart={1}&pubTimeEnd={1}&type=EXTERNAL_REPORT&pageSize=40&sortOrder=desc&query=&minPageCount=&maxPageCount="
28 |
29 | def start_requests(self):
30 | # 获取 cookie
31 | cookie = self.get_cookies()
32 |
33 | for page in range(1, 5):
34 | yield scrapy.Request(
35 | self.url.format(page, self.today),
36 | headers=self.headers,
37 | cookies=cookie,
38 | meta={"page": page, "cookie": cookie}
39 | )
40 |
41 | def parse(self, response):
42 | page = response.meta.get('page')
43 | logging.info('正在抓取第{0}页'.format(page))
44 | status = response.status
45 | logging.info(status)
46 | result = response.text
47 | result = json.loads(result)
48 |
49 | message = result['message']
50 | if message != 'success':
51 | logging.info('message为:{0},请求失败!'.format(message))
52 | return
53 |
54 | data_all = result['data']['list']
55 | for info in data_all:
56 | data = info['data']
57 | report_id = data['id']
58 | stock_name = data['companyName']
59 | author = data['author']
60 | title = data['title']
61 | # id判断
62 | is_ar_id = operation.get_article_id(report_id)
63 | if is_ar_id:
64 | logging.info('id:{0}已经存在'.format(is_ar_id))
65 | continue
66 |
67 | content = data['abstractText']
68 | if content:
69 | content = content.replace('\u3000', '').strip()
70 | else:
71 | content = content
72 | stock_code_info = data['stockInfo']
73 | if stock_code_info is None:
74 | stock_code = None
75 | else:
76 | stock_code = stock_code_info['stockId']
77 |
78 | file_name = '{0}-{1}.pdf'.format(stock_code, title)
79 | org_dt = data['publishTime'].split('T')
80 | publish_time = org_dt[0]
81 |
82 | keys = publish_time.split('-')
83 | year = keys[0]
84 |
85 | filename = "/{0}/{1}/{2}".format(year, publish_time, file_name)
86 |
87 | item = ResearchReportSpiderItem()
88 | # 作为id,唯一主键
89 | item['report_id'] = report_id
90 | # 股票代码
91 | item['stock_code'] = stock_code
92 | # 股票名称
93 | item['stock_name'] = stock_name
94 | # 日期
95 | item['publish_time'] = publish_time
96 | # 作者
97 | item['author'] = author
98 | # 研报标题
99 | item['title'] = title
100 | # 原文评级
101 | item['original_rating'] = data['ratingContent']
102 | # 评级变动
103 | item['rating_changes'] = data['ratingType']
104 | #
105 | item['rating_adjust_mark_type'] = data['ratingAdjustMarkType']
106 | # 机构
107 | item['org_name'] = data['orgName']
108 | # 内容
109 | item['content'] = content
110 | # pdf链接
111 | item['pdf_link'] = [data['s3Url']]
112 | # 文件名
113 | item['filename'] = filename
114 | # 文件存储路径
115 | item['save_path'] = "H-hezudao/Research_Report{0}".format(filename)
116 |
117 | yield item
118 |
119 | def get_cookies(self):
120 | # Firefox无头浏览器模式
121 | from selenium.webdriver.firefox.options import Options
122 | firefox_options = Options()
123 | firefox_options.set_headless()
124 | driver = webdriver.Firefox(firefox_options=firefox_options)
125 |
126 | url = 'https://robo.datayes.com/v2/fastreport/company?subType=%E4%B8%8D%E9%99%90&induName='
127 | driver.get(url)
128 | # 获取cookie列表
129 | cookie_list = driver.get_cookies()
130 | # 格式化打印cookie
131 | cookie_dict = {}
132 | for cookie in cookie_list:
133 | cookie_dict[cookie['name']] = cookie['value']
134 | driver.quit()
135 | logging.info('Firefox 已经 quit')
136 | return cookie_dict
137 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | from scrapy import cmdline
4 |
5 | file_log = os.getcwd()+"/info.log"
6 | if os.path.exists(file_log):
7 | os.remove(file_log)
8 | print("每次运行前把之前的日志文件删除,保留最新日志即可")
9 |
10 |
11 | cmdline.execute('scrapy crawl report'.split())
12 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = research_report_spider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = research_report_spider
12 |
--------------------------------------------------------------------------------