├── .idea ├── encodings.xml ├── misc.xml ├── modules.xml ├── research_report_spider.iml └── vcs.xml ├── README.md ├── geckodriver.log ├── research_report_spider ├── __init__.py ├── common │ ├── __init__.py │ └── operation.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── report_spider.py ├── run.py └── scrapy.cfg /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/research_report_spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 萝卜投研:A股公司研报 2 | 3 | 官网:https://robo.datayes.com/v2/fastreport/company?subType=%E4%B8%8D%E9%99%90&induName= 4 | 5 | 思路:先selenium模拟浏览器获取 cookies 然后获取数据,携带 cookies 请求。 6 | 7 | 备注:注意火狐浏览器驱动(谷歌浏览器驱动)和浏览器本身版本的配合。 8 | -------------------------------------------------------------------------------- /geckodriver.log: -------------------------------------------------------------------------------- 1 | 1560136088970 geckodriver INFO geckodriver 0.20.1 2 | 1560136088987 geckodriver INFO Listening on 127.0.0.1:63245 3 | -------------------------------------------------------------------------------- /research_report_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huanglaoxie0503/research_report_spider/2c1f8b84147bf97519a01aa1749473c5275a93e2/research_report_spider/__init__.py -------------------------------------------------------------------------------- /research_report_spider/common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /research_report_spider/common/operation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pymysql 3 | 4 | from research_report_spider.settings import ( 5 | mysql_host, 6 | mysql_user, 7 | mysql_password, 8 | mysql_db, 9 | mysql_table, 10 | 11 | ) 12 | 13 | 14 | conn = pymysql.connect( 15 | host=mysql_host, 16 | user=mysql_user, 17 | passwd=mysql_password, 18 | db=mysql_db, 19 | charset="utf8", 20 | use_unicode=True, 21 | ) 22 | cursor = conn.cursor() 23 | 24 | 25 | def get_article_id(art_id): 26 | """验证article_id数据库是否已经存在""" 27 | try: 28 | sql = "select * from {0} where report_id=%s;".format(mysql_table) 29 | cursor.execute(sql, (art_id,)) 30 | results = cursor.fetchall() 31 | if results: 32 | return results[0][0] 33 | else: 34 | return None 35 | except pymysql.Error as e: 36 | print(e) 37 | -------------------------------------------------------------------------------- /research_report_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from research_report_spider.settings import mysql_table 10 | 11 | 12 | class ResearchReportSpiderItem(scrapy.Item): 13 | # define the fields for your item here like: 14 | # name = scrapy.Field() 15 | 16 | # 作为id,唯一主键 17 | report_id = scrapy.Field() 18 | # 股票代码 19 | stock_code = scrapy.Field() 20 | # 股票名称 21 | stock_name = scrapy.Field() 22 | # 日期 23 | publish_time = scrapy.Field() 24 | # 作者 25 | author = scrapy.Field() 26 | # 研报标题 27 | title = scrapy.Field() 28 | # 原文评级 29 | original_rating = scrapy.Field() 30 | # 评级变动 31 | rating_changes = scrapy.Field() 32 | # 33 | rating_adjust_mark_type = scrapy.Field() 34 | # 机构 35 | org_name = scrapy.Field() 36 | # 内容 37 | content = scrapy.Field() 38 | # pdf链接 39 | pdf_link = scrapy.Field() 40 | # 文件名 41 | filename = scrapy.Field() 42 | # 文件存储路径 43 | save_path = scrapy.Field() 44 | 45 | def get_insert_sql(self): 46 | # 插入:sql语句 47 | insert_sql = """ 48 | insert into {0}(report_id,stock_code,stock_name,publish_time,author,title, 49 | original_rating,rating_changes,rating_adjust_mark_type,org_name,content,pdf_link,save_path) 50 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 51 | """.format(mysql_table) 52 | 53 | params = ( 54 | self['report_id'], 55 | self['stock_code'], 56 | self['stock_name'], 57 | self['publish_time'], 58 | self['author'], 59 | self['title'], 60 | self['original_rating'], 61 | self['rating_changes'], 62 | self['rating_adjust_mark_type'], 63 | self['org_name'], 64 | self['content'], 65 | self['pdf_link'], 66 | self['save_path'] 67 | ) 68 | return insert_sql, params 69 | -------------------------------------------------------------------------------- /research_report_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ResearchReportSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class ResearchReportSpiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /research_report_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import scrapy 8 | import logging 9 | import pymysql 10 | from research_report_spider.settings import mysql_host, mysql_user, mysql_password, mysql_db 11 | from research_report_spider.items import ResearchReportSpiderItem 12 | from scrapy.pipelines.files import FilesPipeline 13 | 14 | 15 | class MysqlPipeline(object): 16 | """同步的方式将数据保存到数据库:方法二""" 17 | 18 | def __init__(self): 19 | self.conn = pymysql.connect( 20 | host=mysql_host, 21 | user=mysql_user, 22 | passwd=mysql_password, 23 | db=mysql_db, 24 | charset="utf8", 25 | use_unicode=True, 26 | ) 27 | self.cursor = self.conn.cursor() 28 | 29 | def process_item(self, item, spider): 30 | try: 31 | # 插入 32 | if isinstance(item, ResearchReportSpiderItem): 33 | self.do_insert(item) 34 | else: 35 | logging.info("Error Data") 36 | except pymysql.Error as e: 37 | logging.error("-----------------insert faild-----------") 38 | logging.error(e) 39 | print(e) 40 | 41 | return item 42 | 43 | def close_spider(self, spider): 44 | try: 45 | self.conn.close() 46 | logging.info("mysql already close") 47 | except Exception as e: 48 | logging.info("--------mysql no close-------") 49 | logging.error(e) 50 | 51 | def do_insert(self, item): 52 | try: 53 | insert_sql, params = item.get_insert_sql() 54 | 55 | self.cursor.execute(insert_sql, params) 56 | self.conn.commit() 57 | logging.info("----------------insert success-----------") 58 | except pymysql.Error as e: 59 | print(e) 60 | 61 | 62 | class MyFilesPipeline(FilesPipeline): 63 | """ 64 | 下载PDF文件 65 | """ 66 | def get_media_requests(self, item, info): 67 | for url in item["pdf_link"]: 68 | if url: 69 | yield scrapy.Request(url, meta={"item": item}) 70 | 71 | def file_path(self, request, response=None, info=None): 72 | item = request.meta["item"] 73 | filename = item["filename"] 74 | return filename 75 | 76 | def item_completed(self, results, item, info): 77 | file_paths = [x['path'] for ok, x in results if ok] 78 | if not file_paths: 79 | pass 80 | item["filename"] = file_paths 81 | return item 82 | -------------------------------------------------------------------------------- /research_report_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for research_report_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'research_report_spider' 13 | 14 | SPIDER_MODULES = ['research_report_spider.spiders'] 15 | NEWSPIDER_MODULE = 'research_report_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'research_report_spider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'research_report_spider.middlewares.ResearchReportSpiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'research_report_spider.middlewares.ResearchReportSpiderDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'research_report_spider.pipelines.MysqlPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | mysql_host = '127.0.0.1' 93 | mysql_user = 'root' 94 | mysql_password = 'root' 95 | mysql_db = 'crawl' 96 | mysql_table = 'research_report' 97 | -------------------------------------------------------------------------------- /research_report_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /research_report_spider/spiders/report_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import datetime 4 | import logging 5 | import json 6 | 7 | from selenium import webdriver 8 | 9 | from research_report_spider.common import operation 10 | from research_report_spider.items import ResearchReportSpiderItem 11 | 12 | 13 | class ReportSpider(scrapy.Spider): 14 | name = 'report' 15 | allowed_domains = ['gw.datayes.com'] 16 | start_urls = ['http://gw.datayes.com/'] 17 | 18 | dt = datetime.datetime.now().strftime('%Y-%m-%d') 19 | today = dt.replace('-', '') 20 | 21 | base_url = 'https://gw.datayes.com/rrp_adventure/web/search?' 22 | headers = { 23 | "Origin": "https://robo.datayes.com", 24 | "Referer": "https://robo.datayes.com/v2/fastreport/company?subType=%E4%B8%8D%E9%99%90&induName=", 25 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36", 26 | } 27 | url = "https://gw.datayes.com/rrp_adventure/web/search?pageNow={0}&authorId=&isOptional=false&orgName=&reportType=COMPANY&secCodeList=&reportSubType=&industry=&ratingType=&pubTimeStart={1}&pubTimeEnd={1}&type=EXTERNAL_REPORT&pageSize=40&sortOrder=desc&query=&minPageCount=&maxPageCount=" 28 | 29 | def start_requests(self): 30 | # 获取 cookie 31 | cookie = self.get_cookies() 32 | 33 | for page in range(1, 5): 34 | yield scrapy.Request( 35 | self.url.format(page, self.today), 36 | headers=self.headers, 37 | cookies=cookie, 38 | meta={"page": page, "cookie": cookie} 39 | ) 40 | 41 | def parse(self, response): 42 | page = response.meta.get('page') 43 | logging.info('正在抓取第{0}页'.format(page)) 44 | status = response.status 45 | logging.info(status) 46 | result = response.text 47 | result = json.loads(result) 48 | 49 | message = result['message'] 50 | if message != 'success': 51 | logging.info('message为:{0},请求失败!'.format(message)) 52 | return 53 | 54 | data_all = result['data']['list'] 55 | for info in data_all: 56 | data = info['data'] 57 | report_id = data['id'] 58 | stock_name = data['companyName'] 59 | author = data['author'] 60 | title = data['title'] 61 | # id判断 62 | is_ar_id = operation.get_article_id(report_id) 63 | if is_ar_id: 64 | logging.info('id:{0}已经存在'.format(is_ar_id)) 65 | continue 66 | 67 | content = data['abstractText'] 68 | if content: 69 | content = content.replace('\u3000', '').strip() 70 | else: 71 | content = content 72 | stock_code_info = data['stockInfo'] 73 | if stock_code_info is None: 74 | stock_code = None 75 | else: 76 | stock_code = stock_code_info['stockId'] 77 | 78 | file_name = '{0}-{1}.pdf'.format(stock_code, title) 79 | org_dt = data['publishTime'].split('T') 80 | publish_time = org_dt[0] 81 | 82 | keys = publish_time.split('-') 83 | year = keys[0] 84 | 85 | filename = "/{0}/{1}/{2}".format(year, publish_time, file_name) 86 | 87 | item = ResearchReportSpiderItem() 88 | # 作为id,唯一主键 89 | item['report_id'] = report_id 90 | # 股票代码 91 | item['stock_code'] = stock_code 92 | # 股票名称 93 | item['stock_name'] = stock_name 94 | # 日期 95 | item['publish_time'] = publish_time 96 | # 作者 97 | item['author'] = author 98 | # 研报标题 99 | item['title'] = title 100 | # 原文评级 101 | item['original_rating'] = data['ratingContent'] 102 | # 评级变动 103 | item['rating_changes'] = data['ratingType'] 104 | # 105 | item['rating_adjust_mark_type'] = data['ratingAdjustMarkType'] 106 | # 机构 107 | item['org_name'] = data['orgName'] 108 | # 内容 109 | item['content'] = content 110 | # pdf链接 111 | item['pdf_link'] = [data['s3Url']] 112 | # 文件名 113 | item['filename'] = filename 114 | # 文件存储路径 115 | item['save_path'] = "H-hezudao/Research_Report{0}".format(filename) 116 | 117 | yield item 118 | 119 | def get_cookies(self): 120 | # Firefox无头浏览器模式 121 | from selenium.webdriver.firefox.options import Options 122 | firefox_options = Options() 123 | firefox_options.set_headless() 124 | driver = webdriver.Firefox(firefox_options=firefox_options) 125 | 126 | url = 'https://robo.datayes.com/v2/fastreport/company?subType=%E4%B8%8D%E9%99%90&induName=' 127 | driver.get(url) 128 | # 获取cookie列表 129 | cookie_list = driver.get_cookies() 130 | # 格式化打印cookie 131 | cookie_dict = {} 132 | for cookie in cookie_list: 133 | cookie_dict[cookie['name']] = cookie['value'] 134 | driver.quit() 135 | logging.info('Firefox 已经 quit') 136 | return cookie_dict 137 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from scrapy import cmdline 4 | 5 | file_log = os.getcwd()+"/info.log" 6 | if os.path.exists(file_log): 7 | os.remove(file_log) 8 | print("每次运行前把之前的日志文件删除,保留最新日志即可") 9 | 10 | 11 | cmdline.execute('scrapy crawl report'.split()) 12 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = research_report_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = research_report_spider 12 | --------------------------------------------------------------------------------