├── .idea
    ├── encodings.xml
    ├── misc.xml
    ├── modules.xml
    ├── research_report_spider.iml
    └── vcs.xml
├── README.md
├── geckodriver.log
├── research_report_spider
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   └── operation.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── report_spider.py
├── run.py
└── scrapy.cfg


/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" addBOMForNewFiles="with NO BOM" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/research_report_spider.iml" filepath="$PROJECT_DIR$/.idea/research_report_spider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/research_report_spider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="pytest" />
10 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 萝卜投研：A股公司研报
2 | 
3 | 官网：https://robo.datayes.com/v2/fastreport/company?subType=%E4%B8%8D%E9%99%90&induName=
4 | 
5 | 思路：先selenium模拟浏览器获取 cookies 然后获取数据，携带 cookies 请求。
6 | 
7 | 备注：注意火狐浏览器驱动（谷歌浏览器驱动）和浏览器本身版本的配合。
8 | 


--------------------------------------------------------------------------------
/geckodriver.log:
--------------------------------------------------------------------------------
1 | 1560136088970	geckodriver	INFO	geckodriver 0.20.1
2 | 1560136088987	geckodriver	INFO	Listening on 127.0.0.1:63245
3 | 


--------------------------------------------------------------------------------
/research_report_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huanglaoxie0503/research_report_spider/2c1f8b84147bf97519a01aa1749473c5275a93e2/research_report_spider/__init__.py


--------------------------------------------------------------------------------
/research_report_spider/common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/research_report_spider/common/operation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pymysql
 3 | 
 4 | from research_report_spider.settings import (
 5 |     mysql_host,
 6 |     mysql_user,
 7 |     mysql_password,
 8 |     mysql_db,
 9 |     mysql_table,
10 | 
11 | )
12 | 
13 | 
14 | conn = pymysql.connect(
15 |     host=mysql_host,
16 |     user=mysql_user,
17 |     passwd=mysql_password,
18 |     db=mysql_db,
19 |     charset="utf8",
20 |     use_unicode=True,
21 | )
22 | cursor = conn.cursor()
23 | 
24 | 
25 | def get_article_id(art_id):
26 |     """验证article_id数据库是否已经存在"""
27 |     try:
28 |         sql = "select * from {0} where report_id=%s;".format(mysql_table)
29 |         cursor.execute(sql, (art_id,))
30 |         results = cursor.fetchall()
31 |         if results:
32 |             return results[0][0]
33 |         else:
34 |             return None
35 |     except pymysql.Error as e:
36 |         print(e)
37 | 


--------------------------------------------------------------------------------
/research_report_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from research_report_spider.settings import mysql_table
10 | 
11 | 
12 | class ResearchReportSpiderItem(scrapy.Item):
13 |     # define the fields for your item here like:
14 |     # name = scrapy.Field()
15 | 
16 |     # 作为id，唯一主键
17 |     report_id = scrapy.Field()
18 |     # 股票代码
19 |     stock_code = scrapy.Field()
20 |     # 股票名称
21 |     stock_name = scrapy.Field()
22 |     # 日期
23 |     publish_time = scrapy.Field()
24 |     # 作者
25 |     author = scrapy.Field()
26 |     # 研报标题
27 |     title = scrapy.Field()
28 |     # 原文评级
29 |     original_rating = scrapy.Field()
30 |     # 评级变动
31 |     rating_changes = scrapy.Field()
32 |     #
33 |     rating_adjust_mark_type = scrapy.Field()
34 |     # 机构
35 |     org_name = scrapy.Field()
36 |     # 内容
37 |     content = scrapy.Field()
38 |     # pdf链接
39 |     pdf_link = scrapy.Field()
40 |     # 文件名
41 |     filename = scrapy.Field()
42 |     # 文件存储路径
43 |     save_path = scrapy.Field()
44 | 
45 |     def get_insert_sql(self):
46 |         # 插入：sql语句
47 |         insert_sql = """
48 |                         insert into {0}(report_id,stock_code,stock_name,publish_time,author,title,
49 |                         original_rating,rating_changes,rating_adjust_mark_type,org_name,content,pdf_link,save_path)
50 |                         VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
51 |                 """.format(mysql_table)
52 | 
53 |         params = (
54 |             self['report_id'],
55 |             self['stock_code'],
56 |             self['stock_name'],
57 |             self['publish_time'],
58 |             self['author'],
59 |             self['title'],
60 |             self['original_rating'],
61 |             self['rating_changes'],
62 |             self['rating_adjust_mark_type'],
63 |             self['org_name'],
64 |             self['content'],
65 |             self['pdf_link'],
66 |             self['save_path']
67 |         )
68 |         return insert_sql, params
69 | 


--------------------------------------------------------------------------------
/research_report_spider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class ResearchReportSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class ResearchReportSpiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/research_report_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import scrapy
 8 | import logging
 9 | import pymysql
10 | from research_report_spider.settings import mysql_host, mysql_user, mysql_password, mysql_db
11 | from research_report_spider.items import ResearchReportSpiderItem
12 | from scrapy.pipelines.files import FilesPipeline
13 | 
14 | 
15 | class MysqlPipeline(object):
16 |     """同步的方式将数据保存到数据库：方法二"""
17 | 
18 |     def __init__(self):
19 |         self.conn = pymysql.connect(
20 |             host=mysql_host,
21 |             user=mysql_user,
22 |             passwd=mysql_password,
23 |             db=mysql_db,
24 |             charset="utf8",
25 |             use_unicode=True,
26 |         )
27 |         self.cursor = self.conn.cursor()
28 | 
29 |     def process_item(self, item, spider):
30 |         try:
31 |             # 插入
32 |             if isinstance(item, ResearchReportSpiderItem):
33 |                 self.do_insert(item)
34 |             else:
35 |                 logging.info("Error Data")
36 |         except pymysql.Error as e:
37 |             logging.error("-----------------insert faild-----------")
38 |             logging.error(e)
39 |             print(e)
40 | 
41 |         return item
42 | 
43 |     def close_spider(self, spider):
44 |         try:
45 |             self.conn.close()
46 |             logging.info("mysql already close")
47 |         except Exception as e:
48 |             logging.info("--------mysql no close-------")
49 |             logging.error(e)
50 | 
51 |     def do_insert(self, item):
52 |         try:
53 |             insert_sql, params = item.get_insert_sql()
54 | 
55 |             self.cursor.execute(insert_sql, params)
56 |             self.conn.commit()
57 |             logging.info("----------------insert success-----------")
58 |         except pymysql.Error as e:
59 |             print(e)
60 | 
61 | 
62 | class MyFilesPipeline(FilesPipeline):
63 |     """
64 |     下载PDF文件
65 |     """
66 |     def get_media_requests(self, item, info):
67 |         for url in item["pdf_link"]:
68 |             if url:
69 |                 yield scrapy.Request(url, meta={"item": item})
70 | 
71 |     def file_path(self, request, response=None, info=None):
72 |         item = request.meta["item"]
73 |         filename = item["filename"]
74 |         return filename
75 | 
76 |     def item_completed(self, results, item, info):
77 |         file_paths = [x['path'] for ok, x in results if ok]
78 |         if not file_paths:
79 |             pass
80 |         item["filename"] = file_paths
81 |         return item
82 | 


--------------------------------------------------------------------------------
/research_report_spider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for research_report_spider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'research_report_spider'
13 | 
14 | SPIDER_MODULES = ['research_report_spider.spiders']
15 | NEWSPIDER_MODULE = 'research_report_spider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'research_report_spider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'research_report_spider.middlewares.ResearchReportSpiderSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'research_report_spider.middlewares.ResearchReportSpiderDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'research_report_spider.pipelines.MysqlPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | mysql_host = '127.0.0.1'
93 | mysql_user = 'root'
94 | mysql_password = 'root'
95 | mysql_db = 'crawl'
96 | mysql_table = 'research_report'
97 | 


--------------------------------------------------------------------------------
/research_report_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/research_report_spider/spiders/report_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | import datetime
  4 | import logging
  5 | import json
  6 | 
  7 | from selenium import webdriver
  8 | 
  9 | from research_report_spider.common import operation
 10 | from research_report_spider.items import ResearchReportSpiderItem
 11 | 
 12 | 
 13 | class ReportSpider(scrapy.Spider):
 14 |     name = 'report'
 15 |     allowed_domains = ['gw.datayes.com']
 16 |     start_urls = ['http://gw.datayes.com/']
 17 | 
 18 |     dt = datetime.datetime.now().strftime('%Y-%m-%d')
 19 |     today = dt.replace('-', '')
 20 | 
 21 |     base_url = 'https://gw.datayes.com/rrp_adventure/web/search?'
 22 |     headers = {
 23 |         "Origin": "https://robo.datayes.com",
 24 |         "Referer": "https://robo.datayes.com/v2/fastreport/company?subType=%E4%B8%8D%E9%99%90&induName=",
 25 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
 26 |     }
 27 |     url = "https://gw.datayes.com/rrp_adventure/web/search?pageNow={0}&authorId=&isOptional=false&orgName=&reportType=COMPANY&secCodeList=&reportSubType=&industry=&ratingType=&pubTimeStart={1}&pubTimeEnd={1}&type=EXTERNAL_REPORT&pageSize=40&sortOrder=desc&query=&minPageCount=&maxPageCount="
 28 | 
 29 |     def start_requests(self):
 30 |         # 获取 cookie
 31 |         cookie = self.get_cookies()
 32 | 
 33 |         for page in range(1, 5):
 34 |             yield scrapy.Request(
 35 |                 self.url.format(page, self.today),
 36 |                 headers=self.headers,
 37 |                 cookies=cookie,
 38 |                 meta={"page": page, "cookie": cookie}
 39 |             )
 40 | 
 41 |     def parse(self, response):
 42 |         page = response.meta.get('page')
 43 |         logging.info('正在抓取第{0}页'.format(page))
 44 |         status = response.status
 45 |         logging.info(status)
 46 |         result = response.text
 47 |         result = json.loads(result)
 48 | 
 49 |         message = result['message']
 50 |         if message != 'success':
 51 |             logging.info('message为：{0},请求失败！'.format(message))
 52 |             return
 53 | 
 54 |         data_all = result['data']['list']
 55 |         for info in data_all:
 56 |             data = info['data']
 57 |             report_id = data['id']
 58 |             stock_name = data['companyName']
 59 |             author = data['author']
 60 |             title = data['title']
 61 |             # id判断
 62 |             is_ar_id = operation.get_article_id(report_id)
 63 |             if is_ar_id:
 64 |                 logging.info('id:{0}已经存在'.format(is_ar_id))
 65 |                 continue
 66 | 
 67 |             content = data['abstractText']
 68 |             if content:
 69 |                 content = content.replace('\u3000', '').strip()
 70 |             else:
 71 |                 content = content
 72 |             stock_code_info = data['stockInfo']
 73 |             if stock_code_info is None:
 74 |                 stock_code = None
 75 |             else:
 76 |                 stock_code = stock_code_info['stockId']
 77 | 
 78 |             file_name = '{0}-{1}.pdf'.format(stock_code, title)
 79 |             org_dt = data['publishTime'].split('T')
 80 |             publish_time = org_dt[0]
 81 | 
 82 |             keys = publish_time.split('-')
 83 |             year = keys[0]
 84 | 
 85 |             filename = "/{0}/{1}/{2}".format(year, publish_time, file_name)
 86 | 
 87 |             item = ResearchReportSpiderItem()
 88 |             # 作为id，唯一主键
 89 |             item['report_id'] = report_id
 90 |             # 股票代码
 91 |             item['stock_code'] = stock_code
 92 |             # 股票名称
 93 |             item['stock_name'] = stock_name
 94 |             # 日期
 95 |             item['publish_time'] = publish_time
 96 |             # 作者
 97 |             item['author'] = author
 98 |             # 研报标题
 99 |             item['title'] = title
100 |             # 原文评级
101 |             item['original_rating'] = data['ratingContent']
102 |             # 评级变动
103 |             item['rating_changes'] = data['ratingType']
104 |             #
105 |             item['rating_adjust_mark_type'] = data['ratingAdjustMarkType']
106 |             #  机构
107 |             item['org_name'] = data['orgName']
108 |             # 内容
109 |             item['content'] = content
110 |             # pdf链接
111 |             item['pdf_link'] = [data['s3Url']]
112 |             # 文件名
113 |             item['filename'] = filename
114 |             # 文件存储路径
115 |             item['save_path'] = "H-hezudao/Research_Report{0}".format(filename)
116 | 
117 |             yield item
118 | 
119 |     def get_cookies(self):
120 |         # Firefox无头浏览器模式
121 |         from selenium.webdriver.firefox.options import Options
122 |         firefox_options = Options()
123 |         firefox_options.set_headless()
124 |         driver = webdriver.Firefox(firefox_options=firefox_options)
125 | 
126 |         url = 'https://robo.datayes.com/v2/fastreport/company?subType=%E4%B8%8D%E9%99%90&induName='
127 |         driver.get(url)
128 |         # 获取cookie列表
129 |         cookie_list = driver.get_cookies()
130 |         # 格式化打印cookie
131 |         cookie_dict = {}
132 |         for cookie in cookie_list:
133 |             cookie_dict[cookie['name']] = cookie['value']
134 |         driver.quit()
135 |         logging.info('Firefox 已经 quit')
136 |         return cookie_dict
137 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | from scrapy import cmdline
 4 | 
 5 | file_log = os.getcwd()+"/info.log"
 6 | if os.path.exists(file_log):
 7 |     os.remove(file_log)
 8 |     print("每次运行前把之前的日志文件删除,保留最新日志即可")
 9 | 
10 | 
11 | cmdline.execute('scrapy crawl report'.split())
12 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = research_report_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = research_report_spider
12 | 


--------------------------------------------------------------------------------