├── stockbaseinfo ├── __init__.py ├── annualreport │ ├── __init__.py │ └── annualspider.py ├── strategy │ ├── __init__.py │ ├── __pycache__ │ │ ├── utils.cpython-36.pyc │ │ ├── __init__.cpython-36.pyc │ │ └── db_tools.cpython-36.pyc │ ├── db_tools.py │ ├── utils.py │ └── strategy_select.py ├── __pycache__ │ ├── Const.cpython-36.pyc │ ├── items.cpython-36.pyc │ ├── utils.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ ├── settings.cpython-36.pyc │ └── middlewares.cpython-36.pyc ├── spiders │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── baseinfo.cpython-36.pyc │ │ └── newsspider.cpython-36.pyc │ ├── __init__.py │ ├── newsspider.py │ └── baseinfo.py ├── main.py ├── utils.py ├── settings.py ├── middlewares.py ├── items.py └── pipelines.py ├── images ├── 十大股东2.png └── 合作加微信.png ├── README.md ├── scrapy.cfg └── data └── data.sql /stockbaseinfo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stockbaseinfo/annualreport/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stockbaseinfo/strategy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/十大股东2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/images/十大股东2.png -------------------------------------------------------------------------------- /images/合作加微信.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/images/合作加微信.png -------------------------------------------------------------------------------- /stockbaseinfo/__pycache__/Const.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/Const.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/strategy/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/strategy/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/spiders/__pycache__/baseinfo.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/spiders/__pycache__/baseinfo.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/spiders/__pycache__/newsspider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/spiders/__pycache__/newsspider.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/strategy/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/strategy/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/strategy/__pycache__/db_tools.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/strategy/__pycache__/db_tools.cpython-36.pyc -------------------------------------------------------------------------------- /stockbaseinfo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stockbaseinfo 2 | 从巨潮资讯网爬取公司的基本面信息,包括股票基本信息、十大流通股东、历史分红、主要指标、财务报表 3 | 4 | 本程序包括功能: 5 | 1.从巨潮资讯网爬虫基本信息 6 | 2,爬取新浪新闻的滚动资讯信息 7 | 3.从巨潮资讯爬取年报信息。 8 | 9 | -------------------------------------------------------- 10 | 如需爬虫合作,请加见images目录下的微信图片。 -------------------------------------------------------------------------------- /stockbaseinfo/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy.cmdline import execute 4 | import sys 5 | import os 6 | 7 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 8 | # execute(['scrapy', 'crawl', 'baseinfo']) 9 | execute(['scrapy', 'crawl', 'news']) -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = stockbaseinfo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = stockbaseinfo 12 | -------------------------------------------------------------------------------- /stockbaseinfo/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 - *- 2 | 3 | class Utils(object): 4 | #计算有效的时间和数据 5 | @staticmethod 6 | def load_validdata(lstyear,item): 7 | lstyear.sort(reverse=True) 8 | arr_year = ["1900" for _ in range(10)] 9 | arr_item = ['0' for _ in range(10)] 10 | for i in range(len(lstyear)): 11 | arr_year[9 - i] = lstyear[i] 12 | arr_item[9 - i] = item[lstyear[i]] 13 | return arr_year, arr_item -------------------------------------------------------------------------------- /stockbaseinfo/strategy/db_tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import pymysql 3 | import sys 4 | from stockbaseinfo.Const import * 5 | 6 | #调用方法 7 | # dbTools = new DbTools() 8 | 9 | class DbTools: 10 | conn = None 11 | cursor = None 12 | #建立和数据库系统得连接 13 | def connect(self): 14 | self.conn = pymysql.connect(host=Const.DB_SERVER,port=Const.PORT,user=Const.DB_USER, passwd=Const.DB_PWD,db=Const.DB_NAME,charset="utf8") 15 | self.cursor = self.conn.cursor(); 16 | 17 | def insertorupdate_data(self,lstsql): 18 | try: 19 | for sql in lstsql: 20 | self.cursor.execute(sql) 21 | except: 22 | print("except") 23 | finally: 24 | print("finally") 25 | 26 | 27 | def fetch_data(self,sql): 28 | self.cursor.execute(sql) 29 | return self.cursor.fetchall() 30 | 31 | def commit_data(self): 32 | self.cursor.close() 33 | self.conn.commit() 34 | return self.conn.close() 35 | 36 | def close(self): 37 | self.cursor.close() 38 | return self.conn.close() 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /stockbaseinfo/strategy/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 - *- 2 | 3 | import datetime,time 4 | import db_tools as dt 5 | import math 6 | from stockbaseinfo.Const import * 7 | import requests 8 | 9 | class Utils(object): 10 | 11 | #获取某daysago天数之前的日期 12 | @staticmethod 13 | def get_num_startdate(daysnum = 1): 14 | lstretdate =[] 15 | selectdaysago ='''select date from hist_data where code = '000001' order by date desc limit %d;'''%daysnum 16 | lstitem = Utils.fetch_data(selectdaysago) 17 | for item in lstitem: 18 | date = item[0] 19 | lstretdate.append(date.strftime('%Y-%m-%d')) 20 | return lstretdate; 21 | 22 | # 获取某daysnum天数之前的日期 23 | @staticmethod 24 | def get_fourdate(daysnum=4): 25 | selectdaysago = '''select date from hist_data where code = '000001' order by date desc limit %d;''' % daysnum 26 | lstitem = Utils.fetch_data(selectdaysago) 27 | day3date = lstitem[3][0] 28 | day2date = lstitem[2][0] 29 | day1date = lstitem[1][0] 30 | daydate = lstitem[0][0] 31 | day3Date = day3date.strftime('%Y-%m-%d') 32 | day2Date = day2date.strftime('%Y-%m-%d') 33 | day1Date = day1date.strftime('%Y-%m-%d') 34 | dayDate = daydate.strftime('%Y-%m-%d') 35 | return day3Date,day2Date,day1Date,dayDate; 36 | @staticmethod 37 | def get_lastdate(): 38 | selectdaysago = '''select date from hist_data where code = '000001' order by date desc limit 1;''' 39 | lstitem = Utils.fetch_data(selectdaysago) 40 | if len(lstitem)>0 : 41 | startdate = lstitem[0][0] 42 | return startdate.strftime('%Y-%m-%d') 43 | # 执行SQL语句集合 44 | @staticmethod 45 | def execute_data(lstinsertsql): 46 | dbTools = dt.DbTools() 47 | dbTools.connect() 48 | dbTools.insertorupdate_data(lstinsertsql) 49 | dbTools.commit_data() 50 | 51 | # 获取查询数据 52 | @staticmethod 53 | def fetch_data(querysql): 54 | dbTools = dt.DbTools() 55 | dbTools.connect() 56 | lstitem = dbTools.fetch_data(querysql) 57 | for item in lstitem: 58 | temsql = '' 59 | for i in range(len(item)): 60 | temsql += '\'' + str(item[i]) + '\',' 61 | dbTools.close() 62 | return lstitem; 63 | 64 | @staticmethod 65 | def time_cmp(first_time,second_time): 66 | return int(time.strftime("%H%M%S", time.strptime(str(first_time), "%H:%M:%S"))) < int(time.strftime("%H%M%S", second_time)) 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /stockbaseinfo/strategy/strategy_select.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | #策略选择工具 3 | from stockbaseinfo.strategy.utils import * 4 | 5 | #选择净利润较好的公司 6 | def getBestProfit(): 7 | dictIndexDateIsCow = {} 8 | setCowSql = '''SELECT `code`, `subject_title`,`account1`,`account2`,`account3`,`account4`,`account5`, `account6`, `account7`, `account8`, `account9`, `account10`, 9 | `year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`,`season` FROM `main_index` where subject_title like '%净利润增长率%' order by code,season ;''' 10 | lstitem = Utils.fetch_data(setCowSql) 11 | for item in lstitem: 12 | print(item) 13 | # tempcodename = '' 14 | # tempdateiscow = '' 15 | # code = item[0] 16 | # name = item[1] 17 | # iscow = item[2] 18 | # sdate = item[3] 19 | # tempcodename = code + ',' + name 20 | # tempdateiscow = str(sdate) + ':' + str(iscow) 21 | # if dictIndexDateIsCow.__contains__(tempcodename): 22 | # # codedata = dictIndexDateIsCow[tempcodename] 23 | # dictIndexDateIsCow[tempcodename] += tempdateiscow + ',' 24 | # else: 25 | # dictIndexDateIsCow[tempcodename] = '' 26 | # print('----------------------------判断熊市牛市---------------------------------------------------------') 27 | # for key, value in dictIndexDateIsCow.items(): 28 | # print(key + "---" + value) 29 | # print('****************************判断熊市牛市********************************************************') 30 | 31 | def getUprise(): 32 | lstOri = [3,4,5,6,5,4,3,22,1,2,3,4,5,6,7,8,9,7,6,5,4,6,8,9,10] 33 | lastMaxP = 0 34 | lastMinP = 0 35 | lstMax=[] 36 | lstMin = [] 37 | lstMaxData=[] 38 | lstMinData=[] 39 | lstOri.reverse() 40 | for i in range(len(lstOri)): 41 | if i == 0: 42 | pass 43 | else: 44 | minp = min(lstOri[0:i]) 45 | if minp87%就可以买入该股票呢?其实RPS指标只是对强势股的个一个初步筛选,对于A股而言,RPS大于87%的股票就有400多只,都买进也不太现实,具体运用还需结合个股基本面、题材和整体市场情况分析。RPS实际上是欧奈尔在《笑傲股市》中提出的CANSLIM七步选股法的一个技术分析。各字母含义如下所示:C:最近一季度报表显示的盈利(每股收益) 57 | # A:每年度每股盈利的增长幅度 58 | # N:新产品,新服务,股价创新高 59 | # S:该股流通盘大小,市值以及交易量的情况 60 | # L:该股票在行业中的低位,是否为龙头 61 | # I:该股票有无有实力的庄家,机构大流通股东 62 | # M:大盘走势如何,如何判断大盘走向 63 | # 64 | # RPS英文全称Relative Price Strength Rating,即股价相对强度,该指标是欧奈尔CANSLIM选股法则中的趋势分析,具有很强的实战指导意义。RPS指标是指在一段时间内,个股涨幅在全部股票涨幅排名中的位次值。 65 | # 比如A股共有3500只股票,若某只股票的120日涨幅在所有股票中排名第350位,则该股票的RPS值为:(1-350/3500)*100=90。 66 | # 67 | # RPS的值代表该股的120日涨幅超过其他90%的股票的涨幅。通过该指标可以反映个股股价走势在同期市场中的表现相对强弱。RPS的值介于0-100之间,在过去的一年中,所有股票的涨幅排行中,前1%的股票的RPS值为99至100,前2%的股票的RPS值为98至99,以此类推。RPS时间周期可以自己根据需要进行调整,常用的有60日(3个月)、120日(半年)和250日(一年)等。 68 | 69 | def getRPS120(): 70 | print() 71 | if __name__ == "__main__": 72 | getUprise() 73 | # data = "1:23" 74 | # print(data[:data.index(":")]) -------------------------------------------------------------------------------- /stockbaseinfo/spiders/newsspider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Request 3 | from ..items import * 4 | from ..Const import * 5 | import random 6 | import json 7 | import re 8 | from datetime import datetime 9 | 10 | 11 | class newsSpider(scrapy.Spider): 12 | name = "news" 13 | lstsinanews = [] 14 | base_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid={}&k=&num=50&page={}&r={}' 15 | # "2509": "全部", 16 | # "2510": "国内", 17 | # "2511": "国际", 18 | # "2669": "社会", 19 | # "2512": "体育", 20 | # "2513": "娱乐", 21 | # "2514": "军事", 22 | # "2515": "科技", 23 | # "2516": "财经", 24 | # "2517": "股市", 25 | # "2518": "美股", 26 | # "2968": "国内_国际", 27 | # "2970": "国内_社会", 28 | # "2972": "国际_社会", 29 | # "2974": "国内国际社会" 30 | def start_requests(self): 31 | # 可修改 这里设置爬取100页 32 | self.page_total = 24 33 | # self.page_total = 1 34 | for page in range(1, self.page_total + 1): 35 | # 按上面注释 可修改 这里"2509"代表"全部"类别的新闻 36 | lid = "2516" 37 | r = random.random() 38 | yield Request(self.base_url.format(lid, page, r), callback=lambda response,page=page:self.parse(response,page)) 39 | 40 | def parse(self, response,page): 41 | result = json.loads(response.text) 42 | data_list = result.get('result').get('data') 43 | icount = 0 44 | totalcount = len(data_list) 45 | print("totalcount:"+str(totalcount)) 46 | for data in data_list: 47 | icount += 1 48 | item = sina_newsItem() 49 | ctime = datetime.fromtimestamp(int(data.get('ctime'))) 50 | ctime = datetime.strftime(ctime, '%Y-%m-%d %H:%M') 51 | item['ctime'] = ctime 52 | item['url'] = str(data.get('url')).strip() 53 | item['wapurl'] = str(data.get('wapurl')).strip() 54 | item['title'] = str(data.get('title')).strip() 55 | item['media_name'] = str(data.get('media_name')).strip() 56 | item['keywords'] = str(data.get('keywords')).strip() 57 | yield Request(url=item['url'], callback=lambda response,page=page,isend = icount == totalcount:self.parse_content(response,page,isend) , meta={'item': item}) 58 | 59 | # 进入到详情页面 爬取新闻内容 60 | 61 | def parse_content(self, response,page,isend): 62 | item = response.meta['item'] 63 | content = ''.join(response.xpath('//*[@id="artibody" or @id="article"]//p/text()').extract()) 64 | content = re.sub(r'\u3000', '', content) 65 | content = re.sub(r'[ \xa0?]+', ' ', content) 66 | content = re.sub(r'\s*\n\s*', '\n', content) 67 | content = re.sub(r'\s*(\s)', r'\1', content) 68 | content = ''.join([x.strip() for x in content]) 69 | # content_list = response.xpath('//*[@id="artibody" or @id="article"]//p/text()').extract() 70 | # content = r"" 71 | # for part in content_list: 72 | # part = part.strip() 73 | # content += part 74 | item['content'] = content 75 | print("page:"+str(page)+",isend:"+str(isend)) 76 | if self.page_total == page and isend: 77 | stockbaseinfoitme = StockbaseinfoItem() 78 | stockbaseinfoitme['data_type'] = Const.ROLL_NEWS 79 | stockbaseinfoitme['data_content'] = self.lstsinanews 80 | yield stockbaseinfoitme 81 | else: 82 | self.lstsinanews.append(item) 83 | 84 | -------------------------------------------------------------------------------- /stockbaseinfo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for stockbaseinfo project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'stockbaseinfo' 13 | 14 | SPIDER_MODULES = ['stockbaseinfo.spiders'] 15 | NEWSPIDER_MODULE = 'stockbaseinfo.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'stockbaseinfo (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | SPIDER_MIDDLEWARES = { 50 | 'stockbaseinfo.middlewares.StockbaseinfoSpiderMiddleware': 543, 51 | } 52 | HTTPERROR_ALLOWED_CODES= [999] 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'stockbaseinfo.middlewares.StockbaseinfoDownloaderMiddleware': 543, 57 | } 58 | 59 | # Enable or disable extensions 60 | # Enable or disable extensions 61 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'stockbaseinfo.pipelines.StockbaseinfoPipeline': 300, 70 | } 71 | MYSQL_DB_NAME = 'finance_data' 72 | MYSQL_HOST = 'localhost' 73 | MYSQL_PORT = 3306 74 | MYSQL_USER = 'root' 75 | MYSQL_PASSWORD = 'root' 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 78 | #AUTOTHROTTLE_ENABLED = True 79 | # The initial download delay 80 | #AUTOTHROTTLE_START_DELAY = 5 81 | # The maximum download delay to be set in case of high latencies 82 | #AUTOTHROTTLE_MAX_DELAY = 60 83 | # The average number of requests Scrapy should be sending in parallel to 84 | # each remote server 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 | # Enable showing throttling stats for every response received: 87 | #AUTOTHROTTLE_DEBUG = False 88 | 89 | # Enable and configure HTTP caching (disabled by default) 90 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 | HTTPCACHE_ENABLED = True 92 | HTTPCACHE_EXPIRATION_SECS = 0 93 | HTTPCACHE_DIR = 'httpcache' 94 | HTTPCACHE_IGNORE_HTTP_CODES = [] 95 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 96 | -------------------------------------------------------------------------------- /data/data.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE `finance_basic_data` /*!40100 DEFAULT CHARACTER SET utf8 */ /*!80016 DEFAULT ENCRYPTION='N' */; 2 | CREATE TABLE `balance_sheet` ( 3 | `code` varchar(45) DEFAULT NULL, 4 | `subject_title` varchar(500) DEFAULT NULL, 5 | `account1` float DEFAULT '0', 6 | `account2` float DEFAULT '0', 7 | `account3` float DEFAULT '0', 8 | `account4` float DEFAULT '0', 9 | `account5` float DEFAULT '0', 10 | `account6` float DEFAULT '0', 11 | `account7` float DEFAULT '0', 12 | `account8` float DEFAULT '0', 13 | `account9` float DEFAULT '0', 14 | `account10` float DEFAULT '0', 15 | `year1` int(11) DEFAULT '0', 16 | `year2` int(11) DEFAULT '0', 17 | `year3` int(11) DEFAULT '0', 18 | `year4` int(11) DEFAULT '0', 19 | `year5` int(11) DEFAULT '0', 20 | `year6` int(11) DEFAULT '0', 21 | `year7` int(11) DEFAULT '0', 22 | `year8` int(11) DEFAULT '0', 23 | `year9` int(11) DEFAULT '0', 24 | `year10` int(11) DEFAULT '0' 25 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 26 | 27 | CREATE TABLE `cashflow_sheet` ( 28 | `code` varchar(45) DEFAULT NULL, 29 | `subject_title` varchar(500) DEFAULT NULL, 30 | `account1` float DEFAULT '0', 31 | `account2` float DEFAULT '0', 32 | `account3` float DEFAULT '0', 33 | `account4` float DEFAULT '0', 34 | `account5` float DEFAULT '0', 35 | `account6` float DEFAULT '0', 36 | `account7` float DEFAULT '0', 37 | `account8` float DEFAULT '0', 38 | `account9` float DEFAULT '0', 39 | `account10` float DEFAULT '0', 40 | `year1` int(11) DEFAULT '0', 41 | `year2` int(11) DEFAULT '0', 42 | `year3` int(11) DEFAULT '0', 43 | `year4` int(11) DEFAULT '0', 44 | `year5` int(11) DEFAULT '0', 45 | `year6` int(11) DEFAULT '0', 46 | `year7` int(11) DEFAULT '0', 47 | `year8` int(11) DEFAULT '0', 48 | `year9` int(11) DEFAULT '0', 49 | `year10` int(11) DEFAULT '0' 50 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 51 | CREATE TABLE `main_index` ( 52 | `code` varchar(45) DEFAULT NULL, 53 | `subject_title` varchar(500) DEFAULT NULL, 54 | `account1` float DEFAULT '0', 55 | `account2` float DEFAULT '0', 56 | `account3` float DEFAULT '0', 57 | `account4` float DEFAULT '0', 58 | `account5` float DEFAULT '0', 59 | `account6` float DEFAULT '0', 60 | `account7` float DEFAULT '0', 61 | `account8` float DEFAULT '0', 62 | `account9` float DEFAULT '0', 63 | `account10` float DEFAULT '0', 64 | `year1` int(11) DEFAULT '0', 65 | `year2` int(11) DEFAULT '0', 66 | `year3` int(11) DEFAULT '0', 67 | `year4` int(11) DEFAULT '0', 68 | `year5` int(11) DEFAULT '0', 69 | `year6` int(11) DEFAULT '0', 70 | `year7` int(11) DEFAULT '0', 71 | `year8` int(11) DEFAULT '0', 72 | `year9` int(11) DEFAULT '0', 73 | `year10` int(11) DEFAULT '0', 74 | `season` int(11) DEFAULT '0' 75 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 76 | CREATE TABLE `profit_sheet` ( 77 | `code` varchar(45) DEFAULT NULL, 78 | `subject_title` varchar(500) DEFAULT NULL, 79 | `account1` float DEFAULT '0', 80 | `account2` float DEFAULT '0', 81 | `account3` float DEFAULT '0', 82 | `account4` float DEFAULT '0', 83 | `account5` float DEFAULT '0', 84 | `account6` float DEFAULT '0', 85 | `account7` float DEFAULT '0', 86 | `account8` float DEFAULT '0', 87 | `account9` float DEFAULT '0', 88 | `account10` float DEFAULT '0', 89 | `year1` int(11) DEFAULT '0', 90 | `year2` int(11) DEFAULT '0', 91 | `year3` int(11) DEFAULT '0', 92 | `year4` int(11) DEFAULT '0', 93 | `year5` int(11) DEFAULT '0', 94 | `year6` int(11) DEFAULT '0', 95 | `year7` int(11) DEFAULT '0', 96 | `year8` int(11) DEFAULT '0', 97 | `year9` int(11) DEFAULT '0', 98 | `year10` int(11) DEFAULT '0', 99 | `season` int(11) DEFAULT '0' 100 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 101 | CREATE TABLE `sina_news` ( 102 | `idsina_news` int(11) NOT NULL AUTO_INCREMENT, 103 | `title` varchar(500) DEFAULT NULL, 104 | `content` text, 105 | `ctime` datetime DEFAULT NULL, 106 | `media_name` varchar(100) DEFAULT NULL, 107 | `keywords` varchar(200) DEFAULT NULL, 108 | `url` varchar(200) DEFAULT NULL, 109 | `wepurl` varchar(200) DEFAULT NULL, 110 | PRIMARY KEY (`idsina_news`) 111 | ) ENGINE=InnoDB AUTO_INCREMENT=4090 DEFAULT CHARSET=utf8; 112 | CREATE TABLE `stock_bonus` ( 113 | `code` varchar(45) DEFAULT NULL, 114 | `notice_date` datetime DEFAULT NULL, 115 | `rightoff_time` datetime DEFAULT NULL, 116 | `stock_right_registe_date` datetime DEFAULT NULL, 117 | `cash_per_share` float DEFAULT '0', 118 | `send_bonus_share_per_share` float DEFAULT '0', 119 | `increase_shares_per_share` int(11) DEFAULT '0', 120 | `cash_receive_date` datetime DEFAULT NULL, 121 | `share_receive_date` datetime DEFAULT NULL 122 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 123 | CREATE TABLE `stock_holders` ( 124 | `code` varchar(45) DEFAULT NULL, 125 | `holder_range` int(11) DEFAULT '0', 126 | `holder_name` varchar(500) DEFAULT NULL, 127 | `stock_count` float DEFAULT '0', 128 | `stock_percent` float DEFAULT '0', 129 | `stock_property` varchar(200) DEFAULT NULL, 130 | `count_date` date DEFAULT NULL 131 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 132 | CREATE TABLE `stock_info` ( 133 | `code` varchar(45) DEFAULT NULL, 134 | `name` varchar(45) DEFAULT NULL, 135 | `open` float DEFAULT '0', 136 | `high` float DEFAULT '0', 137 | `close` float DEFAULT '0', 138 | `low` float DEFAULT '0', 139 | `volume` float DEFAULT '0', 140 | `amount` float DEFAULT '0', 141 | `price_change` float DEFAULT '0', 142 | `p_change` float DEFAULT '0', 143 | `yesterday_close` float DEFAULT '0', 144 | `exchange` float DEFAULT '0', 145 | `online_years` int(11) DEFAULT '0', 146 | `pb` float DEFAULT '0', 147 | `pe` float DEFAULT '0', 148 | `date` datetime DEFAULT NULL, 149 | `amplitude` float DEFAULT '0' 150 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 151 | -------------------------------------------------------------------------------- /stockbaseinfo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import random 10 | 11 | 12 | class StockbaseinfoSpiderMiddleware(object): 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | def __init__(self, user_agent=''): 18 | self.user_agent = user_agent 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | # This method is used by Scrapy to create your spiders. 23 | s = cls() 24 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 25 | return s 26 | 27 | def process_spider_input(self, response, spider): 28 | # Called for each response that goes through the spider 29 | # middleware and into the spider. 30 | 31 | # Should return None or raise an exception. 32 | return None 33 | 34 | def process_spider_output(self, response, result, spider): 35 | # Called with the results returned from the Spider, after 36 | # it has processed the response. 37 | 38 | # Must return an iterable of Request, dict or Item objects. 39 | for i in result: 40 | yield i 41 | 42 | def process_spider_exception(self, response, exception, spider): 43 | # Called when a spider or process_spider_input() method 44 | # (from other spider middleware) raises an exception. 45 | 46 | # Should return either None or an iterable of Request, dict 47 | # or Item objects. 48 | pass 49 | 50 | def process_start_requests(self, start_requests, spider): 51 | # Called with the start requests of the spider, and works 52 | # similarly to the process_spider_output() method, except 53 | # that it doesn’t have a response associated. 54 | 55 | # Must return only requests (not items). 56 | for r in start_requests: 57 | yield r 58 | 59 | def spider_opened(self, spider): 60 | spider.logger.info('Spider opened: %s' % spider.name) 61 | 62 | 63 | class StockbaseinfoDownloaderMiddleware(object): 64 | # Not all methods need to be defined. If a method is not defined, 65 | # scrapy acts as if the downloader middleware does not modify the 66 | # passed objects. 67 | 68 | @classmethod 69 | def from_crawler(cls, crawler): 70 | # This method is used by Scrapy to create your spiders. 71 | s = cls() 72 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 73 | return s 74 | 75 | def process_request(self, request, spider): 76 | # 这句话用于随机选择user-agent 77 | ua = self.user_agent_list[random.randint(0,16)] 78 | if ua: 79 | print('User-Agent:' + ua) 80 | request.headers.setdefault('User-Agent', ua) 81 | 82 | # Called for each request that goes through the downloader 83 | # middleware. 84 | 85 | # Must either: 86 | # - return None: continue processing this request 87 | # - or return a Response object 88 | # - or return a Request object 89 | # - or raise IgnoreRequest: process_exception() methods of 90 | # installed downloader middleware will be called 91 | return None 92 | 93 | def process_response(self, request, response, spider): 94 | # Called with the response returned from the downloader. 95 | 96 | # Must either; 97 | # - return a Response object 98 | # - return a Request object 99 | # - or raise IgnoreRequest 100 | return response 101 | 102 | def process_exception(self, request, exception, spider): 103 | # Called when a download handler or a process_request() 104 | # (from other downloader middleware) raises an exception. 105 | 106 | # Must either: 107 | # - return None: continue processing this exception 108 | # - return a Response object: stops process_exception() chain 109 | # - return a Request object: stops process_exception() chain 110 | pass 111 | 112 | def spider_opened(self, spider): 113 | spider.logger.info('Spider opened: %s' % spider.name) 114 | 115 | user_agent_list = [ \ 116 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ 117 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ 118 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ 119 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ 120 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ 121 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ 122 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ 123 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 124 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 125 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 126 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 127 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 128 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 129 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 130 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 131 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ 132 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ 133 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"] 134 | 135 | -------------------------------------------------------------------------------- /stockbaseinfo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy.item import Item,Field 10 | 11 | 12 | class StockbaseinfoItem(scrapy.Item): 13 | # name = scrapy.Field() 14 | data_type = Field() 15 | data_content = Field(); 16 | #财务报表 17 | class balance_sheetItem(): 18 | def __init__(self, code, subject_title,account1,account2,account3,account4,account5,account6,account7,account8,account9,account10, 19 | year1,year2,year3,year4,year5,year6,year7,year8,year9,year10): 20 | self.code = code 21 | self.subject_title = subject_title 22 | self.account1 = account1 23 | self.account2 = account2 24 | self.account3 = account3 25 | self.account4 = account4 26 | self.account5 = account5 27 | self.account6 = account6 28 | self.account7 = account7 29 | self.account8 = account8 30 | self.account9 = account9 31 | self.account10 = account10 32 | self.year1 = year1 33 | self.year2 = year2 34 | self.year3 = year3 35 | self.year4 = year4 36 | self.year5 = year5 37 | self.year6 = year6 38 | self.year7 = year7 39 | self.year8 = year8 40 | self.year9 = year9 41 | self.year10 = year10 42 | #现金流量报表 43 | class cash_flowItem(): 44 | def __init__(self, code, subject_title, account1, account2, account3, account4, account5, account6, account7, 45 | account8, account9, account10, 46 | year1, year2, year3, year4, year5, year6, year7, year8, year9, year10): 47 | self.code = code 48 | self.subject_title = subject_title 49 | self.account1 = account1 50 | self.account2 = account2 51 | self.account3 = account3 52 | self.account4 = account4 53 | self.account5 = account5 54 | self.account6 = account6 55 | self.account7 = account7 56 | self.account8 = account8 57 | self.account9 = account9 58 | self.account10 = account10 59 | self.year1 = year1 60 | self.year2 = year2 61 | self.year3 = year3 62 | self.year4 = year4 63 | self.year5 = year5 64 | self.year6 = year6 65 | self.year7 = year7 66 | self.year8 = year8 67 | self.year9 = year9 68 | self.year10 = year10 69 | #主要指数 70 | class main_indexItem(): 71 | def __init__(self, code, subject_title,season, account1, account2, account3, account4, account5, account6, account7, 72 | account8, account9, account10, 73 | year1, year2, year3, year4, year5, year6, year7, year8, year9, year10): 74 | self.code = code 75 | self.subject_title = subject_title 76 | self.season = season 77 | self.account1 = account1 78 | self.account2 = account2 79 | self.account3 = account3 80 | self.account4 = account4 81 | self.account5 = account5 82 | self.account6 = account6 83 | self.account7 = account7 84 | self.account8 = account8 85 | self.account9 = account9 86 | self.account10 = account10 87 | self.year1 = year1 88 | self.year2 = year2 89 | self.year3 = year3 90 | self.year4 = year4 91 | self.year5 = year5 92 | self.year6 = year6 93 | self.year7 = year7 94 | self.year8 = year8 95 | self.year9 = year9 96 | self.year10 = year10 97 | #利润表 98 | class profit_sheetItem(): 99 | def __init__(self, code, subject_title,season, account1, account2, account3, account4, account5, account6, account7, 100 | account8, account9, account10, 101 | year1, year2, year3, year4, year5, year6, year7, year8, year9, year10): 102 | self.code = code 103 | self.subject_title = subject_title 104 | self.season = season 105 | self.account1 = account1 106 | self.account2 = account2 107 | self.account3 = account3 108 | self.account4 = account4 109 | self.account5 = account5 110 | self.account6 = account6 111 | self.account7 = account7 112 | self.account8 = account8 113 | self.account9 = account9 114 | self.account10 = account10 115 | self.year1 = year1 116 | self.year2 = year2 117 | self.year3 = year3 118 | self.year4 = year4 119 | self.year5 = year5 120 | self.year6 = year6 121 | self.year7 = year7 122 | self.year8 = year8 123 | self.year9 = year9 124 | self.year10 = year10 125 | #分红 126 | class stock_bonusItem(): 127 | def __init__(self, code, notice_date, rightoff_time, stock_right_registe_date, cash_per_share, 128 | send_bonus_share_per_share, increase_shares_per_share, cash_receive_date,share_receive_date): 129 | self.code = code 130 | self.notice_date = notice_date 131 | self.rightoff_time = rightoff_time 132 | self.stock_right_registe_date = stock_right_registe_date 133 | self.cash_per_share = cash_per_share 134 | self.send_bonus_share_per_share = send_bonus_share_per_share 135 | self.increase_shares_per_share = increase_shares_per_share 136 | self.cash_receive_date = cash_receive_date 137 | self.share_receive_date = share_receive_date 138 | #十大流通股东 139 | class stock_holdersItem(): 140 | def __init__(self, code, holder_range, holder_name, stock_count, stock_percent, 141 | stock_property,count_date): 142 | self.code = code 143 | self.holder_range = holder_range 144 | self.holder_name = holder_name 145 | self.stock_count = stock_count 146 | self.stock_percent = stock_percent 147 | self.stock_property = stock_property 148 | self.count_date = count_date 149 | #股票基本信息 150 | class stock_infoItem(): 151 | def __init__(self, code, name, open, high, close, 152 | low, volume, amount,price_change, 153 | p_change, yesterday_close, exchange,turnover,online_years, 154 | pb, pe, date,amplitude): 155 | self.code = code 156 | self.name = name 157 | self.open = open 158 | self.high = high 159 | self.close = close 160 | self.low = low 161 | self.volume = volume 162 | self.amount = amount 163 | self.price_change = price_change 164 | self.p_change = p_change 165 | self.yesterday_close = yesterday_close 166 | self.exchange = exchange 167 | self.turnover = turnover 168 | self.online_years = online_years 169 | self.pb = pb 170 | self.pe = pe 171 | self.date = date 172 | self.amplitude = amplitude 173 | 174 | #爬取新浪滚动新闻 175 | class sina_newsItem(scrapy.Item): 176 | collection = 'newsina' 177 | ctime = Field() # 发布时间 178 | url = Field() 179 | wapurl = Field() 180 | title = Field() # 新闻标题 181 | media_name = Field() # 发发布的媒体 182 | keywords = Field() # 关键词 183 | content = Field() # 新闻内容 184 | -------------------------------------------------------------------------------- /stockbaseinfo/annualreport/annualspider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 下载年报 4 | downloads: 5 | 公开招股书(招股说明书/招股意向书) 6 | 《年度报告》 16 17 18 7 | """ 8 | import requests 9 | import random 10 | import time 11 | import urllib 12 | from stockbaseinfo.Const import * 13 | 14 | download_path = 'http://static.cninfo.com.cn/' 15 | saving_path = './pdf/' 16 | 17 | User_Agent = [ 18 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 19 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 20 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 21 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 22 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 23 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 24 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0" 25 | ] 26 | 27 | 28 | headers = {'Accept': 'application/json, text/javascript, */*; q=0.01', 29 | "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", 30 | "Accept-Encoding": "gzip, deflate", 31 | "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-HK;q=0.6,zh-TW;q=0.5", 32 | 'Host': 'www.cninfo.com.cn', 33 | 'Origin': 'http://www.cninfo.com.cn', 34 | 'Referer': 'http://www.cninfo.com.cn/new/commonUrl?url=disclosure/list/notice', 35 | 'X-Requested-With': 'XMLHttpRequest' 36 | } 37 | 38 | 39 | 40 | 41 | # 深市 年度报告 42 | def szseAnnual(page, stock): 43 | query_path = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' 44 | headers['User-Agent'] = random.choice(User_Agent) # 定义User_Agent 45 | query = {'pageNum': page, # 页码 46 | 'pageSize': 30, 47 | 'tabName': 'fulltext', 48 | 'column': 'szse', # 深交所 49 | 'stock': stock, 50 | 'searchkey': '', 51 | 'secid': '', 52 | 'plate': 'sz', 53 | 'category': 'category_ndbg_szsh;', # 年度报告 54 | 'trade': '', 55 | 'seDate': '2016-01-01+~+2019-4-26' # 时间区间 56 | } 57 | 58 | namelist = requests.post(query_path, headers=headers, data=query) 59 | return namelist.json()['announcements'] 60 | 61 | 62 | # 沪市 年度报告 63 | def sseAnnual(page, stock): 64 | query_path = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' 65 | headers['User-Agent'] = random.choice(User_Agent) # 定义User_Agent 66 | query = {'pageNum': page, # 页码 67 | 'pageSize': 30, 68 | 'tabName': 'fulltext', 69 | 'column': 'sse', 70 | 'stock': stock, 71 | 'searchkey': '', 72 | 'secid': '', 73 | 'plate': 'sh', 74 | 'category': 'category_ndbg_szsh;', # 年度报告 75 | 'trade': '', 76 | 'seDate': '2016-01-01+~+2019-4-26' # 时间区间 77 | } 78 | 79 | namelist = requests.post(query_path, headers=headers, data=query) 80 | return namelist.json()['announcements'] # json中的年度报告信息 81 | 82 | 83 | # 深市 招股 84 | def szseStock(page, stock): 85 | query_path = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' 86 | headers['User-Agent'] = random.choice(User_Agent) # 定义User_Agent 87 | query = {'pageNum': page, # 页码 88 | 'pageSize': 30, 89 | 'tabName': 'fulltext', 90 | 'column': 'szse', 91 | 'stock': stock, 92 | 'searchkey': '招股', 93 | 'secid': '', 94 | 'plate': 'sz', 95 | 'category': '', 96 | 'trade': '', 97 | 'seDate': '2001-01-01+~+2019-4-26' # 时间区间 98 | } 99 | 100 | namelist = requests.post(query_path, headers=headers, data=query) 101 | return namelist.json()['announcements'] # json中的年度报告信息 102 | 103 | 104 | # 沪市 招股 105 | def sseStock(page, stock): 106 | query_path = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' 107 | headers['User-Agent'] = random.choice(User_Agent) # 定义User_Agent 108 | query = {'pageNum': page, # 页码 109 | 'pageSize': 30, 110 | 'tabName': 'fulltext', 111 | 'column': 'sse', 112 | 'stock': stock, 113 | 'searchkey': '招股', 114 | 'secid': '', 115 | 'plate': 'sh', 116 | 'category': '', 117 | 'trade': '', 118 | 'seDate': '2010-01-01+~+2020-5-08' # 时间区间 119 | } 120 | 121 | namelist = requests.post(query_path, headers=headers, data=query) 122 | return namelist.json()['announcements'] # json中的年度报告信息 123 | 124 | 125 | # download PDF 126 | def Download(single_page): 127 | if single_page is None: 128 | return 129 | 130 | headers = {'Accept': 'application/json, text/javascript, */*; q=0.01', 131 | "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", 132 | "Accept-Encoding": "gzip, deflate", 133 | "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-HK;q=0.6,zh-TW;q=0.5", 134 | 'Host': 'www.cninfo.com.cn', 135 | 'Origin': 'http://www.cninfo.com.cn' 136 | } 137 | 138 | for i in single_page: 139 | allowed_list = [ 140 | '2019年年度报告(更新后)', 141 | '2019年年度报告', 142 | # '2018年年度报告(更新后)', 143 | # '2018年年度报告', 144 | # '2017年年度报告(更新后)', 145 | # '2017年年度报告', 146 | # '2016年年度报告(更新后)', 147 | # '2016年年度报告', 148 | ] 149 | allowed_list_2 = [ 150 | '招股书', 151 | # '招股说明书', 152 | # '招股意向书', 153 | ] 154 | title = i['announcementTitle'] 155 | allowed = title in allowed_list 156 | if '确认意见' in title: 157 | return 158 | for item in allowed_list_2: 159 | if item in title: 160 | allowed = True 161 | break 162 | if allowed: 163 | download = download_path + i["adjunctUrl"] 164 | name = i["secCode"] + '_' + i['secName'] + '_' + i['announcementTitle'] + '.pdf' 165 | if '*' in name: 166 | name = name.replace('*', '') 167 | file_path = saving_path + name 168 | time.sleep(random.random() * 2) 169 | 170 | headers['User-Agent'] = random.choice(User_Agent) 171 | r = requests.get(download) 172 | 173 | f = open(file_path, "wb") 174 | f.write(r.content) 175 | f.close() 176 | else: 177 | continue 178 | 179 | 180 | # given page_number & stock number 181 | def Run(page_number, stock): 182 | try: 183 | annual_report = szseAnnual(page_number, stock) 184 | stock_report = szseStock(page_number, stock) 185 | annual_report_ = sseAnnual(page_number, stock) 186 | stock_report_ = sseStock(page_number, stock) 187 | except: 188 | print(page_number, 'page error, retrying') 189 | try: 190 | annual_report = szseAnnual(page_number, stock) 191 | except: 192 | print(page_number, 'page error') 193 | Download(annual_report) 194 | Download(stock_report) 195 | Download(annual_report_) 196 | Download(stock_report_) 197 | 198 | 199 | if __name__ == "__main__": 200 | i = 0 201 | for code in Const.LST_CODE_TEST: 202 | Run(1, code) 203 | i=i+1 204 | print(code, "index:"+str(i)+"done") -------------------------------------------------------------------------------- /stockbaseinfo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymysql 9 | # 使用twsited异步IO框架,实现数据的异步写入。 10 | from pymysql import cursors 11 | from twisted.enterprise import adbapi 12 | from stockbaseinfo.settings import * 13 | from stockbaseinfo.Const import * 14 | import traceback 15 | 16 | class StockbaseinfoPipeline(object): 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # 从项目的配置文件中读取相应的参数 20 | cls.MYSQL_DB_NAME = crawler.settings.get("MYSQL_DB_NAME", 'finance_data') 21 | cls.HOST = crawler.settings.get("MYSQL_HOST", 'localhost') 22 | cls.PORT = crawler.settings.get("MYSQL_PORT", 3306) 23 | cls.USER = crawler.settings.get("MYSQL_USER", 'root') 24 | cls.PASSWD = crawler.settings.get("MYSQL_PASSWORD", 'root') 25 | return cls() 26 | 27 | def __init__(self): 28 | dbparams = { 29 | 'host':MYSQL_HOST, 30 | 'port': 3306, 31 | 'user': MYSQL_USER, 32 | 'password': MYSQL_PASSWORD, 33 | 'database': MYSQL_DB_NAME, 34 | 'charset': 'utf8', 35 | 'cursorclass': cursors.DictCursor # 指定cursor的类 36 | } 37 | # 初始化数据库连接池,参数1是mysql的驱动,参数2是连接mysql的配置信息 38 | self.db_pool = adbapi.ConnectionPool('pymysql', **dbparams) 39 | # sql语言的空值 40 | self._sql = None 41 | def process_item(self, item, spider): 42 | # 操作数据,将数据写入数据库 43 | # 如果是同步写入的话,使用的是cursor.execute(),commit() 44 | # 异步存储的方式:函数方式pool.map(self.insert_db,[1,2]) 45 | query = self.db_pool.runInteraction(self.insert_db, item) 46 | query.addErrback(self.handle_error, item, spider) 47 | 48 | #依据不同的数据类型进行不同的数据操作 49 | def insert_db(self, cursor, item): 50 | data_type = item['data_type'] 51 | if data_type == Const.STOCK_INFO:#股票基本信息 52 | stockinfo = item['data_content'] 53 | values = ( 54 | stockinfo.code, 55 | stockinfo.name, 56 | stockinfo.open, 57 | stockinfo.high, 58 | stockinfo.close, 59 | stockinfo.low, 60 | stockinfo.code, 61 | stockinfo.amount, 62 | stockinfo.price_change, 63 | stockinfo.p_change, 64 | stockinfo.yesterday_close, 65 | stockinfo.exchange, 66 | stockinfo.online_years, 67 | stockinfo.pb, 68 | stockinfo.pe, 69 | str(stockinfo.date), 70 | stockinfo.amplitude 71 | ) 72 | sql = '''INSERT INTO `finance_data`.`stock_basic_info`(`code`,`name`,`open`,`high`,`close`,`low`,`volume`,`amount`,`price_change`,`p_change`,`yesterday_close`,`exchange`,`online_years`,`pb`,`pe`,`date`,`amplitude`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);''' 73 | try: 74 | cursor.execute(sql, values) 75 | except: 76 | traceback.print_exc() 77 | elif data_type == Const.BALANCE_SHEET: #资产负债表 78 | lstblancesheet = item['data_content'] 79 | for blancesheet in lstblancesheet: 80 | values = ( 81 | blancesheet.code, 82 | blancesheet.subject_title, 83 | blancesheet.account1, 84 | blancesheet.account2, 85 | blancesheet.account3, 86 | blancesheet.account4, 87 | blancesheet.account5, 88 | blancesheet.account6, 89 | blancesheet.account7, 90 | blancesheet.account8, 91 | blancesheet.account9, 92 | blancesheet.account10, 93 | blancesheet.year1, 94 | blancesheet.year2, 95 | blancesheet.year3, 96 | blancesheet.year4, 97 | blancesheet.year5, 98 | blancesheet.year6, 99 | blancesheet.year7, 100 | blancesheet.year8, 101 | blancesheet.year9, 102 | blancesheet.year10 103 | ) 104 | sql = '''INSERT INTO `finance_data`.`stock_basic_balance_sheet`(`code`,`subject_title`,`account1`,`account2`,`account3`,`account4`,`account5`,`account6`,`account7`,`account8`,`account9`, 105 | `account10`,`year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);''' 106 | try: 107 | cursor.execute(sql, values) 108 | except: 109 | traceback.print_exc() 110 | elif data_type == Const.CASH_FLOW: #资产负债表 111 | lstbcashflow = item['data_content'] 112 | for cashflow in lstbcashflow: 113 | values = ( 114 | cashflow.code, 115 | cashflow.subject_title, 116 | cashflow.account1, 117 | cashflow.account2, 118 | cashflow.account3, 119 | cashflow.account4, 120 | cashflow.account5, 121 | cashflow.account6, 122 | cashflow.account7, 123 | cashflow.account8, 124 | cashflow.account9, 125 | cashflow.account10, 126 | cashflow.year1, 127 | cashflow.year2, 128 | cashflow.year3, 129 | cashflow.year4, 130 | cashflow.year5, 131 | cashflow.year6, 132 | cashflow.year7, 133 | cashflow.year8, 134 | cashflow.year9, 135 | cashflow.year10 136 | ) 137 | sql = '''INSERT INTO `finance_data`.`stock_basic_cashflow_sheet`(`code`,`subject_title`,`account1`,`account2`,`account3`,`account4`,`account5`,`account6`,`account7`,`account8`,`account9`, 138 | `account10`,`year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);''' 139 | try: 140 | cursor.execute(sql, values) 141 | except: 142 | traceback.print_exc() 143 | elif data_type == Const.PROFIT_SHEET: # 利润表 144 | lstprofit = item['data_content'] 145 | for profit in lstprofit: 146 | values = ( 147 | profit.code, 148 | profit.subject_title, 149 | profit.season, 150 | profit.account1, 151 | profit.account2, 152 | profit.account3, 153 | profit.account4, 154 | profit.account5, 155 | profit.account6, 156 | profit.account7, 157 | profit.account8, 158 | profit.account9, 159 | profit.account10, 160 | profit.year1, 161 | profit.year2, 162 | profit.year3, 163 | profit.year4, 164 | profit.year5, 165 | profit.year6, 166 | profit.year7, 167 | profit.year8, 168 | profit.year9, 169 | profit.year10 170 | ) 171 | sql = '''INSERT INTO `finance_data`.`stock_basic_profit_sheet`(`code`,`subject_title`,`season`,`account1`,`account2`,`account3`,`account4`,`account5`,`account6`,`account7`,`account8`,`account9`, 172 | `account10`,`year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);''' 173 | try: 174 | cursor.execute(sql, values) 175 | except: 176 | traceback.print_exc() 177 | elif data_type == Const.STOCK_BONUS: # 历史分红表 178 | lstbonus = item['data_content'] 179 | for bonus in lstbonus: 180 | values = ( 181 | bonus.code, 182 | bonus.notice_date, 183 | bonus.rightoff_time, 184 | bonus.stock_right_registe_date, 185 | bonus.cash_per_share, 186 | bonus.send_bonus_share_per_share, 187 | bonus.increase_shares_per_share, 188 | bonus.cash_receive_date, 189 | bonus.share_receive_date 190 | ) 191 | sql = '''INSERT INTO `finance_data`.`stock_basic_bonus`(`code`,`notice_date`,`rightoff_time`,`stock_right_registe_date`,`cash_per_share`,`send_bonus_share_per_share`,`increase_shares_per_share`,`cash_receive_date`,`share_receive_date`) 192 | VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s);''' 193 | try: 194 | cursor.execute(sql, values) 195 | except: 196 | traceback.print_exc() 197 | elif data_type == Const.MAIN_INDEX: # 历史分红表 198 | lstmainindex = item['data_content'] 199 | for mainindex in lstmainindex: 200 | values = ( 201 | mainindex.code, 202 | mainindex.subject_title, 203 | mainindex.season, 204 | mainindex.account1, 205 | mainindex.account2, 206 | mainindex.account3, 207 | mainindex.account4, 208 | mainindex.account5, 209 | mainindex.account6, 210 | mainindex.account7, 211 | mainindex.account8, 212 | mainindex.account9, 213 | mainindex.account10, 214 | mainindex.year1, 215 | mainindex.year2, 216 | mainindex.year3, 217 | mainindex.year4, 218 | mainindex.year5, 219 | mainindex.year6, 220 | mainindex.year7, 221 | mainindex.year8, 222 | mainindex.year9, 223 | mainindex.year10 224 | ) 225 | sql = '''INSERT INTO `finance_data`.`stock_basic_main_index`(`code`,`subject_title`,`season`,`account1`,`account2`,`account3`,`account4`,`account5`,`account6`,`account7`,`account8`,`account9`, 226 | `account10`,`year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);''' 227 | try: 228 | cursor.execute(sql, values) 229 | except: 230 | traceback.print_exc() 231 | elif data_type == Const.STOCK_HOLDERS: # 前十大股东表 232 | lststockholders = item['data_content'] 233 | for stockholder in lststockholders: 234 | values = ( 235 | stockholder.code, 236 | stockholder.holder_range, 237 | stockholder.holder_name, 238 | stockholder.stock_count, 239 | stockholder.stock_percent, 240 | stockholder.stock_property, 241 | stockholder.count_date 242 | ) 243 | sql = '''INSERT INTO `finance_data`.`stock_basic_holders`(`code`,`holder_range`,`holder_name`,`stock_count`,`stock_percent`,`stock_property`,`count_date`)VALUES(%s,%s,%s,%s,%s,%s,%s);''' 244 | try: 245 | cursor.execute(sql, values) 246 | except: 247 | traceback.print_exc() 248 | elif data_type == Const.ROLL_NEWS: # 获取所有新闻信息 249 | lstnews = item['data_content'] 250 | for newitem in lstnews: 251 | values = ( 252 | newitem['title'], 253 | newitem['content'], 254 | newitem['ctime'], 255 | newitem['media_name'], 256 | newitem['keywords'], 257 | newitem['url'], 258 | newitem['wapurl'] 259 | ) 260 | sql = '''INSERT INTO `finance_basic_data`.`sina_news`(`title`,`content`,`ctime`,`media_name`,`keywords`,`url`,`wepurl`)VALUES(%s,%s,%s,%s,%s,%s,%s);''' 261 | try: 262 | cursor.execute(sql, values) 263 | except: 264 | traceback.print_exc() 265 | 266 | def handle_error(self, error, item, spider): 267 | print('=' * 10 + "error" + '=' * 10) 268 | -------------------------------------------------------------------------------- /stockbaseinfo/spiders/baseinfo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.selector import Selector 4 | from stockbaseinfo.items import * 5 | from stockbaseinfo.Const import * 6 | from stockbaseinfo.utils import * 7 | import json 8 | import requests 9 | 10 | 11 | #需要处理的几个事情: 12 | #1.需要支持多个连接组成的访问群组,而且需要不同的顺序完成 13 | #2.需要支持一个yeild来封装不同数据类型 14 | #3.参考这个网址 完成剩下的内容:https://www.jianshu.com/p/6740c83e4540 15 | #4.用来获取基本信息 http://api.cninfo.com.cn/v5/hq/dataItem?codelist=sh603158 16 | 17 | #TODO:需要统一处理数据不足10个的情况,需要设计一个方案来接收这些数据 18 | 19 | class BaseinfoSpider(scrapy.Spider): 20 | name = 'baseinfo' 21 | # allowed_domains = ['zyh.com'] 22 | start_urls = ['http://www.cninfo.com.cn/new/disclosure/stock?orgId=9900026564&stockCode=002796'] 23 | 24 | def start_requests(self): 25 | for code in Const.LST_S_CODE: 26 | tsurl = Const.BASEINFO_URL + code 27 | yield scrapy.Request(tsurl,callback=self.info_parse) 28 | for code in Const.LST_CODE: 29 | para = 'scode=' + code 30 | data1 = { 31 | 'mergerMark': 'sysapi1067', 'paramStr': para 32 | } 33 | yield scrapy.FormRequest(Const.DATAINFO_URL,formdata=data1, method='POST',callback=self.parse) 34 | 35 | #解析其他信息 36 | def parse(self, response): 37 | print('--------------profit_parse-----------------------------') 38 | if len(response.text) > 4: 39 | js = json.loads(response.text) 40 | code = js[0]['SECCODE'] 41 | param = js[0]['F002N'] 42 | #---------------------------------------请求资产负债表-------------------------------------------------------------- 43 | para = 'scode=' + code + ';rtype=1;sign=' + str(param) 44 | # 资产负债表 45 | balance_data = { 46 | 'mergerMark': 'sysapi1077', 'paramStr': para 47 | } 48 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=balance_data, method='POST', callback=lambda response,code=code:self.balance_parse(response,code)) 49 | # *************************************请求资产负债表************************************************************** 50 | 51 | #---------------------------------------请求利润表-------------------------------------------------------------- 52 | #一季度利润表 53 | para = 'scode=' + code + ';rtype=1;sign=' + str(param) 54 | profit_data = { 55 | 'mergerMark': 'sysapi1075', 'paramStr': para 56 | } 57 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=profit_data, method='POST',callback=lambda response,code=code,rtype='1':self.profit_parse(response,code,rtype)) 58 | #半年利润表 59 | para = 'scode=' + code + ';rtype=2;sign=' + str(param) 60 | profit_data = { 61 | 'mergerMark': 'sysapi1075', 'paramStr': para 62 | } 63 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=profit_data, method='POST', callback=lambda response,code=code,rtype='2':self.profit_parse(response,code,rtype)) 64 | #三季度利润表 65 | para = 'scode=' + code + ';rtype=3;sign=' + str(param) 66 | profit_data = { 67 | 'mergerMark': 'sysapi1075', 'paramStr': para 68 | } 69 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=profit_data, method='POST', callback=lambda response,code=code,rtype='3':self.profit_parse(response,code,rtype)) 70 | #年度利润表 71 | para = 'scode=' + code + ';rtype=4;sign=' + str(param) 72 | profit_data = { 73 | 'mergerMark': 'sysapi1075', 'paramStr': para 74 | } 75 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=profit_data, method='POST', callback=lambda response,code=code,rtype='4':self.profit_parse(response,code,rtype)) 76 | # *************************************请求利润表************************************************************** 77 | #---------------------------------------请求现金流量表-------------------------------------------------------------- 78 | para = 'scode=' + code + ';rtype=1;sign=' + str(param) 79 | cashflow_data = { 80 | 'mergerMark': 'sysapi1076', 'paramStr': para 81 | } 82 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=cashflow_data, method='POST', callback=lambda response,code=code:self.cashflow_parse(response,code)) 83 | # *************************************请求现金流量表************************************************************** 84 | #---------------------------------------请求历史分红表-------------------------------------------------------------- 85 | para = 'scode=' + code 86 | # 分红 87 | bonus_data = { 88 | 'mergerMark': 'sysapi1073', 'paramStr': para 89 | } 90 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=bonus_data, method='POST', callback=lambda response,code=code:self.bonus_parse(response,code)) 91 | # *************************************请求历史分红表************************************************************** 92 | #---------------------------------------请求主要指标表-------------------------------------------------------------- 93 | para = 'scode=' + code + ';rtype=1' 94 | mainindex_data = { 95 | 'mergerMark': 'sysapi1074', 'paramStr': para 96 | } 97 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=mainindex_data, method='POST', callback=lambda response,code=code,rtype='1':self.mainindex_parse(response,code,rtype)) 98 | para = 'scode=' + code + ';rtype=2' 99 | mainindex_data = { 100 | 'mergerMark': 'sysapi1074', 'paramStr': para 101 | } 102 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=mainindex_data, method='POST', 103 | callback=lambda response, code=code, rtype='2': self.mainindex_parse(response, code, 104 | rtype)) 105 | para = 'scode=' + code + ';rtype=3' 106 | mainindex_data = { 107 | 'mergerMark': 'sysapi1074', 'paramStr': para 108 | } 109 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=mainindex_data, method='POST', 110 | callback=lambda response, code=code, rtype='3': self.mainindex_parse(response, code, 111 | rtype)) 112 | para = 'scode=' + code + ';rtype=4' 113 | mainindex_data = { 114 | 'mergerMark': 'sysapi1074', 'paramStr': para 115 | } 116 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=mainindex_data, method='POST', 117 | callback=lambda response, code=code, rtype='4': self.mainindex_parse(response, code, 118 | rtype)) 119 | # *************************************请求主要指标表************************************************************** 120 | # ---------------------------------------请求十大流通股东-------------------------------------------------------------- 121 | para = 'scode=' + code 122 | # 十大股东 123 | holders_data = { 124 | 'mergerMark': 'sysapi1071', 'paramStr': para 125 | } 126 | yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=holders_data, method='POST',callback=lambda response,code=code:self.holders_parse(response,code)) 127 | # *************************************请求十大流通股东************************************************************** 128 | 129 | def profit_parse(self, response,code,rtype): 130 | print('--------------profit_parse-----------------------------') 131 | if len(response.text) > 4: 132 | data = json.loads(response.text) 133 | lst_profit = [] 134 | for item in data: 135 | subject_title = item['index'] 136 | lstyear = list(item.keys()) 137 | lstyear.remove('index') 138 | lstyear.sort() 139 | if len(lstyear) == 10: 140 | profitsheet_item = profit_sheetItem(code, subject_title,rtype, item[lstyear[0]], item[lstyear[1]], 141 | item[lstyear[2]], item[lstyear[3]], item[lstyear[4]], 142 | item[lstyear[5]], item[lstyear[6]], 143 | item[lstyear[7]], item[lstyear[8]], item[lstyear[9]], 144 | lstyear[0], lstyear[1], lstyear[2], lstyear[3], lstyear[4], 145 | lstyear[5], 146 | lstyear[6], lstyear[7], lstyear[8], lstyear[9]) 147 | lst_profit.append(profitsheet_item) 148 | elif len(lstyear) > 0: 149 | arr_inityear, arr_initdata = Utils.load_validdata(lstyear, item) 150 | if len(arr_inityear) > 0 and len(arr_initdata) > 0: 151 | profitsheet_item = profit_sheetItem(code, subject_title, rtype, arr_initdata[0], arr_initdata[1], 152 | arr_initdata[2], arr_initdata[3], arr_initdata[4], 153 | arr_initdata[5], arr_initdata[6], 154 | arr_initdata[7], arr_initdata[8], arr_initdata[9], 155 | arr_inityear[0], arr_inityear[1], arr_inityear[2], 156 | arr_inityear[3], 157 | arr_inityear[4], arr_inityear[5], 158 | arr_inityear[6], arr_inityear[7], arr_inityear[8], 159 | arr_inityear[9]) 160 | lst_profit.append(profitsheet_item) 161 | stockbaseinfoitme = StockbaseinfoItem() 162 | stockbaseinfoitme['data_type'] = Const.PROFIT_SHEET 163 | stockbaseinfoitme['data_content'] = lst_profit 164 | yield stockbaseinfoitme 165 | print('-----------end---profit_parse-----------------------------') 166 | def balance_parse(self, response,code): 167 | print('--------------balance_parse-----------------------------') 168 | if len(response.text) > 4: 169 | data = json.loads(response.text) 170 | lst_blance_sheet = [] 171 | for item in data: 172 | if item['index'].find('科目')>0: 173 | pass; 174 | else: 175 | subject_title = item['index'] 176 | lstyear = list(item.keys()) 177 | lstyear.remove('index') 178 | lstyear.sort() 179 | if len(lstyear) == 10: 180 | balance_sheet_item=balance_sheetItem(code,subject_title,item[lstyear[0]],item[lstyear[1]],item[lstyear[2]],item[lstyear[3]],item[lstyear[4]],item[lstyear[5]],item[lstyear[6]], 181 | item[lstyear[7]],item[lstyear[8]],item[lstyear[9]],lstyear[0],lstyear[1],lstyear[2],lstyear[3],lstyear[4],lstyear[5], 182 | lstyear[6],lstyear[7],lstyear[8],lstyear[9]) 183 | lst_blance_sheet.append(balance_sheet_item) 184 | elif len(lstyear)>0: 185 | arr_inityear,arr_initdata = Utils.load_validdata(lstyear,item) 186 | if len(arr_inityear)>0 and len(arr_initdata)>0: 187 | balance_sheet_item = balance_sheetItem(code, subject_title, arr_initdata[0], arr_initdata[1], 188 | arr_initdata[2], arr_initdata[3], arr_initdata[4], 189 | arr_initdata[5], arr_initdata[6], 190 | arr_initdata[7], arr_initdata[8], arr_initdata[9], 191 | arr_inityear[0], arr_inityear[1], arr_inityear[2], arr_inityear[3], 192 | arr_inityear[4], arr_inityear[5], 193 | arr_inityear[6], arr_inityear[7], arr_inityear[8], arr_inityear[9]) 194 | lst_blance_sheet.append(balance_sheet_item) 195 | stockbaseinfoitme = StockbaseinfoItem() 196 | stockbaseinfoitme['data_type'] = Const.BALANCE_SHEET 197 | stockbaseinfoitme['data_content'] = lst_blance_sheet 198 | yield stockbaseinfoitme 199 | print('-----------end---balance_parse-----------------------------') 200 | def mainindex_parse(self,response,code,rtype): 201 | print('--------------mainindex_parse-----------------------------') 202 | if len(response.text) > 4: 203 | data = json.loads(response.text) 204 | lst_mainindex = [] 205 | for item in data: 206 | subject_title = item['index'] 207 | lstyear = list(item.keys()) 208 | lstyear.remove('index') 209 | lstyear.sort() 210 | if len(lstyear) == 10: 211 | mainindex_item = main_indexItem(code, subject_title, rtype, item[lstyear[0]], item[lstyear[1]], 212 | item[lstyear[2]], item[lstyear[3]], item[lstyear[4]], 213 | item[lstyear[5]], item[lstyear[6]], 214 | item[lstyear[7]], item[lstyear[8]], item[lstyear[9]], 215 | lstyear[0], lstyear[1], lstyear[2], lstyear[3], lstyear[4], 216 | lstyear[5], 217 | lstyear[6], lstyear[7], lstyear[8], lstyear[9]) 218 | lst_mainindex.append(mainindex_item) 219 | elif len(lstyear) > 0: 220 | arr_inityear, arr_initdata = Utils.load_validdata(lstyear, item) 221 | mainindex_item = main_indexItem(code, subject_title,rtype,arr_initdata[0], arr_initdata[1], 222 | arr_initdata[2], arr_initdata[3], arr_initdata[4], 223 | arr_initdata[5], arr_initdata[6], 224 | arr_initdata[7], arr_initdata[8], arr_initdata[9], 225 | arr_inityear[0], arr_inityear[1], arr_inityear[2], arr_inityear[3], 226 | arr_inityear[4], arr_inityear[5], 227 | arr_inityear[6], arr_inityear[7], arr_inityear[8], arr_inityear[9]) 228 | lst_mainindex.append(mainindex_item) 229 | stockbaseinfoitme = StockbaseinfoItem() 230 | stockbaseinfoitme['data_type'] = Const.MAIN_INDEX 231 | stockbaseinfoitme['data_content'] = lst_mainindex 232 | yield stockbaseinfoitme 233 | print('-----------end---mainindex_parse-----------------------------') 234 | 235 | def cashflow_parse(self, response,code): 236 | print('--------------cashflow_parse-----------------------------') 237 | if len(response.text) > 4: 238 | data = json.loads(response.text) 239 | lst_cashflow = [] 240 | for item in data: 241 | subject_title = item['index'] 242 | lstyear = list(item.keys()) 243 | lstyear.remove('index') 244 | lstyear.sort() 245 | if len(lstyear) == 10: 246 | cashflow_item = cash_flowItem(code, subject_title, item[lstyear[0]], item[lstyear[1]], 247 | item[lstyear[2]], item[lstyear[3]], item[lstyear[4]], 248 | item[lstyear[5]], item[lstyear[6]], 249 | item[lstyear[7]], item[lstyear[8]], item[lstyear[9]], 250 | lstyear[0], lstyear[1], lstyear[2], lstyear[3], lstyear[4], 251 | lstyear[5], 252 | lstyear[6], lstyear[7], lstyear[8], lstyear[9]) 253 | lst_cashflow.append(cashflow_item) 254 | elif len(lstyear) > 0: 255 | arr_inityear, arr_initdata = Utils.load_validdata(lstyear, item) 256 | if len(arr_inityear) > 0 and len(arr_initdata) > 0: 257 | cashflow_item = cash_flowItem(code, subject_title, arr_initdata[0], arr_initdata[1], 258 | arr_initdata[2], arr_initdata[3], arr_initdata[4], 259 | arr_initdata[5], arr_initdata[6], 260 | arr_initdata[7], arr_initdata[8], arr_initdata[9], 261 | arr_inityear[0], arr_inityear[1], arr_inityear[2], 262 | arr_inityear[3], 263 | arr_inityear[4], arr_inityear[5], 264 | arr_inityear[6], arr_inityear[7], arr_inityear[8], 265 | arr_inityear[9]) 266 | lst_cashflow.append(cashflow_item) 267 | stockbaseinfoitme = StockbaseinfoItem() 268 | stockbaseinfoitme['data_type'] = Const.CASH_FLOW 269 | stockbaseinfoitme['data_content'] = lst_cashflow 270 | yield stockbaseinfoitme 271 | print('-----------end---cashflow_parse-----------------------------') 272 | def holders_parse(self,response,code): 273 | print('--------------holders_parse-----------------------------') 274 | if len(response.text) > 4: 275 | data = json.loads(response.text) 276 | lst_stockholders = [] 277 | for item in data: 278 | stockholder_item = stock_holdersItem(code,item['F005N'],item['F002V'],item['F003N'],item['F004N'],item['F006V'],item['F001D']) 279 | lst_stockholders.append(stockholder_item) 280 | stockbaseinfoitme = StockbaseinfoItem() 281 | stockbaseinfoitme['data_type'] = Const.STOCK_HOLDERS 282 | stockbaseinfoitme['data_content'] = lst_stockholders 283 | yield stockbaseinfoitme 284 | print('-----------end---holders_parse-----------------------------') 285 | 286 | def bonus_parse(self, response,code): 287 | print('--------------bonus_parse-----------------------------') 288 | if len(response.text) > 4: 289 | data = json.loads(response.text) 290 | lst_bonus = [] 291 | for item in data: 292 | notice_date = item['F013D'] 293 | rightoff_time = item['F014D'] 294 | stock_right_registe_date = item['F015D'] 295 | cash_per_share = item['F010N'] 296 | send_bonus_share_per_share = item['F012N'] 297 | increase_shares_per_share = item['F011N'] 298 | cash_receive_date = item['F016D'] 299 | share_receive_date = item['F017D'] 300 | bonus_item = stock_bonusItem(code, notice_date, rightoff_time, stock_right_registe_date, cash_per_share, 301 | send_bonus_share_per_share, increase_shares_per_share, cash_receive_date, share_receive_date) 302 | lst_bonus.append(bonus_item) 303 | stockbaseinfoitme = StockbaseinfoItem() 304 | stockbaseinfoitme['data_type'] = Const.STOCK_BONUS 305 | stockbaseinfoitme['data_content'] = lst_bonus 306 | yield stockbaseinfoitme 307 | print('-----------end---bonus_parse-----------------------------') 308 | 309 | def info_parse(self,response): 310 | print('--------------info_parse-----------------------------') 311 | if len(response.text)>4: 312 | data = json.loads(response.text) 313 | dataitem = data[0] 314 | code=dataitem['5'] 315 | name = dataitem['55'] 316 | price_change = '0.0' if dataitem['264648']=="" else dataitem['264648'] 317 | p_change = '0.0' if dataitem['199112']=="" else dataitem['199112'] 318 | open = '0.0' if dataitem['7']=="" else dataitem['7'] 319 | yesterday_close = '0.0' if dataitem['6']=="" else dataitem['6'] 320 | high = '0.0' if dataitem['8']=="" else dataitem['9'] 321 | low = '0.0' if dataitem['9']=="" else dataitem['9'] 322 | close = '0.0' if dataitem['10']=="" else dataitem['10'] 323 | volume = '0.0' if dataitem['13']=="" else dataitem['13'] 324 | amount = '0.0' if dataitem['19']=="" else dataitem['19'] 325 | stockinfo = stock_infoItem(code, name, open, high, close, 326 | low, volume, amount,price_change, 327 | p_change, yesterday_close, 0.0, 0.0, 0, 328 | 0.0, 0.0, time.strftime("%Y-%m-%d", time.localtime(time.time())),0.0) 329 | stockbaseinfoitme = StockbaseinfoItem() 330 | stockbaseinfoitme['data_type']=Const.STOCK_INFO 331 | stockbaseinfoitme['data_content']=stockinfo 332 | print('-----------end---info_parse-----------------------------') 333 | yield stockbaseinfoitme 334 | # sel = Selector(response) 335 | # print(response.text) 336 | # 337 | # #stock_baseinfo = stock_infoItem() 338 | # page_stockdetail_sublist = sel.xpath('//div[@class="page-stockdetail"]') 339 | # code = page_stockdetail_sublist.xpath('//div[@class="sub-code"]/text()').extract_first() 340 | # name = page_stockdetail_sublist.xpath('//div[@class="sub-title"]/text()').extract_first() 341 | # close = page_stockdetail_sublist.xpath('//div[@class="sub-trend-value"]/text()').extract_first() 342 | # price_change = page_stockdetail_sublist.xpath('//div[@class="sub-trend-size"]/text()').extract_first() 343 | # p_change = page_stockdetail_sublist.xpath('//div[@class="sub-trend-trend"]/text()').extract_first() 344 | # date = page_stockdetail_sublist.xpath('//div[@class="sub-time last-child"]/text()').extract_first() 345 | # yesterday_close = page_stockdetail_sublist.xpath('//div[@id="pre"]/text()').extract_first() 346 | # open = page_stockdetail_sublist.xpath('//div[@id="open"]/text()').extract_first() 347 | # online_years = page_stockdetail_sublist.xpath('//div[@id="sub-value age"]/text()').extract_first() 348 | # pb = page_stockdetail_sublist.xpath('//div[@id="pb-ratio"]/text()').extract_first() 349 | # pe = page_stockdetail_sublist.xpath('//div[@id="pe-ratio"]/text()').extract_first() 350 | # high = page_stockdetail_sublist.xpath('//div[@id="high"]/text()').extract_first() 351 | # low = page_stockdetail_sublist.xpath('//div[@id="low"]/text()').extract_first() 352 | # volume = page_stockdetail_sublist.xpath('//div[@id="vol"]/text()').extract_first() 353 | # amount = page_stockdetail_sublist.xpath('//div[@id="money"]/text()').extract_first() 354 | # exchange = page_stockdetail_sublist.xpath('//div[@id="amplit"]/text()').extract_first() 355 | # turnover = page_stockdetail_sublist.xpath('//div[@id="huanshou"]/text()').extract_first() 356 | # print(code,name,close,open,price_change,p_change,date,yesterday_close,online_years,pb,pe,high,low,volume,amount,exchange,turnover) 357 | # 358 | # 359 | # StockbaseinfoItem['data_type']= Const.BALANCE_SHEET 360 | # StockbaseinfoItem['data_type']="" 361 | #yield StockbaseinfoItem 362 | --------------------------------------------------------------------------------