├── stockbaseinfo
    ├── __init__.py
    ├── annualreport
    │   ├── __init__.py
    │   └── annualspider.py
    ├── strategy
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── utils.cpython-36.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── db_tools.cpython-36.pyc
    │   ├── db_tools.py
    │   ├── utils.py
    │   └── strategy_select.py
    ├── __pycache__
    │   ├── Const.cpython-36.pyc
    │   ├── items.cpython-36.pyc
    │   ├── utils.cpython-36.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── pipelines.cpython-36.pyc
    │   ├── settings.cpython-36.pyc
    │   └── middlewares.cpython-36.pyc
    ├── spiders
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── baseinfo.cpython-36.pyc
    │   │   └── newsspider.cpython-36.pyc
    │   ├── __init__.py
    │   ├── newsspider.py
    │   └── baseinfo.py
    ├── main.py
    ├── utils.py
    ├── settings.py
    ├── middlewares.py
    ├── items.py
    └── pipelines.py
├── images
    ├── 十大股东2.png
    └── 合作加微信.png
├── README.md
├── scrapy.cfg
└── data
    └── data.sql


/stockbaseinfo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stockbaseinfo/annualreport/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stockbaseinfo/strategy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/images/十大股东2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/images/十大股东2.png


--------------------------------------------------------------------------------
/images/合作加微信.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/images/合作加微信.png


--------------------------------------------------------------------------------
/stockbaseinfo/__pycache__/Const.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/Const.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/__pycache__/middlewares.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/strategy/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/strategy/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/spiders/__pycache__/baseinfo.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/spiders/__pycache__/baseinfo.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/spiders/__pycache__/newsspider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/spiders/__pycache__/newsspider.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/strategy/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/strategy/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/strategy/__pycache__/db_tools.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuezayun/stockbaseinfo/HEAD/stockbaseinfo/strategy/__pycache__/db_tools.cpython-36.pyc


--------------------------------------------------------------------------------
/stockbaseinfo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # stockbaseinfo
 2 | 从巨潮资讯网爬取公司的基本面信息，包括股票基本信息、十大流通股东、历史分红、主要指标、财务报表
 3 | 
 4 | 本程序包括功能：
 5 | 1.从巨潮资讯网爬虫基本信息
 6 | 2，爬取新浪新闻的滚动资讯信息
 7 | 3.从巨潮资讯爬取年报信息。
 8 | 
 9 | --------------------------------------------------------
10 | 如需爬虫合作，请加见images目录下的微信图片。


--------------------------------------------------------------------------------
/stockbaseinfo/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from scrapy.cmdline import execute
4 | import sys
5 | import os
6 | 
7 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
8 | # execute(['scrapy', 'crawl', 'baseinfo'])
9 | execute(['scrapy', 'crawl', 'news'])


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = stockbaseinfo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = stockbaseinfo
12 | 


--------------------------------------------------------------------------------
/stockbaseinfo/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 - *-
 2 | 
 3 | class Utils(object):
 4 |     #计算有效的时间和数据
 5 |     @staticmethod
 6 |     def load_validdata(lstyear,item):
 7 |         lstyear.sort(reverse=True)
 8 |         arr_year = ["1900" for _ in range(10)]
 9 |         arr_item = ['0' for _ in range(10)]
10 |         for i in range(len(lstyear)):
11 |             arr_year[9 - i] = lstyear[i]
12 |             arr_item[9 - i] = item[lstyear[i]]
13 |         return arr_year, arr_item


--------------------------------------------------------------------------------
/stockbaseinfo/strategy/db_tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import pymysql
 3 | import sys
 4 | from stockbaseinfo.Const import *
 5 | 
 6 | #调用方法
 7 | # dbTools = new DbTools()
 8 | 
 9 | class DbTools:
10 |     conn = None
11 |     cursor = None
12 |     #建立和数据库系统得连接
13 |     def connect(self):
14 |         self.conn = pymysql.connect(host=Const.DB_SERVER,port=Const.PORT,user=Const.DB_USER, passwd=Const.DB_PWD,db=Const.DB_NAME,charset="utf8")
15 |         self.cursor = self.conn.cursor();
16 | 
17 |     def insertorupdate_data(self,lstsql):
18 |         try:
19 |             for sql in lstsql:
20 |                 self.cursor.execute(sql)
21 |         except:
22 |             print("except")
23 |         finally:
24 |             print("finally")
25 | 
26 | 
27 |     def fetch_data(self,sql):
28 |         self.cursor.execute(sql)
29 |         return self.cursor.fetchall()
30 | 
31 |     def commit_data(self):
32 |         self.cursor.close()
33 |         self.conn.commit()
34 |         return self.conn.close()
35 | 
36 |     def close(self):
37 |         self.cursor.close()
38 |         return self.conn.close()
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/stockbaseinfo/strategy/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 - *-
 2 | 
 3 | import datetime,time
 4 | import db_tools as dt
 5 | import math
 6 | from stockbaseinfo.Const import *
 7 | import requests
 8 | 
 9 | class Utils(object):
10 | 
11 |     #获取某daysago天数之前的日期
12 |     @staticmethod
13 |     def get_num_startdate(daysnum = 1):
14 |         lstretdate =[]
15 |         selectdaysago ='''select date from hist_data where code = '000001' order by date desc limit %d;'''%daysnum
16 |         lstitem = Utils.fetch_data(selectdaysago)
17 |         for item in lstitem:
18 |             date = item[0]
19 |             lstretdate.append(date.strftime('%Y-%m-%d'))
20 |         return lstretdate;
21 | 
22 |     # 获取某daysnum天数之前的日期
23 |     @staticmethod
24 |     def get_fourdate(daysnum=4):
25 |         selectdaysago = '''select date from hist_data where code = '000001' order by date desc limit %d;''' % daysnum
26 |         lstitem = Utils.fetch_data(selectdaysago)
27 |         day3date = lstitem[3][0]
28 |         day2date = lstitem[2][0]
29 |         day1date = lstitem[1][0]
30 |         daydate = lstitem[0][0]
31 |         day3Date = day3date.strftime('%Y-%m-%d')
32 |         day2Date = day2date.strftime('%Y-%m-%d')
33 |         day1Date = day1date.strftime('%Y-%m-%d')
34 |         dayDate = daydate.strftime('%Y-%m-%d')
35 |         return day3Date,day2Date,day1Date,dayDate;
36 |     @staticmethod
37 |     def get_lastdate():
38 |         selectdaysago = '''select date from hist_data where code = '000001' order by date desc limit 1;'''
39 |         lstitem = Utils.fetch_data(selectdaysago)
40 |         if len(lstitem)>0 :
41 |             startdate = lstitem[0][0]
42 |         return startdate.strftime('%Y-%m-%d')
43 |     # 执行SQL语句集合
44 |     @staticmethod
45 |     def execute_data(lstinsertsql):
46 |         dbTools = dt.DbTools()
47 |         dbTools.connect()
48 |         dbTools.insertorupdate_data(lstinsertsql)
49 |         dbTools.commit_data()
50 | 
51 |     # 获取查询数据
52 |     @staticmethod
53 |     def fetch_data(querysql):
54 |         dbTools = dt.DbTools()
55 |         dbTools.connect()
56 |         lstitem = dbTools.fetch_data(querysql)
57 |         for item in lstitem:
58 |             temsql = ''
59 |             for i in range(len(item)):
60 |                 temsql += '\'' + str(item[i]) + '\','
61 |         dbTools.close()
62 |         return lstitem;
63 | 
64 |     @staticmethod
65 |     def time_cmp(first_time,second_time):
66 |         return int(time.strftime("%H%M%S", time.strptime(str(first_time), "%H:%M:%S"))) < int(time.strftime("%H%M%S", second_time))
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/stockbaseinfo/strategy/strategy_select.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | #策略选择工具
 3 | from stockbaseinfo.strategy.utils import *
 4 | 
 5 | #选择净利润较好的公司
 6 | def getBestProfit():
 7 |     dictIndexDateIsCow = {}
 8 |     setCowSql = '''SELECT `code`, `subject_title`,`account1`,`account2`,`account3`,`account4`,`account5`, `account6`, `account7`, `account8`, `account9`, `account10`,
 9 |     `year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`,`season` FROM  `main_index` where subject_title like '%净利润增长率%' order by code,season ;'''
10 |     lstitem = Utils.fetch_data(setCowSql)
11 |     for item in lstitem:
12 |         print(item)
13 |     #     tempcodename = ''
14 |     #     tempdateiscow = ''
15 |     #     code = item[0]
16 |     #     name = item[1]
17 |     #     iscow = item[2]
18 |     #     sdate = item[3]
19 |     #     tempcodename = code + ',' + name
20 |     #     tempdateiscow = str(sdate) + ':' + str(iscow)
21 |     #     if dictIndexDateIsCow.__contains__(tempcodename):
22 |     #         # codedata = dictIndexDateIsCow[tempcodename]
23 |     #         dictIndexDateIsCow[tempcodename] += tempdateiscow + ','
24 |     #     else:
25 |     #         dictIndexDateIsCow[tempcodename] = ''
26 |     # print('----------------------------判断熊市牛市---------------------------------------------------------')
27 |     # for key, value in dictIndexDateIsCow.items():
28 |     #     print(key + "---" + value)
29 |     # print('****************************判断熊市牛市********************************************************')
30 | 
31 | def getUprise():
32 |     lstOri = [3,4,5,6,5,4,3,22,1,2,3,4,5,6,7,8,9,7,6,5,4,6,8,9,10]
33 |     lastMaxP = 0
34 |     lastMinP = 0
35 |     lstMax=[]
36 |     lstMin = []
37 |     lstMaxData=[]
38 |     lstMinData=[]
39 |     lstOri.reverse()
40 |     for i in range(len(lstOri)):
41 |         if i == 0:
42 |             pass
43 |         else:
44 |             minp = min(lstOri[0:i])
45 |             if minp<lastMinP:
46 |                 lastMinP = minp
47 |             else:
48 |                 if lstMinData.count(minp)<1:
49 |                     lstMin.append(str(i)+":"+str(minp))
50 |                     lstMinData.append(minp)
51 | 
52 |     for mindata in lstMin:
53 |         print(mindata)
54 | 
55 | #计算所有股票在每一个交易日的向前120日滚动RPS值。对股票价格走势和RPS进行可视化
56 | # 欧奈尔研究了1953年至1993年，500只年度涨幅最大的股票，发现每年涨幅居前的，在他们股价真正大幅度攀升之前，其平均的相对强弱指标RPS为87％。这并不意味着，只要RPS>87%就可以买入该股票呢？其实RPS指标只是对强势股的个一个初步筛选，对于A股而言，RPS大于87%的股票就有400多只，都买进也不太现实，具体运用还需结合个股基本面、题材和整体市场情况分析。RPS实际上是欧奈尔在《笑傲股市》中提出的CANSLIM七步选股法的一个技术分析。各字母含义如下所示：C：最近一季度报表显示的盈利（每股收益）
57 | # A：每年度每股盈利的增长幅度
58 | # N：新产品，新服务，股价创新高
59 | # S：该股流通盘大小，市值以及交易量的情况
60 | # L：该股票在行业中的低位，是否为龙头
61 | # I：该股票有无有实力的庄家，机构大流通股东
62 | # M：大盘走势如何，如何判断大盘走向
63 | #
64 | # RPS英文全称Relative Price Strength Rating，即股价相对强度，该指标是欧奈尔CANSLIM选股法则中的趋势分析，具有很强的实战指导意义。RPS指标是指在一段时间内，个股涨幅在全部股票涨幅排名中的位次值。
65 | # 比如A股共有3500只股票，若某只股票的120日涨幅在所有股票中排名第350位，则该股票的RPS值为：(1-350/3500)*100=90。
66 | #
67 | # RPS的值代表该股的120日涨幅超过其他90%的股票的涨幅。通过该指标可以反映个股股价走势在同期市场中的表现相对强弱。RPS的值介于0-100之间，在过去的一年中，所有股票的涨幅排行中，前1%的股票的RPS值为99至100，前2%的股票的RPS值为98至99，以此类推。RPS时间周期可以自己根据需要进行调整，常用的有60日（3个月）、120日（半年）和250日（一年）等。
68 | 
69 | def getRPS120():
70 |     print()
71 | if __name__ == "__main__":
72 |     getUprise()
73 |     # data = "1:23"
74 |     # print(data[:data.index(":")])


--------------------------------------------------------------------------------
/stockbaseinfo/spiders/newsspider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Request
 3 | from ..items import *
 4 | from ..Const import *
 5 | import random
 6 | import json
 7 | import re
 8 | from datetime import datetime
 9 | 
10 | 
11 | class newsSpider(scrapy.Spider):
12 |     name = "news"
13 |     lstsinanews = []
14 |     base_url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid={}&k=&num=50&page={}&r={}'
15 |     #     "2509": "全部",
16 |     #     "2510": "国内",
17 |     #     "2511": "国际",
18 |     #     "2669": "社会",
19 |     #     "2512": "体育",
20 |     #     "2513": "娱乐",
21 |     #     "2514": "军事",
22 |     #     "2515": "科技",
23 |     #     "2516": "财经",
24 |     #     "2517": "股市",
25 |     #     "2518": "美股",
26 |     #     "2968": "国内_国际",
27 |     #     "2970": "国内_社会",
28 |     #     "2972": "国际_社会",
29 |     #     "2974": "国内国际社会"
30 |     def start_requests(self):
31 |         #  可修改  这里设置爬取100页
32 |         self.page_total = 24
33 |         # self.page_total = 1
34 |         for page in range(1, self.page_total + 1):
35 |             #  按上面注释  可修改 这里"2509"代表"全部"类别的新闻
36 |             lid = "2516"
37 |             r = random.random()
38 |             yield Request(self.base_url.format(lid, page, r), callback=lambda response,page=page:self.parse(response,page))
39 | 
40 |     def parse(self, response,page):
41 |         result = json.loads(response.text)
42 |         data_list = result.get('result').get('data')
43 |         icount = 0
44 |         totalcount = len(data_list)
45 |         print("totalcount:"+str(totalcount))
46 |         for data in data_list:
47 |             icount += 1
48 |             item = sina_newsItem()
49 |             ctime = datetime.fromtimestamp(int(data.get('ctime')))
50 |             ctime = datetime.strftime(ctime, '%Y-%m-%d %H:%M')
51 |             item['ctime'] = ctime
52 |             item['url'] = str(data.get('url')).strip()
53 |             item['wapurl'] = str(data.get('wapurl')).strip()
54 |             item['title'] = str(data.get('title')).strip()
55 |             item['media_name'] = str(data.get('media_name')).strip()
56 |             item['keywords'] = str(data.get('keywords')).strip()
57 |             yield Request(url=item['url'], callback=lambda response,page=page,isend = icount == totalcount:self.parse_content(response,page,isend) , meta={'item': item})
58 | 
59 |         # 进入到详情页面 爬取新闻内容
60 | 
61 |     def parse_content(self, response,page,isend):
62 |         item = response.meta['item']
63 |         content = ''.join(response.xpath('//*[@id="artibody" or @id="article"]//p/text()').extract())
64 |         content = re.sub(r'\u3000', '', content)
65 |         content = re.sub(r'[ \xa0?]+', ' ', content)
66 |         content = re.sub(r'\s*\n\s*', '\n', content)
67 |         content = re.sub(r'\s*(\s)', r'\1', content)
68 |         content = ''.join([x.strip() for x in content])
69 |         # content_list = response.xpath('//*[@id="artibody" or @id="article"]//p/text()').extract()
70 |         # content = r""
71 |         # for part in content_list:
72 |         #     part = part.strip()
73 |         #     content += part
74 |         item['content'] = content
75 |         print("page:"+str(page)+",isend:"+str(isend))
76 |         if self.page_total == page and isend:
77 |             stockbaseinfoitme = StockbaseinfoItem()
78 |             stockbaseinfoitme['data_type'] = Const.ROLL_NEWS
79 |             stockbaseinfoitme['data_content'] = self.lstsinanews
80 |             yield stockbaseinfoitme
81 |         else:
82 |             self.lstsinanews.append(item)
83 | 
84 | 


--------------------------------------------------------------------------------
/stockbaseinfo/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for stockbaseinfo project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'stockbaseinfo'
13 | 
14 | SPIDER_MODULES = ['stockbaseinfo.spiders']
15 | NEWSPIDER_MODULE = 'stockbaseinfo.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'stockbaseinfo (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49 | SPIDER_MIDDLEWARES = {
50 |    'stockbaseinfo.middlewares.StockbaseinfoSpiderMiddleware': 543,
51 | }
52 | HTTPERROR_ALLOWED_CODES= [999]
53 | # Enable or disable downloader middlewares
54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 |    'stockbaseinfo.middlewares.StockbaseinfoDownloaderMiddleware': 543,
57 | }
58 | 
59 | # Enable or disable extensions
60 | # Enable or disable extensions
61 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |    'stockbaseinfo.pipelines.StockbaseinfoPipeline': 300,
70 | }
71 | MYSQL_DB_NAME = 'finance_data'
72 | MYSQL_HOST = 'localhost'
73 | MYSQL_PORT = 3306
74 | MYSQL_USER = 'root'
75 | MYSQL_PASSWORD = 'root'
76 | # Enable and configure the AutoThrottle extension (disabled by default)
77 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
78 | #AUTOTHROTTLE_ENABLED = True
79 | # The initial download delay
80 | #AUTOTHROTTLE_START_DELAY = 5
81 | # The maximum download delay to be set in case of high latencies
82 | #AUTOTHROTTLE_MAX_DELAY = 60
83 | # The average number of requests Scrapy should be sending in parallel to
84 | # each remote server
85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
86 | # Enable showing throttling stats for every response received:
87 | #AUTOTHROTTLE_DEBUG = False
88 | 
89 | # Enable and configure HTTP caching (disabled by default)
90 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
91 | HTTPCACHE_ENABLED = True
92 | HTTPCACHE_EXPIRATION_SECS = 0
93 | HTTPCACHE_DIR = 'httpcache'
94 | HTTPCACHE_IGNORE_HTTP_CODES = []
95 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
96 | 


--------------------------------------------------------------------------------
/data/data.sql:
--------------------------------------------------------------------------------
  1 | CREATE DATABASE `finance_basic_data` /*!40100 DEFAULT CHARACTER SET utf8 */ /*!80016 DEFAULT ENCRYPTION='N' */;
  2 | CREATE TABLE `balance_sheet` (
  3 |   `code` varchar(45) DEFAULT NULL,
  4 |   `subject_title` varchar(500) DEFAULT NULL,
  5 |   `account1` float DEFAULT '0',
  6 |   `account2` float DEFAULT '0',
  7 |   `account3` float DEFAULT '0',
  8 |   `account4` float DEFAULT '0',
  9 |   `account5` float DEFAULT '0',
 10 |   `account6` float DEFAULT '0',
 11 |   `account7` float DEFAULT '0',
 12 |   `account8` float DEFAULT '0',
 13 |   `account9` float DEFAULT '0',
 14 |   `account10` float DEFAULT '0',
 15 |   `year1` int(11) DEFAULT '0',
 16 |   `year2` int(11) DEFAULT '0',
 17 |   `year3` int(11) DEFAULT '0',
 18 |   `year4` int(11) DEFAULT '0',
 19 |   `year5` int(11) DEFAULT '0',
 20 |   `year6` int(11) DEFAULT '0',
 21 |   `year7` int(11) DEFAULT '0',
 22 |   `year8` int(11) DEFAULT '0',
 23 |   `year9` int(11) DEFAULT '0',
 24 |   `year10` int(11) DEFAULT '0'
 25 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 26 | 
 27 | CREATE TABLE `cashflow_sheet` (
 28 |   `code` varchar(45) DEFAULT NULL,
 29 |   `subject_title` varchar(500) DEFAULT NULL,
 30 |   `account1` float DEFAULT '0',
 31 |   `account2` float DEFAULT '0',
 32 |   `account3` float DEFAULT '0',
 33 |   `account4` float DEFAULT '0',
 34 |   `account5` float DEFAULT '0',
 35 |   `account6` float DEFAULT '0',
 36 |   `account7` float DEFAULT '0',
 37 |   `account8` float DEFAULT '0',
 38 |   `account9` float DEFAULT '0',
 39 |   `account10` float DEFAULT '0',
 40 |   `year1` int(11) DEFAULT '0',
 41 |   `year2` int(11) DEFAULT '0',
 42 |   `year3` int(11) DEFAULT '0',
 43 |   `year4` int(11) DEFAULT '0',
 44 |   `year5` int(11) DEFAULT '0',
 45 |   `year6` int(11) DEFAULT '0',
 46 |   `year7` int(11) DEFAULT '0',
 47 |   `year8` int(11) DEFAULT '0',
 48 |   `year9` int(11) DEFAULT '0',
 49 |   `year10` int(11) DEFAULT '0'
 50 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 51 | CREATE TABLE `main_index` (
 52 |   `code` varchar(45) DEFAULT NULL,
 53 |   `subject_title` varchar(500) DEFAULT NULL,
 54 |   `account1` float DEFAULT '0',
 55 |   `account2` float DEFAULT '0',
 56 |   `account3` float DEFAULT '0',
 57 |   `account4` float DEFAULT '0',
 58 |   `account5` float DEFAULT '0',
 59 |   `account6` float DEFAULT '0',
 60 |   `account7` float DEFAULT '0',
 61 |   `account8` float DEFAULT '0',
 62 |   `account9` float DEFAULT '0',
 63 |   `account10` float DEFAULT '0',
 64 |   `year1` int(11) DEFAULT '0',
 65 |   `year2` int(11) DEFAULT '0',
 66 |   `year3` int(11) DEFAULT '0',
 67 |   `year4` int(11) DEFAULT '0',
 68 |   `year5` int(11) DEFAULT '0',
 69 |   `year6` int(11) DEFAULT '0',
 70 |   `year7` int(11) DEFAULT '0',
 71 |   `year8` int(11) DEFAULT '0',
 72 |   `year9` int(11) DEFAULT '0',
 73 |   `year10` int(11) DEFAULT '0',
 74 |   `season` int(11) DEFAULT '0'
 75 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 76 | CREATE TABLE `profit_sheet` (
 77 |   `code` varchar(45) DEFAULT NULL,
 78 |   `subject_title` varchar(500) DEFAULT NULL,
 79 |   `account1` float DEFAULT '0',
 80 |   `account2` float DEFAULT '0',
 81 |   `account3` float DEFAULT '0',
 82 |   `account4` float DEFAULT '0',
 83 |   `account5` float DEFAULT '0',
 84 |   `account6` float DEFAULT '0',
 85 |   `account7` float DEFAULT '0',
 86 |   `account8` float DEFAULT '0',
 87 |   `account9` float DEFAULT '0',
 88 |   `account10` float DEFAULT '0',
 89 |   `year1` int(11) DEFAULT '0',
 90 |   `year2` int(11) DEFAULT '0',
 91 |   `year3` int(11) DEFAULT '0',
 92 |   `year4` int(11) DEFAULT '0',
 93 |   `year5` int(11) DEFAULT '0',
 94 |   `year6` int(11) DEFAULT '0',
 95 |   `year7` int(11) DEFAULT '0',
 96 |   `year8` int(11) DEFAULT '0',
 97 |   `year9` int(11) DEFAULT '0',
 98 |   `year10` int(11) DEFAULT '0',
 99 |   `season` int(11) DEFAULT '0'
100 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
101 | CREATE TABLE `sina_news` (
102 |   `idsina_news` int(11) NOT NULL AUTO_INCREMENT,
103 |   `title` varchar(500) DEFAULT NULL,
104 |   `content` text,
105 |   `ctime` datetime DEFAULT NULL,
106 |   `media_name` varchar(100) DEFAULT NULL,
107 |   `keywords` varchar(200) DEFAULT NULL,
108 |   `url` varchar(200) DEFAULT NULL,
109 |   `wepurl` varchar(200) DEFAULT NULL,
110 |   PRIMARY KEY (`idsina_news`)
111 | ) ENGINE=InnoDB AUTO_INCREMENT=4090 DEFAULT CHARSET=utf8;
112 | CREATE TABLE `stock_bonus` (
113 |   `code` varchar(45) DEFAULT NULL,
114 |   `notice_date` datetime DEFAULT NULL,
115 |   `rightoff_time` datetime DEFAULT NULL,
116 |   `stock_right_registe_date` datetime DEFAULT NULL,
117 |   `cash_per_share` float DEFAULT '0',
118 |   `send_bonus_share_per_share` float DEFAULT '0',
119 |   `increase_shares_per_share` int(11) DEFAULT '0',
120 |   `cash_receive_date` datetime DEFAULT NULL,
121 |   `share_receive_date` datetime DEFAULT NULL
122 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
123 | CREATE TABLE `stock_holders` (
124 |   `code` varchar(45) DEFAULT NULL,
125 |   `holder_range` int(11) DEFAULT '0',
126 |   `holder_name` varchar(500) DEFAULT NULL,
127 |   `stock_count` float DEFAULT '0',
128 |   `stock_percent` float DEFAULT '0',
129 |   `stock_property` varchar(200) DEFAULT NULL,
130 |   `count_date` date DEFAULT NULL
131 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
132 | CREATE TABLE `stock_info` (
133 |   `code` varchar(45) DEFAULT NULL,
134 |   `name` varchar(45) DEFAULT NULL,
135 |   `open` float DEFAULT '0',
136 |   `high` float DEFAULT '0',
137 |   `close` float DEFAULT '0',
138 |   `low` float DEFAULT '0',
139 |   `volume` float DEFAULT '0',
140 |   `amount` float DEFAULT '0',
141 |   `price_change` float DEFAULT '0',
142 |   `p_change` float DEFAULT '0',
143 |   `yesterday_close` float DEFAULT '0',
144 |   `exchange` float DEFAULT '0',
145 |   `online_years` int(11) DEFAULT '0',
146 |   `pb` float DEFAULT '0',
147 |   `pe` float DEFAULT '0',
148 |   `date` datetime DEFAULT NULL,
149 |   `amplitude` float DEFAULT '0'
150 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
151 | 


--------------------------------------------------------------------------------
/stockbaseinfo/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import  random
 10 | 
 11 | 
 12 | class StockbaseinfoSpiderMiddleware(object):
 13 |     # Not all methods need to be defined. If a method is not defined,
 14 |     # scrapy acts as if the spider middleware does not modify the
 15 |     # passed objects.
 16 | 
 17 |     def __init__(self, user_agent=''):
 18 |         self.user_agent = user_agent
 19 | 
 20 |     @classmethod
 21 |     def from_crawler(cls, crawler):
 22 |         # This method is used by Scrapy to create your spiders.
 23 |         s = cls()
 24 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 25 |         return s
 26 | 
 27 |     def process_spider_input(self, response, spider):
 28 |         # Called for each response that goes through the spider
 29 |         # middleware and into the spider.
 30 | 
 31 |         # Should return None or raise an exception.
 32 |         return None
 33 | 
 34 |     def process_spider_output(self, response, result, spider):
 35 |         # Called with the results returned from the Spider, after
 36 |         # it has processed the response.
 37 | 
 38 |         # Must return an iterable of Request, dict or Item objects.
 39 |         for i in result:
 40 |             yield i
 41 | 
 42 |     def process_spider_exception(self, response, exception, spider):
 43 |         # Called when a spider or process_spider_input() method
 44 |         # (from other spider middleware) raises an exception.
 45 | 
 46 |         # Should return either None or an iterable of Request, dict
 47 |         # or Item objects.
 48 |         pass
 49 | 
 50 |     def process_start_requests(self, start_requests, spider):
 51 |         # Called with the start requests of the spider, and works
 52 |         # similarly to the process_spider_output() method, except
 53 |         # that it doesn’t have a response associated.
 54 | 
 55 |         # Must return only requests (not items).
 56 |         for r in start_requests:
 57 |             yield r
 58 | 
 59 |     def spider_opened(self, spider):
 60 |         spider.logger.info('Spider opened: %s' % spider.name)
 61 | 
 62 | 
 63 | class StockbaseinfoDownloaderMiddleware(object):
 64 |     # Not all methods need to be defined. If a method is not defined,
 65 |     # scrapy acts as if the downloader middleware does not modify the
 66 |     # passed objects.
 67 | 
 68 |     @classmethod
 69 |     def from_crawler(cls, crawler):
 70 |         # This method is used by Scrapy to create your spiders.
 71 |         s = cls()
 72 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 73 |         return s
 74 | 
 75 |     def process_request(self, request, spider):
 76 |         # 这句话用于随机选择user-agent
 77 |         ua = self.user_agent_list[random.randint(0,16)]
 78 |         if ua:
 79 |             print('User-Agent:' + ua)
 80 |             request.headers.setdefault('User-Agent', ua)
 81 | 
 82 |         # Called for each request that goes through the downloader
 83 |         # middleware.
 84 | 
 85 |         # Must either:
 86 |         # - return None: continue processing this request
 87 |         # - or return a Response object
 88 |         # - or return a Request object
 89 |         # - or raise IgnoreRequest: process_exception() methods of
 90 |         #   installed downloader middleware will be called
 91 |         return None
 92 | 
 93 |     def process_response(self, request, response, spider):
 94 |         # Called with the response returned from the downloader.
 95 | 
 96 |         # Must either;
 97 |         # - return a Response object
 98 |         # - return a Request object
 99 |         # - or raise IgnoreRequest
100 |         return response
101 | 
102 |     def process_exception(self, request, exception, spider):
103 |         # Called when a download handler or a process_request()
104 |         # (from other downloader middleware) raises an exception.
105 | 
106 |         # Must either:
107 |         # - return None: continue processing this exception
108 |         # - return a Response object: stops process_exception() chain
109 |         # - return a Request object: stops process_exception() chain
110 |         pass
111 | 
112 |     def spider_opened(self, spider):
113 |         spider.logger.info('Spider opened: %s' % spider.name)
114 | 
115 |     user_agent_list = [ \
116 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
117 |         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
118 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
119 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
120 |         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
121 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
122 |         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
123 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
124 |         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
125 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
126 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
127 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
128 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
129 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
130 |         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
131 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
132 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
133 |         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
134 | 
135 | 


--------------------------------------------------------------------------------
/stockbaseinfo/items.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your scraped items
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/items.html
  7 | 
  8 | import scrapy
  9 | from scrapy.item import Item,Field
 10 | 
 11 | 
 12 | class StockbaseinfoItem(scrapy.Item):
 13 |     # name = scrapy.Field()
 14 |     data_type = Field()
 15 |     data_content = Field();
 16 | #财务报表
 17 | class balance_sheetItem():
 18 |     def __init__(self, code, subject_title,account1,account2,account3,account4,account5,account6,account7,account8,account9,account10,
 19 |                  year1,year2,year3,year4,year5,year6,year7,year8,year9,year10):
 20 |         self.code = code
 21 |         self.subject_title = subject_title
 22 |         self.account1 = account1
 23 |         self.account2 = account2
 24 |         self.account3 = account3
 25 |         self.account4 = account4
 26 |         self.account5 = account5
 27 |         self.account6 = account6
 28 |         self.account7 = account7
 29 |         self.account8 = account8
 30 |         self.account9 = account9
 31 |         self.account10 = account10
 32 |         self.year1 = year1
 33 |         self.year2 = year2
 34 |         self.year3 = year3
 35 |         self.year4 = year4
 36 |         self.year5 = year5
 37 |         self.year6 = year6
 38 |         self.year7 = year7
 39 |         self.year8 = year8
 40 |         self.year9 = year9
 41 |         self.year10 = year10
 42 | #现金流量报表
 43 | class cash_flowItem():
 44 |     def __init__(self, code, subject_title, account1, account2, account3, account4, account5, account6, account7,
 45 |                  account8, account9, account10,
 46 |                  year1, year2, year3, year4, year5, year6, year7, year8, year9, year10):
 47 |         self.code = code
 48 |         self.subject_title = subject_title
 49 |         self.account1 = account1
 50 |         self.account2 = account2
 51 |         self.account3 = account3
 52 |         self.account4 = account4
 53 |         self.account5 = account5
 54 |         self.account6 = account6
 55 |         self.account7 = account7
 56 |         self.account8 = account8
 57 |         self.account9 = account9
 58 |         self.account10 = account10
 59 |         self.year1 = year1
 60 |         self.year2 = year2
 61 |         self.year3 = year3
 62 |         self.year4 = year4
 63 |         self.year5 = year5
 64 |         self.year6 = year6
 65 |         self.year7 = year7
 66 |         self.year8 = year8
 67 |         self.year9 = year9
 68 |         self.year10 = year10
 69 | #主要指数
 70 | class main_indexItem():
 71 |     def __init__(self, code, subject_title,season, account1, account2, account3, account4, account5, account6, account7,
 72 |                  account8, account9, account10,
 73 |                  year1, year2, year3, year4, year5, year6, year7, year8, year9, year10):
 74 |         self.code = code
 75 |         self.subject_title = subject_title
 76 |         self.season = season
 77 |         self.account1 = account1
 78 |         self.account2 = account2
 79 |         self.account3 = account3
 80 |         self.account4 = account4
 81 |         self.account5 = account5
 82 |         self.account6 = account6
 83 |         self.account7 = account7
 84 |         self.account8 = account8
 85 |         self.account9 = account9
 86 |         self.account10 = account10
 87 |         self.year1 = year1
 88 |         self.year2 = year2
 89 |         self.year3 = year3
 90 |         self.year4 = year4
 91 |         self.year5 = year5
 92 |         self.year6 = year6
 93 |         self.year7 = year7
 94 |         self.year8 = year8
 95 |         self.year9 = year9
 96 |         self.year10 = year10
 97 | #利润表
 98 | class profit_sheetItem():
 99 |     def __init__(self, code, subject_title,season, account1, account2, account3, account4, account5, account6, account7,
100 |                  account8, account9, account10,
101 |                  year1, year2, year3, year4, year5, year6, year7, year8, year9, year10):
102 |         self.code = code
103 |         self.subject_title = subject_title
104 |         self.season = season
105 |         self.account1 = account1
106 |         self.account2 = account2
107 |         self.account3 = account3
108 |         self.account4 = account4
109 |         self.account5 = account5
110 |         self.account6 = account6
111 |         self.account7 = account7
112 |         self.account8 = account8
113 |         self.account9 = account9
114 |         self.account10 = account10
115 |         self.year1 = year1
116 |         self.year2 = year2
117 |         self.year3 = year3
118 |         self.year4 = year4
119 |         self.year5 = year5
120 |         self.year6 = year6
121 |         self.year7 = year7
122 |         self.year8 = year8
123 |         self.year9 = year9
124 |         self.year10 = year10
125 | #分红
126 | class stock_bonusItem():
127 |     def __init__(self, code, notice_date, rightoff_time, stock_right_registe_date, cash_per_share,
128 |                  send_bonus_share_per_share, increase_shares_per_share, cash_receive_date,share_receive_date):
129 |         self.code = code
130 |         self.notice_date = notice_date
131 |         self.rightoff_time = rightoff_time
132 |         self.stock_right_registe_date = stock_right_registe_date
133 |         self.cash_per_share = cash_per_share
134 |         self.send_bonus_share_per_share = send_bonus_share_per_share
135 |         self.increase_shares_per_share = increase_shares_per_share
136 |         self.cash_receive_date = cash_receive_date
137 |         self.share_receive_date = share_receive_date
138 | #十大流通股东
139 | class stock_holdersItem():
140 |     def __init__(self, code, holder_range, holder_name, stock_count, stock_percent,
141 |                  stock_property,count_date):
142 |         self.code = code
143 |         self.holder_range = holder_range
144 |         self.holder_name = holder_name
145 |         self.stock_count = stock_count
146 |         self.stock_percent = stock_percent
147 |         self.stock_property = stock_property
148 |         self.count_date = count_date
149 | #股票基本信息
150 | class stock_infoItem():
151 |     def __init__(self, code, name, open, high, close,
152 |                  low, volume, amount,price_change,
153 |                  p_change, yesterday_close, exchange,turnover,online_years,
154 |                  pb, pe, date,amplitude):
155 |         self.code = code
156 |         self.name = name
157 |         self.open = open
158 |         self.high = high
159 |         self.close = close
160 |         self.low = low
161 |         self.volume = volume
162 |         self.amount = amount
163 |         self.price_change = price_change
164 |         self.p_change = p_change
165 |         self.yesterday_close = yesterday_close
166 |         self.exchange = exchange
167 |         self.turnover = turnover
168 |         self.online_years = online_years
169 |         self.pb = pb
170 |         self.pe = pe
171 |         self.date = date
172 |         self.amplitude = amplitude
173 | 
174 | #爬取新浪滚动新闻
175 | class sina_newsItem(scrapy.Item):
176 |     collection = 'newsina'
177 |     ctime = Field()  # 发布时间
178 |     url = Field()
179 |     wapurl = Field()
180 |     title = Field()  # 新闻标题
181 |     media_name = Field()  # 发发布的媒体
182 |     keywords = Field()  #  关键词
183 |     content = Field()  #  新闻内容
184 | 


--------------------------------------------------------------------------------
/stockbaseinfo/annualreport/annualspider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | """
  3 | 下载年报
  4 |     downloads:
  5 |     公开招股书（招股说明书/招股意向书）
  6 |     《年度报告》 16 17 18
  7 | """
  8 | import requests
  9 | import random
 10 | import time
 11 | import urllib
 12 | from stockbaseinfo.Const import *
 13 | 
 14 | download_path = 'http://static.cninfo.com.cn/'
 15 | saving_path = './pdf/'
 16 | 
 17 | User_Agent = [
 18 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
 19 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 20 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 21 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 22 |     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 23 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 24 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0"
 25 | ]
 26 | 
 27 | 
 28 | headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
 29 |            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
 30 |            "Accept-Encoding": "gzip, deflate",
 31 |            "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-HK;q=0.6,zh-TW;q=0.5",
 32 |            'Host': 'www.cninfo.com.cn',
 33 |            'Origin': 'http://www.cninfo.com.cn',
 34 |            'Referer': 'http://www.cninfo.com.cn/new/commonUrl?url=disclosure/list/notice',
 35 |            'X-Requested-With': 'XMLHttpRequest'
 36 |            }
 37 | 
 38 | 
 39 | 
 40 | 
 41 | # 深市 年度报告
 42 | def szseAnnual(page, stock):
 43 |     query_path = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
 44 |     headers['User-Agent'] = random.choice(User_Agent)  # 定义User_Agent
 45 |     query = {'pageNum': page,  # 页码
 46 |              'pageSize': 30,
 47 |              'tabName': 'fulltext',
 48 |              'column': 'szse',  # 深交所
 49 |              'stock': stock,
 50 |              'searchkey': '',
 51 |              'secid': '',
 52 |              'plate': 'sz',
 53 |              'category': 'category_ndbg_szsh;',  # 年度报告
 54 |              'trade': '',
 55 |              'seDate': '2016-01-01+~+2019-4-26'  # 时间区间
 56 |              }
 57 | 
 58 |     namelist = requests.post(query_path, headers=headers, data=query)
 59 |     return namelist.json()['announcements']
 60 | 
 61 | 
 62 | # 沪市 年度报告
 63 | def sseAnnual(page, stock):
 64 |     query_path = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
 65 |     headers['User-Agent'] = random.choice(User_Agent)  # 定义User_Agent
 66 |     query = {'pageNum': page,  # 页码
 67 |              'pageSize': 30,
 68 |              'tabName': 'fulltext',
 69 |              'column': 'sse',
 70 |              'stock': stock,
 71 |              'searchkey': '',
 72 |              'secid': '',
 73 |              'plate': 'sh',
 74 |              'category': 'category_ndbg_szsh;',  # 年度报告
 75 |              'trade': '',
 76 |              'seDate': '2016-01-01+~+2019-4-26'  # 时间区间
 77 |              }
 78 | 
 79 |     namelist = requests.post(query_path, headers=headers, data=query)
 80 |     return namelist.json()['announcements']  # json中的年度报告信息
 81 | 
 82 | 
 83 | # 深市 招股
 84 | def szseStock(page, stock):
 85 |     query_path = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
 86 |     headers['User-Agent'] = random.choice(User_Agent)  # 定义User_Agent
 87 |     query = {'pageNum': page,  # 页码
 88 |              'pageSize': 30,
 89 |              'tabName': 'fulltext',
 90 |              'column': 'szse',
 91 |              'stock': stock,
 92 |              'searchkey': '招股',
 93 |              'secid': '',
 94 |              'plate': 'sz',
 95 |              'category': '',
 96 |              'trade': '',
 97 |              'seDate': '2001-01-01+~+2019-4-26'  # 时间区间
 98 |              }
 99 | 
100 |     namelist = requests.post(query_path, headers=headers, data=query)
101 |     return namelist.json()['announcements']  # json中的年度报告信息
102 | 
103 | 
104 | # 沪市 招股
105 | def sseStock(page, stock):
106 |     query_path = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
107 |     headers['User-Agent'] = random.choice(User_Agent)  # 定义User_Agent
108 |     query = {'pageNum': page,  # 页码
109 |              'pageSize': 30,
110 |              'tabName': 'fulltext',
111 |              'column': 'sse',
112 |              'stock': stock,
113 |              'searchkey': '招股',
114 |              'secid': '',
115 |              'plate': 'sh',
116 |              'category': '',
117 |              'trade': '',
118 |              'seDate': '2010-01-01+~+2020-5-08'  # 时间区间
119 |              }
120 | 
121 |     namelist = requests.post(query_path, headers=headers, data=query)
122 |     return namelist.json()['announcements']  # json中的年度报告信息
123 | 
124 | 
125 | # download PDF
126 | def Download(single_page):
127 |     if single_page is None:
128 |         return
129 | 
130 |     headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
131 |                "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
132 |                "Accept-Encoding": "gzip, deflate",
133 |                "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-HK;q=0.6,zh-TW;q=0.5",
134 |                'Host': 'www.cninfo.com.cn',
135 |                'Origin': 'http://www.cninfo.com.cn'
136 |                }
137 | 
138 |     for i in single_page:
139 |         allowed_list = [
140 |             '2019年年度报告（更新后）',
141 |             '2019年年度报告',
142 |             # '2018年年度报告（更新后）',
143 |             # '2018年年度报告',
144 |             # '2017年年度报告（更新后）',
145 |             # '2017年年度报告',
146 |             # '2016年年度报告（更新后）',
147 |             # '2016年年度报告',
148 |         ]
149 |         allowed_list_2 = [
150 |             '招股书',
151 |             # '招股说明书',
152 |             # '招股意向书',
153 |         ]
154 |         title = i['announcementTitle']
155 |         allowed = title in allowed_list
156 |         if '确认意见' in title:
157 |             return
158 |         for item in allowed_list_2:
159 |             if item in title:
160 |                 allowed = True
161 |                 break
162 |         if allowed:
163 |             download = download_path + i["adjunctUrl"]
164 |             name = i["secCode"] + '_' + i['secName'] + '_' + i['announcementTitle'] + '.pdf'
165 |             if '*' in name:
166 |                 name = name.replace('*', '')
167 |             file_path = saving_path + name
168 |             time.sleep(random.random() * 2)
169 | 
170 |             headers['User-Agent'] = random.choice(User_Agent)
171 |             r = requests.get(download)
172 | 
173 |             f = open(file_path, "wb")
174 |             f.write(r.content)
175 |             f.close()
176 |         else:
177 |             continue
178 | 
179 | 
180 | # given page_number & stock number
181 | def Run(page_number, stock):
182 |     try:
183 |         annual_report = szseAnnual(page_number, stock)
184 |         stock_report = szseStock(page_number, stock)
185 |         annual_report_ = sseAnnual(page_number, stock)
186 |         stock_report_ = sseStock(page_number, stock)
187 |     except:
188 |         print(page_number, 'page error, retrying')
189 |         try:
190 |             annual_report = szseAnnual(page_number, stock)
191 |         except:
192 |             print(page_number, 'page error')
193 |     Download(annual_report)
194 |     Download(stock_report)
195 |     Download(annual_report_)
196 |     Download(stock_report_)
197 | 
198 | 
199 | if __name__ == "__main__":
200 |     i = 0
201 |     for code in Const.LST_CODE_TEST:
202 |         Run(1, code)
203 |         i=i+1
204 |         print(code, "index:"+str(i)+"done")


--------------------------------------------------------------------------------
/stockbaseinfo/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  7 | 
  8 | import pymysql
  9 | # 使用twsited异步IO框架，实现数据的异步写入。
 10 | from pymysql import cursors
 11 | from twisted.enterprise import adbapi
 12 | from stockbaseinfo.settings import *
 13 | from stockbaseinfo.Const import *
 14 | import traceback
 15 | 
 16 | class StockbaseinfoPipeline(object):
 17 |     @classmethod
 18 |     def from_crawler(cls, crawler):
 19 |         # 从项目的配置文件中读取相应的参数
 20 |         cls.MYSQL_DB_NAME = crawler.settings.get("MYSQL_DB_NAME", 'finance_data')
 21 |         cls.HOST = crawler.settings.get("MYSQL_HOST", 'localhost')
 22 |         cls.PORT = crawler.settings.get("MYSQL_PORT", 3306)
 23 |         cls.USER = crawler.settings.get("MYSQL_USER", 'root')
 24 |         cls.PASSWD = crawler.settings.get("MYSQL_PASSWORD", 'root')
 25 |         return cls()
 26 | 
 27 |     def __init__(self):
 28 |         dbparams = {
 29 |             'host':MYSQL_HOST,
 30 |             'port': 3306,
 31 |             'user': MYSQL_USER,
 32 |             'password': MYSQL_PASSWORD,
 33 |             'database': MYSQL_DB_NAME,
 34 |             'charset': 'utf8',
 35 |             'cursorclass': cursors.DictCursor  # 指定cursor的类
 36 |         }
 37 |         # 初始化数据库连接池，参数1是mysql的驱动，参数2是连接mysql的配置信息
 38 |         self.db_pool = adbapi.ConnectionPool('pymysql', **dbparams)
 39 |         # sql语言的空值
 40 |         self._sql = None
 41 |     def process_item(self, item, spider):
 42 |         # 操作数据，将数据写入数据库
 43 |         # 如果是同步写入的话，使用的是cursor.execute(),commit()
 44 |         # 异步存储的方式：函数方式pool.map(self.insert_db,[1,2])
 45 |         query = self.db_pool.runInteraction(self.insert_db, item)
 46 |         query.addErrback(self.handle_error, item, spider)
 47 | 
 48 |     #依据不同的数据类型进行不同的数据操作
 49 |     def insert_db(self, cursor, item):
 50 |         data_type = item['data_type']
 51 |         if data_type == Const.STOCK_INFO:#股票基本信息
 52 |             stockinfo = item['data_content']
 53 |             values = (
 54 |                 stockinfo.code,
 55 |                 stockinfo.name,
 56 |                 stockinfo.open,
 57 |                 stockinfo.high,
 58 |                 stockinfo.close,
 59 |                 stockinfo.low,
 60 |                 stockinfo.code,
 61 |                 stockinfo.amount,
 62 |                 stockinfo.price_change,
 63 |                 stockinfo.p_change,
 64 |                 stockinfo.yesterday_close,
 65 |                 stockinfo.exchange,
 66 |                 stockinfo.online_years,
 67 |                 stockinfo.pb,
 68 |                 stockinfo.pe,
 69 |                 str(stockinfo.date),
 70 |                 stockinfo.amplitude
 71 |             )
 72 |             sql = '''INSERT INTO `finance_data`.`stock_basic_info`(`code`,`name`,`open`,`high`,`close`,`low`,`volume`,`amount`,`price_change`,`p_change`,`yesterday_close`,`exchange`,`online_years`,`pb`,`pe`,`date`,`amplitude`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);'''
 73 |             try:
 74 |                 cursor.execute(sql, values)
 75 |             except:
 76 |                 traceback.print_exc()
 77 |         elif data_type == Const.BALANCE_SHEET: #资产负债表
 78 |             lstblancesheet =  item['data_content']
 79 |             for blancesheet in lstblancesheet:
 80 |                 values = (
 81 |                     blancesheet.code,
 82 |                     blancesheet.subject_title,
 83 |                     blancesheet.account1,
 84 |                     blancesheet.account2,
 85 |                     blancesheet.account3,
 86 |                     blancesheet.account4,
 87 |                     blancesheet.account5,
 88 |                     blancesheet.account6,
 89 |                     blancesheet.account7,
 90 |                     blancesheet.account8,
 91 |                     blancesheet.account9,
 92 |                     blancesheet.account10,
 93 |                     blancesheet.year1,
 94 |                     blancesheet.year2,
 95 |                     blancesheet.year3,
 96 |                     blancesheet.year4,
 97 |                     blancesheet.year5,
 98 |                     blancesheet.year6,
 99 |                     blancesheet.year7,
100 |                     blancesheet.year8,
101 |                     blancesheet.year9,
102 |                     blancesheet.year10
103 |                 )
104 |                 sql = '''INSERT INTO `finance_data`.`stock_basic_balance_sheet`(`code`,`subject_title`,`account1`,`account2`,`account3`,`account4`,`account5`,`account6`,`account7`,`account8`,`account9`,
105 |                 `account10`,`year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);'''
106 |                 try:
107 |                     cursor.execute(sql, values)
108 |                 except:
109 |                     traceback.print_exc()
110 |         elif data_type == Const.CASH_FLOW: #资产负债表
111 |             lstbcashflow =  item['data_content']
112 |             for cashflow in lstbcashflow:
113 |                 values = (
114 |                     cashflow.code,
115 |                     cashflow.subject_title,
116 |                     cashflow.account1,
117 |                     cashflow.account2,
118 |                     cashflow.account3,
119 |                     cashflow.account4,
120 |                     cashflow.account5,
121 |                     cashflow.account6,
122 |                     cashflow.account7,
123 |                     cashflow.account8,
124 |                     cashflow.account9,
125 |                     cashflow.account10,
126 |                     cashflow.year1,
127 |                     cashflow.year2,
128 |                     cashflow.year3,
129 |                     cashflow.year4,
130 |                     cashflow.year5,
131 |                     cashflow.year6,
132 |                     cashflow.year7,
133 |                     cashflow.year8,
134 |                     cashflow.year9,
135 |                     cashflow.year10
136 |                 )
137 |                 sql = '''INSERT INTO `finance_data`.`stock_basic_cashflow_sheet`(`code`,`subject_title`,`account1`,`account2`,`account3`,`account4`,`account5`,`account6`,`account7`,`account8`,`account9`,
138 |                 `account10`,`year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);'''
139 |                 try:
140 |                     cursor.execute(sql, values)
141 |                 except:
142 |                     traceback.print_exc()
143 |         elif data_type == Const.PROFIT_SHEET:  # 利润表
144 |             lstprofit = item['data_content']
145 |             for profit in lstprofit:
146 |                 values = (
147 |                     profit.code,
148 |                     profit.subject_title,
149 |                     profit.season,
150 |                     profit.account1,
151 |                     profit.account2,
152 |                     profit.account3,
153 |                     profit.account4,
154 |                     profit.account5,
155 |                     profit.account6,
156 |                     profit.account7,
157 |                     profit.account8,
158 |                     profit.account9,
159 |                     profit.account10,
160 |                     profit.year1,
161 |                     profit.year2,
162 |                     profit.year3,
163 |                     profit.year4,
164 |                     profit.year5,
165 |                     profit.year6,
166 |                     profit.year7,
167 |                     profit.year8,
168 |                     profit.year9,
169 |                     profit.year10
170 |                 )
171 |                 sql = '''INSERT INTO `finance_data`.`stock_basic_profit_sheet`(`code`,`subject_title`,`season`,`account1`,`account2`,`account3`,`account4`,`account5`,`account6`,`account7`,`account8`,`account9`,
172 |                  `account10`,`year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);'''
173 |                 try:
174 |                     cursor.execute(sql, values)
175 |                 except:
176 |                     traceback.print_exc()
177 |         elif data_type == Const.STOCK_BONUS:  # 历史分红表
178 |             lstbonus = item['data_content']
179 |             for bonus in lstbonus:
180 |                 values = (
181 |                     bonus.code,
182 |                     bonus.notice_date,
183 |                     bonus.rightoff_time,
184 |                     bonus.stock_right_registe_date,
185 |                     bonus.cash_per_share,
186 |                     bonus.send_bonus_share_per_share,
187 |                     bonus.increase_shares_per_share,
188 |                     bonus.cash_receive_date,
189 |                     bonus.share_receive_date
190 |                 )
191 |                 sql = '''INSERT INTO `finance_data`.`stock_basic_bonus`(`code`,`notice_date`,`rightoff_time`,`stock_right_registe_date`,`cash_per_share`,`send_bonus_share_per_share`,`increase_shares_per_share`,`cash_receive_date`,`share_receive_date`)
192 | VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s);'''
193 |                 try:
194 |                     cursor.execute(sql, values)
195 |                 except:
196 |                     traceback.print_exc()
197 |         elif data_type == Const.MAIN_INDEX:  # 历史分红表
198 |             lstmainindex = item['data_content']
199 |             for mainindex in lstmainindex:
200 |                 values = (
201 |                     mainindex.code,
202 |                     mainindex.subject_title,
203 |                     mainindex.season,
204 |                     mainindex.account1,
205 |                     mainindex.account2,
206 |                     mainindex.account3,
207 |                     mainindex.account4,
208 |                     mainindex.account5,
209 |                     mainindex.account6,
210 |                     mainindex.account7,
211 |                     mainindex.account8,
212 |                     mainindex.account9,
213 |                     mainindex.account10,
214 |                     mainindex.year1,
215 |                     mainindex.year2,
216 |                     mainindex.year3,
217 |                     mainindex.year4,
218 |                     mainindex.year5,
219 |                     mainindex.year6,
220 |                     mainindex.year7,
221 |                     mainindex.year8,
222 |                     mainindex.year9,
223 |                     mainindex.year10
224 |                 )
225 |                 sql = '''INSERT INTO `finance_data`.`stock_basic_main_index`(`code`,`subject_title`,`season`,`account1`,`account2`,`account3`,`account4`,`account5`,`account6`,`account7`,`account8`,`account9`,
226 |                  `account10`,`year1`,`year2`,`year3`,`year4`,`year5`,`year6`,`year7`,`year8`,`year9`,`year10`)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);'''
227 |                 try:
228 |                     cursor.execute(sql, values)
229 |                 except:
230 |                     traceback.print_exc()
231 |         elif data_type == Const.STOCK_HOLDERS:  # 前十大股东表
232 |             lststockholders = item['data_content']
233 |             for stockholder in lststockholders:
234 |                 values = (
235 |                     stockholder.code,
236 |                     stockholder.holder_range,
237 |                     stockholder.holder_name,
238 |                     stockholder.stock_count,
239 |                     stockholder.stock_percent,
240 |                     stockholder.stock_property,
241 |                     stockholder.count_date
242 |                 )
243 |                 sql = '''INSERT INTO `finance_data`.`stock_basic_holders`(`code`,`holder_range`,`holder_name`,`stock_count`,`stock_percent`,`stock_property`,`count_date`)VALUES(%s,%s,%s,%s,%s,%s,%s);'''
244 |                 try:
245 |                     cursor.execute(sql, values)
246 |                 except:
247 |                     traceback.print_exc()
248 |         elif data_type == Const.ROLL_NEWS:  # 获取所有新闻信息
249 |             lstnews = item['data_content']
250 |             for newitem in lstnews:
251 |                 values = (
252 |                     newitem['title'],
253 |                     newitem['content'],
254 |                     newitem['ctime'],
255 |                     newitem['media_name'],
256 |                     newitem['keywords'],
257 |                     newitem['url'],
258 |                     newitem['wapurl']
259 |                 )
260 |                 sql = '''INSERT INTO `finance_basic_data`.`sina_news`(`title`,`content`,`ctime`,`media_name`,`keywords`,`url`,`wepurl`)VALUES(%s,%s,%s,%s,%s,%s,%s);'''
261 |                 try:
262 |                     cursor.execute(sql, values)
263 |                 except:
264 |                     traceback.print_exc()
265 | 
266 |     def handle_error(self, error, item, spider):
267 |         print('=' * 10 + "error" + '=' * 10)
268 | 


--------------------------------------------------------------------------------
/stockbaseinfo/spiders/baseinfo.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from scrapy.selector import Selector
  4 | from stockbaseinfo.items import *
  5 | from stockbaseinfo.Const import *
  6 | from stockbaseinfo.utils import *
  7 | import json
  8 | import requests
  9 | 
 10 | 
 11 | #需要处理的几个事情：
 12 | #1.需要支持多个连接组成的访问群组，而且需要不同的顺序完成
 13 | #2.需要支持一个yeild来封装不同数据类型
 14 | #3.参考这个网址 完成剩下的内容：https://www.jianshu.com/p/6740c83e4540
 15 | #4.用来获取基本信息 http://api.cninfo.com.cn/v5/hq/dataItem?codelist=sh603158
 16 | 
 17 | #TODO:需要统一处理数据不足10个的情况，需要设计一个方案来接收这些数据
 18 | 
 19 | class BaseinfoSpider(scrapy.Spider):
 20 |     name = 'baseinfo'
 21 |     # allowed_domains = ['zyh.com']
 22 |     start_urls = ['http://www.cninfo.com.cn/new/disclosure/stock?orgId=9900026564&stockCode=002796']
 23 | 
 24 |     def start_requests(self):
 25 |         for code in Const.LST_S_CODE:
 26 |             tsurl = Const.BASEINFO_URL + code
 27 |             yield scrapy.Request(tsurl,callback=self.info_parse)
 28 |         for code in Const.LST_CODE:
 29 |             para = 'scode=' + code
 30 |             data1 = {
 31 |                 'mergerMark': 'sysapi1067', 'paramStr': para
 32 |             }
 33 |             yield scrapy.FormRequest(Const.DATAINFO_URL,formdata=data1, method='POST',callback=self.parse)
 34 | 
 35 |     #解析其他信息
 36 |     def parse(self, response):
 37 |         print('--------------profit_parse-----------------------------')
 38 |         if len(response.text) > 4:
 39 |             js = json.loads(response.text)
 40 |             code = js[0]['SECCODE']
 41 |             param = js[0]['F002N']
 42 |             #---------------------------------------请求资产负债表--------------------------------------------------------------
 43 |             para = 'scode=' + code + ';rtype=1;sign=' + str(param)
 44 |             # 资产负债表
 45 |             balance_data = {
 46 |                 'mergerMark': 'sysapi1077', 'paramStr': para
 47 |             }
 48 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=balance_data, method='POST', callback=lambda response,code=code:self.balance_parse(response,code))
 49 |             # *************************************请求资产负债表**************************************************************
 50 | 
 51 |             #---------------------------------------请求利润表--------------------------------------------------------------
 52 |             #一季度利润表
 53 |             para = 'scode=' + code + ';rtype=1;sign=' + str(param)
 54 |             profit_data = {
 55 |                 'mergerMark': 'sysapi1075', 'paramStr': para
 56 |             }
 57 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=profit_data, method='POST',callback=lambda response,code=code,rtype='1':self.profit_parse(response,code,rtype))
 58 |             #半年利润表
 59 |             para = 'scode=' + code + ';rtype=2;sign=' + str(param)
 60 |             profit_data = {
 61 |                 'mergerMark': 'sysapi1075', 'paramStr': para
 62 |             }
 63 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=profit_data, method='POST', callback=lambda response,code=code,rtype='2':self.profit_parse(response,code,rtype))
 64 |             #三季度利润表
 65 |             para = 'scode=' + code + ';rtype=3;sign=' + str(param)
 66 |             profit_data = {
 67 |                 'mergerMark': 'sysapi1075', 'paramStr': para
 68 |             }
 69 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=profit_data, method='POST', callback=lambda response,code=code,rtype='3':self.profit_parse(response,code,rtype))
 70 |             #年度利润表
 71 |             para = 'scode=' + code + ';rtype=4;sign=' + str(param)
 72 |             profit_data = {
 73 |                 'mergerMark': 'sysapi1075', 'paramStr': para
 74 |             }
 75 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=profit_data, method='POST', callback=lambda response,code=code,rtype='4':self.profit_parse(response,code,rtype))
 76 |             # *************************************请求利润表**************************************************************
 77 |             #---------------------------------------请求现金流量表--------------------------------------------------------------
 78 |             para = 'scode=' + code + ';rtype=1;sign=' + str(param)
 79 |             cashflow_data = {
 80 |                 'mergerMark': 'sysapi1076', 'paramStr': para
 81 |             }
 82 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=cashflow_data, method='POST', callback=lambda response,code=code:self.cashflow_parse(response,code))
 83 |             # *************************************请求现金流量表**************************************************************
 84 |             #---------------------------------------请求历史分红表--------------------------------------------------------------
 85 |             para = 'scode=' + code
 86 |             # 分红
 87 |             bonus_data = {
 88 |                 'mergerMark': 'sysapi1073', 'paramStr': para
 89 |             }
 90 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=bonus_data, method='POST', callback=lambda response,code=code:self.bonus_parse(response,code))
 91 |             # *************************************请求历史分红表**************************************************************
 92 |             #---------------------------------------请求主要指标表--------------------------------------------------------------
 93 |             para = 'scode=' + code + ';rtype=1'
 94 |             mainindex_data = {
 95 |                 'mergerMark': 'sysapi1074', 'paramStr': para
 96 |             }
 97 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=mainindex_data, method='POST', callback=lambda response,code=code,rtype='1':self.mainindex_parse(response,code,rtype))
 98 |             para = 'scode=' + code + ';rtype=2'
 99 |             mainindex_data = {
100 |                 'mergerMark': 'sysapi1074', 'paramStr': para
101 |             }
102 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=mainindex_data, method='POST',
103 |                                      callback=lambda response, code=code, rtype='2': self.mainindex_parse(response, code,
104 |                                                                                                           rtype))
105 |             para = 'scode=' + code + ';rtype=3'
106 |             mainindex_data = {
107 |                 'mergerMark': 'sysapi1074', 'paramStr': para
108 |             }
109 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=mainindex_data, method='POST',
110 |                                      callback=lambda response, code=code, rtype='3': self.mainindex_parse(response, code,
111 |                                                                                                           rtype))
112 |             para = 'scode=' + code + ';rtype=4'
113 |             mainindex_data = {
114 |                 'mergerMark': 'sysapi1074', 'paramStr': para
115 |             }
116 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=mainindex_data, method='POST',
117 |                                      callback=lambda response, code=code, rtype='4': self.mainindex_parse(response, code,
118 |                                                                                                           rtype))
119 |             # *************************************请求主要指标表**************************************************************
120 |             # ---------------------------------------请求十大流通股东--------------------------------------------------------------
121 |             para = 'scode=' + code
122 |             # 十大股东
123 |             holders_data = {
124 |                 'mergerMark': 'sysapi1071', 'paramStr': para
125 |             }
126 |             yield scrapy.FormRequest(Const.DATAINFO_URL, formdata=holders_data, method='POST',callback=lambda response,code=code:self.holders_parse(response,code))
127 |             # *************************************请求十大流通股东**************************************************************
128 | 
129 |     def profit_parse(self, response,code,rtype):
130 |         print('--------------profit_parse-----------------------------')
131 |         if len(response.text) > 4:
132 |             data = json.loads(response.text)
133 |             lst_profit = []
134 |             for item in data:
135 |                 subject_title = item['index']
136 |                 lstyear = list(item.keys())
137 |                 lstyear.remove('index')
138 |                 lstyear.sort()
139 |                 if len(lstyear) == 10:
140 |                     profitsheet_item = profit_sheetItem(code, subject_title,rtype, item[lstyear[0]], item[lstyear[1]],
141 |                                                            item[lstyear[2]], item[lstyear[3]], item[lstyear[4]],
142 |                                                            item[lstyear[5]], item[lstyear[6]],
143 |                                                            item[lstyear[7]], item[lstyear[8]], item[lstyear[9]],
144 |                                                            lstyear[0], lstyear[1], lstyear[2], lstyear[3], lstyear[4],
145 |                                                            lstyear[5],
146 |                                                            lstyear[6], lstyear[7], lstyear[8], lstyear[9])
147 |                     lst_profit.append(profitsheet_item)
148 |                 elif len(lstyear) > 0:
149 |                     arr_inityear, arr_initdata = Utils.load_validdata(lstyear, item)
150 |                     if len(arr_inityear) > 0 and len(arr_initdata) > 0:
151 |                         profitsheet_item = profit_sheetItem(code, subject_title, rtype, arr_initdata[0], arr_initdata[1],
152 |                                                                arr_initdata[2], arr_initdata[3], arr_initdata[4],
153 |                                                                arr_initdata[5], arr_initdata[6],
154 |                                                                arr_initdata[7], arr_initdata[8], arr_initdata[9],
155 |                                                                arr_inityear[0], arr_inityear[1], arr_inityear[2],
156 |                                                                arr_inityear[3],
157 |                                                                arr_inityear[4], arr_inityear[5],
158 |                                                                arr_inityear[6], arr_inityear[7], arr_inityear[8],
159 |                                                                arr_inityear[9])
160 |                         lst_profit.append(profitsheet_item)
161 |             stockbaseinfoitme = StockbaseinfoItem()
162 |             stockbaseinfoitme['data_type'] = Const.PROFIT_SHEET
163 |             stockbaseinfoitme['data_content'] = lst_profit
164 |             yield stockbaseinfoitme
165 |         print('-----------end---profit_parse-----------------------------')
166 |     def balance_parse(self, response,code):
167 |         print('--------------balance_parse-----------------------------')
168 |         if len(response.text) > 4:
169 |             data = json.loads(response.text)
170 |             lst_blance_sheet = []
171 |             for item in data:
172 |                 if item['index'].find('科目')>0:
173 |                     pass;
174 |                 else:
175 |                     subject_title = item['index']
176 |                     lstyear = list(item.keys())
177 |                     lstyear.remove('index')
178 |                     lstyear.sort()
179 |                     if len(lstyear) == 10:
180 |                         balance_sheet_item=balance_sheetItem(code,subject_title,item[lstyear[0]],item[lstyear[1]],item[lstyear[2]],item[lstyear[3]],item[lstyear[4]],item[lstyear[5]],item[lstyear[6]],
181 |                                                              item[lstyear[7]],item[lstyear[8]],item[lstyear[9]],lstyear[0],lstyear[1],lstyear[2],lstyear[3],lstyear[4],lstyear[5],
182 |                                                              lstyear[6],lstyear[7],lstyear[8],lstyear[9])
183 |                         lst_blance_sheet.append(balance_sheet_item)
184 |                     elif len(lstyear)>0:
185 |                         arr_inityear,arr_initdata = Utils.load_validdata(lstyear,item)
186 |                         if len(arr_inityear)>0 and len(arr_initdata)>0:
187 |                             balance_sheet_item = balance_sheetItem(code, subject_title, arr_initdata[0], arr_initdata[1],
188 |                                                                    arr_initdata[2], arr_initdata[3], arr_initdata[4],
189 |                                                                    arr_initdata[5], arr_initdata[6],
190 |                                                                    arr_initdata[7], arr_initdata[8], arr_initdata[9],
191 |                                                                    arr_inityear[0], arr_inityear[1], arr_inityear[2], arr_inityear[3],
192 |                                                                    arr_inityear[4], arr_inityear[5],
193 |                                                                    arr_inityear[6], arr_inityear[7], arr_inityear[8], arr_inityear[9])
194 |                             lst_blance_sheet.append(balance_sheet_item)
195 |             stockbaseinfoitme = StockbaseinfoItem()
196 |             stockbaseinfoitme['data_type'] = Const.BALANCE_SHEET
197 |             stockbaseinfoitme['data_content'] = lst_blance_sheet
198 |             yield stockbaseinfoitme
199 |         print('-----------end---balance_parse-----------------------------')
200 |     def mainindex_parse(self,response,code,rtype):
201 |         print('--------------mainindex_parse-----------------------------')
202 |         if len(response.text) > 4:
203 |             data = json.loads(response.text)
204 |             lst_mainindex = []
205 |             for item in data:
206 |                 subject_title = item['index']
207 |                 lstyear = list(item.keys())
208 |                 lstyear.remove('index')
209 |                 lstyear.sort()
210 |                 if len(lstyear) == 10:
211 |                     mainindex_item = main_indexItem(code, subject_title, rtype, item[lstyear[0]], item[lstyear[1]],
212 |                                                     item[lstyear[2]], item[lstyear[3]], item[lstyear[4]],
213 |                                                     item[lstyear[5]], item[lstyear[6]],
214 |                                                     item[lstyear[7]], item[lstyear[8]], item[lstyear[9]],
215 |                                                     lstyear[0], lstyear[1], lstyear[2], lstyear[3], lstyear[4],
216 |                                                     lstyear[5],
217 |                                                     lstyear[6], lstyear[7], lstyear[8], lstyear[9])
218 |                     lst_mainindex.append(mainindex_item)
219 |                 elif len(lstyear) > 0:
220 |                     arr_inityear, arr_initdata = Utils.load_validdata(lstyear, item)
221 |                     mainindex_item = main_indexItem(code, subject_title,rtype,arr_initdata[0], arr_initdata[1],
222 |                                                                    arr_initdata[2], arr_initdata[3], arr_initdata[4],
223 |                                                                    arr_initdata[5], arr_initdata[6],
224 |                                                                    arr_initdata[7], arr_initdata[8], arr_initdata[9],
225 |                                                                    arr_inityear[0], arr_inityear[1], arr_inityear[2], arr_inityear[3],
226 |                                                                    arr_inityear[4], arr_inityear[5],
227 |                                                                    arr_inityear[6], arr_inityear[7], arr_inityear[8], arr_inityear[9])
228 |                     lst_mainindex.append(mainindex_item)
229 |             stockbaseinfoitme = StockbaseinfoItem()
230 |             stockbaseinfoitme['data_type'] = Const.MAIN_INDEX
231 |             stockbaseinfoitme['data_content'] = lst_mainindex
232 |             yield stockbaseinfoitme
233 |         print('-----------end---mainindex_parse-----------------------------')
234 | 
235 |     def cashflow_parse(self, response,code):
236 |         print('--------------cashflow_parse-----------------------------')
237 |         if len(response.text) > 4:
238 |             data = json.loads(response.text)
239 |             lst_cashflow = []
240 |             for item in data:
241 |                 subject_title = item['index']
242 |                 lstyear = list(item.keys())
243 |                 lstyear.remove('index')
244 |                 lstyear.sort()
245 |                 if len(lstyear) == 10:
246 |                     cashflow_item = cash_flowItem(code, subject_title, item[lstyear[0]], item[lstyear[1]],
247 |                                                            item[lstyear[2]], item[lstyear[3]], item[lstyear[4]],
248 |                                                            item[lstyear[5]], item[lstyear[6]],
249 |                                                            item[lstyear[7]], item[lstyear[8]], item[lstyear[9]],
250 |                                                            lstyear[0], lstyear[1], lstyear[2], lstyear[3], lstyear[4],
251 |                                                            lstyear[5],
252 |                                                            lstyear[6], lstyear[7], lstyear[8], lstyear[9])
253 |                     lst_cashflow.append(cashflow_item)
254 |                 elif len(lstyear) > 0:
255 |                     arr_inityear, arr_initdata = Utils.load_validdata(lstyear, item)
256 |                     if len(arr_inityear) > 0 and len(arr_initdata) > 0:
257 |                         cashflow_item = cash_flowItem(code, subject_title, arr_initdata[0], arr_initdata[1],
258 |                                                                arr_initdata[2], arr_initdata[3], arr_initdata[4],
259 |                                                                arr_initdata[5], arr_initdata[6],
260 |                                                                arr_initdata[7], arr_initdata[8], arr_initdata[9],
261 |                                                                arr_inityear[0], arr_inityear[1], arr_inityear[2],
262 |                                                                arr_inityear[3],
263 |                                                                arr_inityear[4], arr_inityear[5],
264 |                                                                arr_inityear[6], arr_inityear[7], arr_inityear[8],
265 |                                                                arr_inityear[9])
266 |                         lst_cashflow.append(cashflow_item)
267 |             stockbaseinfoitme = StockbaseinfoItem()
268 |             stockbaseinfoitme['data_type'] = Const.CASH_FLOW
269 |             stockbaseinfoitme['data_content'] = lst_cashflow
270 |             yield stockbaseinfoitme
271 |         print('-----------end---cashflow_parse-----------------------------')
272 |     def holders_parse(self,response,code):
273 |         print('--------------holders_parse-----------------------------')
274 |         if len(response.text) > 4:
275 |             data = json.loads(response.text)
276 |             lst_stockholders = []
277 |             for item in data:
278 |                 stockholder_item = stock_holdersItem(code,item['F005N'],item['F002V'],item['F003N'],item['F004N'],item['F006V'],item['F001D'])
279 |                 lst_stockholders.append(stockholder_item)
280 |             stockbaseinfoitme = StockbaseinfoItem()
281 |             stockbaseinfoitme['data_type'] = Const.STOCK_HOLDERS
282 |             stockbaseinfoitme['data_content'] = lst_stockholders
283 |             yield stockbaseinfoitme
284 |         print('-----------end---holders_parse-----------------------------')
285 | 
286 |     def bonus_parse(self, response,code):
287 |         print('--------------bonus_parse-----------------------------')
288 |         if len(response.text) > 4:
289 |             data = json.loads(response.text)
290 |             lst_bonus = []
291 |             for item in data:
292 |                 notice_date = item['F013D']
293 |                 rightoff_time = item['F014D']
294 |                 stock_right_registe_date = item['F015D']
295 |                 cash_per_share = item['F010N']
296 |                 send_bonus_share_per_share = item['F012N']
297 |                 increase_shares_per_share = item['F011N']
298 |                 cash_receive_date = item['F016D']
299 |                 share_receive_date = item['F017D']
300 |                 bonus_item = stock_bonusItem(code, notice_date, rightoff_time, stock_right_registe_date, cash_per_share,
301 |         send_bonus_share_per_share, increase_shares_per_share, cash_receive_date, share_receive_date)
302 |                 lst_bonus.append(bonus_item)
303 |             stockbaseinfoitme = StockbaseinfoItem()
304 |             stockbaseinfoitme['data_type'] = Const.STOCK_BONUS
305 |             stockbaseinfoitme['data_content'] = lst_bonus
306 |             yield stockbaseinfoitme
307 |         print('-----------end---bonus_parse-----------------------------')
308 | 
309 |     def info_parse(self,response):
310 |         print('--------------info_parse-----------------------------')
311 |         if len(response.text)>4:
312 |             data = json.loads(response.text)
313 |             dataitem = data[0]
314 |             code=dataitem['5']
315 |             name = dataitem['55']
316 |             price_change = '0.0' if dataitem['264648']=="" else dataitem['264648']
317 |             p_change = '0.0' if dataitem['199112']=="" else dataitem['199112']
318 |             open = '0.0' if dataitem['7']=="" else dataitem['7']
319 |             yesterday_close = '0.0' if dataitem['6']=="" else dataitem['6']
320 |             high = '0.0' if dataitem['8']=="" else dataitem['9']
321 |             low = '0.0' if dataitem['9']=="" else dataitem['9']
322 |             close = '0.0' if dataitem['10']=="" else dataitem['10']
323 |             volume = '0.0' if dataitem['13']=="" else dataitem['13']
324 |             amount = '0.0' if dataitem['19']=="" else dataitem['19']
325 |             stockinfo = stock_infoItem(code, name, open, high, close,
326 |                      low, volume, amount,price_change,
327 |                      p_change, yesterday_close, 0.0, 0.0, 0,
328 |                            0.0, 0.0, time.strftime("%Y-%m-%d", time.localtime(time.time())),0.0)
329 |             stockbaseinfoitme = StockbaseinfoItem()
330 |             stockbaseinfoitme['data_type']=Const.STOCK_INFO
331 |             stockbaseinfoitme['data_content']=stockinfo
332 |             print('-----------end---info_parse-----------------------------')
333 |             yield  stockbaseinfoitme
334 |         # sel = Selector(response)
335 |         # print(response.text)
336 |         #
337 |         # #stock_baseinfo = stock_infoItem()
338 |         # page_stockdetail_sublist = sel.xpath('//div[@class="page-stockdetail"]')
339 |         # code = page_stockdetail_sublist.xpath('//div[@class="sub-code"]/text()').extract_first()
340 |         # name = page_stockdetail_sublist.xpath('//div[@class="sub-title"]/text()').extract_first()
341 |         # close = page_stockdetail_sublist.xpath('//div[@class="sub-trend-value"]/text()').extract_first()
342 |         # price_change = page_stockdetail_sublist.xpath('//div[@class="sub-trend-size"]/text()').extract_first()
343 |         # p_change = page_stockdetail_sublist.xpath('//div[@class="sub-trend-trend"]/text()').extract_first()
344 |         # date = page_stockdetail_sublist.xpath('//div[@class="sub-time last-child"]/text()').extract_first()
345 |         # yesterday_close = page_stockdetail_sublist.xpath('//div[@id="pre"]/text()').extract_first()
346 |         # open = page_stockdetail_sublist.xpath('//div[@id="open"]/text()').extract_first()
347 |         # online_years = page_stockdetail_sublist.xpath('//div[@id="sub-value age"]/text()').extract_first()
348 |         # pb = page_stockdetail_sublist.xpath('//div[@id="pb-ratio"]/text()').extract_first()
349 |         # pe = page_stockdetail_sublist.xpath('//div[@id="pe-ratio"]/text()').extract_first()
350 |         # high = page_stockdetail_sublist.xpath('//div[@id="high"]/text()').extract_first()
351 |         # low  = page_stockdetail_sublist.xpath('//div[@id="low"]/text()').extract_first()
352 |         # volume  = page_stockdetail_sublist.xpath('//div[@id="vol"]/text()').extract_first()
353 |         # amount  = page_stockdetail_sublist.xpath('//div[@id="money"]/text()').extract_first()
354 |         # exchange = page_stockdetail_sublist.xpath('//div[@id="amplit"]/text()').extract_first()
355 |         # turnover = page_stockdetail_sublist.xpath('//div[@id="huanshou"]/text()').extract_first()
356 |         # print(code,name,close,open,price_change,p_change,date,yesterday_close,online_years,pb,pe,high,low,volume,amount,exchange,turnover)
357 |         #
358 |         #
359 |         # StockbaseinfoItem['data_type']= Const.BALANCE_SHEET
360 |         # StockbaseinfoItem['data_type']=""
361 |         #yield StockbaseinfoItem
362 | 


--------------------------------------------------------------------------------