├── basedata20170116
├── gpinfosort.py
└── readme.txt


/gpinfosort.py:
--------------------------------------------------------------------------------
  1 | # _*_ coding:utf-8 _*_
  2 | 
  3 | import requests,re,json,time,os
  4 | import heapq
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | def item_in_a_and_b(a,b):
  8 |     temp_l = []
  9 |     for itema in a:
 10 |         for itemb in b:
 11 |             if itema[1]==itemb[1]:
 12 |                 itema.append(itemb[2])
 13 |                 temp_l.append(itema)
 14 |     return temp_l
 15 | 
 16 | 
 17 | class GPINFO(object):
 18 |     """docstring for GPINFO"""
 19 |     def __init__(self):
 20 |         self.Url = 'http://quote.eastmoney.com/stocklist.html'
 21 |         self.BaseData = []
 22 |         self.Date = time.strftime('%Y%m%d')
 23 |         self.Record = 'basedata'+self.Date
 24 |         if os.path.exists(self.Record):
 25 |             print ('record exist...')
 26 |             self.BaseData = self.get_base_data_from_record()
 27 |         else:
 28 |             print ('fuck-get data again...')
 29 |             self.get_data()
 30 | 
 31 |     def write_record(self,text):
 32 |         with open(self.Record,'ab') as f:
 33 |             f.write((text+'\n').encode('utf-8'))
 34 | 
 35 |     def get_base_data_from_record(self):
 36 |         ll = []
 37 |         with open(self.Record,'rb') as f:
 38 |             json_l = f.readlines()
 39 |             for j in json_l:
 40 |                 ll.append(json.loads(j.decode('utf-8')))
 41 |         return ll
 42 | 
 43 |     def get_data(self):
 44 |         #请求数据
 45 |         orihtml = requests.get(self.Url).content
 46 |         #创建 beautifulsoup 对象
 47 |         soup = BeautifulSoup(orihtml,'lxml')
 48 |         #采集每一个股票的信息
 49 |         count = 0
 50 |         for a in soup.find('div',class_='quotebody').find_all('a',{'target':'_blank'}):
 51 |             record_d = {}
 52 |             #代号
 53 |             num = a.get_text().split('(')[1].strip(')')
 54 |             if not (num.startswith('00') or num.startswith('60')):continue #只需要6*/0*
 55 |             record_d['num']=num
 56 |             #名称
 57 |             name = a.get_text().split('(')[0]
 58 |             record_d['name']=name
 59 |             #详情页
 60 |             detail_url = a['href']
 61 |             record_d['detail_url']=detail_url
 62 | 
 63 |             cwzburl = detail_url
 64 |             #发送请求
 65 |             try:
 66 |                 cwzbhtml = requests.get(cwzburl,timeout=30).content
 67 |             except Exception as e:
 68 |                 print ('perhaps timeout:',e)
 69 |                 continue
 70 |             #创建soup对象
 71 |             cwzbsoup = BeautifulSoup(cwzbhtml,'lxml')
 72 | 
 73 |             #财务指标列表 [浦发银行，总市值	净资产	净利润	市盈率	市净率	毛利率	净利率	ROE] roe:净资产收益率
 74 |             try:
 75 |                 cwzb_list = cwzbsoup.find('div',class_='cwzb').tbody.tr.get_text().split()
 76 |             except Exception as e:
 77 |                 print ('error:',e)
 78 |                 continue
 79 |             #去除退市股票
 80 |             if '-' not in cwzb_list:
 81 |                 record_d['data']=cwzb_list
 82 |                 self.BaseData.append(record_d)
 83 |                 self.write_record(json.dumps(record_d))
 84 |                 count=count+1
 85 |                 print (len(self.BaseData))
 86 | 
 87 | def main():
 88 |     test = GPINFO()
 89 |     result = test.BaseData
 90 |     #[浦发银行，总市值	净资产	净利润	市盈率	市净率	毛利率	净利率	ROE] roe:净资产收益率]
 91 |     top_10 = heapq.nlargest(10,result,key=lambda r:float(r['data'][7].strip('%')))
 92 |     for i in top_10:
 93 |         print(i['data'])
 94 | 
 95 | if __name__ == '__main__':
 96 |     main()
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weelin-zhang/GP/889b0a4c05cc9a123258e9fb759b0f63c8757fcf/readme.txt


--------------------------------------------------------------------------------