├── basedata20170116 ├── gpinfosort.py └── readme.txt /gpinfosort.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8 _*_ 2 | 3 | import requests,re,json,time,os 4 | import heapq 5 | from bs4 import BeautifulSoup 6 | 7 | def item_in_a_and_b(a,b): 8 | temp_l = [] 9 | for itema in a: 10 | for itemb in b: 11 | if itema[1]==itemb[1]: 12 | itema.append(itemb[2]) 13 | temp_l.append(itema) 14 | return temp_l 15 | 16 | 17 | class GPINFO(object): 18 | """docstring for GPINFO""" 19 | def __init__(self): 20 | self.Url = 'http://quote.eastmoney.com/stocklist.html' 21 | self.BaseData = [] 22 | self.Date = time.strftime('%Y%m%d') 23 | self.Record = 'basedata'+self.Date 24 | if os.path.exists(self.Record): 25 | print ('record exist...') 26 | self.BaseData = self.get_base_data_from_record() 27 | else: 28 | print ('fuck-get data again...') 29 | self.get_data() 30 | 31 | def write_record(self,text): 32 | with open(self.Record,'ab') as f: 33 | f.write((text+'\n').encode('utf-8')) 34 | 35 | def get_base_data_from_record(self): 36 | ll = [] 37 | with open(self.Record,'rb') as f: 38 | json_l = f.readlines() 39 | for j in json_l: 40 | ll.append(json.loads(j.decode('utf-8'))) 41 | return ll 42 | 43 | def get_data(self): 44 | #请求数据 45 | orihtml = requests.get(self.Url).content 46 | #创建 beautifulsoup 对象 47 | soup = BeautifulSoup(orihtml,'lxml') 48 | #采集每一个股票的信息 49 | count = 0 50 | for a in soup.find('div',class_='quotebody').find_all('a',{'target':'_blank'}): 51 | record_d = {} 52 | #代号 53 | num = a.get_text().split('(')[1].strip(')') 54 | if not (num.startswith('00') or num.startswith('60')):continue #只需要6*/0* 55 | record_d['num']=num 56 | #名称 57 | name = a.get_text().split('(')[0] 58 | record_d['name']=name 59 | #详情页 60 | detail_url = a['href'] 61 | record_d['detail_url']=detail_url 62 | 63 | cwzburl = detail_url 64 | #发送请求 65 | try: 66 | cwzbhtml = requests.get(cwzburl,timeout=30).content 67 | except Exception as e: 68 | print ('perhaps timeout:',e) 69 | continue 70 | #创建soup对象 71 | cwzbsoup = BeautifulSoup(cwzbhtml,'lxml') 72 | 73 | #财务指标列表 [浦发银行,总市值 净资产 净利润 市盈率 市净率 毛利率 净利率 ROE] roe:净资产收益率 74 | try: 75 | cwzb_list = cwzbsoup.find('div',class_='cwzb').tbody.tr.get_text().split() 76 | except Exception as e: 77 | print ('error:',e) 78 | continue 79 | #去除退市股票 80 | if '-' not in cwzb_list: 81 | record_d['data']=cwzb_list 82 | self.BaseData.append(record_d) 83 | self.write_record(json.dumps(record_d)) 84 | count=count+1 85 | print (len(self.BaseData)) 86 | 87 | def main(): 88 | test = GPINFO() 89 | result = test.BaseData 90 | #[浦发银行,总市值 净资产 净利润 市盈率 市净率 毛利率 净利率 ROE] roe:净资产收益率] 91 | top_10 = heapq.nlargest(10,result,key=lambda r:float(r['data'][7].strip('%'))) 92 | for i in top_10: 93 | print(i['data']) 94 | 95 | if __name__ == '__main__': 96 | main() 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weelin-zhang/GP/889b0a4c05cc9a123258e9fb759b0f63c8757fcf/readme.txt --------------------------------------------------------------------------------