├── README.md ├── data_filter.py ├── .gitignore └── main_V3.py /README.md: -------------------------------------------------------------------------------- 1 | # fund_pool 2 | 基于天天基金网站的爬虫。主要通过四分卫法和其他相关指标,筛选出基金池,仅供参考。 3 | -------------------------------------------------------------------------------- /data_filter.py: -------------------------------------------------------------------------------- 1 | ## 策略 2 | # 1.四分位法筛选3年、2年、1年业绩都排名1/4的基金; 3 | # 2.规模适中(20至100亿) 4 | # 3.基金经理工作年限较久(从事证券相关工作和直接管理基金的两个角度;最好工作时间大于3年) 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | fileName = '基金数据情况-2018-11-06' 11 | f = open('./{}.csv'.format(fileName)) 12 | # f = open('./test.csv') 13 | data = pd.read_csv(f) 14 | 15 | data.近3年增幅 = data.近3年增幅.replace('--', np.nan) 16 | data = data.dropna(axis=0,how='any') 17 | 18 | data.基金规模 = data.基金规模.str.extract('(\d+\.?\d*)', expand=False) 19 | data['基金规模'] = data['基金规模'].apply(pd.to_numeric, errors='coerce') 20 | data = data[data.基金规模 > 5] 21 | 22 | for i in ['近3月排名', '近1年排名', '近2年排名', '近3年排名']: 23 | data[i + '系数'] = data[i].str.split('|').map(lambda x: int(x[0])/int(x[1])) 24 | data = data[data[i+'系数'] < 1/4] 25 | 26 | data.to_csv('./{}{}.csv'.format(fileName, '-筛选后版本')) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /main_V3.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import demjson 3 | import datetime 4 | import time 5 | from pandas.core.frame import DataFrame 6 | from bs4 import BeautifulSoup 7 | # import json 8 | import re 9 | import pandas as pd 10 | from multiprocessing import Pool 11 | import logging 12 | 13 | 14 | 15 | 16 | def rank_data_crawl(time_interval='3n', ft='all'): 17 | # 当前日期 18 | td_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d') 19 | td_dt = datetime.datetime.strptime(td_str, '%Y-%m-%d') 20 | # 去年今日 21 | last_dt = td_dt - datetime.timedelta(days=365) 22 | last_str = datetime.datetime.strftime(last_dt, '%Y-%m-%d') 23 | rank_url = 'http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft={0}&rs=&gs=0&sc={1}zf&st=desc&sd={2}&ed={3}&qdii=&tabSubtype=,,,,,&pi=1&pn=10000&dx=1'.format(ft, time_interval, last_str, td_str) 24 | # print(rank_url) 25 | rp = requests.get(rank_url) 26 | rank_txt = rp.text[rp.text.find('=') + 2:rp.text.rfind(';')] 27 | # print(rank_txt) 28 | # 数据 29 | rank_rawdata = demjson.decode(rank_txt) 30 | # rawdata_allNum = rank_rawdata['allNum'] 31 | rank_list = [] 32 | for i in rank_rawdata['datas']: 33 | rank_list.append(i.split(',')) 34 | # print(rank_url, 'rawdata_allNum:{}'.format(rawdata_allNum), sep='\n') 35 | return rank_list 36 | 37 | # 详情页面的抓取 38 | def get_allFund_content(single_fund_url): 39 | try: 40 | # print(single_fund_url) 41 | # if infromation[3] !='理财型' and infromation[3] !='货币型' and infromation[2].endswith('(后端)')==False: 42 | # code = infromation[0] 43 | headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'} 44 | r = requests.get(single_fund_url, headers=headers) 45 | r.encoding = r.apparent_encoding #避免中文乱码 46 | soup = BeautifulSoup(r.text, 'lxml') 47 | # # 判断交易状态 48 | # staticItem = soup.select('.staticItem')[0].get_text() 49 | # if '终止' in staticItem or '认购' in staticItem: 50 | # pass 51 | # else: 52 | # 各项指标 53 | # 基金名、基金类型、单位净值、累计净值、基金规模、成立日、评级、基金涨幅及排名 54 | # (近1周、近1月、近3月、近6月、今年来、近1年、近2年、近3年) 55 | fund_code = single_fund_url[26:32] 56 | # fund_name = re.match('[\u4e00-\u9fffA-Za-z]+', soup.select('.fundDetail-tit > div')[0].get_text()).group() 57 | fund_name = soup.select('.SitePath > a')[2].get_text() 58 | unit_netValue = soup.select('.dataItem02 > .dataNums > span.ui-font-large')[0].get_text() 59 | accumulative_netValue = soup.select('.dataItem03 > .dataNums > span.ui-font-large')[0].get_text() 60 | fund_info = [i for i in soup.select('div.infoOfFund tr > td')] 61 | # fund_type1 = fund_info[0].get_text().split(':')[1].strip() 62 | fund_type = re.search(':[DQI\-\u4e00-\u9fffA]+', fund_info[0].get_text()).group()[1:] 63 | fund_scale = fund_info[1].get_text().split(':')[1].strip() 64 | fund_establishmentDate = fund_info[3].get_text().split(':')[1].strip() 65 | # fund_grade = fund_info[5].get_text().split(':')[1].strip() 66 | fund_Rdata = soup.select('#increaseAmount_stage > .ui-table-hover div.Rdata ')#指数基金多一排,考虑re或者排名倒着写 67 | fund_1weekAmount = fund_Rdata[0].get_text() 68 | fund_1monthAmount = fund_Rdata[1].get_text() 69 | fund_3monthAmount = fund_Rdata[2].get_text() 70 | fund_6monthAmount = fund_Rdata[3].get_text() 71 | fund_thisYearAmount = fund_Rdata[4].get_text() 72 | fund_1yearAmount = fund_Rdata[5].get_text() 73 | fund_2yearAmount = fund_Rdata[6].get_text() 74 | fund_3yearAmount = fund_Rdata[7].get_text() 75 | fund_1weekRank = fund_Rdata[-8].get_text() 76 | fund_1monthRank = fund_Rdata[-7].get_text() 77 | fund_3monthRank = fund_Rdata[-6].get_text() 78 | fund_6monthRank = fund_Rdata[-5].get_text() 79 | fund_thisYearRank = fund_Rdata[-4].get_text() 80 | fund_1yearRank = fund_Rdata[-3].get_text() 81 | fund_2yearRank = fund_Rdata[-2].get_text() 82 | fund_3yearRank = fund_Rdata[-1].get_text() 83 | Fund_data = [fund_code, fund_name, fund_type, unit_netValue, accumulative_netValue, 84 | fund_scale, fund_establishmentDate, 85 | fund_1weekAmount, fund_1monthAmount, fund_3monthAmount,fund_6monthAmount, 86 | fund_thisYearAmount, fund_1yearAmount, fund_2yearAmount, fund_3yearAmount, 87 | fund_1weekRank, fund_1monthRank, fund_3monthRank, fund_6monthRank, fund_thisYearRank, 88 | fund_1yearRank, fund_2yearRank, fund_3yearRank] 89 | print(Fund_data) 90 | return Fund_data 91 | except Exception as e: 92 | # print('Error:', single_fund_url, str(e)) 93 | logging.exception('Error:', single_fund_url, str(e)) 94 | 95 | 96 | 97 | def main(): 98 | # 初始化区域 99 | main1_name = ['基金代码', '基金简称', '缩写', '日期', '单位净值', '累计净值', 100 | '日增长率(%)', '近1周增幅', '近1月增幅', '近3月增幅', '近6月增幅', '近1年增幅', '近2年增幅', '近3年增幅', 101 | '今年来', '成立来', '成立日期', '购买手续费折扣', '自定义', '手续费原价?', '手续费折后?', 102 | '布吉岛', '布吉岛', '布吉岛', '布吉岛'] 103 | # main1_name = ['基金代码', '基金简称', '日期', '单位净值', '累计净值', 104 | # '日增长率(%)', '近1周增幅', '近1月增幅', '近3月增幅', '近6月增幅', '近1年增幅', '近2年增幅', '近3年增幅', 105 | # '今年来', '成立来', '成立日期'] 106 | main2_name = ['基金代码', '基金简称', '基金类型', '单位净值', '累计净值', '基金规模', '成立日期',\ 107 | '近1周增幅', '近1月增幅', '近3月增幅', '近6月增幅', '今年来增幅', '近1年增幅', '近2年增幅', '近3年增幅',\ 108 | '近1周排名', '近1月排名', '近3月排名', '近6月排名', '今年来排名', '近1年排名', '近2年排名', '近3年排名'] 109 | # ########################## 先爬API接口 ################################### 110 | rawData = rank_data_crawl() 111 | # 数据清洗 112 | # 未满三年剔除 113 | rawData = DataFrame(rawData, columns=main1_name) 114 | rawData= rawData.loc[rawData['近3年增幅'] !=''] 115 | # 去除无用列 116 | # # rawData.drop(rawData.columns(['缩写', '购买手续费折扣', '自定义', '手续费原价?', '手续费折后?', '布吉岛'], axis=1)) 117 | # rawData.drop(['缩写', '购买手续费折扣', '自定义', '手续费原价?', '手续费折后?', '布吉岛'], axis=1, inplace=True) 118 | rawData1= rawData.iloc[1:15, [0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16]] 119 | # main1_name = main1_name[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] 120 | 121 | # ########################## 单页面抓取 ################################### 122 | # 获取抓取的detail网址 123 | detail_urls_list = ['http://fund.eastmoney.com/{}.html'.format(i) for i in rawData1['基金代码']] 124 | print('#详情页面的抓取#启动时间:', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 125 | middle = datetime.datetime.now() 126 | # 多线程 127 | p = Pool(4) 128 | all_fund_data = p.map(get_allFund_content, detail_urls_list) 129 | p.close() 130 | p.join() 131 | while None in all_fund_data: 132 | all_fund_data.remove(None) 133 | end = datetime.datetime.now() 134 | print('#详情页面的抓取#用时:', str(end - middle)) 135 | print('程序结束时间:', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 136 | all_fund_data = DataFrame(all_fund_data, columns=main2_name) 137 | 138 | # 表合并 139 | data_merge = pd.merge(rawData1, all_fund_data, how='left', on='基金代码') 140 | 141 | # 文件储存 142 | file_content = pd.DataFrame(columns=main1_name, data=rawData) 143 | file_content.to_csv('./rawDATA-{}.csv'.format(time.strftime("%Y-%m-%d", time.localtime())), encoding='gbk') 144 | 145 | 146 | if __name__ == '__main__': 147 | main() 148 | --------------------------------------------------------------------------------