├── README.md
├── data_filter.py
├── .gitignore
└── main_V3.py


/README.md:
--------------------------------------------------------------------------------
1 | # fund_pool
2 | 基于天天基金网站的爬虫。主要通过四分卫法和其他相关指标，筛选出基金池，仅供参考。
3 | 


--------------------------------------------------------------------------------
/data_filter.py:
--------------------------------------------------------------------------------
 1 | ## 策略
 2 | # 1.四分位法筛选3年、2年、1年业绩都排名1/4的基金；
 3 | # 2.规模适中（20至100亿）
 4 | # 3.基金经理工作年限较久（从事证券相关工作和直接管理基金的两个角度；最好工作时间大于3年）
 5 | 
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | 
10 | fileName = '基金数据情况-2018-11-06'
11 | f = open('./{}.csv'.format(fileName))
12 | # f = open('./test.csv')
13 | data = pd.read_csv(f)
14 | 
15 | data.近3年增幅 = data.近3年增幅.replace('--', np.nan)
16 | data = data.dropna(axis=0,how='any')
17 | 
18 | data.基金规模 = data.基金规模.str.extract('(\d+\.?\d*)', expand=False)
19 | data['基金规模'] = data['基金规模'].apply(pd.to_numeric, errors='coerce')
20 | data = data[data.基金规模 > 5]
21 | 
22 | for i in ['近3月排名', '近1年排名', '近2年排名', '近3年排名']:
23 |     data[i + '系数'] = data[i].str.split('|').map(lambda x: int(x[0])/int(x[1]))
24 |     data = data[data[i+'系数'] < 1/4]
25 | 
26 | data.to_csv('./{}{}.csv'.format(fileName, '-筛选后版本'))


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/main_V3.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import demjson
  3 | import datetime
  4 | import time
  5 | from pandas.core.frame import DataFrame
  6 | from bs4 import BeautifulSoup
  7 | # import json
  8 | import re
  9 | import pandas as pd
 10 | from multiprocessing import Pool
 11 | import logging
 12 | 
 13 | 
 14 | 
 15 | 
 16 | def rank_data_crawl(time_interval='3n', ft='all'):
 17 |     # 当前日期
 18 |     td_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
 19 |     td_dt = datetime.datetime.strptime(td_str, '%Y-%m-%d')
 20 |     # 去年今日
 21 |     last_dt = td_dt - datetime.timedelta(days=365)
 22 |     last_str = datetime.datetime.strftime(last_dt, '%Y-%m-%d')
 23 |     rank_url = 'http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft={0}&rs=&gs=0&sc={1}zf&st=desc&sd={2}&ed={3}&qdii=&tabSubtype=,,,,,&pi=1&pn=10000&dx=1'.format(ft, time_interval, last_str, td_str)
 24 |     # print(rank_url)
 25 |     rp = requests.get(rank_url)
 26 |     rank_txt = rp.text[rp.text.find('=') + 2:rp.text.rfind(';')]
 27 |     # print(rank_txt)
 28 |     # 数据
 29 |     rank_rawdata = demjson.decode(rank_txt)
 30 |     # rawdata_allNum = rank_rawdata['allNum']
 31 |     rank_list = []
 32 |     for i in rank_rawdata['datas']:
 33 |         rank_list.append(i.split(','))
 34 |     # print(rank_url, 'rawdata_allNum:{}'.format(rawdata_allNum), sep='\n')
 35 |     return rank_list
 36 | 
 37 | # 详情页面的抓取
 38 | def get_allFund_content(single_fund_url):
 39 |     try:
 40 |         # print(single_fund_url)
 41 |         # if infromation[3] !='理财型' and infromation[3] !='货币型' and infromation[2].endswith('(后端)')==False:
 42 |         #     code = infromation[0]
 43 |         headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
 44 |         r = requests.get(single_fund_url, headers=headers)
 45 |         r.encoding = r.apparent_encoding #避免中文乱码
 46 |         soup = BeautifulSoup(r.text, 'lxml')
 47 |         # # 判断交易状态
 48 |         # staticItem = soup.select('.staticItem')[0].get_text()
 49 |         # if '终止' in staticItem or '认购' in staticItem:
 50 |         #     pass
 51 |         # else:
 52 |             # 各项指标
 53 |             # 基金名、基金类型、单位净值、累计净值、基金规模、成立日、评级、基金涨幅及排名
 54 |             # （近1周、近1月、近3月、近6月、今年来、近1年、近2年、近3年）
 55 |         fund_code = single_fund_url[26:32]
 56 |         # fund_name = re.match('[\u4e00-\u9fffA-Za-z]+', soup.select('.fundDetail-tit > div')[0].get_text()).group()
 57 |         fund_name = soup.select('.SitePath > a')[2].get_text()
 58 |         unit_netValue = soup.select('.dataItem02 > .dataNums > span.ui-font-large')[0].get_text()
 59 |         accumulative_netValue = soup.select('.dataItem03 > .dataNums > span.ui-font-large')[0].get_text()
 60 |         fund_info = [i for i in soup.select('div.infoOfFund tr > td')]
 61 |         # fund_type1 = fund_info[0].get_text().split('：')[1].strip()
 62 |         fund_type = re.search('：[DQI\-\u4e00-\u9fffA]+', fund_info[0].get_text()).group()[1:]
 63 |         fund_scale = fund_info[1].get_text().split('：')[1].strip()
 64 |         fund_establishmentDate = fund_info[3].get_text().split('：')[1].strip()
 65 |         # fund_grade = fund_info[5].get_text().split('：')[1].strip()
 66 |         fund_Rdata = soup.select('#increaseAmount_stage > .ui-table-hover div.Rdata ')#指数基金多一排，考虑re或者排名倒着写
 67 |         fund_1weekAmount = fund_Rdata[0].get_text()
 68 |         fund_1monthAmount = fund_Rdata[1].get_text()
 69 |         fund_3monthAmount = fund_Rdata[2].get_text()
 70 |         fund_6monthAmount = fund_Rdata[3].get_text()
 71 |         fund_thisYearAmount = fund_Rdata[4].get_text()
 72 |         fund_1yearAmount = fund_Rdata[5].get_text()
 73 |         fund_2yearAmount = fund_Rdata[6].get_text()
 74 |         fund_3yearAmount = fund_Rdata[7].get_text()
 75 |         fund_1weekRank = fund_Rdata[-8].get_text()
 76 |         fund_1monthRank = fund_Rdata[-7].get_text()
 77 |         fund_3monthRank = fund_Rdata[-6].get_text()
 78 |         fund_6monthRank = fund_Rdata[-5].get_text()
 79 |         fund_thisYearRank = fund_Rdata[-4].get_text()
 80 |         fund_1yearRank = fund_Rdata[-3].get_text()
 81 |         fund_2yearRank = fund_Rdata[-2].get_text()
 82 |         fund_3yearRank = fund_Rdata[-1].get_text()
 83 |         Fund_data = [fund_code, fund_name, fund_type, unit_netValue, accumulative_netValue,
 84 |                      fund_scale, fund_establishmentDate,
 85 |                      fund_1weekAmount, fund_1monthAmount, fund_3monthAmount,fund_6monthAmount,
 86 |                      fund_thisYearAmount, fund_1yearAmount, fund_2yearAmount, fund_3yearAmount,
 87 |                      fund_1weekRank, fund_1monthRank, fund_3monthRank, fund_6monthRank, fund_thisYearRank,
 88 |                      fund_1yearRank, fund_2yearRank, fund_3yearRank]
 89 |         print(Fund_data)
 90 |         return Fund_data
 91 |     except Exception as e:
 92 |         # print('Error:', single_fund_url, str(e))
 93 |         logging.exception('Error:', single_fund_url, str(e))
 94 | 
 95 | 
 96 | 
 97 | def main():
 98 |     #  初始化区域
 99 |     main1_name = ['基金代码', '基金简称', '缩写', '日期', '单位净值', '累计净值',
100 |                   '日增长率(%)', '近1周增幅', '近1月增幅', '近3月增幅', '近6月增幅', '近1年增幅', '近2年增幅', '近3年增幅',
101 |                   '今年来', '成立来', '成立日期', '购买手续费折扣', '自定义', '手续费原价？', '手续费折后？',
102 |                   '布吉岛', '布吉岛', '布吉岛', '布吉岛']
103 |     # main1_name = ['基金代码', '基金简称', '日期', '单位净值', '累计净值',
104 |     #               '日增长率(%)', '近1周增幅', '近1月增幅', '近3月增幅', '近6月增幅', '近1年增幅', '近2年增幅', '近3年增幅',
105 |     #               '今年来', '成立来', '成立日期']
106 |     main2_name = ['基金代码', '基金简称', '基金类型', '单位净值', '累计净值', '基金规模', '成立日期',\
107 |                   '近1周增幅', '近1月增幅', '近3月增幅', '近6月增幅', '今年来增幅', '近1年增幅', '近2年增幅', '近3年增幅',\
108 |                   '近1周排名', '近1月排名', '近3月排名', '近6月排名', '今年来排名', '近1年排名', '近2年排名', '近3年排名']
109 |     # ########################## 先爬API接口 ###################################
110 |     rawData = rank_data_crawl()
111 |     # 数据清洗
112 |     # 未满三年剔除
113 |     rawData = DataFrame(rawData, columns=main1_name)
114 |     rawData= rawData.loc[rawData['近3年增幅'] !='']
115 |     # 去除无用列
116 |     # # rawData.drop(rawData.columns(['缩写', '购买手续费折扣', '自定义', '手续费原价？', '手续费折后？', '布吉岛'], axis=1))
117 |     # rawData.drop(['缩写', '购买手续费折扣', '自定义', '手续费原价？', '手续费折后？', '布吉岛'], axis=1, inplace=True)
118 |     rawData1= rawData.iloc[1:15, [0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16]]
119 |     # main1_name = main1_name[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
120 | 
121 |     # ########################## 单页面抓取 ###################################
122 |     # 获取抓取的detail网址
123 |     detail_urls_list = ['http://fund.eastmoney.com/{}.html'.format(i) for i in rawData1['基金代码']]
124 |     print('#详情页面的抓取#启动时间：', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
125 |     middle = datetime.datetime.now()
126 |     # 多线程
127 |     p = Pool(4)
128 |     all_fund_data = p.map(get_allFund_content, detail_urls_list)
129 |     p.close()
130 |     p.join()
131 |     while None in all_fund_data:
132 |         all_fund_data.remove(None)
133 |     end = datetime.datetime.now()
134 |     print('#详情页面的抓取#用时：', str(end - middle))
135 |     print('程序结束时间：', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
136 |     all_fund_data = DataFrame(all_fund_data, columns=main2_name)
137 | 
138 |     # 表合并
139 |     data_merge = pd.merge(rawData1, all_fund_data, how='left', on='基金代码')
140 | 
141 |     # 文件储存
142 |     file_content = pd.DataFrame(columns=main1_name, data=rawData)
143 |     file_content.to_csv('./rawDATA-{}.csv'.format(time.strftime("%Y-%m-%d", time.localtime())), encoding='gbk')
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     main()
148 | 


--------------------------------------------------------------------------------