├── .gitattributes
├── .gitignore
├── README.md
└── 天天基金网.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows image file caches
2 | Thumbs.db
3 | ehthumbs.db
4 |
5 | # Folder config file
6 | Desktop.ini
7 |
8 | # Recycle Bin used on file shares
9 | $RECYCLE.BIN/
10 |
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 |
17 | # Windows shortcuts
18 | *.lnk
19 |
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 |
24 | # OSX
25 | # =========================
26 |
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 |
31 | # Thumbnails
32 | ._*
33 |
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 |
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #爬取天天基金网的基金信息
2 |
3 | * 爬取基金的基本信息(类型、风险、当前净值、成立时间、规模)
4 | * 爬取基金的当前行业配置
5 | * 爬取基金评级(海通证券,招商证券,上海证券,济安金信)
6 | * 爬取基金阶段涨幅
7 | * 爬取基金经理信息
8 |
9 | ##使用方法
10 | 需要的第三方库
11 | `requests`
12 | `xlsxwriter`
13 | `prettytable`
14 |
15 | >1. 输入基金代码
16 | >2. 等待爬取
17 | >3. 输入文件名,文件会保存到.py的当前路径
18 |
19 | ##注意事项
20 | * 请使用python3
21 | * 当前为未完整版,当爬取的基金行业配置无内容时会报错。
22 |
--------------------------------------------------------------------------------
/天天基金网.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import requests
3 | import re
4 | import os
5 | import xlsxwriter
6 | import prettytable
7 |
8 | #代理
9 | HEADERS = {"User-Agent":r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}
10 |
11 | #URL
12 | MAIN_URL = r"http://fund.eastmoney.com/%s.html" #主页
13 | INDUSTRY_URL = r"http://fund.eastmoney.com/f10/F10DataApi.aspx?type=hypz&code=%s&year=2016" #行业配置
14 | fundURL = "" #基金的URL
15 | industryURL = "" #行业配置URL
16 | moreManagerURLList = [] #更多基金经理信息URL列表
17 |
18 | #基金信息
19 | fundCode = "" #代码
20 | fundName = "" #名字
21 | fundType = "" #类型
22 | fundRisk = "" #风险
23 | fundNet = "" #净值
24 | fundAge = "" #成立时间
25 | fundSize = "" #规模
26 | rankDays = [] #评级日期列表(海通证券,招商证券,上海证券,济安金信)
27 | rankList = [] #评级列表(海通证券,招商证券,上海证券,济安金信)
28 | incomeList = [] #收益列表(近1周,近1月,近3月,近6月,今年来,近1年,近2年,近3年)
29 |
30 | #当前行业配置
31 | industryList = [] #行业类别列表
32 | industryPercentList = [] #占净值比例列表
33 | industryValueList = [] #市值(万元)列表
34 |
35 | #基金经理
36 | managerNameList = [] #基金经理名字列表
37 | managerTimeList = [] #管理当前基金的时间列表
38 | managerCareer = "" #累积的任职时间
39 | managerCareerList = [] #累积的任职时间列表
40 | managerCountFund = "" #单个经理同时管理的基金数
41 | managerCountFundList = [] #所有经理同时管理的基金数列表
42 | managerCurIncome = "" #当前基金总收益
43 | managerCurIncomeList = [] #所有经理基金总收益
44 |
45 | #基金经理同时管理的基金信息
46 | managerFundCode = [] #单个经理管理的基金代码列表
47 | managerFundCodeList = [] #所有经理管理的基金代码列表
48 | managerFundName = [] #单个经理管理的基金名称列表
49 | managerFundNameList = [] #所有经理管理的基金名称列表
50 | managerFundType = [] #单个经理管理的基金类型列表
51 | managerFundTypeList = [] #所有经理管理的基金类型列表
52 | managerFundTime = [] #单个经理任职时间列表
53 | managerFundTimeList = [] #所有经理任职时间列表
54 | managerFundDay = [] #单个经理任职天数列表
55 | managerFundDayList = [] #所有经理任职天数列表
56 | managerFundIncome = [] #单个经理任职回报列表
57 | managerFundIncomeList = [] #所有经理任职回报列表
58 |
59 | #当前时间
60 | starttime = datetime.datetime.now()
61 |
62 | if __name__ == "__main__":
63 |
64 | fundCode = input("请输入需要爬取的基金代码:")
65 | fundURL = MAIN_URL % fundCode
66 |
67 | print("********开始爬虫********")
68 | fundHTML = requests.get(fundURL, headers = HEADERS)
69 | fundHTML.encoding = "utf-8"
70 |
71 | #基本信息
72 | fundName = re.search("FundName\">(.*?)", fundHTML.text, re.S).group(1)
73 | fundType = re.search("ft_;pt_\d+\">(.*?)", fundHTML.text, re.S).group(1)
74 | fundRisk = re.search("\| (.*?)
", fundHTML.text, re.S).group(1)
75 | fundNet = re.search("gz_gsz\">(.*?)", fundHTML.text, re.S).group(1)
76 | fundAge = re.search("成 立 日:(.*?) | ", fundHTML.text, re.S).group(1)
77 | fundSize = re.search("基金规模:(.*?)", fundHTML.text, re.S).group(1)
78 | # 截取评级
79 | rankContent = re.search("html\">海通证券(.*?)更多评级信息>", fundHTML.text, re.S).group(1)
80 | rankDays = re.findall("alignRight\">(.*?)", rankContent, re.S)
81 | rankList = re.findall("alignRight10\">(.*?)", rankContent, re.S)
82 | # 截取阶段涨幅
83 | increaseContent = re.search("typeName\">同类排名(.*?)四分位排名", fundHTML.text, re.S).group(1)
84 | incomeList = re.findall("Rdata\">(.*?)
", increaseContent, re.S)
85 |
86 | #行业配置
87 | industryURL = INDUSTRY_URL % fundCode
88 | industryHTML = requests.get(industryURL)
89 | industryContent = re.search(" (.*?) ", industryHTML.text, re.S).group(1) #截取最新季度信息
90 | industryList = re.findall("class='tol'>(.*?)", industryContent, re.S)
91 | industryPercentList = re.findall("class='tor'>(.*?)", industryContent, re.S)
92 | industryValueList = re.findall("class='tor'>.*? | (.*?) | ", industryContent, re.S)
93 |
94 | #获取当前基金的所有基金经理
95 | moreManagerContent = re.search("基金经理变动一览(.*?)更多", fundHTML.text, re.S).group(1)
96 | moreManagerURL = re.search("href=\"(.*?)\"", moreManagerContent, re.S).group(1)
97 | moreManagerHTML = requests.get(moreManagerURL, headers = HEADERS)
98 | allmanagerContent = re.search("现任基金经理简介(.*?)正文部份结束", moreManagerHTML.text, re.S).group(1) #截取所有基金经理信息
99 | managerNameList = re.findall("姓名:<.*?>(.*?)", allmanagerContent, re.S)
100 | moreManagerURLList = re.findall("text-decoration:none;' href=\"(.*?)\"", allmanagerContent, re.S)
101 | managerTimeList = re.findall("上任日期:(.*?)
", moreManagerHTML.text, re.S)
102 |
103 | #获取当前任职基金经理个人信息
104 | for i in range(len(managerNameList)):
105 | sigleManagerHTML = requests.get(moreManagerURLList[i], headers = HEADERS)
106 | sigleManagerHTML.encoding = "utf-8"
107 | #累计任职时间
108 | managerCareer = re.search("累计任职时间:(.*?)
", sigleManagerHTML.text, re.S).group(1)
109 | managerCareerList.append(managerCareer)
110 | #同时管理的基金数
111 | managerCountFund = len(re.findall("name:'(.*?)'", sigleManagerHTML.text, re.S))
112 | managerCountFundList.append(managerCountFund)
113 | managerFundContent = re.search("任职回报(.*?)", sigleManagerHTML.text, re.S).group(1) #截取任职回报table
114 | #基金代码
115 | managerFundCode = re.findall(".html\">([0-9].*?)", managerFundContent, re.S)
116 | managerFundCodeList.append(managerFundCode)
117 | #基金名称
118 | managerFundName = re.findall("tdl\">.*?>(.*?)", managerFundContent, re.S)
119 | managerFundNameList.append(managerFundName)
120 | #基金类型
121 | managerFundType = re.findall("档案
(.*?) | ", managerFundContent, re.S)
122 | managerFundTypeList.append(managerFundType)
123 | #任职时间
124 | managerFundTime = re.findall("档案.*? | .*? | (.*?) | ", managerFundContent, re.S)
125 | managerFundTimeList.append(managerFundTime)
126 | #任职天数
127 | managerFundDay = re.findall("~.*? | (.*?) | ", managerFundContent, re.S)
128 | managerFundDayList.append(managerFundDay)
129 | #任职回报
130 | managerFundIncome = re.findall("~.*?.*?天 | (.*?) | ", managerFundContent, re.S)
131 | managerFundIncomeList.append(managerFundIncome)
132 | #当前基金总收益
133 | for i in range(len(managerFundCode)):
134 | if managerFundCode[i] == fundCode:
135 | managerCurIncome = managerFundIncome[i]
136 | managerCurIncomeList.append(managerCurIncome)
137 | break
138 |
139 | #输出爬取的信息
140 | print("")
141 | print("基金信息")
142 | print("基金代码:%s" % fundCode)
143 | print("基金名字:%s" % fundName)
144 | print("基金类型:%s" % fundType)
145 | print("基金风险:%s" % fundRisk)
146 | print("基金净值:%s" % fundNet)
147 | print("基金成立时间:%s" % fundAge)
148 | print("基金规模:%s" % fundSize)
149 | print("")
150 | print("行业配置")
151 | pt = prettytable.PrettyTable(["序号", "行业类别", "占净值比例", "市值(万元)"])
152 | pt.padding_width = 5
153 | for i in range(len(industryList)):
154 | pt.add_row([(i + 1), industryList[i], industryPercentList[i], industryValueList[i]])
155 | print(pt)
156 | print("")
157 | print("基金评级")
158 | pt = prettytable.PrettyTable(["评级机构", "评级日期", "评级"])
159 | pt.padding_width = 5
160 | for i in range(len(rankList)):
161 | pt.add_row([rankList[i], rankDays[i], rankList[i]])
162 | print(pt)
163 | print("")
164 | print("基金阶段涨幅")
165 | pt = prettytable.PrettyTable(["近1周", "近1月", "近3月", "近6月", "今年来", "近1年", "近2年", "近3年"])
166 | pt.padding_width = 3
167 | pt.add_row(incomeList)
168 | print(pt)
169 | print("")
170 | print("%d位基金经理" % len(managerNameList))
171 | for i in range(len(managerNameList)):
172 | print("第%d位:%s" % ((i + 1), managerNameList[i]))
173 | print("管理当前基金时间:%s" % managerTimeList[i])
174 | print("累计任职时间:%s" % managerCareerList[i])
175 | print("当前基金总收益:%s" % managerCurIncomeList[i])
176 | print("同时在管理的基金数:%s" % managerCountFundList[i])
177 |
178 | pt = prettytable.PrettyTable(["基金代码", "基金名称", "基金类型", "任职时间", "任职天数", "任职回报"])
179 | pt.padding_width = 5
180 | for j in range(len(managerFundCodeList[i])):
181 | pt.add_row([managerFundCodeList[i][j],
182 | managerFundNameList[i][j],
183 | managerFundTypeList[i][j],
184 | managerFundTimeList[i][j],
185 | managerFundDayList[i][j],
186 | managerFundIncomeList[i][j]])
187 | print(pt)
188 | print("")
189 |
190 | file_name = input("抓取完成,输入文件名保存(不输入则保存到脚本路径):")
191 | if file_name == "":
192 | curTime = str(datetime.datetime.now())
193 | curTime = re.sub(":|\.", "", curTime)
194 | file_name = "%s %s" % (fundName, curTime)
195 | savePath = os.getcwd()
196 | workbook = xlsxwriter.Workbook(savePath + "\\%s.xlsx" % file_name)
197 | print("保存到:" + savePath + "\\%s.xlsx" % file_name)
198 |
199 | #第一页sheet
200 | worksheet1 = workbook.add_worksheet("基金信息")
201 |
202 | titleFormat = workbook.add_format()
203 | titleFormat.set_bold()
204 | titleFormat.set_bg_color("orange")
205 | titleFormat.set_font_size(12)
206 | titleFormat.set_align("center")
207 | titleFormat.set_align("vcenter")
208 | titleFormat.set_border(1)
209 |
210 | contentFormat = workbook.add_format()
211 | contentFormat.set_bg_color("yellow")
212 | contentFormat.set_align("center")
213 | contentFormat.set_align("vcenter")
214 |
215 | headList = ["基金代码",
216 | "基金名字",
217 | "基金类型",
218 | "基金风险",
219 | "基金净值",
220 | "成立时间",
221 | "基金规模",
222 | "海通证券评级",
223 | "招商证券评级",
224 | "上海证券",
225 | "济安金信",
226 | "近1周涨幅",
227 | "近1月涨幅",
228 | "近3月涨幅",
229 | "近6月涨幅",
230 | "今年来涨幅",
231 | "近1年涨幅",
232 | "近2年涨幅",
233 | "近3年涨幅"]
234 | contentList = [fundCode,
235 | fundName,
236 | fundType,
237 | fundRisk,
238 | fundNet,
239 | fundAge,
240 | fundSize,
241 | rankList[0],
242 | rankList[1],
243 | rankList[2],
244 | rankList[3],
245 | incomeList[0],
246 | incomeList[1],
247 | incomeList[2],
248 | incomeList[3],
249 | incomeList[4],
250 | incomeList[5],
251 | incomeList[6],
252 | incomeList[7]]
253 | for i in range(len(headList)):
254 | worksheet1.write(0, i, headList[i], titleFormat)
255 | worksheet1.write(1, i, contentList[i], contentFormat)
256 |
257 | #第二页sheet
258 | worksheet2 = workbook.add_worksheet("行业配置")
259 | worksheet2.write(0, 0, "行业", titleFormat)
260 | worksheet2.write(0, 1, "比例", titleFormat)
261 | worksheet2.write(0, 2, "市值(万元)", titleFormat)
262 | for i in range(len(industryList)):
263 | worksheet2.write(i + 1, 0, industryList[i], contentFormat)
264 | worksheet2.write(i + 1, 1, industryPercentList[i], contentFormat)
265 | worksheet2.write(i + 1, 2, industryValueList[i], contentFormat)
266 |
267 | #后续sheet
268 | for i in range(len(managerNameList)):
269 |
270 | #基金经理基本信息
271 | headList = ["姓名", "管理当前基金起始时间", "当前基金的收益", "累积的任职时间", "同时管理的基金数"]
272 | contentList = [managerNameList[i],
273 | managerTimeList[i],
274 | managerCurIncomeList[i],
275 | managerCareerList[i],
276 | managerCountFundList[i]]
277 | worksheet = workbook.add_worksheet("第%d位基金经理" % (i + 1))
278 | for j in range(len(headList)):
279 | worksheet.write(0, j, headList[j], titleFormat)
280 | worksheet.write(1, j, contentList[j], contentFormat)
281 |
282 | #当前管理的基金信息
283 | headList = ["基金代码", "基金名称", "基金类型", "任职时间", "任职天数", "任职回报"]
284 | worksheet.write_row("A5", headList, titleFormat)
285 | for k in range(len(managerFundCodeList[i])):
286 | contentList = [managerFundCodeList[i][k],
287 | managerFundNameList[i][k],
288 | managerFundTypeList[i][k],
289 | managerFundTimeList[i][k],
290 | managerFundDayList[i][k],
291 | managerFundIncomeList[i][k]]
292 | if managerFundCodeList[i][k] == fundCode:
293 | newContentFormat = workbook.add_format()
294 | newContentFormat.set_bg_color("red")
295 | newContentFormat.set_align("center")
296 | newContentFormat.set_align("vcenter")
297 | worksheet.write_row("A%d" % (6 + k), contentList, newContentFormat)
298 | else:
299 | worksheet.write_row("A%d" % (6 + k), contentList, contentFormat)
300 |
301 | workbook.close()
302 |
303 | #爬虫消耗的时间
304 | endtime = datetime.datetime.now()
305 | time = (endtime - starttime).seconds
306 | print("")
307 | print("********结束爬虫********")
308 | print('总耗时:%ss' % time)
309 |
310 |
--------------------------------------------------------------------------------