├── .gitattributes ├── .gitignore ├── README.md └── 天天基金网.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #爬取天天基金网的基金信息 2 | 3 | * 爬取基金的基本信息(类型、风险、当前净值、成立时间、规模)
4 | * 爬取基金的当前行业配置
5 | * 爬取基金评级(海通证券,招商证券,上海证券,济安金信)
6 | * 爬取基金阶段涨幅
7 | * 爬取基金经理信息
8 | 9 | ##使用方法 10 | 需要的第三方库
11 | `requests`
12 | `xlsxwriter`
13 | `prettytable`
14 | 15 | >1. 输入基金代码
16 | >2. 等待爬取
17 | >3. 输入文件名,文件会保存到.py的当前路径
18 | 19 | ##注意事项 20 | * 请使用python3
21 | * 当前为未完整版,当爬取的基金行业配置无内容时会报错。 22 | -------------------------------------------------------------------------------- /天天基金网.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import requests 3 | import re 4 | import os 5 | import xlsxwriter 6 | import prettytable 7 | 8 | #代理 9 | HEADERS = {"User-Agent":r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"} 10 | 11 | #URL 12 | MAIN_URL = r"http://fund.eastmoney.com/%s.html" #主页 13 | INDUSTRY_URL = r"http://fund.eastmoney.com/f10/F10DataApi.aspx?type=hypz&code=%s&year=2016" #行业配置 14 | fundURL = "" #基金的URL 15 | industryURL = "" #行业配置URL 16 | moreManagerURLList = [] #更多基金经理信息URL列表 17 | 18 | #基金信息 19 | fundCode = "" #代码 20 | fundName = "" #名字 21 | fundType = "" #类型 22 | fundRisk = "" #风险 23 | fundNet = "" #净值 24 | fundAge = "" #成立时间 25 | fundSize = "" #规模 26 | rankDays = [] #评级日期列表(海通证券,招商证券,上海证券,济安金信) 27 | rankList = [] #评级列表(海通证券,招商证券,上海证券,济安金信) 28 | incomeList = [] #收益列表(近1周,近1月,近3月,近6月,今年来,近1年,近2年,近3年) 29 | 30 | #当前行业配置 31 | industryList = [] #行业类别列表 32 | industryPercentList = [] #占净值比例列表 33 | industryValueList = [] #市值(万元)列表 34 | 35 | #基金经理 36 | managerNameList = [] #基金经理名字列表 37 | managerTimeList = [] #管理当前基金的时间列表 38 | managerCareer = "" #累积的任职时间 39 | managerCareerList = [] #累积的任职时间列表 40 | managerCountFund = "" #单个经理同时管理的基金数 41 | managerCountFundList = [] #所有经理同时管理的基金数列表 42 | managerCurIncome = "" #当前基金总收益 43 | managerCurIncomeList = [] #所有经理基金总收益 44 | 45 | #基金经理同时管理的基金信息 46 | managerFundCode = [] #单个经理管理的基金代码列表 47 | managerFundCodeList = [] #所有经理管理的基金代码列表 48 | managerFundName = [] #单个经理管理的基金名称列表 49 | managerFundNameList = [] #所有经理管理的基金名称列表 50 | managerFundType = [] #单个经理管理的基金类型列表 51 | managerFundTypeList = [] #所有经理管理的基金类型列表 52 | managerFundTime = [] #单个经理任职时间列表 53 | managerFundTimeList = [] #所有经理任职时间列表 54 | managerFundDay = [] #单个经理任职天数列表 55 | managerFundDayList = [] #所有经理任职天数列表 56 | managerFundIncome = [] #单个经理任职回报列表 57 | managerFundIncomeList = [] #所有经理任职回报列表 58 | 59 | #当前时间 60 | starttime = datetime.datetime.now() 61 | 62 | if __name__ == "__main__": 63 | 64 | fundCode = input("请输入需要爬取的基金代码:") 65 | fundURL = MAIN_URL % fundCode 66 | 67 | print("********开始爬虫********") 68 | fundHTML = requests.get(fundURL, headers = HEADERS) 69 | fundHTML.encoding = "utf-8" 70 | 71 | #基本信息 72 | fundName = re.search("FundName\">(.*?)", fundHTML.text, re.S).group(1) 73 | fundType = re.search("ft_;pt_\d+\">(.*?)", fundHTML.text, re.S).group(1) 74 | fundRisk = re.search("\|  (.*?)", fundHTML.text, re.S).group(1) 75 | fundNet = re.search("gz_gsz\">(.*?)", fundHTML.text, re.S).group(1) 76 | fundAge = re.search("成 立 日:(.*?)", fundHTML.text, re.S).group(1) 77 | fundSize = re.search("基金规模:(.*?)", fundHTML.text, re.S).group(1) 78 | # 截取评级 79 | rankContent = re.search("html\">海通证券(.*?)更多评级信息>", fundHTML.text, re.S).group(1) 80 | rankDays = re.findall("alignRight\">(.*?)", rankContent, re.S) 81 | rankList = re.findall("alignRight10\">(.*?)", rankContent, re.S) 82 | # 截取阶段涨幅 83 | increaseContent = re.search("typeName\">同类排名(.*?)四分位排名
", fundHTML.text, re.S).group(1) 84 | incomeList = re.findall("Rdata\">(.*?)
", increaseContent, re.S) 85 | 86 | #行业配置 87 | industryURL = INDUSTRY_URL % fundCode 88 | industryHTML = requests.get(industryURL) 89 | industryContent = re.search("  (.*?)  ", industryHTML.text, re.S).group(1) #截取最新季度信息 90 | industryList = re.findall("class='tol'>(.*?)", industryContent, re.S) 91 | industryPercentList = re.findall("class='tor'>(.*?)", industryContent, re.S) 92 | industryValueList = re.findall("class='tor'>.*?(.*?)", industryContent, re.S) 93 | 94 | #获取当前基金的所有基金经理 95 | moreManagerContent = re.search("基金经理变动一览(.*?)更多", fundHTML.text, re.S).group(1) 96 | moreManagerURL = re.search("href=\"(.*?)\"", moreManagerContent, re.S).group(1) 97 | moreManagerHTML = requests.get(moreManagerURL, headers = HEADERS) 98 | allmanagerContent = re.search("现任基金经理简介(.*?)正文部份结束", moreManagerHTML.text, re.S).group(1) #截取所有基金经理信息 99 | managerNameList = re.findall("姓名:<.*?>(.*?)

", allmanagerContent, re.S) 100 | moreManagerURLList = re.findall("text-decoration:none;' href=\"(.*?)\"", allmanagerContent, re.S) 101 | managerTimeList = re.findall("上任日期:(.*?)

", moreManagerHTML.text, re.S) 102 | 103 | #获取当前任职基金经理个人信息 104 | for i in range(len(managerNameList)): 105 | sigleManagerHTML = requests.get(moreManagerURLList[i], headers = HEADERS) 106 | sigleManagerHTML.encoding = "utf-8" 107 | #累计任职时间 108 | managerCareer = re.search("累计任职时间:(.*?)
", sigleManagerHTML.text, re.S).group(1) 109 | managerCareerList.append(managerCareer) 110 | #同时管理的基金数 111 | managerCountFund = len(re.findall("name:'(.*?)'", sigleManagerHTML.text, re.S)) 112 | managerCountFundList.append(managerCountFund) 113 | managerFundContent = re.search("任职回报(.*?)", sigleManagerHTML.text, re.S).group(1) #截取任职回报table 114 | #基金代码 115 | managerFundCode = re.findall(".html\">([0-9].*?)", managerFundContent, re.S) 116 | managerFundCodeList.append(managerFundCode) 117 | #基金名称 118 | managerFundName = re.findall("tdl\">.*?>(.*?)", managerFundContent, re.S) 119 | managerFundNameList.append(managerFundName) 120 | #基金类型 121 | managerFundType = re.findall("档案(.*?)", managerFundContent, re.S) 122 | managerFundTypeList.append(managerFundType) 123 | #任职时间 124 | managerFundTime = re.findall("档案.*?.*?(.*?)", managerFundContent, re.S) 125 | managerFundTimeList.append(managerFundTime) 126 | #任职天数 127 | managerFundDay = re.findall("~.*?(.*?)", managerFundContent, re.S) 128 | managerFundDayList.append(managerFundDay) 129 | #任职回报 130 | managerFundIncome = re.findall("~.*?.*?天(.*?)", managerFundContent, re.S) 131 | managerFundIncomeList.append(managerFundIncome) 132 | #当前基金总收益 133 | for i in range(len(managerFundCode)): 134 | if managerFundCode[i] == fundCode: 135 | managerCurIncome = managerFundIncome[i] 136 | managerCurIncomeList.append(managerCurIncome) 137 | break 138 | 139 | #输出爬取的信息 140 | print("") 141 | print("基金信息") 142 | print("基金代码:%s" % fundCode) 143 | print("基金名字:%s" % fundName) 144 | print("基金类型:%s" % fundType) 145 | print("基金风险:%s" % fundRisk) 146 | print("基金净值:%s" % fundNet) 147 | print("基金成立时间:%s" % fundAge) 148 | print("基金规模:%s" % fundSize) 149 | print("") 150 | print("行业配置") 151 | pt = prettytable.PrettyTable(["序号", "行业类别", "占净值比例", "市值(万元)"]) 152 | pt.padding_width = 5 153 | for i in range(len(industryList)): 154 | pt.add_row([(i + 1), industryList[i], industryPercentList[i], industryValueList[i]]) 155 | print(pt) 156 | print("") 157 | print("基金评级") 158 | pt = prettytable.PrettyTable(["评级机构", "评级日期", "评级"]) 159 | pt.padding_width = 5 160 | for i in range(len(rankList)): 161 | pt.add_row([rankList[i], rankDays[i], rankList[i]]) 162 | print(pt) 163 | print("") 164 | print("基金阶段涨幅") 165 | pt = prettytable.PrettyTable(["近1周", "近1月", "近3月", "近6月", "今年来", "近1年", "近2年", "近3年"]) 166 | pt.padding_width = 3 167 | pt.add_row(incomeList) 168 | print(pt) 169 | print("") 170 | print("%d位基金经理" % len(managerNameList)) 171 | for i in range(len(managerNameList)): 172 | print("第%d位:%s" % ((i + 1), managerNameList[i])) 173 | print("管理当前基金时间:%s" % managerTimeList[i]) 174 | print("累计任职时间:%s" % managerCareerList[i]) 175 | print("当前基金总收益:%s" % managerCurIncomeList[i]) 176 | print("同时在管理的基金数:%s" % managerCountFundList[i]) 177 | 178 | pt = prettytable.PrettyTable(["基金代码", "基金名称", "基金类型", "任职时间", "任职天数", "任职回报"]) 179 | pt.padding_width = 5 180 | for j in range(len(managerFundCodeList[i])): 181 | pt.add_row([managerFundCodeList[i][j], 182 | managerFundNameList[i][j], 183 | managerFundTypeList[i][j], 184 | managerFundTimeList[i][j], 185 | managerFundDayList[i][j], 186 | managerFundIncomeList[i][j]]) 187 | print(pt) 188 | print("") 189 | 190 | file_name = input("抓取完成,输入文件名保存(不输入则保存到脚本路径):") 191 | if file_name == "": 192 | curTime = str(datetime.datetime.now()) 193 | curTime = re.sub(":|\.", "", curTime) 194 | file_name = "%s %s" % (fundName, curTime) 195 | savePath = os.getcwd() 196 | workbook = xlsxwriter.Workbook(savePath + "\\%s.xlsx" % file_name) 197 | print("保存到:" + savePath + "\\%s.xlsx" % file_name) 198 | 199 | #第一页sheet 200 | worksheet1 = workbook.add_worksheet("基金信息") 201 | 202 | titleFormat = workbook.add_format() 203 | titleFormat.set_bold() 204 | titleFormat.set_bg_color("orange") 205 | titleFormat.set_font_size(12) 206 | titleFormat.set_align("center") 207 | titleFormat.set_align("vcenter") 208 | titleFormat.set_border(1) 209 | 210 | contentFormat = workbook.add_format() 211 | contentFormat.set_bg_color("yellow") 212 | contentFormat.set_align("center") 213 | contentFormat.set_align("vcenter") 214 | 215 | headList = ["基金代码", 216 | "基金名字", 217 | "基金类型", 218 | "基金风险", 219 | "基金净值", 220 | "成立时间", 221 | "基金规模", 222 | "海通证券评级", 223 | "招商证券评级", 224 | "上海证券", 225 | "济安金信", 226 | "近1周涨幅", 227 | "近1月涨幅", 228 | "近3月涨幅", 229 | "近6月涨幅", 230 | "今年来涨幅", 231 | "近1年涨幅", 232 | "近2年涨幅", 233 | "近3年涨幅"] 234 | contentList = [fundCode, 235 | fundName, 236 | fundType, 237 | fundRisk, 238 | fundNet, 239 | fundAge, 240 | fundSize, 241 | rankList[0], 242 | rankList[1], 243 | rankList[2], 244 | rankList[3], 245 | incomeList[0], 246 | incomeList[1], 247 | incomeList[2], 248 | incomeList[3], 249 | incomeList[4], 250 | incomeList[5], 251 | incomeList[6], 252 | incomeList[7]] 253 | for i in range(len(headList)): 254 | worksheet1.write(0, i, headList[i], titleFormat) 255 | worksheet1.write(1, i, contentList[i], contentFormat) 256 | 257 | #第二页sheet 258 | worksheet2 = workbook.add_worksheet("行业配置") 259 | worksheet2.write(0, 0, "行业", titleFormat) 260 | worksheet2.write(0, 1, "比例", titleFormat) 261 | worksheet2.write(0, 2, "市值(万元)", titleFormat) 262 | for i in range(len(industryList)): 263 | worksheet2.write(i + 1, 0, industryList[i], contentFormat) 264 | worksheet2.write(i + 1, 1, industryPercentList[i], contentFormat) 265 | worksheet2.write(i + 1, 2, industryValueList[i], contentFormat) 266 | 267 | #后续sheet 268 | for i in range(len(managerNameList)): 269 | 270 | #基金经理基本信息 271 | headList = ["姓名", "管理当前基金起始时间", "当前基金的收益", "累积的任职时间", "同时管理的基金数"] 272 | contentList = [managerNameList[i], 273 | managerTimeList[i], 274 | managerCurIncomeList[i], 275 | managerCareerList[i], 276 | managerCountFundList[i]] 277 | worksheet = workbook.add_worksheet("第%d位基金经理" % (i + 1)) 278 | for j in range(len(headList)): 279 | worksheet.write(0, j, headList[j], titleFormat) 280 | worksheet.write(1, j, contentList[j], contentFormat) 281 | 282 | #当前管理的基金信息 283 | headList = ["基金代码", "基金名称", "基金类型", "任职时间", "任职天数", "任职回报"] 284 | worksheet.write_row("A5", headList, titleFormat) 285 | for k in range(len(managerFundCodeList[i])): 286 | contentList = [managerFundCodeList[i][k], 287 | managerFundNameList[i][k], 288 | managerFundTypeList[i][k], 289 | managerFundTimeList[i][k], 290 | managerFundDayList[i][k], 291 | managerFundIncomeList[i][k]] 292 | if managerFundCodeList[i][k] == fundCode: 293 | newContentFormat = workbook.add_format() 294 | newContentFormat.set_bg_color("red") 295 | newContentFormat.set_align("center") 296 | newContentFormat.set_align("vcenter") 297 | worksheet.write_row("A%d" % (6 + k), contentList, newContentFormat) 298 | else: 299 | worksheet.write_row("A%d" % (6 + k), contentList, contentFormat) 300 | 301 | workbook.close() 302 | 303 | #爬虫消耗的时间 304 | endtime = datetime.datetime.now() 305 | time = (endtime - starttime).seconds 306 | print("") 307 | print("********结束爬虫********") 308 | print('总耗时:%ss' % time) 309 | 310 | --------------------------------------------------------------------------------