├── .gitattributes
├── .gitignore
├── 0-Distance
└── blog_ml_distance.py
├── 0-Spider
├── README.md
├── beidaNewsSpider
│ ├── .idea
│ │ ├── beidaSpider.iml
│ │ ├── inspectionProfiles
│ │ │ └── profiles_settings.xml
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ └── workspace.xml
│ ├── README.md
│ ├── news.sql
│ ├── news.txt
│ └── spider.py
└── tiebaSpider
│ ├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── tiebaSpider.iml
│ └── workspace.xml
│ ├── README.md
│ ├── spider1
│ ├── README.md
│ ├── main.py
│ ├── spider.py
│ ├── spider.pyc
│ └── tiebaname
│ │ └── name.txt
│ └── spider2
│ └── tieba
│ ├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── tieba.iml
│ └── workspace.xml
│ ├── data
│ └── 20170630_all_href.txt
│ ├── name.txt
│ ├── scrapy.cfg
│ └── tieba
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── items.pyc
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── pipelines.pyc
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── tieba1.py
│ ├── tieba1.pyc
│ ├── tieba2.py
│ └── tieba2.pyc
├── AdaBoost
└── AdaBoost.py
├── Apriori
└── Apriori.py
├── Bayes
└── bayes.py
├── Decision-Tree
└── DecisionTree-ID3.py
├── FP-growth
├── FP_Tree.py
├── newsClickStream.py
└── 所用到dat文件下载地址.txt
├── K-means
└── kMeans.py
├── Logistic Regession
├── LogisticRegession.py
├── LogisticRegessionExample.py
├── ex1.txt
├── horseColicTest.txt
└── horseColicTraining.txt
├── PCA
├── PCA.py
├── secom.data
└── testSet.txt
├── README.md
├── Recommend
├── uid_score_bid.dat
├── 基于item的协同过滤推荐BasedItem.py
├── 基于图的推荐PersonalRank.py
├── 基于标签的推荐.py
└── 基于用户的协同过滤推荐BasedUserCF.py
├── Regession
├── abalone.txt
├── ex0.txt
├── ex1.txt
└── regession.py
└── sklearn
├── README.md
└── line_regression
├── Folds5x2_pp.csv
└── sk_linreg.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows image file caches
2 | Thumbs.db
3 | ehthumbs.db
4 |
5 | # Folder config file
6 | Desktop.ini
7 |
8 | # Recycle Bin used on file shares
9 | $RECYCLE.BIN/
10 |
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 |
17 | # Windows shortcuts
18 | *.lnk
19 |
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 |
24 | # OSX
25 | # =========================
26 |
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 |
31 | # Thumbnails
32 | ._*
33 |
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 |
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 |
--------------------------------------------------------------------------------
/0-Distance/blog_ml_distance.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from numpy import *
4 |
5 | print '[+]------------欧式距离-----------'
6 | def twoPointDistance(a,b):
7 | d = sqrt( (a[0]-b[0])**2 + (a[1]-b[1])**2 )
8 | return d
9 |
10 | print 'a,b 二维距离为:',twoPointDistance((1,1),(2,2))
11 |
12 | def threePointDistance(a,b):
13 | d = sqrt( (a[0]-b[0])**2 + (a[1]-b[1])**2 + (a[2]-b[2])**2 )
14 | return d
15 |
16 | print 'a,b 三维距离为:',threePointDistance((1,1,1),(2,2,2))
17 |
18 | def distance(a,b):
19 | sum = 0
20 | for i in range(len(a)):
21 | sum += (a[i]-b[i])**2
22 | return sqrt(sum)
23 |
24 | print 'a,b 多维距离为:',distance((1,1,2,2),(2,2,4,4))
25 |
26 | print '[+]------------标准欧式距离-----------'
27 |
28 | def moreBZOSdis(a,b):
29 | sumnum = 0
30 | for i in range(len(a)):
31 | # 计算si 分量标准差
32 | avg = (a[i]-b[i])/2
33 | si = sqrt( (a[i] - avg) ** 2 + (b[i] - avg) ** 2 )
34 | sumnum += ((a[i]-b[i])/si ) ** 2
35 |
36 | return sqrt(sumnum)
37 |
38 | print 'a,b 标准欧式距离:',moreBZOSdis((1,2,1,2),(3,3,3,4))
39 |
40 | print '[+]------------曼哈顿距离-----------'
41 | def twoMHDdis(a,b):
42 | return abs(a[0]-b[0])+abs(a[1]-b[1])
43 |
44 | print 'a,b 二维曼哈顿距离为:', twoMHDdis((1,1),(2,2))
45 |
46 | def threeMHDdis(a,b):
47 | return abs(a[0]-b[0])+abs(a[1]-b[1]) + abs(a[2]-b[2])
48 |
49 | print 'a,b 三维曼哈顿距离为:', threeMHDdis((1,1,1),(2,2,2))
50 |
51 |
52 | def moreMHDdis(a,b):
53 | sum = 0
54 | for i in range(len(a)):
55 | sum += abs(a[i]-b[i])
56 | return sum
57 |
58 | print 'a,b 多维曼哈顿距离为:', moreMHDdis((1,1,1,1),(2,2,2,2))
59 |
60 | print '[+]------------切比雪夫距离-----------'
61 | def twoQBXFdis(a,b):
62 | return max( abs(a[0]-b[0]), abs(a[1]-b[1]))
63 |
64 | print 'a,b二维切比雪夫距离:' , twoQBXFdis((1,2),(3,4))
65 |
66 | def moreQBXFdis(a,b):
67 | maxnum = 0
68 | for i in range(len(a)):
69 | if abs(a[i]-b[i]) > maxnum:
70 | maxnum = abs(a[i]-b[i])
71 | return maxnum
72 |
73 | print 'a,b多维切比雪夫距离:' , moreQBXFdis((1,1,1,1),(3,4,3,4))
74 |
75 |
76 | print '[+]------------夹角余弦-----------'
77 |
78 | def twoCos(a,b):
79 | cos = (a[0]*b[0]+a[1]*b[1]) / (sqrt(a[0]**2 + b[0]**2) * sqrt(a[1]**2 + b[1]**2) )
80 |
81 | return cos
82 | print 'a,b 二维夹角余弦距离:',twoCos((1,1),(2,2))
83 |
84 | def moreCos(a,b):
85 | sum_fenzi = 0.0
86 | sum_fenmu_1,sum_fenmu_2 = 0,0
87 | for i in range(len(a)):
88 | sum_fenzi += a[i]*b[i]
89 | sum_fenmu_1 += a[i]**2
90 | sum_fenmu_2 += b[i]**2
91 |
92 | return sum_fenzi/( sqrt(sum_fenmu_1) * sqrt(sum_fenmu_2) )
93 | print 'a,b 多维夹角余弦距离:',moreCos((1,1,1,1),(2,2,2,2))
94 |
95 | print '[+]------------汉明距离-----------'
96 |
97 | def hanmingDis(a,b):
98 | sumnum = 0
99 | for i in range(len(a)):
100 | if a[i]!=b[i]:
101 | sumnum += 1
102 | return sumnum
103 |
104 | print 'a,b 汉明距离:',hanmingDis((1,1,2,3),(2,2,1,3))
105 |
106 | print '[+]------------杰卡德距离-----------'
107 |
108 | def jiekadeDis(a,b):
109 | set_a = set(a)
110 | set_b = set(b)
111 | dis = float(len( (set_a | set_b) - (set_a & set_b) ) )/ len(set_a | set_b)
112 | return dis
113 |
114 | print 'a,b 杰卡德距离:', jiekadeDis((1,2,3),(2,3,4))
115 |
116 | def jiekadeXSDis(a,b):
117 | set_a = set(a)
118 | set_b = set(b)
119 | dis = float(len(set_a & set_b) )/ len(set_a | set_b)
120 | return dis
121 |
122 | print 'a,b 杰卡德相似系数:', jiekadeXSDis((1,2,3),(2,3,4))
123 |
--------------------------------------------------------------------------------
/0-Spider/README.md:
--------------------------------------------------------------------------------
1 | > 此部分我会上传一些spider的代码吧,大部分会是以目标进行分类,部分对应的会有csdn的blog,路过的大神不要嘲笑我等小白
2 |
3 |
4 | 1: Scrapy 爬取百度贴吧指定帖子的发帖人和回帖人
5 | http://blog.csdn.net/gamer_gyt/article/details/75043398
6 |
--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/beidaSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | true
50 | DEFINITION_ORDER
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 | project
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 | 1494037431357
322 |
323 |
324 | 1494037431357
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/README.md:
--------------------------------------------------------------------------------
1 | 爬取北大要闻的所有新闻
2 |
3 | url:http://pkunews.pku.edu.cn/xxfz/node_185.htm
4 |
5 | news.sql 为数据备份(Mysql)
6 |
7 | 数据库文件备份与恢复
8 |
9 | 备份:/usr/bin/mysqldump -uroot -proot beidaspider --default-character-set=utf8 --opt -Q -R >./news.sql
10 |
11 | 恢复:/usr/bin/mysql -uroot -proot beidaspider <./news.sql
12 |
--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/spider.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import pymysql
4 | from bs4 import BeautifulSoup
5 | import urllib.request
6 | import time
7 |
8 | '''
9 | 创建数据库和数据表语句
10 | create database beidaspider default charset utf8;
11 |
12 | create table news(
13 | title varchar(100),
14 | pub_date date,
15 | from_ varchar(50),
16 | content varchar(20000)
17 | );
18 |
19 | 数据库备份
20 | /usr/bin/mysqldump -uroot -proot beidaspider --default-character-set=utf8 --opt -Q -R >./news.sql
21 |
22 | 数据库恢复
23 | /usr/bin/mysql -uroot -proot beidaspider <./news.sql
24 | '''
25 |
26 |
27 | class BeiDaSpider:
28 | # 初始化
29 | def __init__(self):
30 | self.root_href = "http://pkunews.pku.edu.cn/xxfz/"
31 |
32 | # 连接数据库
33 | def connMysql(self):
34 | conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root',db='beidaspider',charset='utf8')
35 | cur = conn.cursor()
36 | return cur,conn
37 |
38 | # 写入数据库
39 | def write(self,title,date,from_,content):
40 | cur,conn = self.connMysql()
41 | sql = """INSERT INTO news (title, pub_date, from_, content) VALUES ("%s", "%s", "%s", "%s")""" % (title,date,from_,content)
42 | cur.execute(sql)
43 | conn.commit()
44 | conn.close()
45 |
46 | with open("news.txt","a") as fp:
47 | fp.write(title+"\t"+date+"\t"+from_+"\t"+content+"\n")
48 | fp.close()
49 |
50 | # 解析每页,获取该页所有的新闻链接
51 | def parse_onePage_href(self,url):
52 | res = urllib.request.urlopen(url)
53 | body = BeautifulSoup(res.read())
54 | table = body.find('table',cellspacing="0",cellpadding="0",id="nav2_7Tabcontent_10")
55 | a_list = table.find_all('a')
56 | href_list = []
57 | for a in a_list:
58 | href_list.append(self.root_href + a.get('href'))
59 | return href_list
60 |
61 | # 解析每个新闻,获取数据
62 | def parse_oneNew(self,url):
63 | res = urllib.request.urlopen(url)
64 | body = BeautifulSoup(res.read())
65 |
66 | # 获取标题
67 | title = body.title.get_text().strip()
68 | print(title)
69 |
70 | # 获取时间和来源
71 | #dataAndfrom =
72 | dataAndfrom = body.find('table',width="560",border="0",cellspacing="0",cellpadding="0")
73 | datafrom_list = dataAndfrom.find_all('tr')[0].get_text().strip().split(" ")
74 | date = datafrom_list[0].split(":")[1].strip()
75 | from_ = datafrom_list[1].split(":")[1].strip()
76 | print(date)
77 | #print(from_)
78 |
79 | # 获取新闻内容
80 | content = body.find('table',width="710",border="0",cellspacing="0",cellpadding="0",style="margin-left:15px;").find_all('tr')[3].get_text().strip().replace("\n"," ")
81 | #print(content)
82 |
83 | self.write(title,date,from_,content)
84 |
85 | def start(self):
86 | for i in range(1,21):
87 | if i==1:
88 | href_list = self.parse_onePage_href(self.root_href + "node_185.htm")
89 | for href in href_list:
90 | try:
91 | self.parse_oneNew(href)
92 | except Exception as e:
93 | print(e)
94 | finally:
95 | pass
96 | # time.sleep(1)
97 | # break
98 | else:
99 | href_list = self.parse_onePage_href(self.root_href + "node_185_" + str(i) + ".htm")
100 | for href in href_list:
101 | try:
102 | self.parse_oneNew(href)
103 | except Exception as e:
104 | print(e)
105 | finally:
106 | pass
107 | # time.sleep(1)
108 | #time.sleep(2)
109 | # break
110 |
111 |
112 | if __name__=="__main__":
113 | spi = BeiDaSpider()
114 | spi.start()
115 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/.idea/tiebaSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | print
59 | time.sleep(self.timesleep)
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 | 1498495498538
231 |
232 |
233 | 1498495498538
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/README.md:
--------------------------------------------------------------------------------
1 | 这两个文件夹下的爬虫都是为了实现爬取贴吧前三页帖子的发帖人和回帖人,spider1使用的是BeautifulSoup+urllib2,spider2使用的是scrapy
2 |
3 |
4 | [ Scrapy 爬取百度贴吧指定帖子的发帖人和回帖人](http://blog.csdn.net/Gamer_gyt/article/details/75043398)
5 |
6 |
7 | CSDN博客地址:
8 | http://blog.csdn.net/gamer_gyt/
9 |
10 | 如有问题请联系:
11 | QQ:1923361654
12 | WeChat:gyt13342445911
13 | Email:thinkgamer_gyt@gmail.com
14 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/README.md:
--------------------------------------------------------------------------------
1 | # 项目说明
2 |
3 | 该项目为爬取指定贴吧的前三页帖子的发帖用户和回帖用户的用户名
4 |
5 | data 目录为存放数据的目录,其中以天为单位创建二级目录,以贴吧名为三级单位存储抓取结果
6 |
7 | 目录结构类似于:
8 |
9 | data
10 |
11 | --20170626
12 |
13 | -----戒赌吧.txt
14 |
15 | -----网易吧.txt
16 |
17 | tiebaname 目录为存放贴吧名字的目录,将要爬取的贴吧名字写入该目录下的name.txt文件中
18 |
19 | 目录结构类似于:
20 |
21 | tiebaname
22 |
23 | --name.txt
24 |
25 | 采用的是python 的beautifulSoup库,效果不太理想,但后续会逐步改善,可能换成别的框架
26 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/main.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from spider import Spider
4 |
5 | if __name__ == "__main__":
6 | import time
7 | print("Start At:",time.asctime( time.localtime(time.time()) ))
8 | spider = Spider()
9 | spider.start()
10 | print("Stop At:",time.asctime( time.localtime(time.time()) ))
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/spider.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from bs4 import BeautifulSoup
4 | import urllib2
5 | import urllib
6 | import time,os
7 |
8 | class Spider:
9 |
10 | def __init__(self):
11 | self.search_url = 'https://tieba.baidu.com/f?kw='
12 | self.tieba_list = [] # 存储要爬取的若干个贴吧的链接
13 | self.url_list = [] # 存放每个贴吧前三页的帖子链接
14 | self.timesleep = 2 # 每次访问tieba的url时间间隔
15 | self.pages = 3 # 设置要抓取多少页
16 | self.current_href = '' # 当前爬取的贴吧链接url
17 |
18 | # 在data目录下创建日期和贴吧名的txt文件
19 | if not os.path.exists('data/%s' % time.strftime('%Y%m%d')):
20 | os.mkdir('data/%s' % time.strftime('%Y%m%d'))
21 |
22 | def error(self,loc,url,e):
23 | fw = open("error/error.log","a")
24 | fw.write(time.asctime( time.localtime(time.time()) )+"\t"+loc+"\t"+url+"\t"+str(e))
25 | fw.close()
26 |
27 | # 模拟浏览器进行登录
28 | def get_page(self,href):
29 | res = urllib2.urlopen(href)
30 | # 如果访问成功的话返回读取的内容,否则返回空的字符串
31 | if res.code == 200:
32 | return res.read()
33 | else:
34 | return ""
35 |
36 | # 从文件中加载贴吧名并组成url
37 | def read(self):
38 | try:
39 | with open("tiebaname/name.txt", "r") as fr:
40 | for line in fr.readlines():
41 | # urllib.quote(line.strip()) 将关键字转变成url 格式
42 | self.tieba_list.append(self.search_url + urllib.quote(line.strip()) + "&ie=utf-8&pn=")
43 | fr.close()
44 | except Exception as e:
45 | self.error("read", "read error", e)
46 | pass
47 | finally:
48 | return self.tieba_list
49 |
50 |
51 | # 解析每个帖子共有几页
52 | def get_num(self,url):
53 | try:
54 | if self.get_page(url):
55 | body = BeautifulSoup(self.get_page(url), "html.parser")
56 | num_li = body.find_all("li", class_="l_reply_num", style="margin-left:8px")[0]
57 | num = num_li.findAll('span', class_='red')[1].get_text()
58 | # print(num)
59 | return int(num)
60 | else:
61 | pass
62 | except Exception as e:
63 | self.error("get_num",url,e)
64 | return 1
65 |
66 | # 解析每一个贴吧前三页的所有帖子连接
67 | def parse_href(self,one_tieba_url):
68 | self.url_list = [] # 存放一个贴吧前三页所有帖子的链接
69 | try:
70 | for i in range(0,self.pages):
71 | url = one_tieba_url + str(i * 50)
72 | try:
73 | # i* 50 控制翻页,每页显示50个
74 | if self.get_page(one_tieba_url+str(i*50)):
75 | body = BeautifulSoup(self.get_page(url), "html.parser")
76 | div_list = body.find_all("div", class_="threadlist_title pull_left j_th_tit ") # 解析到每一个帖子
77 | for div in div_list:
78 | # print(div.a.get('href'),div.a.get_text())
79 | # print("https://tieba.baidu.com" + div.a.get('href'))
80 | self.url_list.append("https://tieba.baidu.com" + div.a.get('href'))
81 | else:
82 | pass
83 | except Exception as e:
84 | self.error("parse_href",url,e)
85 | pass
86 | # time.sleep(self.timesleep)
87 | except Exception as e:
88 | self.error("parse_href",one_tieba_url,e)
89 | pass
90 |
91 | # 解析每个贴吧前三页所有帖子的发帖人和回帖人的用户名
92 | def parse_username(self):
93 | try:
94 | # 解析每个帖子对应的发帖人和回帖人
95 | for url in self.url_list:
96 | filename = urllib.unquote(self.current_href.split("kw=")[1].split("&ie=")[0]) # 贴吧名字,也是文件名
97 | fw = open('data/%s/%s.txt' % (time.strftime('%Y%m%d'), filename), 'a')
98 |
99 | try:
100 | fw.write(url+"\t")
101 | num = self.get_num(url)
102 | for i in range(1,num+1):
103 | one_url = url+"?pn="+str(i) # https://tieba.baidu.com/p/5183701449?pn=1
104 | # print("total %s papges, now parse is %s page,url is:%s"%(num,i,one_url))
105 | # 解析用户名
106 | if self.get_page(one_url):
107 | li_list = BeautifulSoup(self.get_page(one_url), "html.parser").find_all('li',class_='d_name')
108 | for li in li_list:
109 | # print(li.a.get_text())
110 | fw.write(li.a.get_text().encode("utf-8")+"\t")
111 | # time.sleep(self.timesleep)
112 | else:
113 | pass
114 | fw.write("\n")
115 | fw.close()
116 | print(url)
117 | except Exception as e:
118 | self.error("parse_username",url,e)
119 | pass
120 |
121 | time.sleep(self.timesleep)
122 | except Exception as e:
123 | self.error("parse_username",url,e)
124 | pass
125 |
126 | def start(self):
127 | self.read() # load tieba_prepare name
128 | for url in self.tieba_list:
129 | try:
130 | self.current_href =url
131 | print("Start:",self.current_href,time.strftime("%Y-%m-%d %H-%M-%S")) #self.current_href,
132 | self.parse_href(url) # 解析该贴吧对应的前三页的每个帖子的链接
133 | self.parse_username() # 解析每个帖子的发帖人和回帖人
134 | except Exception as e:
135 | self.error("start","parse error at start",e)
136 | pass
137 |
138 | time.sleep(self.timesleep)
139 | print("Over:",time.strftime("%Y-%m-%d %H-%M-%S"))
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider1/spider.pyc
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/tiebaname/name.txt:
--------------------------------------------------------------------------------
1 | 戒赌
2 | 足彩
3 | 福彩
4 | 汉中彩票
5 | 体彩
6 | 竞彩
7 | 双色球
8 | 深圳
9 | 上海
10 | 北京
11 | 武汉
12 | 福建
13 | 浙江
14 | 广州
15 | 哈尔滨
16 | 吉林
17 | 青岛
18 | 杭州
19 | 山东
20 | 重庆
21 | nba
22 | 曼联
23 | 科比
24 | 皇家马德里
25 | 巴塞罗那
26 | 切尔西
27 | ac米兰
28 | 北京国安
29 | 山东鲁能
30 | 国际米兰
31 | 拜仁慕尼黑
32 | 火箭
33 | 广州FC
34 | 詹姆斯
35 | 麦迪
36 | 利物浦
37 | 阿森纳
38 | 尤文图斯
39 | 洛杉矶湖人
40 | 上海申花
41 | 热火
42 | 梅西
43 | 德国队
44 | 江苏舜天
45 | 小小罗
46 | 天津泰达
47 | 死飞
48 | 欧洲杯
49 | 中超
50 | cba
51 | 河南建业
52 | 曼城
53 | 国足
54 | 意大利国家队
55 | 多特蒙德
56 | 英超
57 | 中国足球
58 | 库里
59 | 内马尔
60 | 罗伊斯
61 | 足球
62 | 篮球
63 | 网球
64 | 浙江绿城
65 | 苹果
66 | iphone
67 | 长春亚泰
68 | 英格兰
69 | 辽宁宏运
70 | 贵州人和
71 | 上海东亚
72 | 重庆力帆
73 | 西甲
74 | 马德里竞技
75 | 德甲
76 | 世界杯
77 | 艾弗森
78 | 韦德
79 | 马刺
80 | 易建联
81 | 北京金隅
82 | 广东宏远
83 | 李毅
84 | 扒皮
85 | 美女
86 | 小米
87 | 电影
88 | 内涵
89 | 动漫
90 | nba
91 | 头像
92 | 遮天
93 | exo
94 | 爆照
95 | 减肥
96 | 鹿晗
97 | 神回复
98 | dota
99 | 文字控
100 | 心理学
101 | 美食
102 | 校花
103 | 绿帽子小同学
104 | 旅行
105 | 小说
106 | 笑话
107 | 90后
108 | 高考
109 | 权志龙
110 | 吴亦凡
111 | 手绘
112 | 梦幻西游
113 | 旅游
114 | dota2
115 | les
116 | 胥渡
117 | 爱情
118 | 整形
119 | 隆鼻
120 | 腐女
121 | gay
122 | 搞笑
123 | 柯南
124 | 剑网
125 | 凡人修仙
126 | 周杰伦
127 | 刘诗诗
128 | 爱情公寓
129 | 陈奕迅
130 | 李敏浩
131 | 音乐
132 | bigbang
133 | 帅哥
134 | 淘宝
135 | 进击的巨人
136 | 张杰
137 | 网名
138 | 魅族
139 | 手机
140 | 短句
141 | 张艺兴
142 | 金秀贤
143 | 手工
144 | 路过的一只
145 | 娱乐圈
146 | 内涵图
147 | 章鱼卡
148 | 君似毒
149 | 黄子韬
150 | 秦时明月
151 | 杨幂
152 | 言情小说
153 | 化妆
154 | 天天酷跑
155 | 情感
156 | 2012
157 | 恐怖
158 | 维尼夫妇
159 | 整容
160 | vae
161 | 爱所以存在
162 | 吴世勋
163 | 吃货
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/.idea/tieba.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 | 1498758628713
228 |
229 |
230 | 1498758628713
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/name.txt:
--------------------------------------------------------------------------------
1 | 戒赌
2 | 足彩
3 | 福彩
4 | 汉中彩票
5 | 体彩
6 | 竞彩
7 | 双色球
8 | 深圳
9 | 上海
10 | 北京
11 | 武汉
12 | 福建
13 | 浙江
14 | 广州
15 | 哈尔滨
16 | 吉林
17 | 青岛
18 | 杭州
19 | 山东
20 | 重庆
21 | nba
22 | 曼联
23 | 科比
24 | 皇家马德里
25 | 巴塞罗那
26 | 切尔西
27 | ac米兰
28 | 北京国安
29 | 山东鲁能
30 | 国际米兰
31 | 拜仁慕尼黑
32 | 火箭
33 | 广州FC
34 | 詹姆斯
35 | 麦迪
36 | 利物浦
37 | 阿森纳
38 | 尤文图斯
39 | 洛杉矶湖人
40 | 上海申花
41 | 热火
42 | 梅西
43 | 德国队
44 | 江苏舜天
45 | 小小罗
46 | 天津泰达
47 | 死飞
48 | 欧洲杯
49 | 中超
50 | cba
51 | 河南建业
52 | 曼城
53 | 国足
54 | 意大利国家队
55 | 多特蒙德
56 | 英超
57 | 中国足球
58 | 库里
59 | 内马尔
60 | 罗伊斯
61 | 足球
62 | 篮球
63 | 网球
64 | 浙江绿城
65 | 苹果
66 | iphone
67 | 长春亚泰
68 | 英格兰
69 | 辽宁宏运
70 | 贵州人和
71 | 上海东亚
72 | 重庆力帆
73 | 西甲
74 | 马德里竞技
75 | 德甲
76 | 世界杯
77 | 艾弗森
78 | 韦德
79 | 马刺
80 | 易建联
81 | 北京金隅
82 | 广东宏远
83 | 李毅
84 | 扒皮
85 | 美女
86 | 小米
87 | 电影
88 | 内涵
89 | 动漫
90 | nba
91 | 头像
92 | 遮天
93 | exo
94 | 爆照
95 | 减肥
96 | 鹿晗
97 | 神回复
98 | dota
99 | 文字控
100 | 心理学
101 | 美食
102 | 校花
103 | 绿帽子小同学
104 | 旅行
105 | 小说
106 | 笑话
107 | 90后
108 | 高考
109 | 权志龙
110 | 吴亦凡
111 | 手绘
112 | 梦幻西游
113 | 旅游
114 | dota2
115 | les
116 | 胥渡
117 | 爱情
118 | 整形
119 | 隆鼻
120 | 腐女
121 | gay
122 | 搞笑
123 | 柯南
124 | 剑网
125 | 凡人修仙
126 | 周杰伦
127 | 刘诗诗
128 | 爱情公寓
129 | 陈奕迅
130 | 李敏浩
131 | 音乐
132 | bigbang
133 | 帅哥
134 | 淘宝
135 | 进击的巨人
136 | 张杰
137 | 网名
138 | 魅族
139 | 手机
140 | 短句
141 | 张艺兴
142 | 金秀贤
143 | 手工
144 | 路过的一只
145 | 娱乐圈
146 | 内涵图
147 | 章鱼卡
148 | 君似毒
149 | 黄子韬
150 | 秦时明月
151 | 杨幂
152 | 言情小说
153 | 化妆
154 | 天天酷跑
155 | 情感
156 | 2012
157 | 恐怖
158 | 维尼夫妇
159 | 整容
160 | vae
161 | 爱所以存在
162 | 吴世勋
163 | 吃货
164 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = tieba.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tieba
12 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.py
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.pyc
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class TiebaItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/items.pyc
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class TiebaSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import time
9 |
10 |
11 | class TiebaPipeline(object):
12 |
13 | def process_item(self, item, spider):
14 | return item
15 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/pipelines.pyc
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for tieba project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'tieba'
13 |
14 | SPIDER_MODULES = ['tieba.spiders']
15 | NEWSPIDER_MODULE = 'tieba.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'tieba (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'tieba.middlewares.TiebaSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'tieba.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'tieba.pipelines.TiebaPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/settings.pyc
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/__init__.pyc
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba1.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import scrapy
4 | import urllib
5 | import time
6 |
7 |
8 | class TiebaSpider(scrapy.Spider):
9 |
10 | name = 'tieba'
11 |
12 | def __init__(self):
13 | self.urls = []
14 |
15 | # 加载贴吧名
16 | fr = open("name.txt", "r")
17 |
18 | for one in fr.readlines():
19 | for i in range(0, 3):
20 | self.urls.append('https://tieba.baidu.com/f?kw=' +
21 | urllib.quote(one.strip()) + '&ie=utf-8&pn=' + str(i * 50))
22 | fr.close()
23 |
24 | def start_requests(self):
25 | urls = self.urls
26 |
27 | for url in urls:
28 | yield scrapy.Request(url=url, callback=self.parse)
29 |
30 | def parse(self, response):
31 | sel = scrapy.Selector(response)
32 | ahref_list = sel.xpath(
33 | '//a[re:test(@class, "j_th_tit ")]//@href').extract()
34 |
35 | fw = open("data/%s_all_href.txt" % time.strftime('%Y%m%d'), "a")
36 | for ahref in ahref_list:
37 | href = "https://tieba.baidu.com" + ahref
38 | fw.write(href + "\n")
39 | fw.close()
40 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba1.pyc
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba2.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import scrapy
4 | import time
5 | from scrapy.http.request import Request
6 | from scrapy.http import HtmlResponse
7 |
8 | class TiebaSpider2(scrapy.Spider):
9 |
10 | name = 'tieba2'
11 |
12 | def __init__(self):
13 | self.urls = []
14 |
15 | # 加载贴吧名
16 | fr = open("data/%s_all_href.txt" % time.strftime('%Y%m%d'), "r")
17 |
18 | for one in fr.readlines():
19 | self.urls.append(one.strip())
20 | fr.close()
21 |
22 | def start_requests(self):
23 | urls = self.urls
24 |
25 | for one in urls:
26 | yield scrapy.Request(url=one, callback=self.parse)
27 |
28 | def parse_uname(self, response):
29 | # response = HtmlResponse(url=page_url.url)
30 | sel = scrapy.Selector(response)
31 | name_list = sel.xpath('//li[re:test(@class, "d_name")]//a/text()').extract()
32 | # print respons
33 | fw = open("data/%s_all_name.txt" % time.strftime('%Y%m%d'), "a")
34 | for name in list(set(name_list)):
35 | fw.write(name.encode("utf-8"))
36 | fw.write("\n")
37 | fw.close()
38 |
39 | def parse(self, response):
40 | sel = scrapy.Selector(response)
41 |
42 | # 可能有些帖子被删除
43 | try:
44 | # 得到每个帖子有多少页
45 | num = int(sel.xpath('//span[re:test(@class,"red")]//text()').extract()[1])
46 | # 遍历每页获得用户名
47 | for page_num in range(1, num + 1):
48 | one_url = response.url + "?pn=" + str(page_num)
49 |
50 | yield Request(url=one_url, callback=self.parse_uname)
51 | except Exception as e:
52 | pass
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba2.pyc
--------------------------------------------------------------------------------
/AdaBoost/AdaBoost.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | '''
3 | Created on 2016年5月10日
4 |
5 | @author: Gamer Think
6 | '''
7 | from test.inspect_fodder import StupidGit
8 |
9 | __author__="thinkgamer"
10 |
11 | from numpy import *
12 |
13 | #加载数据集
14 | def loadSimData():
15 | datMat = matrix([[1.0 , 2.1],
16 | [2. , 1.1],
17 | [1.3 , 1. ],
18 | [1. , 1. ],
19 | [2. , 1. ]])
20 |
21 | classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
22 | return datMat,classLabels
23 |
24 | #单层决策树生成函数
25 | def stumpClassify(dataMatrix, dimen,threshVal, threshInsq):
26 | retArray = ones((shape(dataMatrix)[0],1))
27 | if threshInsq == 'lt':
28 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
29 | else:
30 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0
31 | return retArray
32 |
33 | def buildStump(dataArr,classLabels,D):
34 | dataMatrix = mat(dataArr)
35 | #matrix必须是二维的,numpy可以是多维的
36 | labelMat = mat(classLabels).T #.T表示转置矩阵
37 | m,n = shape(dataMatrix) #给定数据集的行列数
38 | numSteps = 10.0 #变用于在特征的所有可能值上进行遍历
39 | bestStump = {} #字典用于存储给定权重向量0时所得到的最佳单层决策树的相关信息
40 | bestClassEnt = mat(zeros((m,1)))
41 | minError = inf #首先将minError初始化为正无穷大
42 | for i in range(n):
43 | rangeMin = dataMatrix[:,i].min()
44 | rangeMax = dataMatrix[:,i].max()
45 | stepSize = (rangeMax-rangeMin)/numSteps
46 | for j in range(-1,int(numSteps)+1):
47 | #lt :小于,lte,le:小于等于
48 | #gt:大于,,gte,ge:大于等于
49 | #eq:等于 ne,neq:不等于
50 | for inequal in ['lt','gt']:
51 | threshVal = (rangeMin + float(j) * stepSize)
52 | predictedVals = stumpClassify(dataMatrix,i,threshVal, inequal)
53 | errArr = mat(ones((m,1)))
54 | errArr[predictedVals==labelMat]=0
55 | weightedError = D.T * errArr #计算加权错误概率
56 | # print "split: dim %d, thresh % .2f, thresh inequal: %s, the weighted error is %.3f" % (i, threshVal,inequal,weightedError)
57 | #更新bestStump中保存的最佳单层决策树的相关信息
58 | if weightedError < minError:
59 | minError = weightedError
60 | bestClassEnt = predictedVals.copy()
61 | bestStump['dim'] = i
62 | bestStump['thresh'] = threshVal
63 | bestStump['ineq'] = inequal
64 |
65 | return bestStump,minError,bestClassEnt
66 |
67 | #基于单层决策树的AdaBoost训练过程
68 | #numIt:迭代次数,默认为40
69 | def adaBoostTrainDS(dataArr,classLabels,numIt=40):
70 | weakClassArr = []
71 | m= shape(dataArr)[0]
72 | D = mat(ones((m,1))/m)
73 | aggClassEst = mat(zeros((m,1)))
74 | #迭代
75 | for i in range(numIt):
76 | #调用单层决策树
77 | bestStump,error,classEst = buildStump(dataArr, classLabels, D)
78 | print "D:",D.T #打印D的转置矩阵
79 | alpha = float(0.5 * log((1.0 - error) / max(error,1e-16)))# max(error,1e-16)))用于确保没有错误时,不会发生溢出
80 | bestStump['alpha'] = alpha
81 | weakClassArr.append(bestStump)
82 | print "classEst:",classEst.T
83 | #为下一次迭代计算D
84 | expon = multiply(-1 * alpha * mat(classLabels).T,classEst)
85 | D = multiply(D,exp(expon))
86 | D = D /D.sum()
87 | #错误率累加计算
88 | aggClassEst += alpha* classEst
89 | print "aggClassEst:",aggClassEst.T
90 | aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m,1)))
91 | errorRate = aggErrors.sum()/m
92 | print "total error:",errorRate
93 | #如果不发生错误,返回
94 | if errorRate == 0.0:
95 | break
96 | return weakClassArr
97 |
98 |
99 | #AdaBoost分类函数
100 | #输入参数为待分类样例datToClass和多个弱分类器classifierArr
101 | def adaClassify(datToClass,classifierArr):
102 | dataMatrix = mat(datToClass)
103 | m = shape(dataMatrix)[0]
104 | aggClassEst = mat(zeros((m,1)))
105 | for i in range(len(classifierArr)):
106 | classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
107 | classifierArr[i]['thresh'],\
108 | classifierArr[i]['ineq'])
109 | aggClassEst+= classifierArr[i]['alpha'] * classEst
110 | print aggClassEst
111 | return sign(aggClassEst)
112 |
113 |
114 | #main函数
115 | if __name__=="__main__":
116 | #加载数据集
117 | datMat,classLabels = loadSimData()
118 | # print "datMat:",datMat
119 | # print "classLabels:",classLabels
120 |
121 | #单层决策树生成函数
122 | # D = mat(ones((5,1))/5)
123 | # print buildStump(datMat, classLabels, D)
124 |
125 | #基于单层决策树的Adaboost训练过程
126 | classifierArray = adaBoostTrainDS(datMat, classLabels, 30)
127 | # for classifier in classifierArray:
128 | # print classifier
129 |
130 | #测试AdaBoost分类函数
131 | print "[0,0]:\n",adaClassify([0,0], classifierArray)
132 | print "\n\n[[5,5],[0,0]]:\n",adaClassify([[5,5],[0,0]], classifierArray)
133 |
--------------------------------------------------------------------------------
/Apriori/Apriori.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/Apriori/Apriori.py
--------------------------------------------------------------------------------
/Bayes/bayes.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8
2 | '''
3 | Created on 2016年5月12日
4 |
5 | @author: Gamer Think
6 | '''
7 |
8 | from numpy import *
9 |
10 | #词表到向量的转换函数
11 | def loadDataSet():
12 | postingList = [['my','dog','has','flea','problems','help','please'],
13 | ['maybe','not','take','him','to','dog','park','stupid'],
14 | ['my','dalmation','is','so','cute','I','love','him'],
15 | ['stop','posting','stupid','worthless','garbage'],
16 | ['mr','licks','ate','my','steak','how','to','stop','him'],
17 | ['quit','buying','worthless','dog','food','stupid']]
18 | classVec = [0,1,0,1,0,1] #1,侮辱 0,正常
19 | return postingList,classVec
20 |
21 | def createVocabList(dataSet):
22 | vocabSet = set([]) #调用set方法,创建一个空集
23 | for document in dataSet:
24 | vocabSet = vocabSet | set(document) #创建两个集合的并集
25 | return list(vocabSet)
26 |
27 | def setOfWords2Vec(vocabList,inputSet):
28 | returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
29 | for word in inputSet:
30 | if word in vocabList:
31 | returnVec[vocabList.index(word)] = 1
32 | else:
33 | print "the word:%s is not in my Vocabulary" % word
34 | return returnVec
35 |
36 |
37 | def bagOfWords2VecMN(vocabList,inputSet):
38 | returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
39 | for word in inputSet:
40 | if word in vocabList:
41 | returnVec[vocabList.index(word)] += 1
42 | return returnVec
43 |
44 |
45 | #朴素贝叶斯分类器训练集
46 | def trainNB0(trainMatrix,trainCategory): #传入参数为文档矩阵,每篇文档类别标签所构成的向量
47 | numTrainDocs = len(trainMatrix) #文档矩阵的长度
48 | numWords = len(trainMatrix[0]) #第一个文档的单词个数
49 | pAbusive = sum(trainCategory)/float(numTrainDocs) #任意文档属于侮辱性文档概率
50 | #p0Num = zeros(numWords);p1Num = zeros(numWords) #初始化两个矩阵,长度为numWords,内容值为0
51 | p0Num = ones(numWords);p1Num = ones(numWords) #初始化两个矩阵,长度为numWords,内容值为1
52 | #p0Denom = 0.0;p1Denom = 0.0 #初始化概率
53 | p0Denom = 2.0;p1Denom = 2.0
54 | for i in range(numTrainDocs):
55 | if trainCategory[i]==1:
56 | p1Num +=trainMatrix[i]
57 | p1Denom += sum(trainMatrix[i])
58 | else:
59 | p0Num +=trainMatrix[i]
60 | p0Denom += sum(trainMatrix[i])
61 | #p1Vect = p1Num/p1Denom #对每个元素做除法
62 | #p0Vect = p0Num/p0Denom
63 | p1Vect = log(p1Num/p1Denom)
64 | p0Vect = log(p0Num/p0Denom)
65 | return p0Vect,p1Vect,pAbusive
66 |
67 | #朴素贝叶斯分类函数
68 | def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
69 | p1 = sum(vec2Classify * p1Vec) + log(pClass1) #元素相乘
70 | p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
71 | if p1>p0:
72 | return 1
73 | else:
74 | return 0
75 |
76 | def testingNB():
77 | listOPosts,listClasses = loadDataSet() #产生文档矩阵和对应的标签
78 | myVocabList = createVocabList(listOPosts) #创建并集
79 | trainMat = [] #创建一个空的列表
80 | for postinDoc in listOPosts:
81 | trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #使用词向量来填充trainMat列表
82 | p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #训练函数
83 | testEntry = ['love','my','dalmation'] #测试文档列表
84 | thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
85 | print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
86 | testEntry = ['stupid','garbage']
87 | thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
88 | print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
89 |
90 | if __name__=="__main__":
91 | testingNB()
--------------------------------------------------------------------------------
/Decision-Tree/DecisionTree-ID3.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | '''
3 | '''
4 | from math import log
5 | import operator
6 |
7 | def createDataSet():
8 | dataSet =[[1,1,'yes'],
9 | [1,1,'yes'],
10 | [1,0,'no'],
11 | [0,1,'no'],
12 | [0,1,'no']]
13 | labels = ['no surfacing','flippers'] #分类的属性
14 | return dataSet,labels
15 |
16 | #计算给定数据的香农熵
17 | def calcShannonEnt(dataSet):
18 | numEntries = len(dataSet)
19 | labelCounts = {}
20 | for featVec in dataSet:
21 | currentLabel = featVec[-1] #获得标签
22 | #构造存放标签的字典
23 | if currentLabel not in labelCounts.keys():
24 | labelCounts[currentLabel]=0
25 | labelCounts[currentLabel]+=1 #对应的标签数目+1
26 | #计算香农熵
27 | shannonEnt = 0.0
28 | for key in labelCounts:
29 | prob = float(labelCounts[key])/numEntries
30 | shannonEnt -=prob*log(prob,2)
31 | return shannonEnt
32 |
33 | #划分数据集,三个参数为带划分的数据集,划分数据集的特征,特征的返回值
34 | def splitDataSet(dataSet,axis,value):
35 | retDataSet = []
36 | for featVec in dataSet:
37 | if featVec[axis] ==value:
38 | #将相同数据集特征的抽取出来
39 | reducedFeatVec = featVec[:axis]
40 | reducedFeatVec.extend(featVec[axis+1:])
41 | retDataSet.append(reducedFeatVec)
42 | return retDataSet #返回一个列表
43 |
44 | #选择最好的数据集划分方式
45 | def chooseBestFeatureToSplit(dataSet):
46 | numFeature = len(dataSet[0])-1
47 | baseEntropy = calcShannonEnt(dataSet)
48 | bestInfoGain = 0.0
49 | beatFeature = -1
50 | for i in range(numFeature):
51 | featureList = [example[i] for example in dataSet] #获取第i个特征所有的可能取值
52 | uniqueVals = set(featureList) #从列表中创建集合,得到不重复的所有可能取值ֵ
53 | newEntropy = 0.0
54 | for value in uniqueVals:
55 | subDataSet = splitDataSet(dataSet,i,value) #以i为数据集特征,value为返回值,划分数据集
56 | prob = len(subDataSet)/float(len(dataSet)) #数据集特征为i的所占的比例
57 | newEntropy +=prob * calcShannonEnt(subDataSet) #计算每种数据集的信息熵
58 | infoGain = baseEntropy- newEntropy
59 | #计算最好的信息增益,增益越大说明所占决策权越大
60 | if (infoGain > bestInfoGain):
61 | bestInfoGain = infoGain
62 | bestFeature = i
63 | return bestFeature
64 |
65 | #递归构建决策树
66 | def majorityCnt(classList):
67 | classCount = {}
68 | for vote in classList:
69 | if vote not in classCount.keys():
70 | classCount[vote]=0
71 | classCount[vote]+=1
72 | sortedClassCount = sorted(classCount.iteritems(),key =operator.itemgetter(1),reverse=True)#排序,True升序
73 | return sortedClassCount[0][0] #返回出现次数最多的
74 |
75 | #创建树的函数代码
76 | def createTree(dataSet,labels):
77 | classList = [example[-1] for example in dataSet]
78 | if classList.count(classList[0])==len(classList):#类别完全相同则停止划分
79 | return classList[0]
80 | if len(dataSet[0]) ==1: #遍历完所有特征值时返回出现次数最多的
81 | return majorityCnt(classList)
82 | bestFeat = chooseBestFeatureToSplit(dataSet) #选择最好的数据集划分方式
83 | bestFeatLabel = labels[bestFeat] #得到对应的标签值
84 | myTree = {bestFeatLabel:{}}
85 | del(labels[bestFeat]) #清空labels[bestFeat],在下一次使用时清零
86 | featValues = [example[bestFeat] for example in dataSet]
87 | uniqueVals = set(featValues)
88 | for value in uniqueVals:
89 | subLabels =labels[:]
90 | #递归调用创建决策树函数
91 | myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
92 | return myTree
93 |
94 | if __name__=="__main__":
95 | dataSet,labels = createDataSet()
96 | print createTree(dataSet,labels)
--------------------------------------------------------------------------------
/FP-growth/FP_Tree.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/FP-growth/FP_Tree.py
--------------------------------------------------------------------------------
/FP-growth/newsClickStream.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/FP-growth/newsClickStream.py
--------------------------------------------------------------------------------
/FP-growth/所用到dat文件下载地址.txt:
--------------------------------------------------------------------------------
1 | http://download.csdn.net/detail/gamer_gyt/9514873
--------------------------------------------------------------------------------
/K-means/kMeans.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/K-means/kMeans.py
--------------------------------------------------------------------------------
/Logistic Regession/LogisticRegession.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | '''
3 | Created on 2016/4/24
4 |
5 | @author: Gamer Think
6 | '''
7 |
8 | from numpy import *
9 |
10 | #加载数据集
11 | def loadDataSet():
12 | dataMat = []
13 | labelMat = []
14 | fp = open("ex1.txt")
15 | for line in fp.readlines():
16 | lineArr = line.strip().split() #分割
17 | dataMat.append([1.0,float(lineArr[0]), float(lineArr[1])])
18 | labelMat.append( int(lineArr[2]))
19 |
20 | return dataMat,labelMat
21 |
22 | #定义Sigmoid函数
23 | def sigmoid(inX):
24 | return 1.0/(1+exp(-inX))
25 |
26 | #梯度上升算法求解最佳回归系数
27 | def gradAscent(dataMatIn,classLabels):
28 | dataMatrix = mat(dataMatIn) #将数组转为矩阵
29 | labelMat = mat(classLabels).transpose()
30 | m,n = shape(dataMatrix) #返回矩阵的行和列
31 | alpha = 0.001 #初始化 alpha的值
32 | maxCycles = 500 #最大迭代次数
33 | weights = ones((n,1)) #初始化最佳回归系数
34 | for i in range(0,maxCycles):
35 | #引用原书的代码,求梯度
36 | h = sigmoid(dataMatrix*weights)
37 | error = labelMat - h
38 | weights = weights + alpha * dataMatrix.transpose() * error
39 |
40 | return weights
41 |
42 | #随机梯度上升算法求回归系数
43 | def stocGradAscent0(dataMatrix,labelMat):
44 | dataMatrix = array(dataMatrix)
45 | m,n = shape(dataMatrix)
46 | alpha = 0.01
47 | weights = ones(n)
48 | for i in range(0,m):
49 | h = sigmoid(sum(dataMatrix[i]*weights))
50 | error = labelMat[i] - h
51 | weights = weights + alpha * error * dataMatrix[i]
52 |
53 | return weights
54 |
55 |
56 | #改进版的随机梯度上升算法
57 | def stocGradAscent1(dataMatrix,labelMat,numIter=150):
58 | m,n = shape(dataMatrix)
59 | weights = ones(n)
60 | for i in range(0,numIter):
61 | dataIndex = range(m)
62 | for j in range(0,m):
63 | alpha = 4/(1.0+j+i)+0.01
64 | randIndex = int(random.uniform(0,len(dataIndex)))
65 | h = sigmoid(sum(dataMatrix[randIndex] * weights))
66 | error = labelMat[randIndex] - h
67 | weights = weights + alpha * error * dataMatrix[randIndex]
68 | del(dataIndex[randIndex])
69 |
70 | return weights
71 |
72 | #分析数据,画出决策边界
73 | def plotBestFit(wei,dataMatrix,labelMat):
74 | import matplotlib.pyplot as plt
75 | weights = wei #将矩阵wei转化为list
76 | dataArr = array(dataMatrix) #将矩阵转化为数组
77 | n = shape(dataMatrix)[0]
78 | xcord1 = [];ycord1=[]
79 | xcord2 = [];ycord2=[]
80 |
81 | for i in range(n):
82 | if int(labelMat[i])==1:
83 | xcord1.append(dataArr[i,1])
84 | ycord1.append(dataArr[i,2])
85 | else:
86 | xcord2.append(dataArr[i,1])
87 | ycord2.append(dataArr[i,2])
88 |
89 | fig = plt.figure()
90 | ax = fig.add_subplot(111)
91 | ax.scatter(xcord1,ycord1,s=30,c='red', marker='s')
92 | ax.scatter(xcord2,ycord2,s=30,c="green")
93 | x = arange(-3.0,3.0,0.1)
94 | y = (-weights[0]-weights[1] * x)/weights[2]
95 | ax.plot(x,y)
96 | plt.xlabel("x1") #X轴的标签
97 | plt.ylabel("x2") #Y轴的标签
98 | plt.show()
99 |
100 |
101 |
102 | if __name__=="__main__":
103 | dataMatrix,labelMat = loadDataSet()
104 | #梯度上升算法
105 | # weight = gradAscent(dataMatrix, labelMat)
106 | # print weight
107 | # plotBestFit(weight.getA(),dataMatrix,labelMat)
108 |
109 | #随机梯度上升算法
110 | # weight = stocGradAscent0(dataMatrix, labelMat)
111 | # print weight
112 | # plotBestFit(weight,dataMatrix,labelMat)
113 |
114 | #改进版的随机梯度上升算法
115 | weight = stocGradAscent1(array(dataMatrix), labelMat)
116 | print weight
117 | plotBestFit(weight,dataMatrix,labelMat)
118 |
119 |
--------------------------------------------------------------------------------
/Logistic Regession/LogisticRegessionExample.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | '''
3 | Created on 2016/4/25
4 |
5 | @author: Gamer Think
6 | '''
7 | import LogisticRegession as lr
8 | from numpy import *
9 |
10 | #二分类问题进行分类
11 | def classifyVector(inX,weights):
12 | prob = lr.sigmoid(sum(inX * weights))
13 | if prob>0.5:
14 | return 1.0
15 | else:
16 | return 0.0
17 |
18 | #训练和测试
19 | def colicTest():
20 | frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt')
21 | trainingSet = []; trainingLabels = []
22 | #训练回归模型
23 | for line in frTrain.readlines():
24 | currLine = line.strip().split('\t')
25 | lineArr =[]
26 | for i in range(21):
27 | lineArr.append(float(currLine[i]))
28 | trainingSet.append(lineArr)
29 | trainingLabels.append(float(currLine[21]))
30 | trainWeights = lr.stocGradAscent1(array(trainingSet), trainingLabels, 1000)
31 | errorCount = 0; numTestVec = 0.0
32 | #测试回归模型
33 | for line in frTest.readlines():
34 | numTestVec += 1.0
35 | currLine = line.strip().split('\t')
36 | lineArr =[]
37 | for i in range(21):
38 | lineArr.append(float(currLine[i]))
39 | if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]):
40 | errorCount += 1
41 | errorRate = (float(errorCount)/numTestVec)
42 | print "the error rate of this test is: %f" % errorRate
43 | return errorRate
44 |
45 | def multiTest():
46 | numTests = 10
47 | errorSum = 0.0
48 | for k in range(numTests):
49 | errorSum += colicTest()
50 | print "after %d iterations the average error rate is: %f" % (numTests,errorSum/float(numTests))
51 |
52 |
53 | if __name__=="__main__":
54 | multiTest()
55 |
--------------------------------------------------------------------------------
/Logistic Regession/ex1.txt:
--------------------------------------------------------------------------------
1 | -0.017612 14.053064 0
2 | -1.395634 4.662541 1
3 | -0.752157 6.538620 0
4 | -1.322371 7.152853 0
5 | 0.423363 11.054677 0
6 | 0.406704 7.067335 1
7 | 0.667394 12.741452 0
8 | -2.460150 6.866805 1
9 | 0.569411 9.548755 0
10 | -0.026632 10.427743 0
11 | 0.850433 6.920334 1
12 | 1.347183 13.175500 0
13 | 1.176813 3.167020 1
14 | -1.781871 9.097953 0
15 | -0.566606 5.749003 1
16 | 0.931635 1.589505 1
17 | -0.024205 6.151823 1
18 | -0.036453 2.690988 1
19 | -0.196949 0.444165 1
20 | 1.014459 5.754399 1
21 | 1.985298 3.230619 1
22 | -1.693453 -0.557540 1
23 | -0.576525 11.778922 0
24 | -0.346811 -1.678730 1
25 | -2.124484 2.672471 1
26 | 1.217916 9.597015 0
27 | -0.733928 9.098687 0
28 | -3.642001 -1.618087 1
29 | 0.315985 3.523953 1
30 | 1.416614 9.619232 0
31 | -0.386323 3.989286 1
32 | 0.556921 8.294984 1
33 | 1.224863 11.587360 0
34 | -1.347803 -2.406051 1
35 | 1.196604 4.951851 1
36 | 0.275221 9.543647 0
37 | 0.470575 9.332488 0
38 | -1.889567 9.542662 0
39 | -1.527893 12.150579 0
40 | -1.185247 11.309318 0
41 | -0.445678 3.297303 1
42 | 1.042222 6.105155 1
43 | -0.618787 10.320986 0
44 | 1.152083 0.548467 1
45 | 0.828534 2.676045 1
46 | -1.237728 10.549033 0
47 | -0.683565 -2.166125 1
48 | 0.229456 5.921938 1
49 | -0.959885 11.555336 0
50 | 0.492911 10.993324 0
51 | 0.184992 8.721488 0
52 | -0.355715 10.325976 0
53 | -0.397822 8.058397 0
54 | 0.824839 13.730343 0
55 | 1.507278 5.027866 1
56 | 0.099671 6.835839 1
57 | -0.344008 10.717485 0
58 | 1.785928 7.718645 1
59 | -0.918801 11.560217 0
60 | -0.364009 4.747300 1
61 | -0.841722 4.119083 1
62 | 0.490426 1.960539 1
63 | -0.007194 9.075792 0
64 | 0.356107 12.447863 0
65 | 0.342578 12.281162 0
66 | -0.810823 -1.466018 1
67 | 2.530777 6.476801 1
68 | 1.296683 11.607559 0
69 | 0.475487 12.040035 0
70 | -0.783277 11.009725 0
71 | 0.074798 11.023650 0
72 | -1.337472 0.468339 1
73 | -0.102781 13.763651 0
74 | -0.147324 2.874846 1
75 | 0.518389 9.887035 0
76 | 1.015399 7.571882 0
77 | -1.658086 -0.027255 1
78 | 1.319944 2.171228 1
79 | 2.056216 5.019981 1
80 | -0.851633 4.375691 1
81 | -1.510047 6.061992 0
82 | -1.076637 -3.181888 1
83 | 1.821096 10.283990 0
84 | 3.010150 8.401766 1
85 | -1.099458 1.688274 1
86 | -0.834872 -1.733869 1
87 | -0.846637 3.849075 1
88 | 1.400102 12.628781 0
89 | 1.752842 5.468166 1
90 | 0.078557 0.059736 1
91 | 0.089392 -0.715300 1
92 | 1.825662 12.693808 0
93 | 0.197445 9.744638 0
94 | 0.126117 0.922311 1
95 | -0.679797 1.220530 1
96 | 0.677983 2.556666 1
97 | 0.761349 10.693862 0
98 | -2.168791 0.143632 1
99 | 1.388610 9.341997 0
100 | 0.317029 14.739025 0
--------------------------------------------------------------------------------
/Logistic Regession/horseColicTest.txt:
--------------------------------------------------------------------------------
1 | 2 1 38.50 54 20 0 1 2 2 3 4 1 2 2 5.90 0 2 42.00 6.30 0 0 1
2 | 2 1 37.60 48 36 0 0 1 1 0 3 0 0 0 0 0 0 44.00 6.30 1 5.00 1
3 | 1 1 37.7 44 28 0 4 3 2 5 4 4 1 1 0 3 5 45 70 3 2 1
4 | 1 1 37 56 24 3 1 4 2 4 4 3 1 1 0 0 0 35 61 3 2 0
5 | 2 1 38.00 42 12 3 0 3 1 1 0 1 0 0 0 0 2 37.00 5.80 0 0 1
6 | 1 1 0 60 40 3 0 1 1 0 4 0 3 2 0 0 5 42 72 0 0 1
7 | 2 1 38.40 80 60 3 2 2 1 3 2 1 2 2 0 1 1 54.00 6.90 0 0 1
8 | 2 1 37.80 48 12 2 1 2 1 3 0 1 2 0 0 2 0 48.00 7.30 1 0 1
9 | 2 1 37.90 45 36 3 3 3 2 2 3 1 2 1 0 3 0 33.00 5.70 3 0 1
10 | 2 1 39.00 84 12 3 1 5 1 2 4 2 1 2 7.00 0 4 62.00 5.90 2 2.20 0
11 | 2 1 38.20 60 24 3 1 3 2 3 3 2 3 3 0 4 4 53.00 7.50 2 1.40 1
12 | 1 1 0 140 0 0 0 4 2 5 4 4 1 1 0 0 5 30 69 0 0 0
13 | 1 1 37.90 120 60 3 3 3 1 5 4 4 2 2 7.50 4 5 52.00 6.60 3 1.80 0
14 | 2 1 38.00 72 36 1 1 3 1 3 0 2 2 1 0 3 5 38.00 6.80 2 2.00 1
15 | 2 9 38.00 92 28 1 1 2 1 1 3 2 3 0 7.20 0 0 37.00 6.10 1 1.10 1
16 | 1 1 38.30 66 30 2 3 1 1 2 4 3 3 2 8.50 4 5 37.00 6.00 0 0 1
17 | 2 1 37.50 48 24 3 1 1 1 2 1 0 1 1 0 3 2 43.00 6.00 1 2.80 1
18 | 1 1 37.50 88 20 2 3 3 1 4 3 3 0 0 0 0 0 35.00 6.40 1 0 0
19 | 2 9 0 150 60 4 4 4 2 5 4 4 0 0 0 0 0 0 0 0 0 0
20 | 1 1 39.7 100 30 0 0 6 2 4 4 3 1 0 0 4 5 65 75 0 0 0
21 | 1 1 38.30 80 0 3 3 4 2 5 4 3 2 1 0 4 4 45.00 7.50 2 4.60 1
22 | 2 1 37.50 40 32 3 1 3 1 3 2 3 2 1 0 0 5 32.00 6.40 1 1.10 1
23 | 1 1 38.40 84 30 3 1 5 2 4 3 3 2 3 6.50 4 4 47.00 7.50 3 0 0
24 | 1 1 38.10 84 44 4 0 4 2 5 3 1 1 3 5.00 0 4 60.00 6.80 0 5.70 0
25 | 2 1 38.70 52 0 1 1 1 1 1 3 1 0 0 0 1 3 4.00 74.00 0 0 1
26 | 2 1 38.10 44 40 2 1 3 1 3 3 1 0 0 0 1 3 35.00 6.80 0 0 1
27 | 2 1 38.4 52 20 2 1 3 1 1 3 2 2 1 0 3 5 41 63 1 1 1
28 | 1 1 38.20 60 0 1 0 3 1 2 1 1 1 1 0 4 4 43.00 6.20 2 3.90 1
29 | 2 1 37.70 40 18 1 1 1 0 3 2 1 1 1 0 3 3 36.00 3.50 0 0 1
30 | 1 1 39.1 60 10 0 1 1 0 2 3 0 0 0 0 4 4 0 0 0 0 1
31 | 2 1 37.80 48 16 1 1 1 1 0 1 1 2 1 0 4 3 43.00 7.50 0 0 1
32 | 1 1 39.00 120 0 4 3 5 2 2 4 3 2 3 8.00 0 0 65.00 8.20 3 4.60 1
33 | 1 1 38.20 76 0 2 3 2 1 5 3 3 1 2 6.00 1 5 35.00 6.50 2 0.90 1
34 | 2 1 38.30 88 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0
35 | 1 1 38.00 80 30 3 3 3 1 0 0 0 0 0 6.00 0 0 48.00 8.30 0 4.30 1
36 | 1 1 0 0 0 3 1 1 1 2 3 3 1 3 6.00 4 4 0 0 2 0 0
37 | 1 1 37.60 40 0 1 1 1 1 1 1 1 0 0 0 1 1 0 0 2 2.10 1
38 | 2 1 37.50 44 0 1 1 1 1 3 3 2 0 0 0 0 0 45.00 5.80 2 1.40 1
39 | 2 1 38.2 42 16 1 1 3 1 1 3 1 0 0 0 1 0 35 60 1 1 1
40 | 2 1 38 56 44 3 3 3 0 0 1 1 2 1 0 4 0 47 70 2 1 1
41 | 2 1 38.30 45 20 3 3 2 2 2 4 1 2 0 0 4 0 0 0 0 0 1
42 | 1 1 0 48 96 1 1 3 1 0 4 1 2 1 0 1 4 42.00 8.00 1 0 1
43 | 1 1 37.70 55 28 2 1 2 1 2 3 3 0 3 5.00 4 5 0 0 0 0 1
44 | 2 1 36.00 100 20 4 3 6 2 2 4 3 1 1 0 4 5 74.00 5.70 2 2.50 0
45 | 1 1 37.10 60 20 2 0 4 1 3 0 3 0 2 5.00 3 4 64.00 8.50 2 0 1
46 | 2 1 37.10 114 40 3 0 3 2 2 2 1 0 0 0 0 3 32.00 0 3 6.50 1
47 | 1 1 38.1 72 30 3 3 3 1 4 4 3 2 1 0 3 5 37 56 3 1 1
48 | 1 1 37.00 44 12 3 1 1 2 1 1 1 0 0 0 4 2 40.00 6.70 3 8.00 1
49 | 1 1 38.6 48 20 3 1 1 1 4 3 1 0 0 0 3 0 37 75 0 0 1
50 | 1 1 0 82 72 3 1 4 1 2 3 3 0 3 0 4 4 53 65 3 2 0
51 | 1 9 38.20 78 60 4 4 6 0 3 3 3 0 0 0 1 0 59.00 5.80 3 3.10 0
52 | 2 1 37.8 60 16 1 1 3 1 2 3 2 1 2 0 3 0 41 73 0 0 0
53 | 1 1 38.7 34 30 2 0 3 1 2 3 0 0 0 0 0 0 33 69 0 2 0
54 | 1 1 0 36 12 1 1 1 1 1 2 1 1 1 0 1 5 44.00 0 0 0 1
55 | 2 1 38.30 44 60 0 0 1 1 0 0 0 0 0 0 0 0 6.40 36.00 0 0 1
56 | 2 1 37.40 54 18 3 0 1 1 3 4 3 2 2 0 4 5 30.00 7.10 2 0 1
57 | 1 1 0 0 0 4 3 0 2 2 4 1 0 0 0 0 0 54 76 3 2 1
58 | 1 1 36.6 48 16 3 1 3 1 4 1 1 1 1 0 0 0 27 56 0 0 0
59 | 1 1 38.5 90 0 1 1 3 1 3 3 3 2 3 2 4 5 47 79 0 0 1
60 | 1 1 0 75 12 1 1 4 1 5 3 3 0 3 5.80 0 0 58.00 8.50 1 0 1
61 | 2 1 38.20 42 0 3 1 1 1 1 1 2 2 1 0 3 2 35.00 5.90 2 0 1
62 | 1 9 38.20 78 60 4 4 6 0 3 3 3 0 0 0 1 0 59.00 5.80 3 3.10 0
63 | 2 1 38.60 60 30 1 1 3 1 4 2 2 1 1 0 0 0 40.00 6.00 1 0 1
64 | 2 1 37.80 42 40 1 1 1 1 1 3 1 0 0 0 3 3 36.00 6.20 0 0 1
65 | 1 1 38 60 12 1 1 2 1 2 1 1 1 1 0 1 4 44 65 3 2 0
66 | 2 1 38.00 42 12 3 0 3 1 1 1 1 0 0 0 0 1 37.00 5.80 0 0 1
67 | 2 1 37.60 88 36 3 1 1 1 3 3 2 1 3 1.50 0 0 44.00 6.00 0 0 0
--------------------------------------------------------------------------------
/PCA/PCA.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf8-*-
2 | '''
3 | Created on 2016-5-15
4 |
5 | @author: thinkgamer
6 | '''
7 | from numpy import *
8 |
9 | def loadDataSet(filename,delim = "\t"):
10 | fr = open(filename)
11 | stringArr = [line.strip().split(delim) for line in fr.readlines()]
12 | datArr = [map(float, line) for line in stringArr]
13 | return mat(datArr)
14 |
15 | #dataMat对应数据集,N个特征
16 | def pca(dataMat, topNfeat=9999999):
17 | meanVals = mean(dataMat, axis = 0) #求平均值
18 | meanRemoved = dataMat - meanVals #去平均值
19 | covMat = cov(meanRemoved,rowvar=0) #计算协防差矩阵
20 | eigVals, eigVects = linalg.eig(mat(covMat))
21 | eigValInd = argsort(eigVals)
22 | #从小到大对N个值排序
23 | eigValInd = eigValInd[: -(topNfeat + 1) : -1]
24 | redEigVects = eigVects[:, eigValInd]
25 | #将数据转换到新空间
26 | lowDDataMat = meanRemoved * redEigVects
27 | reconMat = (lowDDataMat * redEigVects.T) + meanVals
28 | return lowDDataMat, reconMat
29 |
30 | #测试
31 | dataMat = loadDataSet("testSet.txt")
32 | lowDMat, reconMat = pca(dataMat,1)
33 | print shape(lowDMat)
34 |
35 | '''
36 | #show
37 | import matplotlib
38 | import matplotlib.pyplot as plt
39 | fig = plt.figure()
40 | ax = fig.add_subplot(111)
41 | ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^', s = 90 )
42 | ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0],marker='o', s = 50 , c ='red' )
43 | plt.show()
44 | '''
45 |
46 | #将NaN替换成平均值函数
47 | def replaceNanWithMean():
48 | datMat = loadDataSet('secom.data', ' ')
49 | numFeat = shape(datMat)[1]
50 | for i in range(numFeat):
51 | meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number)
52 | datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean
53 | return datMat
54 |
55 | #加载数据
56 | dataMat = replaceNanWithMean()
57 | #去除均值
58 | meanVals = mean(dataMat, axis=0)
59 | meanRemoved = dataMat - meanVals
60 | #计算协方差
61 | covMat = cov(meanRemoved, rowvar=0)
62 |
63 | #特征值分析
64 | eigVals, eigVects = linalg.eig(mat(covMat))
65 | print eigVals
--------------------------------------------------------------------------------
/PCA/testSet.txt:
--------------------------------------------------------------------------------
1 | 10.235186 11.321997
2 | 10.122339 11.810993
3 | 9.190236 8.904943
4 | 9.306371 9.847394
5 | 8.330131 8.340352
6 | 10.152785 10.123532
7 | 10.408540 10.821986
8 | 9.003615 10.039206
9 | 9.534872 10.096991
10 | 9.498181 10.825446
11 | 9.875271 9.233426
12 | 10.362276 9.376892
13 | 10.191204 11.250851
14 | 7.720499 6.476300
15 | 9.334379 8.471268
16 | 7.963186 6.731333
17 | 8.244973 9.013785
18 | 9.569196 10.568949
19 | 8.854793 9.076536
20 | 9.382171 7.240862
21 | 8.179055 8.944502
22 | 8.267896 8.797017
23 | 9.047165 8.725068
24 | 8.741043 7.901385
25 | 7.190216 7.804587
26 | 8.081227 9.314431
27 | 8.047460 5.720780
28 | 7.917584 7.543254
29 | 8.676942 10.102220
30 | 9.210251 9.424717
31 | 7.732998 9.840202
32 | 7.681754 8.609897
33 | 7.925863 10.079159
34 | 8.261509 8.242080
35 | 8.514872 7.527561
36 | 10.324450 10.804481
37 | 7.856710 7.931543
38 | 7.858608 7.995340
39 | 9.196808 6.577598
40 | 9.644415 10.935081
41 | 9.579833 9.085021
42 | 7.888484 5.976428
43 | 9.072624 9.703344
44 | 8.914184 9.298515
45 | 7.822869 7.086663
46 | 10.538554 11.061464
47 | 8.280187 8.709012
48 | 8.884223 8.670105
49 | 9.359927 10.575055
50 | 9.078611 9.710833
51 | 7.935134 8.586173
52 | 8.805945 10.575145
53 | 9.584316 9.614076
54 | 11.269714 11.717254
55 | 9.120444 9.019774
56 | 7.977520 8.313923
57 | 8.104648 9.456128
58 | 8.617126 7.331723
59 | 9.033811 9.469706
60 | 8.327680 5.122092
61 | 8.532272 10.100909
62 | 9.295434 8.933824
63 | 9.905202 9.027559
64 | 10.585764 10.912733
65 | 10.427584 11.532578
66 | 9.072767 9.960144
67 | 9.164275 8.645121
68 | 9.746058 10.717080
69 | 9.286072 9.340024
70 | 8.188233 7.432415
71 | 7.948598 8.445419
72 | 7.563350 5.656178
73 | 8.972405 8.801869
74 | 9.980868 8.788996
75 | 7.753490 7.714248
76 | 7.431143 9.032819
77 | 8.943403 8.359354
78 | 10.481890 9.988969
79 | 9.150454 10.278760
80 | 8.123894 9.060351
81 | 8.626164 8.469342
82 | 7.354185 7.631252
83 | 11.323046 11.015032
84 | 8.190008 6.860792
85 | 8.412598 7.661358
86 | 9.258404 8.580382
87 | 11.007915 11.443881
88 | 8.279403 8.347003
89 | 8.931149 10.105221
90 | 10.239245 10.077473
91 | 8.129346 7.096877
92 | 8.485823 9.373561
93 | 10.703640 11.651618
94 | 9.500728 8.150228
95 | 9.712414 9.910445
96 | 9.333374 9.407557
97 | 8.787865 10.168021
98 | 9.238180 10.253478
99 | 9.577388 8.895150
100 | 10.447753 10.318227
101 | 9.303944 9.223136
102 | 9.883268 11.662945
103 | 9.471921 10.443792
104 | 10.007753 9.579912
105 | 8.110298 7.106263
106 | 6.964069 6.585040
107 | 10.413499 9.649309
108 | 8.032629 7.053254
109 | 8.015549 9.166753
110 | 10.462924 8.656612
111 | 9.530788 10.134130
112 | 9.202658 9.314222
113 | 10.103241 10.235159
114 | 7.849264 6.624856
115 | 9.059071 7.992555
116 | 10.172889 10.724789
117 | 9.528439 6.420990
118 | 7.190422 6.789792
119 | 9.085716 9.846328
120 | 9.452887 8.735386
121 | 7.417322 7.348594
122 | 8.468639 8.715086
123 | 8.303642 9.463231
124 | 9.939052 10.026771
125 | 8.701989 7.516978
126 | 9.737541 10.587281
127 | 8.280233 7.852444
128 | 10.648386 10.259203
129 | 9.173893 10.520372
130 | 9.135397 10.751406
131 | 7.594580 8.488833
132 | 8.587520 8.463406
133 | 8.581887 7.888644
134 | 9.448768 8.707422
135 | 7.882664 7.772030
136 | 10.050635 9.859720
137 | 9.012078 9.533899
138 | 8.770020 8.882996
139 | 9.428804 9.446306
140 | 8.504209 8.319693
141 | 9.800003 10.964667
142 | 8.069660 7.683099
143 | 10.012217 10.320644
144 | 8.704677 8.918146
145 | 8.198722 7.297786
146 | 9.868322 9.901657
147 | 9.426997 11.480353
148 | 9.228767 9.262976
149 | 8.952359 9.528471
150 | 8.186847 8.600587
151 | 9.026371 8.705143
152 | 9.483364 9.807079
153 | 7.826587 7.975401
154 | 11.197846 10.959298
155 | 7.632421 8.769745
156 | 8.761605 8.309365
157 | 9.353670 8.728758
158 | 6.466637 6.038996
159 | 8.370634 9.178830
160 | 10.337451 11.075600
161 | 8.917679 8.288367
162 | 9.076621 8.487626
163 | 7.278948 4.634097
164 | 10.153017 11.219183
165 | 7.132603 5.853118
166 | 9.338644 9.805940
167 | 9.878602 9.187000
168 | 10.009505 10.924505
169 | 9.384438 10.691860
170 | 7.535322 8.160481
171 | 6.808732 8.268469
172 | 8.302965 8.075009
173 | 8.345379 8.305356
174 | 9.517530 8.249839
175 | 9.267825 9.999109
176 | 10.291511 11.032664
177 | 8.605909 8.705207
178 | 8.331145 7.812295
179 | 8.632412 10.574287
180 | 8.766397 8.712107
181 | 9.407070 9.732756
182 | 9.709495 9.729569
183 | 10.422201 11.070360
184 | 6.831495 6.466763
185 | 8.187122 8.405929
186 | 8.523093 9.041844
187 | 7.952394 6.801220
188 | 10.490780 10.001468
189 | 10.813791 9.802494
190 | 7.861113 7.541475
191 | 8.800399 8.738974
192 | 7.542152 6.612838
193 | 9.446981 9.378659
194 | 8.281684 7.358572
195 | 8.473801 8.208343
196 | 11.736767 11.022029
197 | 8.379578 8.714348
198 | 8.313718 8.832381
199 | 9.342589 10.416659
200 | 7.560710 6.889648
201 | 9.295344 9.739040
202 | 9.176612 9.718781
203 | 8.614385 10.150521
204 | 9.079373 8.839794
205 | 10.333289 10.921255
206 | 9.453502 7.335134
207 | 10.174590 10.292500
208 | 9.693713 9.793636
209 | 7.474925 7.751391
210 | 10.107905 10.156997
211 | 9.257241 7.854266
212 | 10.209794 11.410157
213 | 7.248050 6.433676
214 | 10.150091 9.288597
215 | 10.077713 10.321500
216 | 8.191122 8.931519
217 | 8.791469 10.287216
218 | 9.229434 9.095193
219 | 8.682571 8.546005
220 | 7.524099 7.709751
221 | 8.442410 8.326037
222 | 9.364851 9.095989
223 | 9.061222 7.557899
224 | 7.989999 8.555363
225 | 8.801275 8.868732
226 | 10.351932 9.497796
227 | 10.230710 10.496151
228 | 9.783163 9.891408
229 | 10.651481 9.431617
230 | 8.387393 6.400507
231 | 9.003921 7.050003
232 | 8.483723 8.314886
233 | 9.020501 7.545771
234 | 9.329105 11.095661
235 | 9.583687 9.271929
236 | 8.908705 8.407529
237 | 8.835406 8.083517
238 | 9.736362 8.296735
239 | 10.030302 9.737178
240 | 8.287142 6.993460
241 | 9.173211 9.306335
242 | 9.026355 9.696531
243 | 9.128391 9.921247
244 | 11.486346 12.910777
245 | 11.519458 11.472111
246 | 9.027707 10.263974
247 | 9.351935 8.542200
248 | 9.421701 11.403201
249 | 9.005687 8.100969
250 | 7.015279 6.614278
251 | 8.213607 8.340948
252 | 8.226646 8.718997
253 | 8.144753 8.366877
254 | 10.133642 12.790169
255 | 10.763481 10.847016
256 | 10.003622 10.337716
257 | 9.007955 9.792482
258 | 8.670506 10.782931
259 | 10.386414 9.956162
260 | 10.104761 10.123044
261 | 8.079502 8.304075
262 | 9.945424 11.855409
263 | 8.642497 9.998066
264 | 9.349722 8.690328
265 | 9.034991 8.826490
266 | 8.738746 7.518464
267 | 8.919532 9.740312
268 | 9.464136 10.444588
269 | 10.710057 12.666857
270 | 10.042007 10.532091
271 | 8.447996 7.426363
272 | 9.509351 9.030516
273 | 11.946359 10.553075
274 | 9.981617 9.912651
275 | 9.853876 9.632967
276 | 10.560648 11.881714
277 | 8.370952 9.989491
278 | 8.323209 10.102529
279 | 9.828359 11.702462
280 | 8.515623 8.426754
281 | 9.004363 9.628036
282 | 10.529847 10.458031
283 | 10.028765 10.624880
284 | 9.448114 9.313227
285 | 8.332617 7.382295
286 | 8.323006 8.276608
287 | 7.740771 8.799750
288 | 8.379615 8.146192
289 | 8.340764 9.184458
290 | 9.863614 8.254694
291 | 9.969563 9.405134
292 | 9.164394 9.182127
293 | 10.622098 9.722592
294 | 9.592072 10.029446
295 | 8.212027 7.477366
296 | 9.080225 8.244448
297 | 8.555774 7.842325
298 | 9.958046 9.696221
299 | 8.972573 9.797128
300 | 9.213223 7.128437
301 | 8.737239 9.385138
302 | 10.333907 10.994856
303 | 8.797511 8.643075
304 | 11.044848 9.623160
305 | 8.539260 9.097113
306 | 11.582163 11.884333
307 | 7.863848 7.176199
308 | 6.218103 5.283562
309 | 9.120602 7.250190
310 | 9.001166 9.635203
311 | 8.081476 8.844224
312 | 9.369802 8.230911
313 | 8.768925 8.666987
314 | 9.841098 8.543896
315 | 10.451522 9.549511
316 | 9.755402 9.117522
317 | 7.988961 6.869854
318 | 8.872507 9.787118
319 | 10.363980 10.716608
320 | 6.315671 5.765953
321 | 9.638879 9.202355
322 | 8.588126 8.037966
323 | 8.947408 9.144386
324 | 9.051130 7.195132
325 | 9.321709 8.380668
326 | 10.146531 9.754745
327 | 9.843373 8.891437
328 | 9.213148 11.700632
329 | 7.630078 7.294753
330 | 8.093088 7.967590
331 | 7.488915 6.090652
332 | 8.126036 8.586472
333 | 8.760350 7.268987
334 | 10.201347 9.141013
335 | 7.838208 7.307700
336 | 6.155653 5.563997
337 | 7.767841 6.254528
338 | 8.425656 8.615832
339 | 10.362168 10.886815
340 | 10.180024 10.378934
341 | 9.794665 10.047812
342 | 9.970394 9.668279
343 | 7.030217 7.060471
344 | 9.275414 9.095738
345 | 10.314911 10.456539
346 | 9.259774 8.204851
347 | 10.023919 9.558307
348 | 8.887540 9.866704
349 | 9.851608 9.410989
350 | 8.710882 7.268012
351 | 9.017007 10.217673
352 | 7.976369 9.000979
353 | 8.738332 8.664734
354 | 8.344510 8.977600
355 | 8.959613 12.324240
356 | 9.169982 8.624635
357 | 7.487451 8.154859
358 | 8.706316 7.719455
359 | 9.564832 8.940403
360 | 8.327775 9.044509
361 | 9.734032 10.195255
362 | 8.021343 6.445092
363 | 9.081048 11.024397
364 | 7.626651 6.549263
365 | 10.725858 8.575374
366 | 8.731381 8.307788
367 | 10.394237 10.596874
368 | 7.029311 7.658832
369 | 9.517907 7.509904
370 | 10.394064 10.060898
371 | 10.752500 9.431601
372 | 9.692431 10.332130
373 | 9.651897 7.876862
374 | 8.592329 10.096837
375 | 10.212801 10.827496
376 | 9.045043 9.265524
377 | 8.901643 8.036115
378 | 10.794525 9.318830
379 | 11.040915 12.021746
380 | 8.390836 9.672469
381 | 9.840166 11.226568
382 | 10.806810 12.205633
383 | 8.924285 10.934056
384 | 8.411251 8.289672
385 | 7.808891 9.663290
386 | 9.733437 8.486958
387 | 8.300026 7.477374
388 | 8.221756 10.278308
389 | 9.096867 9.619677
390 | 9.410116 9.289188
391 | 10.097176 9.768470
392 | 9.387954 8.844855
393 | 9.376134 7.704630
394 | 8.231599 9.101203
395 | 9.910738 10.694855
396 | 8.645689 7.764589
397 | 8.090245 7.109596
398 | 9.253483 9.813672
399 | 9.331546 8.039386
400 | 9.843256 10.208792
401 | 9.713131 9.247665
402 | 9.259369 10.704622
403 | 10.243948 9.695883
404 | 6.396262 6.456390
405 | 8.936289 8.703871
406 | 8.750846 9.347273
407 | 6.497155 4.130251
408 | 9.516552 10.164848
409 | 9.125766 8.858775
410 | 8.374387 7.300114
411 | 8.132816 7.621107
412 | 10.099505 9.159134
413 | 9.356477 6.869999
414 | 8.112934 7.587547
415 | 7.265396 6.987031
416 | 11.950505 13.715109
417 | 10.745959 10.822171
418 | 8.893270 7.887332
419 | 6.003473 4.960219
420 | 7.498851 6.451334
421 | 10.162072 9.935954
422 | 8.732617 9.177679
423 | 9.300827 9.952360
424 | 11.908436 12.256801
425 | 9.371215 9.188645
426 | 9.943640 9.245037
427 | 7.386450 7.046819
428 | 8.410374 8.293218
429 | 7.830419 6.440253
430 | 8.263140 8.279446
431 | 11.448164 12.192363
432 | 8.216533 9.186628
433 | 9.316128 10.046697
434 | 8.156927 6.834792
435 | 9.951421 11.240598
436 | 9.059607 8.458446
437 | 10.476339 10.560461
438 | 7.548200 7.227127
439 | 9.432204 7.236705
440 | 9.402750 9.126413
441 | 11.188095 13.853426
442 | 9.520201 11.028131
443 | 8.884154 9.764071
444 | 8.961105 8.833117
445 | 8.549663 8.865765
446 | 10.111708 10.515462
447 | 9.024761 9.169368
448 | 7.904149 8.048756
449 | 9.240995 7.796142
450 | 8.126538 6.116125
451 | 7.442148 7.931335
452 | 9.486821 10.091359
453 | 9.834289 11.694720
454 | 9.009714 11.599170
455 | 9.761314 11.344083
456 | 6.993941 6.562988
457 | 8.659524 8.410107
458 | 7.685363 8.097297
459 | 7.793217 6.519109
460 | 8.883454 9.257347
461 | 8.781821 9.231980
462 | 7.946281 7.658978
463 | 8.523959 10.646480
464 | 9.031525 8.649648
465 | 8.317140 7.758978
466 | 9.192417 11.151218
467 | 8.408486 8.282182
468 | 10.327702 11.459048
469 | 8.389687 8.548727
470 | 8.642250 7.056870
471 | 8.833447 9.267638
472 | 8.805261 8.320281
473 | 9.726211 9.095997
474 | 8.477631 9.507530
475 | 9.738838 9.652110
476 | 8.272108 7.582696
477 | 9.258089 8.495931
478 | 8.334144 8.810766
479 | 8.150904 6.486032
480 | 7.259669 7.270156
481 | 11.034180 11.519954
482 | 10.705432 10.642527
483 | 8.388814 7.159137
484 | 8.559369 7.846284
485 | 7.187988 6.519313
486 | 8.811453 7.765900
487 | 8.492762 7.992941
488 | 8.739752 8.502909
489 | 10.150752 10.420295
490 | 7.062378 5.365289
491 | 8.448195 7.480000
492 | 10.224333 11.592750
493 | 9.533795 9.212845
494 | 9.519492 7.690501
495 | 9.661847 10.376189
496 | 7.963877 8.597193
497 | 10.184486 9.136709
498 | 8.505234 9.159210
499 | 8.187646 8.518690
500 | 9.167590 9.405917
501 | 8.612162 8.518755
502 | 10.970868 10.392229
503 | 9.603649 9.141095
504 | 9.704263 8.830178
505 | 9.657506 8.132449
506 | 9.337882 11.045306
507 | 9.521722 9.537764
508 | 8.954197 8.728179
509 | 8.635658 10.352662
510 | 8.910816 9.020317
511 | 9.900933 9.392002
512 | 10.247105 8.289649
513 | 9.571690 8.171237
514 | 7.388627 7.668071
515 | 8.354008 10.074590
516 | 9.775598 8.835696
517 | 8.768913 7.983604
518 | 8.330199 8.474098
519 | 8.169356 9.361172
520 | 10.346522 10.086434
521 | 7.976144 9.266702
522 | 8.429648 7.865824
523 | 11.261674 11.788587
524 | 10.051066 10.112425
525 | 8.954626 9.789343
526 | 8.382220 8.121012
527 | 9.820642 9.426441
528 | 8.125950 9.695087
529 | 8.646465 7.291808
530 | 8.190202 8.003737
531 | 8.773887 7.306175
532 | 8.731000 10.300436
533 | 9.163098 7.816769
534 | 9.456346 9.223922
535 | 9.645180 9.324053
536 | 8.835060 8.966915
537 | 9.325950 10.943248
538 | 9.941912 9.548535
539 | 9.282799 10.119488
540 | 9.567591 9.462164
541 | 8.529019 9.768001
542 | 9.314824 10.153727
543 | 8.264439 8.273860
544 | 8.307262 8.214036
545 | 9.122041 8.657861
546 | 8.404258 8.389365
547 | 7.828355 8.419433
548 | 9.803180 10.108286
549 | 8.662439 8.581953
550 | 8.883265 8.978377
551 | 8.012330 8.262451
552 | 9.420258 8.974878
553 | 7.015415 6.365940
554 | 9.888832 11.163036
555 | 9.677549 10.346431
556 | 8.410158 7.912899
557 | 9.464147 10.762900
558 | 7.067227 7.035717
559 | 9.320923 10.583089
560 | 9.056917 8.771241
561 | 8.110004 8.387789
562 | 10.310021 10.970014
563 | 8.211185 8.809627
564 | 8.942883 8.840746
565 | 9.479958 8.328700
566 | 8.973982 8.702291
567 | 8.519257 8.764855
568 | 9.424556 8.956911
569 | 7.222919 8.177787
570 | 8.257007 9.700619
571 | 9.778795 9.296134
572 | 8.028806 8.575974
573 | 9.886464 9.965076
574 | 9.090552 6.978930
575 | 9.605548 10.256751
576 | 9.959004 9.610229
577 | 8.308701 9.509124
578 | 7.748293 9.685933
579 | 8.311108 9.428114
580 | 9.697068 10.217956
581 | 9.582991 9.478773
582 | 9.167265 10.198412
583 | 10.329753 10.406602
584 | 8.908819 7.428789
585 | 10.072908 10.393294
586 | 7.992905 9.226629
587 | 8.907696 7.269366
588 | 8.421948 9.342968
589 | 7.481399 7.225033
590 | 10.358408 10.166130
591 | 8.786556 10.279943
592 | 9.658701 11.379367
593 | 10.167807 9.417552
594 | 8.653449 8.656681
595 | 8.020304 8.671270
596 | 8.364348 10.004068
597 | 9.119183 9.788199
598 | 8.405504 9.740580
599 | 11.020930 11.904350
600 | 9.755232 9.515713
601 | 10.059542 9.589748
602 | 8.727131 9.777998
603 | 7.666182 6.028642
604 | 8.870733 8.367501
605 | 9.340446 7.707269
606 | 9.919283 10.796813
607 | 7.905837 8.326034
608 | 10.181187 10.089865
609 | 8.797328 8.981988
610 | 8.466272 7.765032
611 | 10.335914 12.620539
612 | 9.365003 8.609115
613 | 8.011017 7.249489
614 | 10.923993 13.901513
615 | 7.074631 7.558720
616 | 9.824598 8.851297
617 | 8.861026 8.370857
618 | 10.127296 10.861535
619 | 10.548377 10.855695
620 | 8.880470 7.948761
621 | 8.901619 9.674705
622 | 7.813710 9.246912
623 | 10.128808 10.560668
624 | 11.096699 10.911644
625 | 8.551471 6.871514
626 | 8.907241 8.677815
627 | 10.571647 10.294838
628 | 8.815314 8.810725
629 | 8.453396 8.339296
630 | 9.594819 11.487580
631 | 10.714211 9.628908
632 | 7.428788 7.712869
633 | 10.892119 12.747752
634 | 9.024071 11.112692
635 | 7.803375 7.847038
636 | 8.521558 8.881848
637 | 9.742818 11.520203
638 | 9.832836 9.180396
639 | 8.703132 10.028498
640 | 9.905029 11.347606
641 | 10.037536 8.882688
642 | 8.629995 8.392863
643 | 9.583497 9.219663
644 | 8.781687 9.650598
645 | 9.344119 9.537024
646 | 10.407510 9.223929
647 | 7.244488 6.559021
648 | 10.643616 10.288383
649 | 8.757557 6.947901
650 | 10.784590 11.233350
651 | 10.028427 11.330033
652 | 7.968361 6.830308
653 | 8.925954 8.539113
654 | 7.738692 7.114987
655 | 8.192398 8.352016
656 | 10.412017 12.431122
657 | 8.208801 5.777678
658 | 7.820077 7.790720
659 | 9.542754 11.542541
660 | 6.817938 7.429229
661 | 7.365218 7.956797
662 | 9.274391 7.932700
663 | 9.546475 8.803412
664 | 7.471734 6.797870
665 | 8.016969 7.848070
666 | 8.852701 8.458114
667 | 8.215012 8.468330
668 | 6.975507 6.846980
669 | 9.435134 10.609700
670 | 9.228075 9.342622
671 | 8.388410 7.637856
672 | 7.111456 9.289163
673 | 9.403508 8.482654
674 | 9.133894 8.343575
675 | 10.670801 9.750821
676 | 9.983542 10.074537
677 | 10.012865 8.537017
678 | 8.929895 8.951909
679 | 7.666951 7.473615
680 | 9.493839 7.821783
681 | 8.894081 7.059413
682 | 9.593382 9.859732
683 | 9.126847 8.395700
684 | 9.532945 9.850696
685 | 9.459384 9.384213
686 | 8.982743 8.217062
687 | 10.107798 8.790772
688 | 10.563574 9.044890
689 | 8.278963 9.518790
690 | 8.734960 10.494129
691 | 9.597940 9.530895
692 | 10.025478 9.508270
693 | 10.335922 10.974063
694 | 8.404390 8.146748
695 | 7.108699 6.038469
696 | 8.873951 7.474227
697 | 8.731459 8.154455
698 | 8.795146 7.534687
699 | 6.407165 6.810352
700 | 9.979312 10.287430
701 | 8.786715 8.396736
702 | 10.753339 10.360567
703 | 10.508031 10.321976
704 | 10.636925 10.193797
705 | 10.614322 11.215420
706 | 8.916411 8.965286
707 | 8.112756 8.304769
708 | 10.833109 10.497542
709 | 8.319758 9.727691
710 | 9.945336 11.820097
711 | 10.150461 9.914715
712 | 10.185024 10.388722
713 | 9.793569 9.079955
714 | 10.590128 11.811596
715 | 8.505584 6.884282
716 | 10.461428 10.745439
717 | 8.755781 9.418427
718 | 7.488249 7.172072
719 | 10.238905 10.428659
720 | 9.887827 10.427821
721 | 8.529971 8.838217
722 | 8.375208 10.242837
723 | 8.901724 8.398304
724 | 8.607694 9.173198
725 | 8.691369 9.964261
726 | 9.584578 9.641546
727 | 10.265792 11.405078
728 | 7.592968 6.683355
729 | 8.692791 9.389031
730 | 7.589852 6.005793
731 | 10.550386 11.736584
732 | 8.578351 7.227055
733 | 7.526931 6.875134
734 | 8.577081 9.877115
735 | 9.272136 11.050928
736 | 10.300809 10.653059
737 | 8.642013 9.006681
738 | 9.720491 10.265202
739 | 9.029005 9.646928
740 | 8.736201 7.975603
741 | 8.672886 9.070759
742 | 8.370633 8.412170
743 | 9.483776 9.183341
744 | 6.790842 7.594992
745 | 9.842146 10.156810
746 | 9.563336 7.962532
747 | 8.724669 9.870732
748 | 9.012145 9.171326
749 | 9.116948 9.791167
750 | 6.219094 7.988420
751 | 9.468422 8.359975
752 | 8.825231 8.475208
753 | 9.572224 9.696428
754 | 9.609128 8.488175
755 | 9.428590 10.468998
756 | 8.293266 8.617701
757 | 9.423584 10.355688
758 | 9.240796 9.517228
759 | 10.915423 13.026252
760 | 10.854684 11.130866
761 | 9.226816 9.391796
762 | 9.580264 10.359235
763 | 7.289907 6.898208
764 | 9.338857 10.374025
765 | 9.523176 11.332190
766 | 10.162233 10.357396
767 | 8.873930 9.207398
768 | 8.607259 7.794804
769 | 8.852325 8.215797
770 | 8.077272 6.501042
771 | 8.169273 8.269613
772 | 6.806421 7.544423
773 | 8.793151 9.691549
774 | 11.640981 11.365702
775 | 9.544082 11.576545
776 | 9.009266 9.605596
777 | 9.726552 9.426719
778 | 9.495888 10.626624
779 | 8.683982 9.337864
780 | 8.322105 8.631099
781 | 8.887895 8.644931
782 | 8.662659 11.373025
783 | 9.263321 7.536016
784 | 7.802624 7.171625
785 | 8.773183 8.561565
786 | 8.730443 10.197596
787 | 8.942915 7.758383
788 | 8.057618 8.774996
789 | 8.112081 8.202349
790 | 10.378884 12.103755
791 | 9.248876 8.637249
792 | 9.739599 9.708576
793 | 8.126345 8.278487
794 | 8.894788 7.966117
795 | 9.683165 9.019221
796 | 10.886957 12.053843
797 | 9.668852 10.902132
798 | 7.486692 6.471138
799 | 8.794850 9.173609
800 | 8.835915 8.296727
801 | 9.443984 11.375344
802 | 8.696621 6.434580
803 | 9.645560 9.233722
804 | 9.623857 7.915590
805 | 10.840632 12.620268
806 | 7.298135 7.356141
807 | 9.639644 8.902389
808 | 9.849802 7.682624
809 | 10.609964 10.259615
810 | 9.768229 11.382811
811 | 7.646351 7.571849
812 | 10.230300 9.470859
813 | 8.224402 8.496866
814 | 6.879671 8.393648
815 | 7.976247 8.667221
816 | 9.183268 8.694550
817 | 11.471853 12.786280
818 | 10.428349 10.615726
819 | 8.090828 5.902504
820 | 9.738627 8.485792
821 | 8.139709 8.396333
822 | 9.508055 8.990529
823 | 8.857260 8.497732
824 | 8.902558 7.014433
825 | 9.660607 11.040833
826 | 8.772221 10.512150
827 | 11.020038 9.354134
828 | 7.918527 7.742062
829 | 7.630835 7.756260
830 | 11.043272 11.041613
831 | 9.299376 8.674157
832 | 9.795087 8.431837
833 | 9.415683 8.312101
834 | 7.942037 6.942913
835 | 9.724790 11.766496
836 | 10.222032 11.550876
837 | 8.894163 8.306020
838 | 8.394309 8.070420
839 | 9.012776 6.880548
840 | 9.661093 10.138921
841 | 9.896472 9.762372
842 | 9.135628 8.759928
843 | 8.762656 10.306028
844 | 8.602473 8.861956
845 | 10.085297 10.464774
846 | 10.644983 10.945767
847 | 9.034571 8.391668
848 | 8.602920 8.501944
849 | 8.224766 7.402758
850 | 8.755050 9.431085
851 | 9.669937 8.641049
852 | 10.693530 10.287124
853 | 9.462806 7.611153
854 | 9.287707 10.082363
855 | 10.941260 10.783728
856 | 9.263080 7.913328
857 | 10.167111 10.225338
858 | 8.783830 9.465345
859 | 8.958624 8.662136
860 | 9.841649 9.926781
861 | 7.205691 6.790638
862 | 8.629089 9.135461
863 | 7.469440 8.450442
864 | 8.179133 7.790434
865 | 8.083984 7.875520
866 | 9.271300 8.135359
867 | 8.652349 8.254397
868 | 7.983920 6.609684
869 | 7.836860 9.785238
870 | 7.418535 7.011256
871 | 8.458288 10.095364
872 | 9.387605 9.726911
873 | 8.663951 8.206705
874 | 10.146507 11.698577
875 | 8.937103 10.990924
876 | 11.218687 11.141945
877 | 8.363142 9.106936
878 | 7.877643 7.122922
879 | 9.620978 9.905689
880 | 9.509649 10.773209
881 | 6.748743 6.705385
882 | 9.300919 8.085029
883 | 9.332257 9.818791
884 | 7.898610 8.366643
885 | 9.841914 9.480675
886 | 6.920484 8.959501
887 | 8.544713 9.563136
888 | 8.162266 6.715277
889 | 8.659552 9.282008
890 | 10.673398 13.174824
891 | 9.024000 10.379238
892 | 8.183292 6.647572
893 | 10.544919 10.649602
894 | 7.201266 6.529605
895 | 9.557407 11.096821
896 | 8.304605 6.940929
897 | 9.742855 9.920897
898 | 10.024587 9.645222
899 | 10.002296 9.998940
900 | 8.965876 8.665419
901 | 7.823136 6.949572
902 | 8.125088 7.654065
903 | 6.569589 6.046863
904 | 10.195497 8.689129
905 | 11.730011 10.374221
906 | 8.739105 7.457571
907 | 9.820059 10.278526
908 | 9.547456 10.398198
909 | 8.375072 8.416302
910 | 8.889533 8.308929
911 | 8.861201 9.290408
912 | 12.677687 12.788463
913 | 9.100735 8.620537
914 | 7.728350 6.328219
915 | 7.955373 8.355028
916 | 8.733352 8.645414
917 | 10.257527 11.191813
918 | 9.246413 9.497014
919 | 9.745302 9.642035
920 | 7.785652 8.147621
921 | 7.431673 8.566399
922 | 8.654384 8.466701
923 | 8.475392 6.744677
924 | 9.968440 10.765192
925 | 10.163616 10.806963
926 | 10.238135 10.036636
927 | 9.902889 10.746730
928 | 9.523850 8.749708
929 | 9.214363 9.149178
930 | 9.266040 10.841502
931 | 8.494292 7.770942
932 | 10.821158 10.410192
933 | 8.645888 7.970308
934 | 9.885204 10.098080
935 | 9.084990 10.886349
936 | 9.277874 8.871449
937 | 8.135131 7.137064
938 | 7.917379 9.080522
939 | 9.685586 8.822850
940 | 8.558141 7.848112
941 | 9.502917 10.061255
942 | 6.409004 5.164774
943 | 10.149235 10.579951
944 | 7.847304 8.411351
945 | 8.846930 6.819939
946 | 8.675153 9.411147
947 | 9.476276 9.061508
948 | 11.099184 10.644263
949 | 8.792411 10.379405
950 | 8.400418 7.072706
951 | 8.555713 7.923805
952 | 8.024763 8.426993
953 | 8.642696 10.453412
954 | 7.906117 7.920408
955 | 8.793393 9.722878
956 | 8.280364 7.669854
957 | 9.387766 9.706245
958 | 9.626853 10.762499
959 | 10.163631 10.919007
960 | 9.375543 11.513524
961 | 9.309440 8.575699
962 | 10.055329 10.297255
963 | 8.706241 9.097172
964 | 10.032934 11.951897
965 | 10.812974 11.311435
966 | 10.352603 10.819865
967 | 8.276870 9.055403
968 | 8.397389 7.944434
969 | 9.371741 10.395790
970 | 10.825710 10.144099
971 | 9.158483 11.385382
972 | 10.658639 11.389856
973 | 8.091762 6.631039
974 | 10.734892 10.054598
975 | 11.535880 11.604912
976 | 9.799077 11.371677
977 | 8.478725 9.078455
978 | 9.399902 8.947744
979 | 7.305377 8.144973
980 | 7.613377 6.668798
981 | 10.681308 10.830845
982 | 9.973855 10.004133
983 | 9.369918 7.855433
984 | 8.838223 7.429033
985 | 9.521831 10.623930
986 | 9.724419 10.447452
987 | 8.890224 9.275923
988 | 9.932763 11.589953
989 | 10.839337 9.051250
990 | 8.497708 7.521701
991 | 8.440236 8.705670
992 | 9.063566 9.755744
993 | 8.449647 8.929485
994 | 8.554576 8.063231
995 | 10.348606 10.550718
996 | 5.985254 5.186844
997 | 9.931937 10.175582
998 | 9.854922 9.201393
999 | 9.114580 9.134215
1000 | 10.334899 8.543604
1001 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Machine-Learning-With-Python
2 | ========================
3 | Fix bugs and add new features for personalized projects
4 |
5 |
--------------------------------------------------------------------------------
/Recommend/基于item的协同过滤推荐BasedItem.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*-coding:utf-8-*-
3 | '''
4 | Created on 2016-5-30
5 |
6 | @author: thinkgamer
7 | '''
8 | import math
9 |
10 | class ItemBasedCF:
11 | def __init__(self,train_file):
12 | self.train_file = train_file
13 | self.readData()
14 | def readData(self):
15 | #读取文件,并生成用户-物品的评分表和测试集
16 | self.train = dict() #用户-物品的评分表
17 | for line in open(self.train_file):
18 | # user,item,score = line.strip().split(",")
19 | user,score,item = line.strip().split(",")
20 | self.train.setdefault(user,{})
21 | self.train[user][item] = int(float(score))
22 |
23 | def ItemSimilarity(self):
24 | #建立物品-物品的共现矩阵
25 | C = dict() #物品-物品的共现矩阵
26 | N = dict() #物品被多少个不同用户购买
27 | for user,items in self.train.items():
28 | for i in items.keys():
29 | N.setdefault(i,0)
30 | N[i] += 1
31 | C.setdefault(i,{})
32 | for j in items.keys():
33 | if i == j : continue
34 | C[i].setdefault(j,0)
35 | C[i][j] += 1
36 | #计算相似度矩阵
37 | self.W = dict()
38 | for i,related_items in C.items():
39 | self.W.setdefault(i,{})
40 | for j,cij in related_items.items():
41 | self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
42 | return self.W
43 |
44 | #给用户user推荐,前K个相关用户
45 | def Recommend(self,user,K=3,N=10):
46 | rank = dict()
47 | action_item = self.train[user] #用户user产生过行为的item和评分
48 | for item,score in action_item.items():
49 | for j,wj in sorted(self.W[item].items(),key=lambda x:x[1],reverse=True)[0:K]:
50 | if j in action_item.keys():
51 | continue
52 | rank.setdefault(j,0)
53 | rank[j] += score * wj
54 | return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N])
55 |
56 | #声明一个ItemBased推荐的对象
57 | Item = ItemBasedCF("uid_score_bid")
58 | Item.ItemSimilarity()
59 | recommedDic = Item.Recommend("xiyuweilan")
60 | for k,v in recommedDic.iteritems():
61 | print k,"\t",v
--------------------------------------------------------------------------------
/Recommend/基于图的推荐PersonalRank.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/Recommend/基于图的推荐PersonalRank.py
--------------------------------------------------------------------------------
/Recommend/基于标签的推荐.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*-coding:utf-8-*-
3 | import random
4 | import math
5 | #统计各类数量
6 | def addValueToMat(theMat,key,value,incr):
7 | if key not in theMat: #如果key没出先在theMat中
8 | theMat[key]=dict();
9 | theMat[key][value]=incr;
10 | else:
11 | if value not in theMat[key]:
12 | theMat[key][value]=incr;
13 | else:
14 | theMat[key][value]+=incr;#若有值,则递增
15 |
16 | user_tags = dict();
17 | tag_items = dict();
18 | user_items = dict();
19 | user_items_test = dict();#测试集数据字典
20 | item_tags = dict() #用于多样性测试
21 |
22 | #初始化,进行各种统计
23 | def InitStat():
24 | data_file = open('delicious.dat')
25 | line = data_file.readline();
26 | while line:
27 | if random.random()>0.1:#将90%的数据作为训练集,剩下10%的数据作为测试集
28 | terms = line.split("\t");#训练集的数据结构是[user, item, tag]形式
29 | user=terms[0];
30 | item=terms[1];
31 | tag=terms[2];
32 | addValueToMat(user_tags,user,tag,1)
33 | addValueToMat(tag_items,tag,item,1)
34 | addValueToMat(user_items,user,item,1)
35 | addValueToMat(item_tags,item,tag,1)
36 | line = data_file.readline();
37 | else:
38 | addValueToMat(user_items_test,user,item,1)
39 | data_file.close();
40 |
41 | #推荐算法
42 | def Recommend(usr):
43 | recommend_list = dict();
44 | tagged_item = user_items[usr];#得到该用户所有推荐过的物品
45 | for tag_,wut in user_tags[usr].items():#用户打过的标签及次数
46 | for item_,wit in tag_items[tag_].items():#物品被打过的标签及被打过的次数
47 | if item_ not in tagged_item:#已经推荐过的不再推荐
48 | if item_ not in recommend_list:
49 | recommend_list[item_]=wut*wit;#根据公式
50 | else:
51 | recommend_list[item_]+=wut*wit;
52 | return sorted(recommend_list.iteritems(), key=lambda a:a[1],reverse=True)
53 |
54 | #统计标签流行度
55 | def TagPopularity():
56 | tagfreq = {}
57 | for user in user_tags.keys():
58 | for tag in user_tags[user].keys():
59 | if tag not in tagfreq:
60 | tagfreq[tag] = 1
61 | else:
62 | tagfreq[tag] +=1
63 | return sorted(tagfreq.iteritems(), key=lambda a:a[1],reverse=True)
64 |
65 | #计算余弦相似度
66 | def CosineSim(item_tags,i,j):
67 | ret = 0
68 | for b,wib in item_tags[i].items(): #求物品i,j的标签交集数目
69 | if b in item_tags[j]:
70 | ret += wib * item_tags[j][b]
71 | ni = 0
72 | nj = 0
73 | for b, w in item_tags[i].items(): #统计 i 的标签数目
74 | ni += w * w
75 | for b, w in item_tags[j].items(): #统计 j 的标签数目
76 | nj += w * w
77 | if ret == 0:
78 | return 0
79 | return ret/math.sqrt(ni * nj) #返回余弦值
80 |
81 | #计算推荐列表多样性
82 | def Diversity(item_tags,recommend_items):
83 | ret = 0
84 | n = 0
85 | for i in dict(recommend_items).keys():
86 | for j in dict(recommend_items).keys():
87 | if i == j:
88 | continue
89 | ret += CosineSim(item_tags,i,j)
90 | n += 1
91 | return ret/(n * 1.0)
92 |
93 | InitStat()
94 | recommend_list = Recommend("48411")
95 | # print recommend_list
96 | for recommend in recommend_list[:10]: #兴趣度最高的十个itemid
97 | print recommend
98 |
99 | #标签流行度统计
100 | tagFreq = TagPopularity()
101 | for tag in tagFreq[:20]:
102 | print tag
103 |
104 | #推荐列表多样性,计算时间较长
105 | diversityNum = Diversity(item_tags, recommend_list)
106 | print diversityNum
--------------------------------------------------------------------------------
/Recommend/基于用户的协同过滤推荐BasedUserCF.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/Recommend/基于用户的协同过滤推荐BasedUserCF.py
--------------------------------------------------------------------------------
/Regession/ex0.txt:
--------------------------------------------------------------------------------
1 | 1.000000 0.067732 3.176513
2 | 1.000000 0.427810 3.816464
3 | 1.000000 0.995731 4.550095
4 | 1.000000 0.738336 4.256571
5 | 1.000000 0.981083 4.560815
6 | 1.000000 0.526171 3.929515
7 | 1.000000 0.378887 3.526170
8 | 1.000000 0.033859 3.156393
9 | 1.000000 0.132791 3.110301
10 | 1.000000 0.138306 3.149813
11 | 1.000000 0.247809 3.476346
12 | 1.000000 0.648270 4.119688
13 | 1.000000 0.731209 4.282233
14 | 1.000000 0.236833 3.486582
15 | 1.000000 0.969788 4.655492
16 | 1.000000 0.607492 3.965162
17 | 1.000000 0.358622 3.514900
18 | 1.000000 0.147846 3.125947
19 | 1.000000 0.637820 4.094115
20 | 1.000000 0.230372 3.476039
21 | 1.000000 0.070237 3.210610
22 | 1.000000 0.067154 3.190612
23 | 1.000000 0.925577 4.631504
24 | 1.000000 0.717733 4.295890
25 | 1.000000 0.015371 3.085028
26 | 1.000000 0.335070 3.448080
27 | 1.000000 0.040486 3.167440
28 | 1.000000 0.212575 3.364266
29 | 1.000000 0.617218 3.993482
30 | 1.000000 0.541196 3.891471
31 | 1.000000 0.045353 3.143259
32 | 1.000000 0.126762 3.114204
33 | 1.000000 0.556486 3.851484
34 | 1.000000 0.901144 4.621899
35 | 1.000000 0.958476 4.580768
36 | 1.000000 0.274561 3.620992
37 | 1.000000 0.394396 3.580501
38 | 1.000000 0.872480 4.618706
39 | 1.000000 0.409932 3.676867
40 | 1.000000 0.908969 4.641845
41 | 1.000000 0.166819 3.175939
42 | 1.000000 0.665016 4.264980
43 | 1.000000 0.263727 3.558448
44 | 1.000000 0.231214 3.436632
45 | 1.000000 0.552928 3.831052
46 | 1.000000 0.047744 3.182853
47 | 1.000000 0.365746 3.498906
48 | 1.000000 0.495002 3.946833
49 | 1.000000 0.493466 3.900583
50 | 1.000000 0.792101 4.238522
51 | 1.000000 0.769660 4.233080
52 | 1.000000 0.251821 3.521557
53 | 1.000000 0.181951 3.203344
54 | 1.000000 0.808177 4.278105
55 | 1.000000 0.334116 3.555705
56 | 1.000000 0.338630 3.502661
57 | 1.000000 0.452584 3.859776
58 | 1.000000 0.694770 4.275956
59 | 1.000000 0.590902 3.916191
60 | 1.000000 0.307928 3.587961
61 | 1.000000 0.148364 3.183004
62 | 1.000000 0.702180 4.225236
63 | 1.000000 0.721544 4.231083
64 | 1.000000 0.666886 4.240544
65 | 1.000000 0.124931 3.222372
66 | 1.000000 0.618286 4.021445
67 | 1.000000 0.381086 3.567479
68 | 1.000000 0.385643 3.562580
69 | 1.000000 0.777175 4.262059
70 | 1.000000 0.116089 3.208813
71 | 1.000000 0.115487 3.169825
72 | 1.000000 0.663510 4.193949
73 | 1.000000 0.254884 3.491678
74 | 1.000000 0.993888 4.533306
75 | 1.000000 0.295434 3.550108
76 | 1.000000 0.952523 4.636427
77 | 1.000000 0.307047 3.557078
78 | 1.000000 0.277261 3.552874
79 | 1.000000 0.279101 3.494159
80 | 1.000000 0.175724 3.206828
81 | 1.000000 0.156383 3.195266
82 | 1.000000 0.733165 4.221292
83 | 1.000000 0.848142 4.413372
84 | 1.000000 0.771184 4.184347
85 | 1.000000 0.429492 3.742878
86 | 1.000000 0.162176 3.201878
87 | 1.000000 0.917064 4.648964
88 | 1.000000 0.315044 3.510117
89 | 1.000000 0.201473 3.274434
90 | 1.000000 0.297038 3.579622
91 | 1.000000 0.336647 3.489244
92 | 1.000000 0.666109 4.237386
93 | 1.000000 0.583888 3.913749
94 | 1.000000 0.085031 3.228990
95 | 1.000000 0.687006 4.286286
96 | 1.000000 0.949655 4.628614
97 | 1.000000 0.189912 3.239536
98 | 1.000000 0.844027 4.457997
99 | 1.000000 0.333288 3.513384
100 | 1.000000 0.427035 3.729674
101 | 1.000000 0.466369 3.834274
102 | 1.000000 0.550659 3.811155
103 | 1.000000 0.278213 3.598316
104 | 1.000000 0.918769 4.692514
105 | 1.000000 0.886555 4.604859
106 | 1.000000 0.569488 3.864912
107 | 1.000000 0.066379 3.184236
108 | 1.000000 0.335751 3.500796
109 | 1.000000 0.426863 3.743365
110 | 1.000000 0.395746 3.622905
111 | 1.000000 0.694221 4.310796
112 | 1.000000 0.272760 3.583357
113 | 1.000000 0.503495 3.901852
114 | 1.000000 0.067119 3.233521
115 | 1.000000 0.038326 3.105266
116 | 1.000000 0.599122 3.865544
117 | 1.000000 0.947054 4.628625
118 | 1.000000 0.671279 4.231213
119 | 1.000000 0.434811 3.791149
120 | 1.000000 0.509381 3.968271
121 | 1.000000 0.749442 4.253910
122 | 1.000000 0.058014 3.194710
123 | 1.000000 0.482978 3.996503
124 | 1.000000 0.466776 3.904358
125 | 1.000000 0.357767 3.503976
126 | 1.000000 0.949123 4.557545
127 | 1.000000 0.417320 3.699876
128 | 1.000000 0.920461 4.613614
129 | 1.000000 0.156433 3.140401
130 | 1.000000 0.656662 4.206717
131 | 1.000000 0.616418 3.969524
132 | 1.000000 0.853428 4.476096
133 | 1.000000 0.133295 3.136528
134 | 1.000000 0.693007 4.279071
135 | 1.000000 0.178449 3.200603
136 | 1.000000 0.199526 3.299012
137 | 1.000000 0.073224 3.209873
138 | 1.000000 0.286515 3.632942
139 | 1.000000 0.182026 3.248361
140 | 1.000000 0.621523 3.995783
141 | 1.000000 0.344584 3.563262
142 | 1.000000 0.398556 3.649712
143 | 1.000000 0.480369 3.951845
144 | 1.000000 0.153350 3.145031
145 | 1.000000 0.171846 3.181577
146 | 1.000000 0.867082 4.637087
147 | 1.000000 0.223855 3.404964
148 | 1.000000 0.528301 3.873188
149 | 1.000000 0.890192 4.633648
150 | 1.000000 0.106352 3.154768
151 | 1.000000 0.917886 4.623637
152 | 1.000000 0.014855 3.078132
153 | 1.000000 0.567682 3.913596
154 | 1.000000 0.068854 3.221817
155 | 1.000000 0.603535 3.938071
156 | 1.000000 0.532050 3.880822
157 | 1.000000 0.651362 4.176436
158 | 1.000000 0.901225 4.648161
159 | 1.000000 0.204337 3.332312
160 | 1.000000 0.696081 4.240614
161 | 1.000000 0.963924 4.532224
162 | 1.000000 0.981390 4.557105
163 | 1.000000 0.987911 4.610072
164 | 1.000000 0.990947 4.636569
165 | 1.000000 0.736021 4.229813
166 | 1.000000 0.253574 3.500860
167 | 1.000000 0.674722 4.245514
168 | 1.000000 0.939368 4.605182
169 | 1.000000 0.235419 3.454340
170 | 1.000000 0.110521 3.180775
171 | 1.000000 0.218023 3.380820
172 | 1.000000 0.869778 4.565020
173 | 1.000000 0.196830 3.279973
174 | 1.000000 0.958178 4.554241
175 | 1.000000 0.972673 4.633520
176 | 1.000000 0.745797 4.281037
177 | 1.000000 0.445674 3.844426
178 | 1.000000 0.470557 3.891601
179 | 1.000000 0.549236 3.849728
180 | 1.000000 0.335691 3.492215
181 | 1.000000 0.884739 4.592374
182 | 1.000000 0.918916 4.632025
183 | 1.000000 0.441815 3.756750
184 | 1.000000 0.116598 3.133555
185 | 1.000000 0.359274 3.567919
186 | 1.000000 0.814811 4.363382
187 | 1.000000 0.387125 3.560165
188 | 1.000000 0.982243 4.564305
189 | 1.000000 0.780880 4.215055
190 | 1.000000 0.652565 4.174999
191 | 1.000000 0.870030 4.586640
192 | 1.000000 0.604755 3.960008
193 | 1.000000 0.255212 3.529963
194 | 1.000000 0.730546 4.213412
195 | 1.000000 0.493829 3.908685
196 | 1.000000 0.257017 3.585821
197 | 1.000000 0.833735 4.374394
198 | 1.000000 0.070095 3.213817
199 | 1.000000 0.527070 3.952681
200 | 1.000000 0.116163 3.129283
201 |
--------------------------------------------------------------------------------
/Regession/ex1.txt:
--------------------------------------------------------------------------------
1 | 1.000000 0.635975 4.093119
2 | 1.000000 0.552438 3.804358
3 | 1.000000 0.855922 4.456531
4 | 1.000000 0.083386 3.187049
5 | 1.000000 0.975802 4.506176
6 | 1.000000 0.181269 3.171914
7 | 1.000000 0.129156 3.053996
8 | 1.000000 0.605648 3.974659
9 | 1.000000 0.301625 3.542525
10 | 1.000000 0.698805 4.234199
11 | 1.000000 0.226419 3.405937
12 | 1.000000 0.519290 3.932469
13 | 1.000000 0.354424 3.514051
14 | 1.000000 0.118380 3.105317
15 | 1.000000 0.512811 3.843351
16 | 1.000000 0.236795 3.576074
17 | 1.000000 0.353509 3.544471
18 | 1.000000 0.481447 3.934625
19 | 1.000000 0.060509 3.228226
20 | 1.000000 0.174090 3.300232
21 | 1.000000 0.806818 4.331785
22 | 1.000000 0.531462 3.908166
23 | 1.000000 0.853167 4.386918
24 | 1.000000 0.304804 3.617260
25 | 1.000000 0.612021 4.082411
26 | 1.000000 0.620880 3.949470
27 | 1.000000 0.580245 3.984041
28 | 1.000000 0.742443 4.251907
29 | 1.000000 0.110770 3.115214
30 | 1.000000 0.742687 4.234319
31 | 1.000000 0.574390 3.947544
32 | 1.000000 0.986378 4.532519
33 | 1.000000 0.294867 3.510392
34 | 1.000000 0.472125 3.927832
35 | 1.000000 0.872321 4.631825
36 | 1.000000 0.843537 4.482263
37 | 1.000000 0.864577 4.487656
38 | 1.000000 0.341874 3.486371
39 | 1.000000 0.097980 3.137514
40 | 1.000000 0.757874 4.212660
41 | 1.000000 0.877656 4.506268
42 | 1.000000 0.457993 3.800973
43 | 1.000000 0.475341 3.975979
44 | 1.000000 0.848391 4.494447
45 | 1.000000 0.746059 4.244715
46 | 1.000000 0.153462 3.019251
47 | 1.000000 0.694256 4.277945
48 | 1.000000 0.498712 3.812414
49 | 1.000000 0.023580 3.116973
50 | 1.000000 0.976826 4.617363
51 | 1.000000 0.624004 4.005158
52 | 1.000000 0.472220 3.874188
53 | 1.000000 0.390551 3.630228
54 | 1.000000 0.021349 3.145849
55 | 1.000000 0.173488 3.192618
56 | 1.000000 0.971028 4.540226
57 | 1.000000 0.595302 3.835879
58 | 1.000000 0.097638 3.141948
59 | 1.000000 0.745972 4.323316
60 | 1.000000 0.676390 4.204829
61 | 1.000000 0.488949 3.946710
62 | 1.000000 0.982873 4.666332
63 | 1.000000 0.296060 3.482348
64 | 1.000000 0.228008 3.451286
65 | 1.000000 0.671059 4.186388
66 | 1.000000 0.379419 3.595223
67 | 1.000000 0.285170 3.534446
68 | 1.000000 0.236314 3.420891
69 | 1.000000 0.629803 4.115553
70 | 1.000000 0.770272 4.257463
71 | 1.000000 0.493052 3.934798
72 | 1.000000 0.631592 4.154963
73 | 1.000000 0.965676 4.587470
74 | 1.000000 0.598675 3.944766
75 | 1.000000 0.351997 3.480517
76 | 1.000000 0.342001 3.481382
77 | 1.000000 0.661424 4.253286
78 | 1.000000 0.140912 3.131670
79 | 1.000000 0.373574 3.527099
80 | 1.000000 0.223166 3.378051
81 | 1.000000 0.908785 4.578960
82 | 1.000000 0.915102 4.551773
83 | 1.000000 0.410940 3.634259
84 | 1.000000 0.754921 4.167016
85 | 1.000000 0.764453 4.217570
86 | 1.000000 0.101534 3.237201
87 | 1.000000 0.780368 4.353163
88 | 1.000000 0.819868 4.342184
89 | 1.000000 0.173990 3.236950
90 | 1.000000 0.330472 3.509404
91 | 1.000000 0.162656 3.242535
92 | 1.000000 0.476283 3.907937
93 | 1.000000 0.636391 4.108455
94 | 1.000000 0.758737 4.181959
95 | 1.000000 0.778372 4.251103
96 | 1.000000 0.936287 4.538462
97 | 1.000000 0.510904 3.848193
98 | 1.000000 0.515737 3.974757
99 | 1.000000 0.437823 3.708323
100 | 1.000000 0.828607 4.385210
101 | 1.000000 0.556100 3.927788
102 | 1.000000 0.038209 3.187881
103 | 1.000000 0.321993 3.444542
104 | 1.000000 0.067288 3.199263
105 | 1.000000 0.774989 4.285745
106 | 1.000000 0.566077 3.878557
107 | 1.000000 0.796314 4.155745
108 | 1.000000 0.746600 4.197772
109 | 1.000000 0.360778 3.524928
110 | 1.000000 0.397321 3.525692
111 | 1.000000 0.062142 3.211318
112 | 1.000000 0.379250 3.570495
113 | 1.000000 0.248238 3.462431
114 | 1.000000 0.682561 4.206177
115 | 1.000000 0.355393 3.562322
116 | 1.000000 0.889051 4.595215
117 | 1.000000 0.733806 4.182694
118 | 1.000000 0.153949 3.320695
119 | 1.000000 0.036104 3.122670
120 | 1.000000 0.388577 3.541312
121 | 1.000000 0.274481 3.502135
122 | 1.000000 0.319401 3.537559
123 | 1.000000 0.431653 3.712609
124 | 1.000000 0.960398 4.504875
125 | 1.000000 0.083660 3.262164
126 | 1.000000 0.122098 3.105583
127 | 1.000000 0.415299 3.742634
128 | 1.000000 0.854192 4.566589
129 | 1.000000 0.925574 4.630884
130 | 1.000000 0.109306 3.190539
131 | 1.000000 0.805161 4.289105
132 | 1.000000 0.344474 3.406602
133 | 1.000000 0.769116 4.251899
134 | 1.000000 0.182003 3.183214
135 | 1.000000 0.225972 3.342508
136 | 1.000000 0.413088 3.747926
137 | 1.000000 0.964444 4.499998
138 | 1.000000 0.203334 3.350089
139 | 1.000000 0.285574 3.539554
140 | 1.000000 0.850209 4.443465
141 | 1.000000 0.061561 3.290370
142 | 1.000000 0.426935 3.733302
143 | 1.000000 0.389376 3.614803
144 | 1.000000 0.096918 3.175132
145 | 1.000000 0.148938 3.164284
146 | 1.000000 0.893738 4.619629
147 | 1.000000 0.195527 3.426648
148 | 1.000000 0.407248 3.670722
149 | 1.000000 0.224357 3.412571
150 | 1.000000 0.045963 3.110330
151 | 1.000000 0.944647 4.647928
152 | 1.000000 0.756552 4.164515
153 | 1.000000 0.432098 3.730603
154 | 1.000000 0.990511 4.609868
155 | 1.000000 0.649699 4.094111
156 | 1.000000 0.584879 3.907636
157 | 1.000000 0.785934 4.240814
158 | 1.000000 0.029945 3.106915
159 | 1.000000 0.075747 3.201181
160 | 1.000000 0.408408 3.872302
161 | 1.000000 0.583851 3.860890
162 | 1.000000 0.497759 3.884108
163 | 1.000000 0.421301 3.696816
164 | 1.000000 0.140320 3.114540
165 | 1.000000 0.546465 3.791233
166 | 1.000000 0.843181 4.443487
167 | 1.000000 0.295390 3.535337
168 | 1.000000 0.825059 4.417975
169 | 1.000000 0.946343 4.742471
170 | 1.000000 0.350404 3.470964
171 | 1.000000 0.042787 3.113381
172 | 1.000000 0.352487 3.594600
173 | 1.000000 0.590736 3.914875
174 | 1.000000 0.120748 3.108492
175 | 1.000000 0.143140 3.152725
176 | 1.000000 0.511926 3.994118
177 | 1.000000 0.496358 3.933417
178 | 1.000000 0.382802 3.510829
179 | 1.000000 0.252464 3.498402
180 | 1.000000 0.845894 4.460441
181 | 1.000000 0.132023 3.245277
182 | 1.000000 0.442301 3.771067
183 | 1.000000 0.266889 3.434771
184 | 1.000000 0.008575 2.999612
185 | 1.000000 0.897632 4.454221
186 | 1.000000 0.533171 3.985348
187 | 1.000000 0.285243 3.557982
188 | 1.000000 0.377258 3.625972
189 | 1.000000 0.486995 3.922226
190 | 1.000000 0.305993 3.547421
191 | 1.000000 0.277528 3.580944
192 | 1.000000 0.750899 4.268081
193 | 1.000000 0.694756 4.278096
194 | 1.000000 0.870158 4.517640
195 | 1.000000 0.276457 3.555461
196 | 1.000000 0.017761 3.055026
197 | 1.000000 0.802046 4.354819
198 | 1.000000 0.559275 3.894387
199 | 1.000000 0.941305 4.597773
200 | 1.000000 0.856877 4.523616
201 |
--------------------------------------------------------------------------------
/Regession/regession.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf8-*-
2 | '''
3 | Created on 2016年5月14日
4 |
5 | @author: Gamer Think
6 | '''
7 |
8 | from numpy import *
9 | #加载数据集
10 | def loadDataSet(filename):
11 | numFeat = len(open(filename).readline().split("\t")) -1
12 | dataMat = []; labelMat = []
13 | fr = open(filename)
14 | for line in fr.readlines():
15 | lineArr = []
16 | curLine = line.strip().split("\t")
17 | for i in range(numFeat):
18 | lineArr.append(float(curLine[i]))
19 |
20 | dataMat.append(lineArr)
21 | labelMat.append(float(curLine[-1]))
22 |
23 | return dataMat,labelMat
24 |
25 | #====================用线性回归找到最佳拟合曲线===========
26 | #计算最佳拟合曲线
27 | def standRegress(xArr,yArr):
28 | xMat = mat(xArr); yMat = mat(yArr).T #.T代表转置矩阵
29 | xTx = xMat.T * xMat
30 | if linalg.det(xTx) ==0.0: #linalg.det(xTx) 计算行列式的值
31 | print "This matrix is singular , cannot do inverse"
32 | return
33 | ws = xTx.I * (xMat.T * yMat)
34 | return ws
35 |
36 | #测试上边的函数
37 | xArr,yArr = loadDataSet("ex0.txt")
38 | ws = standRegress(xArr, yArr)
39 | print "ws(相关系数):",ws #ws 存放的就是回归系数
40 |
41 | #画图展示
42 | def show():
43 | import matplotlib.pyplot as plt
44 | xMat = mat(xArr); yMat = mat(yArr)
45 | yHat = xMat*ws
46 | fig = plt.figure() #创建绘图对象
47 | ax = fig.add_subplot(111) #111表示将画布划分为1行2列选择使用从上到下第一块
48 | #scatter绘制散点图
49 | ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
50 | #复制,排序
51 | xCopy =xMat.copy()
52 | xCopy.sort(0)
53 | yHat = xCopy * ws
54 | #plot画线
55 | ax.plot(xCopy[:,1],yHat)
56 | plt.show()
57 |
58 | # show()
59 |
60 | #利用numpy库提供的corrcoef来计算预测值和真实值得相关性
61 | yHat = mat(xArr) * ws #yHat = xMat * ws
62 | print "相关性:",corrcoef(yHat.T,mat(yArr))
63 | #====================用线性回归找到最佳拟合曲线===========
64 |
65 | '''
66 | #==================局部加权线性回归================
67 |
68 | def lwlr(testPoint,xArr,yArr,k=1.0):
69 | xMat = mat(xArr); yMat = mat(yArr).T
70 | m = shape(xMat)[0]
71 | weights = mat(eye((m))) #产生对角线矩阵
72 | for j in range(m):
73 | diffMat = testPoint - xMat[j,:]
74 | #更新权重值,以指数级递减
75 | weights[j,j] = exp(diffMat * diffMat.T /(-2.0*k**2))
76 | xTx = xMat.T * (weights * xMat)
77 | if linalg.det(xTx) == 0.0:
78 | print "this matrix is singular,cannot do inverse"
79 | return
80 | ws = xTx.I * (xMat.T * (weights * yMat))
81 | return testPoint * ws
82 |
83 | def lwlrTest(testArr,xArr,yArr,k=1.0):
84 | m = shape(testArr)[0]
85 | yHat = zeros(m)
86 | for i in range(m):
87 | yHat[i] =lwlr(testArr[i],xArr,yArr,k)
88 | return yHat
89 |
90 |
91 | xArr,yArr = loadDataSet('ex0.txt')
92 | print "k=1.0:",lwlr(xArr[0],xArr,yArr,1.0)
93 | print "k=0.001:",lwlr(xArr[0],xArr,yArr,0.001)
94 | print "k=0.003:",lwlr(xArr[0],xArr,yArr,0.003)
95 |
96 | #画图
97 | def showlwlr():
98 | yHat = lwlrTest(xArr, xArr, yArr, 0.01)
99 | xMat = mat(xArr)
100 | srtInd = xMat[:,1].argsort(0)
101 | xSort = xMat[srtInd][:,0,:]
102 |
103 | import matplotlib.pyplot as plt
104 | fig = plt.figure() #创建绘图对象
105 | ax = fig.add_subplot(111) #111表示将画布划分为1行2列选择使用从上到下第一块
106 | ax.plot(xSort[:,1],yHat[srtInd])
107 | #scatter绘制散点图
108 | ax.scatter(xMat[:,1].flatten().A[0],mat(yArr).T[:,0].flatten().A[0],s=2,c='red')
109 | plt.show()
110 |
111 | showlwlr()
112 | '''
113 | '''
114 | #=========================岭回归==================
115 | #用于计算回归系数
116 | def ridgeRegres(xMat,yMat,lam=0.2):
117 | xTx = xMat.T * xMat
118 | denom = xTx + eye(shape(xMat)[1]) * lam
119 | if linalg.det(denom)==0.0:
120 | print "This matrix is singular, cannot do inverse"
121 | return
122 | ws = denom.I * (xMat.T * yMat)
123 | return ws
124 |
125 | #用于在一组lambda上做测试
126 | def ridgeTest(xArr,yArr):
127 | xMat = mat(xArr); yMat = mat(yArr).T
128 | yMean = mean(yMat,0)
129 | #数据标准化
130 | yMat = yMat - yMean
131 | xMeans = mean(xMat,0)
132 | xVar = var(xMat,0)
133 | xMat = (xMat - xMeans)/xVar
134 |
135 | numTestPts = 30
136 | wMat = zeros((numTestPts, shape(xMat)[1]))
137 | for i in range(numTestPts):
138 | ws = ridgeRegres(xMat, yMat, exp(i-10))
139 | wMat[i,:]=ws.T
140 | return wMat
141 |
142 | abX,abY = loadDataSet('abalone.txt')
143 | ridgeWeights = ridgeTest(abX,abY)
144 | # print ridgeWeights
145 |
146 | def showRidge():
147 | import matplotlib.pyplot as plt
148 | fig = plt.figure()
149 | ax = fig.add_subplot(111)
150 | ax.plot(ridgeWeights)
151 | plt.show()
152 |
153 | showRidge()
154 | #===================岭回归=============
155 | '''
156 | #===================向前逐步回归============
157 |
158 | #计算平方误差
159 | def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
160 | return ((yArr-yHatArr)**2).sum()
161 |
162 | #数据标准化处理
163 | def regularize(xMat):#regularize by columns
164 | inMat = xMat.copy()
165 | inMeans = mean(inMat,0) #calc mean then subtract it off
166 | inVar = var(inMat,0) #calc variance of Xi then divide by it
167 | inMat = (inMat - inMeans)/inVar
168 | return inMat
169 |
170 |
171 | def stageWise(xArr,yArr,eps=0.01,numIt=100):
172 | xMat = mat(xArr); yMat=mat(yArr).T
173 | yMean = mean(yMat,0)
174 | yMat = yMat - yMean #can also regularize ys but will get smaller coef
175 | xMat = regularize(xMat)
176 | m,n=shape(xMat)
177 | returnMat = zeros((numIt,n)) #testing code remove
178 | ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
179 | for i in range(numIt):#could change this to while loop
180 | #print ws.T
181 | lowestError = inf;
182 | for j in range(n):
183 | for sign in [-1,1]:
184 | wsTest = ws.copy()
185 | wsTest[j] += eps*sign
186 | yTest = xMat*wsTest
187 | rssE = rssError(yMat.A,yTest.A)
188 | if rssE < lowestError:
189 | lowestError = rssE
190 | wsMax = wsTest
191 | ws = wsMax.copy()
192 | returnMat[i,:]=ws.T
193 | return returnMat
194 |
195 | xArr,yArr = loadDataSet('abalone.txt')
196 | print stageWise(xArr, yArr, 0.01, 200),"\n\n"
197 |
198 | # print stageWise(xArr, yArr, 0.001, 200)
199 |
200 | xMat = mat(xArr)
201 | yMat = mat(yArr).T
202 | xMat = regularize(xMat)
203 | yM = mean(yMat,0)
204 | yMat = yMat - yM
205 | weights = standRegress(xMat, yMat.T)
206 | print weights.T
--------------------------------------------------------------------------------
/sklearn/README.md:
--------------------------------------------------------------------------------
1 | 0: line_regression——回归分析之Sklearn实现电力预测
2 | http://blog.csdn.net/Gamer_gyt/article/details/78467021
3 |
--------------------------------------------------------------------------------
/sklearn/line_regression/sk_linreg.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:fenc=utf-8
4 | #
5 | # Copyright © 2017 Register
6 | #
7 | # Distributed under terms of the GPLv3 license.
8 |
9 | """
10 | """
11 | from sklearn.cross_validation import train_test_split
12 | from sklearn.linear_model import LinearRegression
13 | import pandas as pd
14 | import numpy as np
15 |
16 | # pandas 读取数据
17 | data = pd.read_csv("Folds5x2_pp.csv")
18 | print data.shape
19 |
20 | # 准备样本数据和样本输出
21 | X = data[["AT","V","AP","RH"]]
22 | print X.shape
23 | y = data[["PE"]]
24 | print y.shape
25 |
26 | linreg = LinearRegression()
27 | linreg.fit(X_train,y_train)
28 |
29 | # 训练模型完毕,查看结果
30 | print linreg.intercept_
31 | print linreg.coef_
32 |
33 | y_pred = linreg.predict(X_test)
34 | from sklearn import metrics
35 |
36 | # 使用sklearn来计算mse和Rmse
37 | print "MSE:",metrics.mean_squared_error(y_test, y_pred)
38 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred))
39 |
40 | # 交叉验证
41 | from sklearn.model_selection import cross_val_predict
42 | predicted = cross_val_predict(linreg,X,y,cv=10)
43 | print "MSE:",metrics.mean_squared_error(y, predicted)
44 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y, predicted))
45 |
46 | # 画图查看结果
47 | import matplotlib.pyplot as plt
48 | fig, ax = plt.subplots()
49 | ax.scatter(y, predicted)
50 | ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
51 | ax.set_xlabel('Measured')
52 | ax.set_ylabel('Predicted')
53 | plt.show()
54 |
--------------------------------------------------------------------------------