├── .gitattributes ├── .gitignore ├── 0-Distance └── blog_ml_distance.py ├── 0-Spider ├── README.md ├── beidaNewsSpider │ ├── .idea │ │ ├── beidaSpider.iml │ │ ├── inspectionProfiles │ │ │ └── profiles_settings.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ └── workspace.xml │ ├── README.md │ ├── news.sql │ ├── news.txt │ └── spider.py └── tiebaSpider │ ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── tiebaSpider.iml │ └── workspace.xml │ ├── README.md │ ├── spider1 │ ├── README.md │ ├── main.py │ ├── spider.py │ ├── spider.pyc │ └── tiebaname │ │ └── name.txt │ └── spider2 │ └── tieba │ ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── tieba.iml │ └── workspace.xml │ ├── data │ └── 20170630_all_href.txt │ ├── name.txt │ ├── scrapy.cfg │ └── tieba │ ├── __init__.py │ ├── __init__.pyc │ ├── items.py │ ├── items.pyc │ ├── middlewares.py │ ├── pipelines.py │ ├── pipelines.pyc │ ├── settings.py │ ├── settings.pyc │ └── spiders │ ├── __init__.py │ ├── __init__.pyc │ ├── tieba1.py │ ├── tieba1.pyc │ ├── tieba2.py │ └── tieba2.pyc ├── AdaBoost └── AdaBoost.py ├── Apriori └── Apriori.py ├── Bayes └── bayes.py ├── Decision-Tree └── DecisionTree-ID3.py ├── FP-growth ├── FP_Tree.py ├── newsClickStream.py └── 所用到dat文件下载地址.txt ├── K-means └── kMeans.py ├── Logistic Regession ├── LogisticRegession.py ├── LogisticRegessionExample.py ├── ex1.txt ├── horseColicTest.txt └── horseColicTraining.txt ├── PCA ├── PCA.py ├── secom.data └── testSet.txt ├── README.md ├── Recommend ├── uid_score_bid.dat ├── 基于item的协同过滤推荐BasedItem.py ├── 基于图的推荐PersonalRank.py ├── 基于标签的推荐.py └── 基于用户的协同过滤推荐BasedUserCF.py ├── Regession ├── abalone.txt ├── ex0.txt ├── ex1.txt └── regession.py └── sklearn ├── README.md └── line_regression ├── Folds5x2_pp.csv └── sk_linreg.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /0-Distance/blog_ml_distance.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from numpy import * 4 | 5 | print '[+]------------欧式距离-----------' 6 | def twoPointDistance(a,b): 7 | d = sqrt( (a[0]-b[0])**2 + (a[1]-b[1])**2 ) 8 | return d 9 | 10 | print 'a,b 二维距离为:',twoPointDistance((1,1),(2,2)) 11 | 12 | def threePointDistance(a,b): 13 | d = sqrt( (a[0]-b[0])**2 + (a[1]-b[1])**2 + (a[2]-b[2])**2 ) 14 | return d 15 | 16 | print 'a,b 三维距离为:',threePointDistance((1,1,1),(2,2,2)) 17 | 18 | def distance(a,b): 19 | sum = 0 20 | for i in range(len(a)): 21 | sum += (a[i]-b[i])**2 22 | return sqrt(sum) 23 | 24 | print 'a,b 多维距离为:',distance((1,1,2,2),(2,2,4,4)) 25 | 26 | print '[+]------------标准欧式距离-----------' 27 | 28 | def moreBZOSdis(a,b): 29 | sumnum = 0 30 | for i in range(len(a)): 31 | # 计算si 分量标准差 32 | avg = (a[i]-b[i])/2 33 | si = sqrt( (a[i] - avg) ** 2 + (b[i] - avg) ** 2 ) 34 | sumnum += ((a[i]-b[i])/si ) ** 2 35 | 36 | return sqrt(sumnum) 37 | 38 | print 'a,b 标准欧式距离:',moreBZOSdis((1,2,1,2),(3,3,3,4)) 39 | 40 | print '[+]------------曼哈顿距离-----------' 41 | def twoMHDdis(a,b): 42 | return abs(a[0]-b[0])+abs(a[1]-b[1]) 43 | 44 | print 'a,b 二维曼哈顿距离为:', twoMHDdis((1,1),(2,2)) 45 | 46 | def threeMHDdis(a,b): 47 | return abs(a[0]-b[0])+abs(a[1]-b[1]) + abs(a[2]-b[2]) 48 | 49 | print 'a,b 三维曼哈顿距离为:', threeMHDdis((1,1,1),(2,2,2)) 50 | 51 | 52 | def moreMHDdis(a,b): 53 | sum = 0 54 | for i in range(len(a)): 55 | sum += abs(a[i]-b[i]) 56 | return sum 57 | 58 | print 'a,b 多维曼哈顿距离为:', moreMHDdis((1,1,1,1),(2,2,2,2)) 59 | 60 | print '[+]------------切比雪夫距离-----------' 61 | def twoQBXFdis(a,b): 62 | return max( abs(a[0]-b[0]), abs(a[1]-b[1])) 63 | 64 | print 'a,b二维切比雪夫距离:' , twoQBXFdis((1,2),(3,4)) 65 | 66 | def moreQBXFdis(a,b): 67 | maxnum = 0 68 | for i in range(len(a)): 69 | if abs(a[i]-b[i]) > maxnum: 70 | maxnum = abs(a[i]-b[i]) 71 | return maxnum 72 | 73 | print 'a,b多维切比雪夫距离:' , moreQBXFdis((1,1,1,1),(3,4,3,4)) 74 | 75 | 76 | print '[+]------------夹角余弦-----------' 77 | 78 | def twoCos(a,b): 79 | cos = (a[0]*b[0]+a[1]*b[1]) / (sqrt(a[0]**2 + b[0]**2) * sqrt(a[1]**2 + b[1]**2) ) 80 | 81 | return cos 82 | print 'a,b 二维夹角余弦距离:',twoCos((1,1),(2,2)) 83 | 84 | def moreCos(a,b): 85 | sum_fenzi = 0.0 86 | sum_fenmu_1,sum_fenmu_2 = 0,0 87 | for i in range(len(a)): 88 | sum_fenzi += a[i]*b[i] 89 | sum_fenmu_1 += a[i]**2 90 | sum_fenmu_2 += b[i]**2 91 | 92 | return sum_fenzi/( sqrt(sum_fenmu_1) * sqrt(sum_fenmu_2) ) 93 | print 'a,b 多维夹角余弦距离:',moreCos((1,1,1,1),(2,2,2,2)) 94 | 95 | print '[+]------------汉明距离-----------' 96 | 97 | def hanmingDis(a,b): 98 | sumnum = 0 99 | for i in range(len(a)): 100 | if a[i]!=b[i]: 101 | sumnum += 1 102 | return sumnum 103 | 104 | print 'a,b 汉明距离:',hanmingDis((1,1,2,3),(2,2,1,3)) 105 | 106 | print '[+]------------杰卡德距离-----------' 107 | 108 | def jiekadeDis(a,b): 109 | set_a = set(a) 110 | set_b = set(b) 111 | dis = float(len( (set_a | set_b) - (set_a & set_b) ) )/ len(set_a | set_b) 112 | return dis 113 | 114 | print 'a,b 杰卡德距离:', jiekadeDis((1,2,3),(2,3,4)) 115 | 116 | def jiekadeXSDis(a,b): 117 | set_a = set(a) 118 | set_b = set(b) 119 | dis = float(len(set_a & set_b) )/ len(set_a | set_b) 120 | return dis 121 | 122 | print 'a,b 杰卡德相似系数:', jiekadeXSDis((1,2,3),(2,3,4)) 123 | -------------------------------------------------------------------------------- /0-Spider/README.md: -------------------------------------------------------------------------------- 1 | > 此部分我会上传一些spider的代码吧,大部分会是以目标进行分类,部分对应的会有csdn的blog,路过的大神不要嘲笑我等小白 2 | 3 | 4 | 1: Scrapy 爬取百度贴吧指定帖子的发帖人和回帖人
5 | http://blog.csdn.net/gamer_gyt/article/details/75043398
6 | -------------------------------------------------------------------------------- /0-Spider/beidaNewsSpider/.idea/beidaSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /0-Spider/beidaNewsSpider/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /0-Spider/beidaNewsSpider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /0-Spider/beidaNewsSpider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /0-Spider/beidaNewsSpider/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 38 | 39 | 40 | 45 | 46 | 47 | 48 | 49 | true 50 | DEFINITION_ORDER 51 | 52 | 53 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 82 | 83 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 143 | 144 | 157 | 158 | 175 | 176 | 188 | 189 | project 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 224 | 225 | 244 | 245 | 266 | 267 | 289 | 290 | 314 | 315 | 316 | 318 | 319 | 320 | 321 | 1494037431357 322 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 355 | 356 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 404 | 405 | 406 | 407 | 408 | 409 | -------------------------------------------------------------------------------- /0-Spider/beidaNewsSpider/README.md: -------------------------------------------------------------------------------- 1 | 爬取北大要闻的所有新闻 2 | 3 | url:http://pkunews.pku.edu.cn/xxfz/node_185.htm 4 | 5 | news.sql 为数据备份(Mysql) 6 | 7 | 数据库文件备份与恢复 8 | 9 | 备份:/usr/bin/mysqldump -uroot -proot beidaspider --default-character-set=utf8 --opt -Q -R >./news.sql 10 | 11 | 恢复:/usr/bin/mysql -uroot -proot beidaspider <./news.sql 12 | -------------------------------------------------------------------------------- /0-Spider/beidaNewsSpider/spider.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import pymysql 4 | from bs4 import BeautifulSoup 5 | import urllib.request 6 | import time 7 | 8 | ''' 9 | 创建数据库和数据表语句 10 | create database beidaspider default charset utf8; 11 | 12 | create table news( 13 | title varchar(100), 14 | pub_date date, 15 | from_ varchar(50), 16 | content varchar(20000) 17 | ); 18 | 19 | 数据库备份 20 | /usr/bin/mysqldump -uroot -proot beidaspider --default-character-set=utf8 --opt -Q -R >./news.sql 21 | 22 | 数据库恢复 23 | /usr/bin/mysql -uroot -proot beidaspider <./news.sql 24 | ''' 25 | 26 | 27 | class BeiDaSpider: 28 | # 初始化 29 | def __init__(self): 30 | self.root_href = "http://pkunews.pku.edu.cn/xxfz/" 31 | 32 | # 连接数据库 33 | def connMysql(self): 34 | conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root',db='beidaspider',charset='utf8') 35 | cur = conn.cursor() 36 | return cur,conn 37 | 38 | # 写入数据库 39 | def write(self,title,date,from_,content): 40 | cur,conn = self.connMysql() 41 | sql = """INSERT INTO news (title, pub_date, from_, content) VALUES ("%s", "%s", "%s", "%s")""" % (title,date,from_,content) 42 | cur.execute(sql) 43 | conn.commit() 44 | conn.close() 45 | 46 | with open("news.txt","a") as fp: 47 | fp.write(title+"\t"+date+"\t"+from_+"\t"+content+"\n") 48 | fp.close() 49 | 50 | # 解析每页,获取该页所有的新闻链接 51 | def parse_onePage_href(self,url): 52 | res = urllib.request.urlopen(url) 53 | body = BeautifulSoup(res.read()) 54 | table = body.find('table',cellspacing="0",cellpadding="0",id="nav2_7Tabcontent_10") 55 | a_list = table.find_all('a') 56 | href_list = [] 57 | for a in a_list: 58 | href_list.append(self.root_href + a.get('href')) 59 | return href_list 60 | 61 | # 解析每个新闻,获取数据 62 | def parse_oneNew(self,url): 63 | res = urllib.request.urlopen(url) 64 | body = BeautifulSoup(res.read()) 65 | 66 | # 获取标题 67 | title = body.title.get_text().strip() 68 | print(title) 69 | 70 | # 获取时间和来源 71 | #dataAndfrom = 72 | dataAndfrom = body.find('table',width="560",border="0",cellspacing="0",cellpadding="0") 73 | datafrom_list = dataAndfrom.find_all('tr')[0].get_text().strip().split("  ") 74 | date = datafrom_list[0].split(":")[1].strip() 75 | from_ = datafrom_list[1].split(":")[1].strip() 76 | print(date) 77 | #print(from_) 78 | 79 | # 获取新闻内容 80 | content = body.find('table',width="710",border="0",cellspacing="0",cellpadding="0",style="margin-left:15px;").find_all('tr')[3].get_text().strip().replace("\n"," ") 81 | #print(content) 82 | 83 | self.write(title,date,from_,content) 84 | 85 | def start(self): 86 | for i in range(1,21): 87 | if i==1: 88 | href_list = self.parse_onePage_href(self.root_href + "node_185.htm") 89 | for href in href_list: 90 | try: 91 | self.parse_oneNew(href) 92 | except Exception as e: 93 | print(e) 94 | finally: 95 | pass 96 | # time.sleep(1) 97 | # break 98 | else: 99 | href_list = self.parse_onePage_href(self.root_href + "node_185_" + str(i) + ".htm") 100 | for href in href_list: 101 | try: 102 | self.parse_oneNew(href) 103 | except Exception as e: 104 | print(e) 105 | finally: 106 | pass 107 | # time.sleep(1) 108 | #time.sleep(2) 109 | # break 110 | 111 | 112 | if __name__=="__main__": 113 | spi = BeiDaSpider() 114 | spi.start() 115 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/.idea/tiebaSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 55 | 56 | 57 | 58 | print 59 | time.sleep(self.timesleep) 60 | 61 | 62 | 63 | 76 | 77 | 78 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 105 | 106 | 109 | 110 | 111 | 112 | 115 | 116 | 119 | 120 | 123 | 124 | 125 | 126 | 129 | 130 | 133 | 134 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 157 | 158 | 159 | 160 | 177 | 178 | 189 | 190 | 208 | 209 | 223 | 224 | 225 | 227 | 228 | 229 | 230 | 1498495498538 231 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/README.md: -------------------------------------------------------------------------------- 1 | 这两个文件夹下的爬虫都是为了实现爬取贴吧前三页帖子的发帖人和回帖人,spider1使用的是BeautifulSoup+urllib2,spider2使用的是scrapy 2 | 3 | 4 | [ Scrapy 爬取百度贴吧指定帖子的发帖人和回帖人](http://blog.csdn.net/Gamer_gyt/article/details/75043398) 5 | 6 | 7 | CSDN博客地址: 8 | http://blog.csdn.net/gamer_gyt/ 9 | 10 | 如有问题请联系: 11 | QQ:1923361654 12 | WeChat:gyt13342445911 13 | Email:thinkgamer_gyt@gmail.com 14 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider1/README.md: -------------------------------------------------------------------------------- 1 | # 项目说明 2 | 3 | 该项目为爬取指定贴吧的前三页帖子的发帖用户和回帖用户的用户名 4 | 5 | data 目录为存放数据的目录,其中以天为单位创建二级目录,以贴吧名为三级单位存储抓取结果 6 | 7 | 目录结构类似于: 8 | 9 | data 10 | 11 | --20170626 12 | 13 | -----戒赌吧.txt 14 | 15 | -----网易吧.txt 16 | 17 | tiebaname 目录为存放贴吧名字的目录,将要爬取的贴吧名字写入该目录下的name.txt文件中 18 | 19 | 目录结构类似于: 20 | 21 | tiebaname 22 | 23 | --name.txt 24 | 25 | 采用的是python 的beautifulSoup库,效果不太理想,但后续会逐步改善,可能换成别的框架 26 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider1/main.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from spider import Spider 4 | 5 | if __name__ == "__main__": 6 | import time 7 | print("Start At:",time.asctime( time.localtime(time.time()) )) 8 | spider = Spider() 9 | spider.start() 10 | print("Stop At:",time.asctime( time.localtime(time.time()) )) -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider1/spider.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from bs4 import BeautifulSoup 4 | import urllib2 5 | import urllib 6 | import time,os 7 | 8 | class Spider: 9 | 10 | def __init__(self): 11 | self.search_url = 'https://tieba.baidu.com/f?kw=' 12 | self.tieba_list = [] # 存储要爬取的若干个贴吧的链接 13 | self.url_list = [] # 存放每个贴吧前三页的帖子链接 14 | self.timesleep = 2 # 每次访问tieba的url时间间隔 15 | self.pages = 3 # 设置要抓取多少页 16 | self.current_href = '' # 当前爬取的贴吧链接url 17 | 18 | # 在data目录下创建日期和贴吧名的txt文件 19 | if not os.path.exists('data/%s' % time.strftime('%Y%m%d')): 20 | os.mkdir('data/%s' % time.strftime('%Y%m%d')) 21 | 22 | def error(self,loc,url,e): 23 | fw = open("error/error.log","a") 24 | fw.write(time.asctime( time.localtime(time.time()) )+"\t"+loc+"\t"+url+"\t"+str(e)) 25 | fw.close() 26 | 27 | # 模拟浏览器进行登录 28 | def get_page(self,href): 29 | res = urllib2.urlopen(href) 30 | # 如果访问成功的话返回读取的内容,否则返回空的字符串 31 | if res.code == 200: 32 | return res.read() 33 | else: 34 | return "" 35 | 36 | # 从文件中加载贴吧名并组成url 37 | def read(self): 38 | try: 39 | with open("tiebaname/name.txt", "r") as fr: 40 | for line in fr.readlines(): 41 | # urllib.quote(line.strip()) 将关键字转变成url 格式 42 | self.tieba_list.append(self.search_url + urllib.quote(line.strip()) + "&ie=utf-8&pn=") 43 | fr.close() 44 | except Exception as e: 45 | self.error("read", "read error", e) 46 | pass 47 | finally: 48 | return self.tieba_list 49 | 50 | 51 | # 解析每个帖子共有几页 52 | def get_num(self,url): 53 | try: 54 | if self.get_page(url): 55 | body = BeautifulSoup(self.get_page(url), "html.parser") 56 | num_li = body.find_all("li", class_="l_reply_num", style="margin-left:8px")[0] 57 | num = num_li.findAll('span', class_='red')[1].get_text() 58 | # print(num) 59 | return int(num) 60 | else: 61 | pass 62 | except Exception as e: 63 | self.error("get_num",url,e) 64 | return 1 65 | 66 | # 解析每一个贴吧前三页的所有帖子连接 67 | def parse_href(self,one_tieba_url): 68 | self.url_list = [] # 存放一个贴吧前三页所有帖子的链接 69 | try: 70 | for i in range(0,self.pages): 71 | url = one_tieba_url + str(i * 50) 72 | try: 73 | # i* 50 控制翻页,每页显示50个 74 | if self.get_page(one_tieba_url+str(i*50)): 75 | body = BeautifulSoup(self.get_page(url), "html.parser") 76 | div_list = body.find_all("div", class_="threadlist_title pull_left j_th_tit ") # 解析到每一个帖子 77 | for div in div_list: 78 | # print(div.a.get('href'),div.a.get_text()) 79 | # print("https://tieba.baidu.com" + div.a.get('href')) 80 | self.url_list.append("https://tieba.baidu.com" + div.a.get('href')) 81 | else: 82 | pass 83 | except Exception as e: 84 | self.error("parse_href",url,e) 85 | pass 86 | # time.sleep(self.timesleep) 87 | except Exception as e: 88 | self.error("parse_href",one_tieba_url,e) 89 | pass 90 | 91 | # 解析每个贴吧前三页所有帖子的发帖人和回帖人的用户名 92 | def parse_username(self): 93 | try: 94 | # 解析每个帖子对应的发帖人和回帖人 95 | for url in self.url_list: 96 | filename = urllib.unquote(self.current_href.split("kw=")[1].split("&ie=")[0]) # 贴吧名字,也是文件名 97 | fw = open('data/%s/%s.txt' % (time.strftime('%Y%m%d'), filename), 'a') 98 | 99 | try: 100 | fw.write(url+"\t") 101 | num = self.get_num(url) 102 | for i in range(1,num+1): 103 | one_url = url+"?pn="+str(i) # https://tieba.baidu.com/p/5183701449?pn=1 104 | # print("total %s papges, now parse is %s page,url is:%s"%(num,i,one_url)) 105 | # 解析用户名 106 | if self.get_page(one_url): 107 | li_list = BeautifulSoup(self.get_page(one_url), "html.parser").find_all('li',class_='d_name') 108 | for li in li_list: 109 | # print(li.a.get_text()) 110 | fw.write(li.a.get_text().encode("utf-8")+"\t") 111 | # time.sleep(self.timesleep) 112 | else: 113 | pass 114 | fw.write("\n") 115 | fw.close() 116 | print(url) 117 | except Exception as e: 118 | self.error("parse_username",url,e) 119 | pass 120 | 121 | time.sleep(self.timesleep) 122 | except Exception as e: 123 | self.error("parse_username",url,e) 124 | pass 125 | 126 | def start(self): 127 | self.read() # load tieba_prepare name 128 | for url in self.tieba_list: 129 | try: 130 | self.current_href =url 131 | print("Start:",self.current_href,time.strftime("%Y-%m-%d %H-%M-%S")) #self.current_href, 132 | self.parse_href(url) # 解析该贴吧对应的前三页的每个帖子的链接 133 | self.parse_username() # 解析每个帖子的发帖人和回帖人 134 | except Exception as e: 135 | self.error("start","parse error at start",e) 136 | pass 137 | 138 | time.sleep(self.timesleep) 139 | print("Over:",time.strftime("%Y-%m-%d %H-%M-%S")) -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider1/spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider1/spider.pyc -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider1/tiebaname/name.txt: -------------------------------------------------------------------------------- 1 | 戒赌 2 | 足彩 3 | 福彩 4 | 汉中彩票 5 | 体彩 6 | 竞彩 7 | 双色球 8 | 深圳 9 | 上海 10 | 北京 11 | 武汉 12 | 福建 13 | 浙江 14 | 广州 15 | 哈尔滨 16 | 吉林 17 | 青岛 18 | 杭州 19 | 山东 20 | 重庆 21 | nba 22 | 曼联 23 | 科比 24 | 皇家马德里 25 | 巴塞罗那 26 | 切尔西 27 | ac米兰 28 | 北京国安 29 | 山东鲁能 30 | 国际米兰 31 | 拜仁慕尼黑 32 | 火箭 33 | 广州FC 34 | 詹姆斯 35 | 麦迪 36 | 利物浦 37 | 阿森纳 38 | 尤文图斯 39 | 洛杉矶湖人 40 | 上海申花 41 | 热火 42 | 梅西 43 | 德国队 44 | 江苏舜天 45 | 小小罗 46 | 天津泰达 47 | 死飞 48 | 欧洲杯 49 | 中超 50 | cba 51 | 河南建业 52 | 曼城 53 | 国足 54 | 意大利国家队 55 | 多特蒙德 56 | 英超 57 | 中国足球 58 | 库里 59 | 内马尔 60 | 罗伊斯 61 | 足球 62 | 篮球 63 | 网球 64 | 浙江绿城 65 | 苹果 66 | iphone 67 | 长春亚泰 68 | 英格兰 69 | 辽宁宏运 70 | 贵州人和 71 | 上海东亚 72 | 重庆力帆 73 | 西甲 74 | 马德里竞技 75 | 德甲 76 | 世界杯 77 | 艾弗森 78 | 韦德 79 | 马刺 80 | 易建联 81 | 北京金隅 82 | 广东宏远 83 | 李毅 84 | 扒皮 85 | 美女 86 | 小米 87 | 电影 88 | 内涵 89 | 动漫 90 | nba 91 | 头像 92 | 遮天 93 | exo 94 | 爆照 95 | 减肥 96 | 鹿晗 97 | 神回复 98 | dota 99 | 文字控 100 | 心理学 101 | 美食 102 | 校花 103 | 绿帽子小同学 104 | 旅行 105 | 小说 106 | 笑话 107 | 90后 108 | 高考 109 | 权志龙 110 | 吴亦凡 111 | 手绘 112 | 梦幻西游 113 | 旅游 114 | dota2 115 | les 116 | 胥渡 117 | 爱情 118 | 整形 119 | 隆鼻 120 | 腐女 121 | gay 122 | 搞笑 123 | 柯南 124 | 剑网 125 | 凡人修仙 126 | 周杰伦 127 | 刘诗诗 128 | 爱情公寓 129 | 陈奕迅 130 | 李敏浩 131 | 音乐 132 | bigbang 133 | 帅哥 134 | 淘宝 135 | 进击的巨人 136 | 张杰 137 | 网名 138 | 魅族 139 | 手机 140 | 短句 141 | 张艺兴 142 | 金秀贤 143 | 手工 144 | 路过的一只 145 | 娱乐圈 146 | 内涵图 147 | 章鱼卡 148 | 君似毒 149 | 黄子韬 150 | 秦时明月 151 | 杨幂 152 | 言情小说 153 | 化妆 154 | 天天酷跑 155 | 情感 156 | 2012 157 | 恐怖 158 | 维尼夫妇 159 | 整容 160 | vae 161 | 爱所以存在 162 | 吴世勋 163 | 吃货 -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/.idea/tieba.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 45 | 46 | 47 | 54 | 55 | 56 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 81 | 82 | 85 | 86 | 87 | 88 | 91 | 92 | 95 | 96 | 99 | 100 | 101 | 102 | 105 | 106 | 109 | 110 | 113 | 114 | 117 | 118 | 119 | 120 | 123 | 124 | 127 | 128 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 154 | 155 | 156 | 157 | 174 | 175 | 186 | 187 | 205 | 206 | 220 | 221 | 222 | 224 | 225 | 226 | 227 | 1498758628713 228 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/name.txt: -------------------------------------------------------------------------------- 1 | 戒赌 2 | 足彩 3 | 福彩 4 | 汉中彩票 5 | 体彩 6 | 竞彩 7 | 双色球 8 | 深圳 9 | 上海 10 | 北京 11 | 武汉 12 | 福建 13 | 浙江 14 | 广州 15 | 哈尔滨 16 | 吉林 17 | 青岛 18 | 杭州 19 | 山东 20 | 重庆 21 | nba 22 | 曼联 23 | 科比 24 | 皇家马德里 25 | 巴塞罗那 26 | 切尔西 27 | ac米兰 28 | 北京国安 29 | 山东鲁能 30 | 国际米兰 31 | 拜仁慕尼黑 32 | 火箭 33 | 广州FC 34 | 詹姆斯 35 | 麦迪 36 | 利物浦 37 | 阿森纳 38 | 尤文图斯 39 | 洛杉矶湖人 40 | 上海申花 41 | 热火 42 | 梅西 43 | 德国队 44 | 江苏舜天 45 | 小小罗 46 | 天津泰达 47 | 死飞 48 | 欧洲杯 49 | 中超 50 | cba 51 | 河南建业 52 | 曼城 53 | 国足 54 | 意大利国家队 55 | 多特蒙德 56 | 英超 57 | 中国足球 58 | 库里 59 | 内马尔 60 | 罗伊斯 61 | 足球 62 | 篮球 63 | 网球 64 | 浙江绿城 65 | 苹果 66 | iphone 67 | 长春亚泰 68 | 英格兰 69 | 辽宁宏运 70 | 贵州人和 71 | 上海东亚 72 | 重庆力帆 73 | 西甲 74 | 马德里竞技 75 | 德甲 76 | 世界杯 77 | 艾弗森 78 | 韦德 79 | 马刺 80 | 易建联 81 | 北京金隅 82 | 广东宏远 83 | 李毅 84 | 扒皮 85 | 美女 86 | 小米 87 | 电影 88 | 内涵 89 | 动漫 90 | nba 91 | 头像 92 | 遮天 93 | exo 94 | 爆照 95 | 减肥 96 | 鹿晗 97 | 神回复 98 | dota 99 | 文字控 100 | 心理学 101 | 美食 102 | 校花 103 | 绿帽子小同学 104 | 旅行 105 | 小说 106 | 笑话 107 | 90后 108 | 高考 109 | 权志龙 110 | 吴亦凡 111 | 手绘 112 | 梦幻西游 113 | 旅游 114 | dota2 115 | les 116 | 胥渡 117 | 爱情 118 | 整形 119 | 隆鼻 120 | 腐女 121 | gay 122 | 搞笑 123 | 柯南 124 | 剑网 125 | 凡人修仙 126 | 周杰伦 127 | 刘诗诗 128 | 爱情公寓 129 | 陈奕迅 130 | 李敏浩 131 | 音乐 132 | bigbang 133 | 帅哥 134 | 淘宝 135 | 进击的巨人 136 | 张杰 137 | 网名 138 | 魅族 139 | 手机 140 | 短句 141 | 张艺兴 142 | 金秀贤 143 | 手工 144 | 路过的一只 145 | 娱乐圈 146 | 内涵图 147 | 章鱼卡 148 | 君似毒 149 | 黄子韬 150 | 秦时明月 151 | 杨幂 152 | 言情小说 153 | 化妆 154 | 天天酷跑 155 | 情感 156 | 2012 157 | 恐怖 158 | 维尼夫妇 159 | 整容 160 | vae 161 | 爱所以存在 162 | 吴世勋 163 | 吃货 164 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tieba.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tieba 12 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.py -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.pyc -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TiebaItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/items.pyc -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TiebaSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import time 9 | 10 | 11 | class TiebaPipeline(object): 12 | 13 | def process_item(self, item, spider): 14 | return item 15 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/pipelines.pyc -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tieba project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tieba' 13 | 14 | SPIDER_MODULES = ['tieba.spiders'] 15 | NEWSPIDER_MODULE = 'tieba.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'tieba (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'tieba.middlewares.TiebaSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'tieba.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'tieba.pipelines.TiebaPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/settings.pyc -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/__init__.pyc -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba1.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import scrapy 4 | import urllib 5 | import time 6 | 7 | 8 | class TiebaSpider(scrapy.Spider): 9 | 10 | name = 'tieba' 11 | 12 | def __init__(self): 13 | self.urls = [] 14 | 15 | # 加载贴吧名 16 | fr = open("name.txt", "r") 17 | 18 | for one in fr.readlines(): 19 | for i in range(0, 3): 20 | self.urls.append('https://tieba.baidu.com/f?kw=' + 21 | urllib.quote(one.strip()) + '&ie=utf-8&pn=' + str(i * 50)) 22 | fr.close() 23 | 24 | def start_requests(self): 25 | urls = self.urls 26 | 27 | for url in urls: 28 | yield scrapy.Request(url=url, callback=self.parse) 29 | 30 | def parse(self, response): 31 | sel = scrapy.Selector(response) 32 | ahref_list = sel.xpath( 33 | '//a[re:test(@class, "j_th_tit ")]//@href').extract() 34 | 35 | fw = open("data/%s_all_href.txt" % time.strftime('%Y%m%d'), "a") 36 | for ahref in ahref_list: 37 | href = "https://tieba.baidu.com" + ahref 38 | fw.write(href + "\n") 39 | fw.close() 40 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba1.pyc -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba2.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import scrapy 4 | import time 5 | from scrapy.http.request import Request 6 | from scrapy.http import HtmlResponse 7 | 8 | class TiebaSpider2(scrapy.Spider): 9 | 10 | name = 'tieba2' 11 | 12 | def __init__(self): 13 | self.urls = [] 14 | 15 | # 加载贴吧名 16 | fr = open("data/%s_all_href.txt" % time.strftime('%Y%m%d'), "r") 17 | 18 | for one in fr.readlines(): 19 | self.urls.append(one.strip()) 20 | fr.close() 21 | 22 | def start_requests(self): 23 | urls = self.urls 24 | 25 | for one in urls: 26 | yield scrapy.Request(url=one, callback=self.parse) 27 | 28 | def parse_uname(self, response): 29 | # response = HtmlResponse(url=page_url.url) 30 | sel = scrapy.Selector(response) 31 | name_list = sel.xpath('//li[re:test(@class, "d_name")]//a/text()').extract() 32 | # print respons 33 | fw = open("data/%s_all_name.txt" % time.strftime('%Y%m%d'), "a") 34 | for name in list(set(name_list)): 35 | fw.write(name.encode("utf-8")) 36 | fw.write("\n") 37 | fw.close() 38 | 39 | def parse(self, response): 40 | sel = scrapy.Selector(response) 41 | 42 | # 可能有些帖子被删除 43 | try: 44 | # 得到每个帖子有多少页 45 | num = int(sel.xpath('//span[re:test(@class,"red")]//text()').extract()[1]) 46 | # 遍历每页获得用户名 47 | for page_num in range(1, num + 1): 48 | one_url = response.url + "?pn=" + str(page_num) 49 | 50 | yield Request(url=one_url, callback=self.parse_uname) 51 | except Exception as e: 52 | pass 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba2.pyc -------------------------------------------------------------------------------- /AdaBoost/AdaBoost.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | ''' 3 | Created on 2016年5月10日 4 | 5 | @author: Gamer Think 6 | ''' 7 | from test.inspect_fodder import StupidGit 8 | 9 | __author__="thinkgamer" 10 | 11 | from numpy import * 12 | 13 | #加载数据集 14 | def loadSimData(): 15 | datMat = matrix([[1.0 , 2.1], 16 | [2. , 1.1], 17 | [1.3 , 1. ], 18 | [1. , 1. ], 19 | [2. , 1. ]]) 20 | 21 | classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] 22 | return datMat,classLabels 23 | 24 | #单层决策树生成函数 25 | def stumpClassify(dataMatrix, dimen,threshVal, threshInsq): 26 | retArray = ones((shape(dataMatrix)[0],1)) 27 | if threshInsq == 'lt': 28 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 29 | else: 30 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 31 | return retArray 32 | 33 | def buildStump(dataArr,classLabels,D): 34 | dataMatrix = mat(dataArr) 35 | #matrix必须是二维的,numpy可以是多维的 36 | labelMat = mat(classLabels).T #.T表示转置矩阵 37 | m,n = shape(dataMatrix) #给定数据集的行列数 38 | numSteps = 10.0 #变用于在特征的所有可能值上进行遍历 39 | bestStump = {} #字典用于存储给定权重向量0时所得到的最佳单层决策树的相关信息 40 | bestClassEnt = mat(zeros((m,1))) 41 | minError = inf #首先将minError初始化为正无穷大 42 | for i in range(n): 43 | rangeMin = dataMatrix[:,i].min() 44 | rangeMax = dataMatrix[:,i].max() 45 | stepSize = (rangeMax-rangeMin)/numSteps 46 | for j in range(-1,int(numSteps)+1): 47 | #lt :小于,lte,le:小于等于 48 | #gt:大于,,gte,ge:大于等于 49 | #eq:等于 ne,neq:不等于 50 | for inequal in ['lt','gt']: 51 | threshVal = (rangeMin + float(j) * stepSize) 52 | predictedVals = stumpClassify(dataMatrix,i,threshVal, inequal) 53 | errArr = mat(ones((m,1))) 54 | errArr[predictedVals==labelMat]=0 55 | weightedError = D.T * errArr #计算加权错误概率 56 | # print "split: dim %d, thresh % .2f, thresh inequal: %s, the weighted error is %.3f" % (i, threshVal,inequal,weightedError) 57 | #更新bestStump中保存的最佳单层决策树的相关信息 58 | if weightedError < minError: 59 | minError = weightedError 60 | bestClassEnt = predictedVals.copy() 61 | bestStump['dim'] = i 62 | bestStump['thresh'] = threshVal 63 | bestStump['ineq'] = inequal 64 | 65 | return bestStump,minError,bestClassEnt 66 | 67 | #基于单层决策树的AdaBoost训练过程 68 | #numIt:迭代次数,默认为40 69 | def adaBoostTrainDS(dataArr,classLabels,numIt=40): 70 | weakClassArr = [] 71 | m= shape(dataArr)[0] 72 | D = mat(ones((m,1))/m) 73 | aggClassEst = mat(zeros((m,1))) 74 | #迭代 75 | for i in range(numIt): 76 | #调用单层决策树 77 | bestStump,error,classEst = buildStump(dataArr, classLabels, D) 78 | print "D:",D.T #打印D的转置矩阵 79 | alpha = float(0.5 * log((1.0 - error) / max(error,1e-16)))# max(error,1e-16)))用于确保没有错误时,不会发生溢出 80 | bestStump['alpha'] = alpha 81 | weakClassArr.append(bestStump) 82 | print "classEst:",classEst.T 83 | #为下一次迭代计算D 84 | expon = multiply(-1 * alpha * mat(classLabels).T,classEst) 85 | D = multiply(D,exp(expon)) 86 | D = D /D.sum() 87 | #错误率累加计算 88 | aggClassEst += alpha* classEst 89 | print "aggClassEst:",aggClassEst.T 90 | aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m,1))) 91 | errorRate = aggErrors.sum()/m 92 | print "total error:",errorRate 93 | #如果不发生错误,返回 94 | if errorRate == 0.0: 95 | break 96 | return weakClassArr 97 | 98 | 99 | #AdaBoost分类函数 100 | #输入参数为待分类样例datToClass和多个弱分类器classifierArr 101 | def adaClassify(datToClass,classifierArr): 102 | dataMatrix = mat(datToClass) 103 | m = shape(dataMatrix)[0] 104 | aggClassEst = mat(zeros((m,1))) 105 | for i in range(len(classifierArr)): 106 | classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\ 107 | classifierArr[i]['thresh'],\ 108 | classifierArr[i]['ineq']) 109 | aggClassEst+= classifierArr[i]['alpha'] * classEst 110 | print aggClassEst 111 | return sign(aggClassEst) 112 | 113 | 114 | #main函数 115 | if __name__=="__main__": 116 | #加载数据集 117 | datMat,classLabels = loadSimData() 118 | # print "datMat:",datMat 119 | # print "classLabels:",classLabels 120 | 121 | #单层决策树生成函数 122 | # D = mat(ones((5,1))/5) 123 | # print buildStump(datMat, classLabels, D) 124 | 125 | #基于单层决策树的Adaboost训练过程 126 | classifierArray = adaBoostTrainDS(datMat, classLabels, 30) 127 | # for classifier in classifierArray: 128 | # print classifier 129 | 130 | #测试AdaBoost分类函数 131 | print "[0,0]:\n",adaClassify([0,0], classifierArray) 132 | print "\n\n[[5,5],[0,0]]:\n",adaClassify([[5,5],[0,0]], classifierArray) 133 | -------------------------------------------------------------------------------- /Apriori/Apriori.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/Apriori/Apriori.py -------------------------------------------------------------------------------- /Bayes/bayes.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 2 | ''' 3 | Created on 2016年5月12日 4 | 5 | @author: Gamer Think 6 | ''' 7 | 8 | from numpy import * 9 | 10 | #词表到向量的转换函数 11 | def loadDataSet(): 12 | postingList = [['my','dog','has','flea','problems','help','please'], 13 | ['maybe','not','take','him','to','dog','park','stupid'], 14 | ['my','dalmation','is','so','cute','I','love','him'], 15 | ['stop','posting','stupid','worthless','garbage'], 16 | ['mr','licks','ate','my','steak','how','to','stop','him'], 17 | ['quit','buying','worthless','dog','food','stupid']] 18 | classVec = [0,1,0,1,0,1] #1,侮辱 0,正常 19 | return postingList,classVec 20 | 21 | def createVocabList(dataSet): 22 | vocabSet = set([]) #调用set方法,创建一个空集 23 | for document in dataSet: 24 | vocabSet = vocabSet | set(document) #创建两个集合的并集 25 | return list(vocabSet) 26 | 27 | def setOfWords2Vec(vocabList,inputSet): 28 | returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量 29 | for word in inputSet: 30 | if word in vocabList: 31 | returnVec[vocabList.index(word)] = 1 32 | else: 33 | print "the word:%s is not in my Vocabulary" % word 34 | return returnVec 35 | 36 | 37 | def bagOfWords2VecMN(vocabList,inputSet): 38 | returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量 39 | for word in inputSet: 40 | if word in vocabList: 41 | returnVec[vocabList.index(word)] += 1 42 | return returnVec 43 | 44 | 45 | #朴素贝叶斯分类器训练集 46 | def trainNB0(trainMatrix,trainCategory): #传入参数为文档矩阵,每篇文档类别标签所构成的向量 47 | numTrainDocs = len(trainMatrix) #文档矩阵的长度 48 | numWords = len(trainMatrix[0]) #第一个文档的单词个数 49 | pAbusive = sum(trainCategory)/float(numTrainDocs) #任意文档属于侮辱性文档概率 50 | #p0Num = zeros(numWords);p1Num = zeros(numWords) #初始化两个矩阵,长度为numWords,内容值为0 51 | p0Num = ones(numWords);p1Num = ones(numWords) #初始化两个矩阵,长度为numWords,内容值为1 52 | #p0Denom = 0.0;p1Denom = 0.0 #初始化概率 53 | p0Denom = 2.0;p1Denom = 2.0 54 | for i in range(numTrainDocs): 55 | if trainCategory[i]==1: 56 | p1Num +=trainMatrix[i] 57 | p1Denom += sum(trainMatrix[i]) 58 | else: 59 | p0Num +=trainMatrix[i] 60 | p0Denom += sum(trainMatrix[i]) 61 | #p1Vect = p1Num/p1Denom #对每个元素做除法 62 | #p0Vect = p0Num/p0Denom 63 | p1Vect = log(p1Num/p1Denom) 64 | p0Vect = log(p0Num/p0Denom) 65 | return p0Vect,p1Vect,pAbusive 66 | 67 | #朴素贝叶斯分类函数 68 | def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): 69 | p1 = sum(vec2Classify * p1Vec) + log(pClass1) #元素相乘 70 | p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) 71 | if p1>p0: 72 | return 1 73 | else: 74 | return 0 75 | 76 | def testingNB(): 77 | listOPosts,listClasses = loadDataSet() #产生文档矩阵和对应的标签 78 | myVocabList = createVocabList(listOPosts) #创建并集 79 | trainMat = [] #创建一个空的列表 80 | for postinDoc in listOPosts: 81 | trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #使用词向量来填充trainMat列表 82 | p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #训练函数 83 | testEntry = ['love','my','dalmation'] #测试文档列表 84 | thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵 85 | print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb) 86 | testEntry = ['stupid','garbage'] 87 | thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵 88 | print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb) 89 | 90 | if __name__=="__main__": 91 | testingNB() -------------------------------------------------------------------------------- /Decision-Tree/DecisionTree-ID3.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | ''' 4 | from math import log 5 | import operator 6 | 7 | def createDataSet(): 8 | dataSet =[[1,1,'yes'], 9 | [1,1,'yes'], 10 | [1,0,'no'], 11 | [0,1,'no'], 12 | [0,1,'no']] 13 | labels = ['no surfacing','flippers'] #分类的属性 14 | return dataSet,labels 15 | 16 | #计算给定数据的香农熵 17 | def calcShannonEnt(dataSet): 18 | numEntries = len(dataSet) 19 | labelCounts = {} 20 | for featVec in dataSet: 21 | currentLabel = featVec[-1] #获得标签 22 | #构造存放标签的字典 23 | if currentLabel not in labelCounts.keys(): 24 | labelCounts[currentLabel]=0 25 | labelCounts[currentLabel]+=1 #对应的标签数目+1 26 | #计算香农熵 27 | shannonEnt = 0.0 28 | for key in labelCounts: 29 | prob = float(labelCounts[key])/numEntries 30 | shannonEnt -=prob*log(prob,2) 31 | return shannonEnt 32 | 33 | #划分数据集,三个参数为带划分的数据集,划分数据集的特征,特征的返回值 34 | def splitDataSet(dataSet,axis,value): 35 | retDataSet = [] 36 | for featVec in dataSet: 37 | if featVec[axis] ==value: 38 | #将相同数据集特征的抽取出来 39 | reducedFeatVec = featVec[:axis] 40 | reducedFeatVec.extend(featVec[axis+1:]) 41 | retDataSet.append(reducedFeatVec) 42 | return retDataSet #返回一个列表 43 | 44 | #选择最好的数据集划分方式 45 | def chooseBestFeatureToSplit(dataSet): 46 | numFeature = len(dataSet[0])-1 47 | baseEntropy = calcShannonEnt(dataSet) 48 | bestInfoGain = 0.0 49 | beatFeature = -1 50 | for i in range(numFeature): 51 | featureList = [example[i] for example in dataSet] #获取第i个特征所有的可能取值 52 | uniqueVals = set(featureList) #从列表中创建集合,得到不重复的所有可能取值ֵ 53 | newEntropy = 0.0 54 | for value in uniqueVals: 55 | subDataSet = splitDataSet(dataSet,i,value) #以i为数据集特征,value为返回值,划分数据集 56 | prob = len(subDataSet)/float(len(dataSet)) #数据集特征为i的所占的比例 57 | newEntropy +=prob * calcShannonEnt(subDataSet) #计算每种数据集的信息熵 58 | infoGain = baseEntropy- newEntropy 59 | #计算最好的信息增益,增益越大说明所占决策权越大 60 | if (infoGain > bestInfoGain): 61 | bestInfoGain = infoGain 62 | bestFeature = i 63 | return bestFeature 64 | 65 | #递归构建决策树 66 | def majorityCnt(classList): 67 | classCount = {} 68 | for vote in classList: 69 | if vote not in classCount.keys(): 70 | classCount[vote]=0 71 | classCount[vote]+=1 72 | sortedClassCount = sorted(classCount.iteritems(),key =operator.itemgetter(1),reverse=True)#排序,True升序 73 | return sortedClassCount[0][0] #返回出现次数最多的 74 | 75 | #创建树的函数代码 76 | def createTree(dataSet,labels): 77 | classList = [example[-1] for example in dataSet] 78 | if classList.count(classList[0])==len(classList):#类别完全相同则停止划分 79 | return classList[0] 80 | if len(dataSet[0]) ==1: #遍历完所有特征值时返回出现次数最多的 81 | return majorityCnt(classList) 82 | bestFeat = chooseBestFeatureToSplit(dataSet) #选择最好的数据集划分方式 83 | bestFeatLabel = labels[bestFeat] #得到对应的标签值 84 | myTree = {bestFeatLabel:{}} 85 | del(labels[bestFeat]) #清空labels[bestFeat],在下一次使用时清零 86 | featValues = [example[bestFeat] for example in dataSet] 87 | uniqueVals = set(featValues) 88 | for value in uniqueVals: 89 | subLabels =labels[:] 90 | #递归调用创建决策树函数 91 | myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels) 92 | return myTree 93 | 94 | if __name__=="__main__": 95 | dataSet,labels = createDataSet() 96 | print createTree(dataSet,labels) -------------------------------------------------------------------------------- /FP-growth/FP_Tree.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/FP-growth/FP_Tree.py -------------------------------------------------------------------------------- /FP-growth/newsClickStream.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/FP-growth/newsClickStream.py -------------------------------------------------------------------------------- /FP-growth/所用到dat文件下载地址.txt: -------------------------------------------------------------------------------- 1 | http://download.csdn.net/detail/gamer_gyt/9514873 -------------------------------------------------------------------------------- /K-means/kMeans.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/K-means/kMeans.py -------------------------------------------------------------------------------- /Logistic Regession/LogisticRegession.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | ''' 3 | Created on 2016/4/24 4 | 5 | @author: Gamer Think 6 | ''' 7 | 8 | from numpy import * 9 | 10 | #加载数据集 11 | def loadDataSet(): 12 | dataMat = [] 13 | labelMat = [] 14 | fp = open("ex1.txt") 15 | for line in fp.readlines(): 16 | lineArr = line.strip().split() #分割 17 | dataMat.append([1.0,float(lineArr[0]), float(lineArr[1])]) 18 | labelMat.append( int(lineArr[2])) 19 | 20 | return dataMat,labelMat 21 | 22 | #定义Sigmoid函数 23 | def sigmoid(inX): 24 | return 1.0/(1+exp(-inX)) 25 | 26 | #梯度上升算法求解最佳回归系数 27 | def gradAscent(dataMatIn,classLabels): 28 | dataMatrix = mat(dataMatIn) #将数组转为矩阵 29 | labelMat = mat(classLabels).transpose() 30 | m,n = shape(dataMatrix) #返回矩阵的行和列 31 | alpha = 0.001 #初始化 alpha的值 32 | maxCycles = 500 #最大迭代次数 33 | weights = ones((n,1)) #初始化最佳回归系数 34 | for i in range(0,maxCycles): 35 | #引用原书的代码,求梯度 36 | h = sigmoid(dataMatrix*weights) 37 | error = labelMat - h 38 | weights = weights + alpha * dataMatrix.transpose() * error 39 | 40 | return weights 41 | 42 | #随机梯度上升算法求回归系数 43 | def stocGradAscent0(dataMatrix,labelMat): 44 | dataMatrix = array(dataMatrix) 45 | m,n = shape(dataMatrix) 46 | alpha = 0.01 47 | weights = ones(n) 48 | for i in range(0,m): 49 | h = sigmoid(sum(dataMatrix[i]*weights)) 50 | error = labelMat[i] - h 51 | weights = weights + alpha * error * dataMatrix[i] 52 | 53 | return weights 54 | 55 | 56 | #改进版的随机梯度上升算法 57 | def stocGradAscent1(dataMatrix,labelMat,numIter=150): 58 | m,n = shape(dataMatrix) 59 | weights = ones(n) 60 | for i in range(0,numIter): 61 | dataIndex = range(m) 62 | for j in range(0,m): 63 | alpha = 4/(1.0+j+i)+0.01 64 | randIndex = int(random.uniform(0,len(dataIndex))) 65 | h = sigmoid(sum(dataMatrix[randIndex] * weights)) 66 | error = labelMat[randIndex] - h 67 | weights = weights + alpha * error * dataMatrix[randIndex] 68 | del(dataIndex[randIndex]) 69 | 70 | return weights 71 | 72 | #分析数据,画出决策边界 73 | def plotBestFit(wei,dataMatrix,labelMat): 74 | import matplotlib.pyplot as plt 75 | weights = wei #将矩阵wei转化为list 76 | dataArr = array(dataMatrix) #将矩阵转化为数组 77 | n = shape(dataMatrix)[0] 78 | xcord1 = [];ycord1=[] 79 | xcord2 = [];ycord2=[] 80 | 81 | for i in range(n): 82 | if int(labelMat[i])==1: 83 | xcord1.append(dataArr[i,1]) 84 | ycord1.append(dataArr[i,2]) 85 | else: 86 | xcord2.append(dataArr[i,1]) 87 | ycord2.append(dataArr[i,2]) 88 | 89 | fig = plt.figure() 90 | ax = fig.add_subplot(111) 91 | ax.scatter(xcord1,ycord1,s=30,c='red', marker='s') 92 | ax.scatter(xcord2,ycord2,s=30,c="green") 93 | x = arange(-3.0,3.0,0.1) 94 | y = (-weights[0]-weights[1] * x)/weights[2] 95 | ax.plot(x,y) 96 | plt.xlabel("x1") #X轴的标签 97 | plt.ylabel("x2") #Y轴的标签 98 | plt.show() 99 | 100 | 101 | 102 | if __name__=="__main__": 103 | dataMatrix,labelMat = loadDataSet() 104 | #梯度上升算法 105 | # weight = gradAscent(dataMatrix, labelMat) 106 | # print weight 107 | # plotBestFit(weight.getA(),dataMatrix,labelMat) 108 | 109 | #随机梯度上升算法 110 | # weight = stocGradAscent0(dataMatrix, labelMat) 111 | # print weight 112 | # plotBestFit(weight,dataMatrix,labelMat) 113 | 114 | #改进版的随机梯度上升算法 115 | weight = stocGradAscent1(array(dataMatrix), labelMat) 116 | print weight 117 | plotBestFit(weight,dataMatrix,labelMat) 118 | 119 | -------------------------------------------------------------------------------- /Logistic Regession/LogisticRegessionExample.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | ''' 3 | Created on 2016/4/25 4 | 5 | @author: Gamer Think 6 | ''' 7 | import LogisticRegession as lr 8 | from numpy import * 9 | 10 | #二分类问题进行分类 11 | def classifyVector(inX,weights): 12 | prob = lr.sigmoid(sum(inX * weights)) 13 | if prob>0.5: 14 | return 1.0 15 | else: 16 | return 0.0 17 | 18 | #训练和测试 19 | def colicTest(): 20 | frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt') 21 | trainingSet = []; trainingLabels = [] 22 | #训练回归模型 23 | for line in frTrain.readlines(): 24 | currLine = line.strip().split('\t') 25 | lineArr =[] 26 | for i in range(21): 27 | lineArr.append(float(currLine[i])) 28 | trainingSet.append(lineArr) 29 | trainingLabels.append(float(currLine[21])) 30 | trainWeights = lr.stocGradAscent1(array(trainingSet), trainingLabels, 1000) 31 | errorCount = 0; numTestVec = 0.0 32 | #测试回归模型 33 | for line in frTest.readlines(): 34 | numTestVec += 1.0 35 | currLine = line.strip().split('\t') 36 | lineArr =[] 37 | for i in range(21): 38 | lineArr.append(float(currLine[i])) 39 | if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]): 40 | errorCount += 1 41 | errorRate = (float(errorCount)/numTestVec) 42 | print "the error rate of this test is: %f" % errorRate 43 | return errorRate 44 | 45 | def multiTest(): 46 | numTests = 10 47 | errorSum = 0.0 48 | for k in range(numTests): 49 | errorSum += colicTest() 50 | print "after %d iterations the average error rate is: %f" % (numTests,errorSum/float(numTests)) 51 | 52 | 53 | if __name__=="__main__": 54 | multiTest() 55 | -------------------------------------------------------------------------------- /Logistic Regession/ex1.txt: -------------------------------------------------------------------------------- 1 | -0.017612 14.053064 0 2 | -1.395634 4.662541 1 3 | -0.752157 6.538620 0 4 | -1.322371 7.152853 0 5 | 0.423363 11.054677 0 6 | 0.406704 7.067335 1 7 | 0.667394 12.741452 0 8 | -2.460150 6.866805 1 9 | 0.569411 9.548755 0 10 | -0.026632 10.427743 0 11 | 0.850433 6.920334 1 12 | 1.347183 13.175500 0 13 | 1.176813 3.167020 1 14 | -1.781871 9.097953 0 15 | -0.566606 5.749003 1 16 | 0.931635 1.589505 1 17 | -0.024205 6.151823 1 18 | -0.036453 2.690988 1 19 | -0.196949 0.444165 1 20 | 1.014459 5.754399 1 21 | 1.985298 3.230619 1 22 | -1.693453 -0.557540 1 23 | -0.576525 11.778922 0 24 | -0.346811 -1.678730 1 25 | -2.124484 2.672471 1 26 | 1.217916 9.597015 0 27 | -0.733928 9.098687 0 28 | -3.642001 -1.618087 1 29 | 0.315985 3.523953 1 30 | 1.416614 9.619232 0 31 | -0.386323 3.989286 1 32 | 0.556921 8.294984 1 33 | 1.224863 11.587360 0 34 | -1.347803 -2.406051 1 35 | 1.196604 4.951851 1 36 | 0.275221 9.543647 0 37 | 0.470575 9.332488 0 38 | -1.889567 9.542662 0 39 | -1.527893 12.150579 0 40 | -1.185247 11.309318 0 41 | -0.445678 3.297303 1 42 | 1.042222 6.105155 1 43 | -0.618787 10.320986 0 44 | 1.152083 0.548467 1 45 | 0.828534 2.676045 1 46 | -1.237728 10.549033 0 47 | -0.683565 -2.166125 1 48 | 0.229456 5.921938 1 49 | -0.959885 11.555336 0 50 | 0.492911 10.993324 0 51 | 0.184992 8.721488 0 52 | -0.355715 10.325976 0 53 | -0.397822 8.058397 0 54 | 0.824839 13.730343 0 55 | 1.507278 5.027866 1 56 | 0.099671 6.835839 1 57 | -0.344008 10.717485 0 58 | 1.785928 7.718645 1 59 | -0.918801 11.560217 0 60 | -0.364009 4.747300 1 61 | -0.841722 4.119083 1 62 | 0.490426 1.960539 1 63 | -0.007194 9.075792 0 64 | 0.356107 12.447863 0 65 | 0.342578 12.281162 0 66 | -0.810823 -1.466018 1 67 | 2.530777 6.476801 1 68 | 1.296683 11.607559 0 69 | 0.475487 12.040035 0 70 | -0.783277 11.009725 0 71 | 0.074798 11.023650 0 72 | -1.337472 0.468339 1 73 | -0.102781 13.763651 0 74 | -0.147324 2.874846 1 75 | 0.518389 9.887035 0 76 | 1.015399 7.571882 0 77 | -1.658086 -0.027255 1 78 | 1.319944 2.171228 1 79 | 2.056216 5.019981 1 80 | -0.851633 4.375691 1 81 | -1.510047 6.061992 0 82 | -1.076637 -3.181888 1 83 | 1.821096 10.283990 0 84 | 3.010150 8.401766 1 85 | -1.099458 1.688274 1 86 | -0.834872 -1.733869 1 87 | -0.846637 3.849075 1 88 | 1.400102 12.628781 0 89 | 1.752842 5.468166 1 90 | 0.078557 0.059736 1 91 | 0.089392 -0.715300 1 92 | 1.825662 12.693808 0 93 | 0.197445 9.744638 0 94 | 0.126117 0.922311 1 95 | -0.679797 1.220530 1 96 | 0.677983 2.556666 1 97 | 0.761349 10.693862 0 98 | -2.168791 0.143632 1 99 | 1.388610 9.341997 0 100 | 0.317029 14.739025 0 -------------------------------------------------------------------------------- /Logistic Regession/horseColicTest.txt: -------------------------------------------------------------------------------- 1 | 2 1 38.50 54 20 0 1 2 2 3 4 1 2 2 5.90 0 2 42.00 6.30 0 0 1 2 | 2 1 37.60 48 36 0 0 1 1 0 3 0 0 0 0 0 0 44.00 6.30 1 5.00 1 3 | 1 1 37.7 44 28 0 4 3 2 5 4 4 1 1 0 3 5 45 70 3 2 1 4 | 1 1 37 56 24 3 1 4 2 4 4 3 1 1 0 0 0 35 61 3 2 0 5 | 2 1 38.00 42 12 3 0 3 1 1 0 1 0 0 0 0 2 37.00 5.80 0 0 1 6 | 1 1 0 60 40 3 0 1 1 0 4 0 3 2 0 0 5 42 72 0 0 1 7 | 2 1 38.40 80 60 3 2 2 1 3 2 1 2 2 0 1 1 54.00 6.90 0 0 1 8 | 2 1 37.80 48 12 2 1 2 1 3 0 1 2 0 0 2 0 48.00 7.30 1 0 1 9 | 2 1 37.90 45 36 3 3 3 2 2 3 1 2 1 0 3 0 33.00 5.70 3 0 1 10 | 2 1 39.00 84 12 3 1 5 1 2 4 2 1 2 7.00 0 4 62.00 5.90 2 2.20 0 11 | 2 1 38.20 60 24 3 1 3 2 3 3 2 3 3 0 4 4 53.00 7.50 2 1.40 1 12 | 1 1 0 140 0 0 0 4 2 5 4 4 1 1 0 0 5 30 69 0 0 0 13 | 1 1 37.90 120 60 3 3 3 1 5 4 4 2 2 7.50 4 5 52.00 6.60 3 1.80 0 14 | 2 1 38.00 72 36 1 1 3 1 3 0 2 2 1 0 3 5 38.00 6.80 2 2.00 1 15 | 2 9 38.00 92 28 1 1 2 1 1 3 2 3 0 7.20 0 0 37.00 6.10 1 1.10 1 16 | 1 1 38.30 66 30 2 3 1 1 2 4 3 3 2 8.50 4 5 37.00 6.00 0 0 1 17 | 2 1 37.50 48 24 3 1 1 1 2 1 0 1 1 0 3 2 43.00 6.00 1 2.80 1 18 | 1 1 37.50 88 20 2 3 3 1 4 3 3 0 0 0 0 0 35.00 6.40 1 0 0 19 | 2 9 0 150 60 4 4 4 2 5 4 4 0 0 0 0 0 0 0 0 0 0 20 | 1 1 39.7 100 30 0 0 6 2 4 4 3 1 0 0 4 5 65 75 0 0 0 21 | 1 1 38.30 80 0 3 3 4 2 5 4 3 2 1 0 4 4 45.00 7.50 2 4.60 1 22 | 2 1 37.50 40 32 3 1 3 1 3 2 3 2 1 0 0 5 32.00 6.40 1 1.10 1 23 | 1 1 38.40 84 30 3 1 5 2 4 3 3 2 3 6.50 4 4 47.00 7.50 3 0 0 24 | 1 1 38.10 84 44 4 0 4 2 5 3 1 1 3 5.00 0 4 60.00 6.80 0 5.70 0 25 | 2 1 38.70 52 0 1 1 1 1 1 3 1 0 0 0 1 3 4.00 74.00 0 0 1 26 | 2 1 38.10 44 40 2 1 3 1 3 3 1 0 0 0 1 3 35.00 6.80 0 0 1 27 | 2 1 38.4 52 20 2 1 3 1 1 3 2 2 1 0 3 5 41 63 1 1 1 28 | 1 1 38.20 60 0 1 0 3 1 2 1 1 1 1 0 4 4 43.00 6.20 2 3.90 1 29 | 2 1 37.70 40 18 1 1 1 0 3 2 1 1 1 0 3 3 36.00 3.50 0 0 1 30 | 1 1 39.1 60 10 0 1 1 0 2 3 0 0 0 0 4 4 0 0 0 0 1 31 | 2 1 37.80 48 16 1 1 1 1 0 1 1 2 1 0 4 3 43.00 7.50 0 0 1 32 | 1 1 39.00 120 0 4 3 5 2 2 4 3 2 3 8.00 0 0 65.00 8.20 3 4.60 1 33 | 1 1 38.20 76 0 2 3 2 1 5 3 3 1 2 6.00 1 5 35.00 6.50 2 0.90 1 34 | 2 1 38.30 88 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 35 | 1 1 38.00 80 30 3 3 3 1 0 0 0 0 0 6.00 0 0 48.00 8.30 0 4.30 1 36 | 1 1 0 0 0 3 1 1 1 2 3 3 1 3 6.00 4 4 0 0 2 0 0 37 | 1 1 37.60 40 0 1 1 1 1 1 1 1 0 0 0 1 1 0 0 2 2.10 1 38 | 2 1 37.50 44 0 1 1 1 1 3 3 2 0 0 0 0 0 45.00 5.80 2 1.40 1 39 | 2 1 38.2 42 16 1 1 3 1 1 3 1 0 0 0 1 0 35 60 1 1 1 40 | 2 1 38 56 44 3 3 3 0 0 1 1 2 1 0 4 0 47 70 2 1 1 41 | 2 1 38.30 45 20 3 3 2 2 2 4 1 2 0 0 4 0 0 0 0 0 1 42 | 1 1 0 48 96 1 1 3 1 0 4 1 2 1 0 1 4 42.00 8.00 1 0 1 43 | 1 1 37.70 55 28 2 1 2 1 2 3 3 0 3 5.00 4 5 0 0 0 0 1 44 | 2 1 36.00 100 20 4 3 6 2 2 4 3 1 1 0 4 5 74.00 5.70 2 2.50 0 45 | 1 1 37.10 60 20 2 0 4 1 3 0 3 0 2 5.00 3 4 64.00 8.50 2 0 1 46 | 2 1 37.10 114 40 3 0 3 2 2 2 1 0 0 0 0 3 32.00 0 3 6.50 1 47 | 1 1 38.1 72 30 3 3 3 1 4 4 3 2 1 0 3 5 37 56 3 1 1 48 | 1 1 37.00 44 12 3 1 1 2 1 1 1 0 0 0 4 2 40.00 6.70 3 8.00 1 49 | 1 1 38.6 48 20 3 1 1 1 4 3 1 0 0 0 3 0 37 75 0 0 1 50 | 1 1 0 82 72 3 1 4 1 2 3 3 0 3 0 4 4 53 65 3 2 0 51 | 1 9 38.20 78 60 4 4 6 0 3 3 3 0 0 0 1 0 59.00 5.80 3 3.10 0 52 | 2 1 37.8 60 16 1 1 3 1 2 3 2 1 2 0 3 0 41 73 0 0 0 53 | 1 1 38.7 34 30 2 0 3 1 2 3 0 0 0 0 0 0 33 69 0 2 0 54 | 1 1 0 36 12 1 1 1 1 1 2 1 1 1 0 1 5 44.00 0 0 0 1 55 | 2 1 38.30 44 60 0 0 1 1 0 0 0 0 0 0 0 0 6.40 36.00 0 0 1 56 | 2 1 37.40 54 18 3 0 1 1 3 4 3 2 2 0 4 5 30.00 7.10 2 0 1 57 | 1 1 0 0 0 4 3 0 2 2 4 1 0 0 0 0 0 54 76 3 2 1 58 | 1 1 36.6 48 16 3 1 3 1 4 1 1 1 1 0 0 0 27 56 0 0 0 59 | 1 1 38.5 90 0 1 1 3 1 3 3 3 2 3 2 4 5 47 79 0 0 1 60 | 1 1 0 75 12 1 1 4 1 5 3 3 0 3 5.80 0 0 58.00 8.50 1 0 1 61 | 2 1 38.20 42 0 3 1 1 1 1 1 2 2 1 0 3 2 35.00 5.90 2 0 1 62 | 1 9 38.20 78 60 4 4 6 0 3 3 3 0 0 0 1 0 59.00 5.80 3 3.10 0 63 | 2 1 38.60 60 30 1 1 3 1 4 2 2 1 1 0 0 0 40.00 6.00 1 0 1 64 | 2 1 37.80 42 40 1 1 1 1 1 3 1 0 0 0 3 3 36.00 6.20 0 0 1 65 | 1 1 38 60 12 1 1 2 1 2 1 1 1 1 0 1 4 44 65 3 2 0 66 | 2 1 38.00 42 12 3 0 3 1 1 1 1 0 0 0 0 1 37.00 5.80 0 0 1 67 | 2 1 37.60 88 36 3 1 1 1 3 3 2 1 3 1.50 0 0 44.00 6.00 0 0 0 -------------------------------------------------------------------------------- /PCA/PCA.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf8-*- 2 | ''' 3 | Created on 2016-5-15 4 | 5 | @author: thinkgamer 6 | ''' 7 | from numpy import * 8 | 9 | def loadDataSet(filename,delim = "\t"): 10 | fr = open(filename) 11 | stringArr = [line.strip().split(delim) for line in fr.readlines()] 12 | datArr = [map(float, line) for line in stringArr] 13 | return mat(datArr) 14 | 15 | #dataMat对应数据集,N个特征 16 | def pca(dataMat, topNfeat=9999999): 17 | meanVals = mean(dataMat, axis = 0) #求平均值 18 | meanRemoved = dataMat - meanVals #去平均值 19 | covMat = cov(meanRemoved,rowvar=0) #计算协防差矩阵 20 | eigVals, eigVects = linalg.eig(mat(covMat)) 21 | eigValInd = argsort(eigVals) 22 | #从小到大对N个值排序 23 | eigValInd = eigValInd[: -(topNfeat + 1) : -1] 24 | redEigVects = eigVects[:, eigValInd] 25 | #将数据转换到新空间 26 | lowDDataMat = meanRemoved * redEigVects 27 | reconMat = (lowDDataMat * redEigVects.T) + meanVals 28 | return lowDDataMat, reconMat 29 | 30 | #测试 31 | dataMat = loadDataSet("testSet.txt") 32 | lowDMat, reconMat = pca(dataMat,1) 33 | print shape(lowDMat) 34 | 35 | ''' 36 | #show 37 | import matplotlib 38 | import matplotlib.pyplot as plt 39 | fig = plt.figure() 40 | ax = fig.add_subplot(111) 41 | ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^', s = 90 ) 42 | ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0],marker='o', s = 50 , c ='red' ) 43 | plt.show() 44 | ''' 45 | 46 | #将NaN替换成平均值函数 47 | def replaceNanWithMean(): 48 | datMat = loadDataSet('secom.data', ' ') 49 | numFeat = shape(datMat)[1] 50 | for i in range(numFeat): 51 | meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number) 52 | datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean 53 | return datMat 54 | 55 | #加载数据 56 | dataMat = replaceNanWithMean() 57 | #去除均值 58 | meanVals = mean(dataMat, axis=0) 59 | meanRemoved = dataMat - meanVals 60 | #计算协方差 61 | covMat = cov(meanRemoved, rowvar=0) 62 | 63 | #特征值分析 64 | eigVals, eigVects = linalg.eig(mat(covMat)) 65 | print eigVals -------------------------------------------------------------------------------- /PCA/testSet.txt: -------------------------------------------------------------------------------- 1 | 10.235186 11.321997 2 | 10.122339 11.810993 3 | 9.190236 8.904943 4 | 9.306371 9.847394 5 | 8.330131 8.340352 6 | 10.152785 10.123532 7 | 10.408540 10.821986 8 | 9.003615 10.039206 9 | 9.534872 10.096991 10 | 9.498181 10.825446 11 | 9.875271 9.233426 12 | 10.362276 9.376892 13 | 10.191204 11.250851 14 | 7.720499 6.476300 15 | 9.334379 8.471268 16 | 7.963186 6.731333 17 | 8.244973 9.013785 18 | 9.569196 10.568949 19 | 8.854793 9.076536 20 | 9.382171 7.240862 21 | 8.179055 8.944502 22 | 8.267896 8.797017 23 | 9.047165 8.725068 24 | 8.741043 7.901385 25 | 7.190216 7.804587 26 | 8.081227 9.314431 27 | 8.047460 5.720780 28 | 7.917584 7.543254 29 | 8.676942 10.102220 30 | 9.210251 9.424717 31 | 7.732998 9.840202 32 | 7.681754 8.609897 33 | 7.925863 10.079159 34 | 8.261509 8.242080 35 | 8.514872 7.527561 36 | 10.324450 10.804481 37 | 7.856710 7.931543 38 | 7.858608 7.995340 39 | 9.196808 6.577598 40 | 9.644415 10.935081 41 | 9.579833 9.085021 42 | 7.888484 5.976428 43 | 9.072624 9.703344 44 | 8.914184 9.298515 45 | 7.822869 7.086663 46 | 10.538554 11.061464 47 | 8.280187 8.709012 48 | 8.884223 8.670105 49 | 9.359927 10.575055 50 | 9.078611 9.710833 51 | 7.935134 8.586173 52 | 8.805945 10.575145 53 | 9.584316 9.614076 54 | 11.269714 11.717254 55 | 9.120444 9.019774 56 | 7.977520 8.313923 57 | 8.104648 9.456128 58 | 8.617126 7.331723 59 | 9.033811 9.469706 60 | 8.327680 5.122092 61 | 8.532272 10.100909 62 | 9.295434 8.933824 63 | 9.905202 9.027559 64 | 10.585764 10.912733 65 | 10.427584 11.532578 66 | 9.072767 9.960144 67 | 9.164275 8.645121 68 | 9.746058 10.717080 69 | 9.286072 9.340024 70 | 8.188233 7.432415 71 | 7.948598 8.445419 72 | 7.563350 5.656178 73 | 8.972405 8.801869 74 | 9.980868 8.788996 75 | 7.753490 7.714248 76 | 7.431143 9.032819 77 | 8.943403 8.359354 78 | 10.481890 9.988969 79 | 9.150454 10.278760 80 | 8.123894 9.060351 81 | 8.626164 8.469342 82 | 7.354185 7.631252 83 | 11.323046 11.015032 84 | 8.190008 6.860792 85 | 8.412598 7.661358 86 | 9.258404 8.580382 87 | 11.007915 11.443881 88 | 8.279403 8.347003 89 | 8.931149 10.105221 90 | 10.239245 10.077473 91 | 8.129346 7.096877 92 | 8.485823 9.373561 93 | 10.703640 11.651618 94 | 9.500728 8.150228 95 | 9.712414 9.910445 96 | 9.333374 9.407557 97 | 8.787865 10.168021 98 | 9.238180 10.253478 99 | 9.577388 8.895150 100 | 10.447753 10.318227 101 | 9.303944 9.223136 102 | 9.883268 11.662945 103 | 9.471921 10.443792 104 | 10.007753 9.579912 105 | 8.110298 7.106263 106 | 6.964069 6.585040 107 | 10.413499 9.649309 108 | 8.032629 7.053254 109 | 8.015549 9.166753 110 | 10.462924 8.656612 111 | 9.530788 10.134130 112 | 9.202658 9.314222 113 | 10.103241 10.235159 114 | 7.849264 6.624856 115 | 9.059071 7.992555 116 | 10.172889 10.724789 117 | 9.528439 6.420990 118 | 7.190422 6.789792 119 | 9.085716 9.846328 120 | 9.452887 8.735386 121 | 7.417322 7.348594 122 | 8.468639 8.715086 123 | 8.303642 9.463231 124 | 9.939052 10.026771 125 | 8.701989 7.516978 126 | 9.737541 10.587281 127 | 8.280233 7.852444 128 | 10.648386 10.259203 129 | 9.173893 10.520372 130 | 9.135397 10.751406 131 | 7.594580 8.488833 132 | 8.587520 8.463406 133 | 8.581887 7.888644 134 | 9.448768 8.707422 135 | 7.882664 7.772030 136 | 10.050635 9.859720 137 | 9.012078 9.533899 138 | 8.770020 8.882996 139 | 9.428804 9.446306 140 | 8.504209 8.319693 141 | 9.800003 10.964667 142 | 8.069660 7.683099 143 | 10.012217 10.320644 144 | 8.704677 8.918146 145 | 8.198722 7.297786 146 | 9.868322 9.901657 147 | 9.426997 11.480353 148 | 9.228767 9.262976 149 | 8.952359 9.528471 150 | 8.186847 8.600587 151 | 9.026371 8.705143 152 | 9.483364 9.807079 153 | 7.826587 7.975401 154 | 11.197846 10.959298 155 | 7.632421 8.769745 156 | 8.761605 8.309365 157 | 9.353670 8.728758 158 | 6.466637 6.038996 159 | 8.370634 9.178830 160 | 10.337451 11.075600 161 | 8.917679 8.288367 162 | 9.076621 8.487626 163 | 7.278948 4.634097 164 | 10.153017 11.219183 165 | 7.132603 5.853118 166 | 9.338644 9.805940 167 | 9.878602 9.187000 168 | 10.009505 10.924505 169 | 9.384438 10.691860 170 | 7.535322 8.160481 171 | 6.808732 8.268469 172 | 8.302965 8.075009 173 | 8.345379 8.305356 174 | 9.517530 8.249839 175 | 9.267825 9.999109 176 | 10.291511 11.032664 177 | 8.605909 8.705207 178 | 8.331145 7.812295 179 | 8.632412 10.574287 180 | 8.766397 8.712107 181 | 9.407070 9.732756 182 | 9.709495 9.729569 183 | 10.422201 11.070360 184 | 6.831495 6.466763 185 | 8.187122 8.405929 186 | 8.523093 9.041844 187 | 7.952394 6.801220 188 | 10.490780 10.001468 189 | 10.813791 9.802494 190 | 7.861113 7.541475 191 | 8.800399 8.738974 192 | 7.542152 6.612838 193 | 9.446981 9.378659 194 | 8.281684 7.358572 195 | 8.473801 8.208343 196 | 11.736767 11.022029 197 | 8.379578 8.714348 198 | 8.313718 8.832381 199 | 9.342589 10.416659 200 | 7.560710 6.889648 201 | 9.295344 9.739040 202 | 9.176612 9.718781 203 | 8.614385 10.150521 204 | 9.079373 8.839794 205 | 10.333289 10.921255 206 | 9.453502 7.335134 207 | 10.174590 10.292500 208 | 9.693713 9.793636 209 | 7.474925 7.751391 210 | 10.107905 10.156997 211 | 9.257241 7.854266 212 | 10.209794 11.410157 213 | 7.248050 6.433676 214 | 10.150091 9.288597 215 | 10.077713 10.321500 216 | 8.191122 8.931519 217 | 8.791469 10.287216 218 | 9.229434 9.095193 219 | 8.682571 8.546005 220 | 7.524099 7.709751 221 | 8.442410 8.326037 222 | 9.364851 9.095989 223 | 9.061222 7.557899 224 | 7.989999 8.555363 225 | 8.801275 8.868732 226 | 10.351932 9.497796 227 | 10.230710 10.496151 228 | 9.783163 9.891408 229 | 10.651481 9.431617 230 | 8.387393 6.400507 231 | 9.003921 7.050003 232 | 8.483723 8.314886 233 | 9.020501 7.545771 234 | 9.329105 11.095661 235 | 9.583687 9.271929 236 | 8.908705 8.407529 237 | 8.835406 8.083517 238 | 9.736362 8.296735 239 | 10.030302 9.737178 240 | 8.287142 6.993460 241 | 9.173211 9.306335 242 | 9.026355 9.696531 243 | 9.128391 9.921247 244 | 11.486346 12.910777 245 | 11.519458 11.472111 246 | 9.027707 10.263974 247 | 9.351935 8.542200 248 | 9.421701 11.403201 249 | 9.005687 8.100969 250 | 7.015279 6.614278 251 | 8.213607 8.340948 252 | 8.226646 8.718997 253 | 8.144753 8.366877 254 | 10.133642 12.790169 255 | 10.763481 10.847016 256 | 10.003622 10.337716 257 | 9.007955 9.792482 258 | 8.670506 10.782931 259 | 10.386414 9.956162 260 | 10.104761 10.123044 261 | 8.079502 8.304075 262 | 9.945424 11.855409 263 | 8.642497 9.998066 264 | 9.349722 8.690328 265 | 9.034991 8.826490 266 | 8.738746 7.518464 267 | 8.919532 9.740312 268 | 9.464136 10.444588 269 | 10.710057 12.666857 270 | 10.042007 10.532091 271 | 8.447996 7.426363 272 | 9.509351 9.030516 273 | 11.946359 10.553075 274 | 9.981617 9.912651 275 | 9.853876 9.632967 276 | 10.560648 11.881714 277 | 8.370952 9.989491 278 | 8.323209 10.102529 279 | 9.828359 11.702462 280 | 8.515623 8.426754 281 | 9.004363 9.628036 282 | 10.529847 10.458031 283 | 10.028765 10.624880 284 | 9.448114 9.313227 285 | 8.332617 7.382295 286 | 8.323006 8.276608 287 | 7.740771 8.799750 288 | 8.379615 8.146192 289 | 8.340764 9.184458 290 | 9.863614 8.254694 291 | 9.969563 9.405134 292 | 9.164394 9.182127 293 | 10.622098 9.722592 294 | 9.592072 10.029446 295 | 8.212027 7.477366 296 | 9.080225 8.244448 297 | 8.555774 7.842325 298 | 9.958046 9.696221 299 | 8.972573 9.797128 300 | 9.213223 7.128437 301 | 8.737239 9.385138 302 | 10.333907 10.994856 303 | 8.797511 8.643075 304 | 11.044848 9.623160 305 | 8.539260 9.097113 306 | 11.582163 11.884333 307 | 7.863848 7.176199 308 | 6.218103 5.283562 309 | 9.120602 7.250190 310 | 9.001166 9.635203 311 | 8.081476 8.844224 312 | 9.369802 8.230911 313 | 8.768925 8.666987 314 | 9.841098 8.543896 315 | 10.451522 9.549511 316 | 9.755402 9.117522 317 | 7.988961 6.869854 318 | 8.872507 9.787118 319 | 10.363980 10.716608 320 | 6.315671 5.765953 321 | 9.638879 9.202355 322 | 8.588126 8.037966 323 | 8.947408 9.144386 324 | 9.051130 7.195132 325 | 9.321709 8.380668 326 | 10.146531 9.754745 327 | 9.843373 8.891437 328 | 9.213148 11.700632 329 | 7.630078 7.294753 330 | 8.093088 7.967590 331 | 7.488915 6.090652 332 | 8.126036 8.586472 333 | 8.760350 7.268987 334 | 10.201347 9.141013 335 | 7.838208 7.307700 336 | 6.155653 5.563997 337 | 7.767841 6.254528 338 | 8.425656 8.615832 339 | 10.362168 10.886815 340 | 10.180024 10.378934 341 | 9.794665 10.047812 342 | 9.970394 9.668279 343 | 7.030217 7.060471 344 | 9.275414 9.095738 345 | 10.314911 10.456539 346 | 9.259774 8.204851 347 | 10.023919 9.558307 348 | 8.887540 9.866704 349 | 9.851608 9.410989 350 | 8.710882 7.268012 351 | 9.017007 10.217673 352 | 7.976369 9.000979 353 | 8.738332 8.664734 354 | 8.344510 8.977600 355 | 8.959613 12.324240 356 | 9.169982 8.624635 357 | 7.487451 8.154859 358 | 8.706316 7.719455 359 | 9.564832 8.940403 360 | 8.327775 9.044509 361 | 9.734032 10.195255 362 | 8.021343 6.445092 363 | 9.081048 11.024397 364 | 7.626651 6.549263 365 | 10.725858 8.575374 366 | 8.731381 8.307788 367 | 10.394237 10.596874 368 | 7.029311 7.658832 369 | 9.517907 7.509904 370 | 10.394064 10.060898 371 | 10.752500 9.431601 372 | 9.692431 10.332130 373 | 9.651897 7.876862 374 | 8.592329 10.096837 375 | 10.212801 10.827496 376 | 9.045043 9.265524 377 | 8.901643 8.036115 378 | 10.794525 9.318830 379 | 11.040915 12.021746 380 | 8.390836 9.672469 381 | 9.840166 11.226568 382 | 10.806810 12.205633 383 | 8.924285 10.934056 384 | 8.411251 8.289672 385 | 7.808891 9.663290 386 | 9.733437 8.486958 387 | 8.300026 7.477374 388 | 8.221756 10.278308 389 | 9.096867 9.619677 390 | 9.410116 9.289188 391 | 10.097176 9.768470 392 | 9.387954 8.844855 393 | 9.376134 7.704630 394 | 8.231599 9.101203 395 | 9.910738 10.694855 396 | 8.645689 7.764589 397 | 8.090245 7.109596 398 | 9.253483 9.813672 399 | 9.331546 8.039386 400 | 9.843256 10.208792 401 | 9.713131 9.247665 402 | 9.259369 10.704622 403 | 10.243948 9.695883 404 | 6.396262 6.456390 405 | 8.936289 8.703871 406 | 8.750846 9.347273 407 | 6.497155 4.130251 408 | 9.516552 10.164848 409 | 9.125766 8.858775 410 | 8.374387 7.300114 411 | 8.132816 7.621107 412 | 10.099505 9.159134 413 | 9.356477 6.869999 414 | 8.112934 7.587547 415 | 7.265396 6.987031 416 | 11.950505 13.715109 417 | 10.745959 10.822171 418 | 8.893270 7.887332 419 | 6.003473 4.960219 420 | 7.498851 6.451334 421 | 10.162072 9.935954 422 | 8.732617 9.177679 423 | 9.300827 9.952360 424 | 11.908436 12.256801 425 | 9.371215 9.188645 426 | 9.943640 9.245037 427 | 7.386450 7.046819 428 | 8.410374 8.293218 429 | 7.830419 6.440253 430 | 8.263140 8.279446 431 | 11.448164 12.192363 432 | 8.216533 9.186628 433 | 9.316128 10.046697 434 | 8.156927 6.834792 435 | 9.951421 11.240598 436 | 9.059607 8.458446 437 | 10.476339 10.560461 438 | 7.548200 7.227127 439 | 9.432204 7.236705 440 | 9.402750 9.126413 441 | 11.188095 13.853426 442 | 9.520201 11.028131 443 | 8.884154 9.764071 444 | 8.961105 8.833117 445 | 8.549663 8.865765 446 | 10.111708 10.515462 447 | 9.024761 9.169368 448 | 7.904149 8.048756 449 | 9.240995 7.796142 450 | 8.126538 6.116125 451 | 7.442148 7.931335 452 | 9.486821 10.091359 453 | 9.834289 11.694720 454 | 9.009714 11.599170 455 | 9.761314 11.344083 456 | 6.993941 6.562988 457 | 8.659524 8.410107 458 | 7.685363 8.097297 459 | 7.793217 6.519109 460 | 8.883454 9.257347 461 | 8.781821 9.231980 462 | 7.946281 7.658978 463 | 8.523959 10.646480 464 | 9.031525 8.649648 465 | 8.317140 7.758978 466 | 9.192417 11.151218 467 | 8.408486 8.282182 468 | 10.327702 11.459048 469 | 8.389687 8.548727 470 | 8.642250 7.056870 471 | 8.833447 9.267638 472 | 8.805261 8.320281 473 | 9.726211 9.095997 474 | 8.477631 9.507530 475 | 9.738838 9.652110 476 | 8.272108 7.582696 477 | 9.258089 8.495931 478 | 8.334144 8.810766 479 | 8.150904 6.486032 480 | 7.259669 7.270156 481 | 11.034180 11.519954 482 | 10.705432 10.642527 483 | 8.388814 7.159137 484 | 8.559369 7.846284 485 | 7.187988 6.519313 486 | 8.811453 7.765900 487 | 8.492762 7.992941 488 | 8.739752 8.502909 489 | 10.150752 10.420295 490 | 7.062378 5.365289 491 | 8.448195 7.480000 492 | 10.224333 11.592750 493 | 9.533795 9.212845 494 | 9.519492 7.690501 495 | 9.661847 10.376189 496 | 7.963877 8.597193 497 | 10.184486 9.136709 498 | 8.505234 9.159210 499 | 8.187646 8.518690 500 | 9.167590 9.405917 501 | 8.612162 8.518755 502 | 10.970868 10.392229 503 | 9.603649 9.141095 504 | 9.704263 8.830178 505 | 9.657506 8.132449 506 | 9.337882 11.045306 507 | 9.521722 9.537764 508 | 8.954197 8.728179 509 | 8.635658 10.352662 510 | 8.910816 9.020317 511 | 9.900933 9.392002 512 | 10.247105 8.289649 513 | 9.571690 8.171237 514 | 7.388627 7.668071 515 | 8.354008 10.074590 516 | 9.775598 8.835696 517 | 8.768913 7.983604 518 | 8.330199 8.474098 519 | 8.169356 9.361172 520 | 10.346522 10.086434 521 | 7.976144 9.266702 522 | 8.429648 7.865824 523 | 11.261674 11.788587 524 | 10.051066 10.112425 525 | 8.954626 9.789343 526 | 8.382220 8.121012 527 | 9.820642 9.426441 528 | 8.125950 9.695087 529 | 8.646465 7.291808 530 | 8.190202 8.003737 531 | 8.773887 7.306175 532 | 8.731000 10.300436 533 | 9.163098 7.816769 534 | 9.456346 9.223922 535 | 9.645180 9.324053 536 | 8.835060 8.966915 537 | 9.325950 10.943248 538 | 9.941912 9.548535 539 | 9.282799 10.119488 540 | 9.567591 9.462164 541 | 8.529019 9.768001 542 | 9.314824 10.153727 543 | 8.264439 8.273860 544 | 8.307262 8.214036 545 | 9.122041 8.657861 546 | 8.404258 8.389365 547 | 7.828355 8.419433 548 | 9.803180 10.108286 549 | 8.662439 8.581953 550 | 8.883265 8.978377 551 | 8.012330 8.262451 552 | 9.420258 8.974878 553 | 7.015415 6.365940 554 | 9.888832 11.163036 555 | 9.677549 10.346431 556 | 8.410158 7.912899 557 | 9.464147 10.762900 558 | 7.067227 7.035717 559 | 9.320923 10.583089 560 | 9.056917 8.771241 561 | 8.110004 8.387789 562 | 10.310021 10.970014 563 | 8.211185 8.809627 564 | 8.942883 8.840746 565 | 9.479958 8.328700 566 | 8.973982 8.702291 567 | 8.519257 8.764855 568 | 9.424556 8.956911 569 | 7.222919 8.177787 570 | 8.257007 9.700619 571 | 9.778795 9.296134 572 | 8.028806 8.575974 573 | 9.886464 9.965076 574 | 9.090552 6.978930 575 | 9.605548 10.256751 576 | 9.959004 9.610229 577 | 8.308701 9.509124 578 | 7.748293 9.685933 579 | 8.311108 9.428114 580 | 9.697068 10.217956 581 | 9.582991 9.478773 582 | 9.167265 10.198412 583 | 10.329753 10.406602 584 | 8.908819 7.428789 585 | 10.072908 10.393294 586 | 7.992905 9.226629 587 | 8.907696 7.269366 588 | 8.421948 9.342968 589 | 7.481399 7.225033 590 | 10.358408 10.166130 591 | 8.786556 10.279943 592 | 9.658701 11.379367 593 | 10.167807 9.417552 594 | 8.653449 8.656681 595 | 8.020304 8.671270 596 | 8.364348 10.004068 597 | 9.119183 9.788199 598 | 8.405504 9.740580 599 | 11.020930 11.904350 600 | 9.755232 9.515713 601 | 10.059542 9.589748 602 | 8.727131 9.777998 603 | 7.666182 6.028642 604 | 8.870733 8.367501 605 | 9.340446 7.707269 606 | 9.919283 10.796813 607 | 7.905837 8.326034 608 | 10.181187 10.089865 609 | 8.797328 8.981988 610 | 8.466272 7.765032 611 | 10.335914 12.620539 612 | 9.365003 8.609115 613 | 8.011017 7.249489 614 | 10.923993 13.901513 615 | 7.074631 7.558720 616 | 9.824598 8.851297 617 | 8.861026 8.370857 618 | 10.127296 10.861535 619 | 10.548377 10.855695 620 | 8.880470 7.948761 621 | 8.901619 9.674705 622 | 7.813710 9.246912 623 | 10.128808 10.560668 624 | 11.096699 10.911644 625 | 8.551471 6.871514 626 | 8.907241 8.677815 627 | 10.571647 10.294838 628 | 8.815314 8.810725 629 | 8.453396 8.339296 630 | 9.594819 11.487580 631 | 10.714211 9.628908 632 | 7.428788 7.712869 633 | 10.892119 12.747752 634 | 9.024071 11.112692 635 | 7.803375 7.847038 636 | 8.521558 8.881848 637 | 9.742818 11.520203 638 | 9.832836 9.180396 639 | 8.703132 10.028498 640 | 9.905029 11.347606 641 | 10.037536 8.882688 642 | 8.629995 8.392863 643 | 9.583497 9.219663 644 | 8.781687 9.650598 645 | 9.344119 9.537024 646 | 10.407510 9.223929 647 | 7.244488 6.559021 648 | 10.643616 10.288383 649 | 8.757557 6.947901 650 | 10.784590 11.233350 651 | 10.028427 11.330033 652 | 7.968361 6.830308 653 | 8.925954 8.539113 654 | 7.738692 7.114987 655 | 8.192398 8.352016 656 | 10.412017 12.431122 657 | 8.208801 5.777678 658 | 7.820077 7.790720 659 | 9.542754 11.542541 660 | 6.817938 7.429229 661 | 7.365218 7.956797 662 | 9.274391 7.932700 663 | 9.546475 8.803412 664 | 7.471734 6.797870 665 | 8.016969 7.848070 666 | 8.852701 8.458114 667 | 8.215012 8.468330 668 | 6.975507 6.846980 669 | 9.435134 10.609700 670 | 9.228075 9.342622 671 | 8.388410 7.637856 672 | 7.111456 9.289163 673 | 9.403508 8.482654 674 | 9.133894 8.343575 675 | 10.670801 9.750821 676 | 9.983542 10.074537 677 | 10.012865 8.537017 678 | 8.929895 8.951909 679 | 7.666951 7.473615 680 | 9.493839 7.821783 681 | 8.894081 7.059413 682 | 9.593382 9.859732 683 | 9.126847 8.395700 684 | 9.532945 9.850696 685 | 9.459384 9.384213 686 | 8.982743 8.217062 687 | 10.107798 8.790772 688 | 10.563574 9.044890 689 | 8.278963 9.518790 690 | 8.734960 10.494129 691 | 9.597940 9.530895 692 | 10.025478 9.508270 693 | 10.335922 10.974063 694 | 8.404390 8.146748 695 | 7.108699 6.038469 696 | 8.873951 7.474227 697 | 8.731459 8.154455 698 | 8.795146 7.534687 699 | 6.407165 6.810352 700 | 9.979312 10.287430 701 | 8.786715 8.396736 702 | 10.753339 10.360567 703 | 10.508031 10.321976 704 | 10.636925 10.193797 705 | 10.614322 11.215420 706 | 8.916411 8.965286 707 | 8.112756 8.304769 708 | 10.833109 10.497542 709 | 8.319758 9.727691 710 | 9.945336 11.820097 711 | 10.150461 9.914715 712 | 10.185024 10.388722 713 | 9.793569 9.079955 714 | 10.590128 11.811596 715 | 8.505584 6.884282 716 | 10.461428 10.745439 717 | 8.755781 9.418427 718 | 7.488249 7.172072 719 | 10.238905 10.428659 720 | 9.887827 10.427821 721 | 8.529971 8.838217 722 | 8.375208 10.242837 723 | 8.901724 8.398304 724 | 8.607694 9.173198 725 | 8.691369 9.964261 726 | 9.584578 9.641546 727 | 10.265792 11.405078 728 | 7.592968 6.683355 729 | 8.692791 9.389031 730 | 7.589852 6.005793 731 | 10.550386 11.736584 732 | 8.578351 7.227055 733 | 7.526931 6.875134 734 | 8.577081 9.877115 735 | 9.272136 11.050928 736 | 10.300809 10.653059 737 | 8.642013 9.006681 738 | 9.720491 10.265202 739 | 9.029005 9.646928 740 | 8.736201 7.975603 741 | 8.672886 9.070759 742 | 8.370633 8.412170 743 | 9.483776 9.183341 744 | 6.790842 7.594992 745 | 9.842146 10.156810 746 | 9.563336 7.962532 747 | 8.724669 9.870732 748 | 9.012145 9.171326 749 | 9.116948 9.791167 750 | 6.219094 7.988420 751 | 9.468422 8.359975 752 | 8.825231 8.475208 753 | 9.572224 9.696428 754 | 9.609128 8.488175 755 | 9.428590 10.468998 756 | 8.293266 8.617701 757 | 9.423584 10.355688 758 | 9.240796 9.517228 759 | 10.915423 13.026252 760 | 10.854684 11.130866 761 | 9.226816 9.391796 762 | 9.580264 10.359235 763 | 7.289907 6.898208 764 | 9.338857 10.374025 765 | 9.523176 11.332190 766 | 10.162233 10.357396 767 | 8.873930 9.207398 768 | 8.607259 7.794804 769 | 8.852325 8.215797 770 | 8.077272 6.501042 771 | 8.169273 8.269613 772 | 6.806421 7.544423 773 | 8.793151 9.691549 774 | 11.640981 11.365702 775 | 9.544082 11.576545 776 | 9.009266 9.605596 777 | 9.726552 9.426719 778 | 9.495888 10.626624 779 | 8.683982 9.337864 780 | 8.322105 8.631099 781 | 8.887895 8.644931 782 | 8.662659 11.373025 783 | 9.263321 7.536016 784 | 7.802624 7.171625 785 | 8.773183 8.561565 786 | 8.730443 10.197596 787 | 8.942915 7.758383 788 | 8.057618 8.774996 789 | 8.112081 8.202349 790 | 10.378884 12.103755 791 | 9.248876 8.637249 792 | 9.739599 9.708576 793 | 8.126345 8.278487 794 | 8.894788 7.966117 795 | 9.683165 9.019221 796 | 10.886957 12.053843 797 | 9.668852 10.902132 798 | 7.486692 6.471138 799 | 8.794850 9.173609 800 | 8.835915 8.296727 801 | 9.443984 11.375344 802 | 8.696621 6.434580 803 | 9.645560 9.233722 804 | 9.623857 7.915590 805 | 10.840632 12.620268 806 | 7.298135 7.356141 807 | 9.639644 8.902389 808 | 9.849802 7.682624 809 | 10.609964 10.259615 810 | 9.768229 11.382811 811 | 7.646351 7.571849 812 | 10.230300 9.470859 813 | 8.224402 8.496866 814 | 6.879671 8.393648 815 | 7.976247 8.667221 816 | 9.183268 8.694550 817 | 11.471853 12.786280 818 | 10.428349 10.615726 819 | 8.090828 5.902504 820 | 9.738627 8.485792 821 | 8.139709 8.396333 822 | 9.508055 8.990529 823 | 8.857260 8.497732 824 | 8.902558 7.014433 825 | 9.660607 11.040833 826 | 8.772221 10.512150 827 | 11.020038 9.354134 828 | 7.918527 7.742062 829 | 7.630835 7.756260 830 | 11.043272 11.041613 831 | 9.299376 8.674157 832 | 9.795087 8.431837 833 | 9.415683 8.312101 834 | 7.942037 6.942913 835 | 9.724790 11.766496 836 | 10.222032 11.550876 837 | 8.894163 8.306020 838 | 8.394309 8.070420 839 | 9.012776 6.880548 840 | 9.661093 10.138921 841 | 9.896472 9.762372 842 | 9.135628 8.759928 843 | 8.762656 10.306028 844 | 8.602473 8.861956 845 | 10.085297 10.464774 846 | 10.644983 10.945767 847 | 9.034571 8.391668 848 | 8.602920 8.501944 849 | 8.224766 7.402758 850 | 8.755050 9.431085 851 | 9.669937 8.641049 852 | 10.693530 10.287124 853 | 9.462806 7.611153 854 | 9.287707 10.082363 855 | 10.941260 10.783728 856 | 9.263080 7.913328 857 | 10.167111 10.225338 858 | 8.783830 9.465345 859 | 8.958624 8.662136 860 | 9.841649 9.926781 861 | 7.205691 6.790638 862 | 8.629089 9.135461 863 | 7.469440 8.450442 864 | 8.179133 7.790434 865 | 8.083984 7.875520 866 | 9.271300 8.135359 867 | 8.652349 8.254397 868 | 7.983920 6.609684 869 | 7.836860 9.785238 870 | 7.418535 7.011256 871 | 8.458288 10.095364 872 | 9.387605 9.726911 873 | 8.663951 8.206705 874 | 10.146507 11.698577 875 | 8.937103 10.990924 876 | 11.218687 11.141945 877 | 8.363142 9.106936 878 | 7.877643 7.122922 879 | 9.620978 9.905689 880 | 9.509649 10.773209 881 | 6.748743 6.705385 882 | 9.300919 8.085029 883 | 9.332257 9.818791 884 | 7.898610 8.366643 885 | 9.841914 9.480675 886 | 6.920484 8.959501 887 | 8.544713 9.563136 888 | 8.162266 6.715277 889 | 8.659552 9.282008 890 | 10.673398 13.174824 891 | 9.024000 10.379238 892 | 8.183292 6.647572 893 | 10.544919 10.649602 894 | 7.201266 6.529605 895 | 9.557407 11.096821 896 | 8.304605 6.940929 897 | 9.742855 9.920897 898 | 10.024587 9.645222 899 | 10.002296 9.998940 900 | 8.965876 8.665419 901 | 7.823136 6.949572 902 | 8.125088 7.654065 903 | 6.569589 6.046863 904 | 10.195497 8.689129 905 | 11.730011 10.374221 906 | 8.739105 7.457571 907 | 9.820059 10.278526 908 | 9.547456 10.398198 909 | 8.375072 8.416302 910 | 8.889533 8.308929 911 | 8.861201 9.290408 912 | 12.677687 12.788463 913 | 9.100735 8.620537 914 | 7.728350 6.328219 915 | 7.955373 8.355028 916 | 8.733352 8.645414 917 | 10.257527 11.191813 918 | 9.246413 9.497014 919 | 9.745302 9.642035 920 | 7.785652 8.147621 921 | 7.431673 8.566399 922 | 8.654384 8.466701 923 | 8.475392 6.744677 924 | 9.968440 10.765192 925 | 10.163616 10.806963 926 | 10.238135 10.036636 927 | 9.902889 10.746730 928 | 9.523850 8.749708 929 | 9.214363 9.149178 930 | 9.266040 10.841502 931 | 8.494292 7.770942 932 | 10.821158 10.410192 933 | 8.645888 7.970308 934 | 9.885204 10.098080 935 | 9.084990 10.886349 936 | 9.277874 8.871449 937 | 8.135131 7.137064 938 | 7.917379 9.080522 939 | 9.685586 8.822850 940 | 8.558141 7.848112 941 | 9.502917 10.061255 942 | 6.409004 5.164774 943 | 10.149235 10.579951 944 | 7.847304 8.411351 945 | 8.846930 6.819939 946 | 8.675153 9.411147 947 | 9.476276 9.061508 948 | 11.099184 10.644263 949 | 8.792411 10.379405 950 | 8.400418 7.072706 951 | 8.555713 7.923805 952 | 8.024763 8.426993 953 | 8.642696 10.453412 954 | 7.906117 7.920408 955 | 8.793393 9.722878 956 | 8.280364 7.669854 957 | 9.387766 9.706245 958 | 9.626853 10.762499 959 | 10.163631 10.919007 960 | 9.375543 11.513524 961 | 9.309440 8.575699 962 | 10.055329 10.297255 963 | 8.706241 9.097172 964 | 10.032934 11.951897 965 | 10.812974 11.311435 966 | 10.352603 10.819865 967 | 8.276870 9.055403 968 | 8.397389 7.944434 969 | 9.371741 10.395790 970 | 10.825710 10.144099 971 | 9.158483 11.385382 972 | 10.658639 11.389856 973 | 8.091762 6.631039 974 | 10.734892 10.054598 975 | 11.535880 11.604912 976 | 9.799077 11.371677 977 | 8.478725 9.078455 978 | 9.399902 8.947744 979 | 7.305377 8.144973 980 | 7.613377 6.668798 981 | 10.681308 10.830845 982 | 9.973855 10.004133 983 | 9.369918 7.855433 984 | 8.838223 7.429033 985 | 9.521831 10.623930 986 | 9.724419 10.447452 987 | 8.890224 9.275923 988 | 9.932763 11.589953 989 | 10.839337 9.051250 990 | 8.497708 7.521701 991 | 8.440236 8.705670 992 | 9.063566 9.755744 993 | 8.449647 8.929485 994 | 8.554576 8.063231 995 | 10.348606 10.550718 996 | 5.985254 5.186844 997 | 9.931937 10.175582 998 | 9.854922 9.201393 999 | 9.114580 9.134215 1000 | 10.334899 8.543604 1001 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Machine-Learning-With-Python 2 | ======================== 3 | Fix bugs and add new features for personalized projects 4 | 5 | -------------------------------------------------------------------------------- /Recommend/基于item的协同过滤推荐BasedItem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*-coding:utf-8-*- 3 | ''' 4 | Created on 2016-5-30 5 | 6 | @author: thinkgamer 7 | ''' 8 | import math 9 | 10 | class ItemBasedCF: 11 | def __init__(self,train_file): 12 | self.train_file = train_file 13 | self.readData() 14 | def readData(self): 15 | #读取文件,并生成用户-物品的评分表和测试集 16 | self.train = dict() #用户-物品的评分表 17 | for line in open(self.train_file): 18 | # user,item,score = line.strip().split(",") 19 | user,score,item = line.strip().split(",") 20 | self.train.setdefault(user,{}) 21 | self.train[user][item] = int(float(score)) 22 | 23 | def ItemSimilarity(self): 24 | #建立物品-物品的共现矩阵 25 | C = dict() #物品-物品的共现矩阵 26 | N = dict() #物品被多少个不同用户购买 27 | for user,items in self.train.items(): 28 | for i in items.keys(): 29 | N.setdefault(i,0) 30 | N[i] += 1 31 | C.setdefault(i,{}) 32 | for j in items.keys(): 33 | if i == j : continue 34 | C[i].setdefault(j,0) 35 | C[i][j] += 1 36 | #计算相似度矩阵 37 | self.W = dict() 38 | for i,related_items in C.items(): 39 | self.W.setdefault(i,{}) 40 | for j,cij in related_items.items(): 41 | self.W[i][j] = cij / (math.sqrt(N[i] * N[j])) 42 | return self.W 43 | 44 | #给用户user推荐,前K个相关用户 45 | def Recommend(self,user,K=3,N=10): 46 | rank = dict() 47 | action_item = self.train[user] #用户user产生过行为的item和评分 48 | for item,score in action_item.items(): 49 | for j,wj in sorted(self.W[item].items(),key=lambda x:x[1],reverse=True)[0:K]: 50 | if j in action_item.keys(): 51 | continue 52 | rank.setdefault(j,0) 53 | rank[j] += score * wj 54 | return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N]) 55 | 56 | #声明一个ItemBased推荐的对象 57 | Item = ItemBasedCF("uid_score_bid") 58 | Item.ItemSimilarity() 59 | recommedDic = Item.Recommend("xiyuweilan") 60 | for k,v in recommedDic.iteritems(): 61 | print k,"\t",v -------------------------------------------------------------------------------- /Recommend/基于图的推荐PersonalRank.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/Recommend/基于图的推荐PersonalRank.py -------------------------------------------------------------------------------- /Recommend/基于标签的推荐.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*-coding:utf-8-*- 3 | import random 4 | import math 5 | #统计各类数量 6 | def addValueToMat(theMat,key,value,incr): 7 | if key not in theMat: #如果key没出先在theMat中 8 | theMat[key]=dict(); 9 | theMat[key][value]=incr; 10 | else: 11 | if value not in theMat[key]: 12 | theMat[key][value]=incr; 13 | else: 14 | theMat[key][value]+=incr;#若有值,则递增 15 | 16 | user_tags = dict(); 17 | tag_items = dict(); 18 | user_items = dict(); 19 | user_items_test = dict();#测试集数据字典 20 | item_tags = dict() #用于多样性测试 21 | 22 | #初始化,进行各种统计 23 | def InitStat(): 24 | data_file = open('delicious.dat') 25 | line = data_file.readline(); 26 | while line: 27 | if random.random()>0.1:#将90%的数据作为训练集,剩下10%的数据作为测试集 28 | terms = line.split("\t");#训练集的数据结构是[user, item, tag]形式 29 | user=terms[0]; 30 | item=terms[1]; 31 | tag=terms[2]; 32 | addValueToMat(user_tags,user,tag,1) 33 | addValueToMat(tag_items,tag,item,1) 34 | addValueToMat(user_items,user,item,1) 35 | addValueToMat(item_tags,item,tag,1) 36 | line = data_file.readline(); 37 | else: 38 | addValueToMat(user_items_test,user,item,1) 39 | data_file.close(); 40 | 41 | #推荐算法 42 | def Recommend(usr): 43 | recommend_list = dict(); 44 | tagged_item = user_items[usr];#得到该用户所有推荐过的物品 45 | for tag_,wut in user_tags[usr].items():#用户打过的标签及次数 46 | for item_,wit in tag_items[tag_].items():#物品被打过的标签及被打过的次数 47 | if item_ not in tagged_item:#已经推荐过的不再推荐 48 | if item_ not in recommend_list: 49 | recommend_list[item_]=wut*wit;#根据公式 50 | else: 51 | recommend_list[item_]+=wut*wit; 52 | return sorted(recommend_list.iteritems(), key=lambda a:a[1],reverse=True) 53 | 54 | #统计标签流行度 55 | def TagPopularity(): 56 | tagfreq = {} 57 | for user in user_tags.keys(): 58 | for tag in user_tags[user].keys(): 59 | if tag not in tagfreq: 60 | tagfreq[tag] = 1 61 | else: 62 | tagfreq[tag] +=1 63 | return sorted(tagfreq.iteritems(), key=lambda a:a[1],reverse=True) 64 | 65 | #计算余弦相似度 66 | def CosineSim(item_tags,i,j): 67 | ret = 0 68 | for b,wib in item_tags[i].items(): #求物品i,j的标签交集数目 69 | if b in item_tags[j]: 70 | ret += wib * item_tags[j][b] 71 | ni = 0 72 | nj = 0 73 | for b, w in item_tags[i].items(): #统计 i 的标签数目 74 | ni += w * w 75 | for b, w in item_tags[j].items(): #统计 j 的标签数目 76 | nj += w * w 77 | if ret == 0: 78 | return 0 79 | return ret/math.sqrt(ni * nj) #返回余弦值 80 | 81 | #计算推荐列表多样性 82 | def Diversity(item_tags,recommend_items): 83 | ret = 0 84 | n = 0 85 | for i in dict(recommend_items).keys(): 86 | for j in dict(recommend_items).keys(): 87 | if i == j: 88 | continue 89 | ret += CosineSim(item_tags,i,j) 90 | n += 1 91 | return ret/(n * 1.0) 92 | 93 | InitStat() 94 | recommend_list = Recommend("48411") 95 | # print recommend_list 96 | for recommend in recommend_list[:10]: #兴趣度最高的十个itemid 97 | print recommend 98 | 99 | #标签流行度统计 100 | tagFreq = TagPopularity() 101 | for tag in tagFreq[:20]: 102 | print tag 103 | 104 | #推荐列表多样性,计算时间较长 105 | diversityNum = Diversity(item_tags, recommend_list) 106 | print diversityNum -------------------------------------------------------------------------------- /Recommend/基于用户的协同过滤推荐BasedUserCF.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/Recommend/基于用户的协同过滤推荐BasedUserCF.py -------------------------------------------------------------------------------- /Regession/ex0.txt: -------------------------------------------------------------------------------- 1 | 1.000000 0.067732 3.176513 2 | 1.000000 0.427810 3.816464 3 | 1.000000 0.995731 4.550095 4 | 1.000000 0.738336 4.256571 5 | 1.000000 0.981083 4.560815 6 | 1.000000 0.526171 3.929515 7 | 1.000000 0.378887 3.526170 8 | 1.000000 0.033859 3.156393 9 | 1.000000 0.132791 3.110301 10 | 1.000000 0.138306 3.149813 11 | 1.000000 0.247809 3.476346 12 | 1.000000 0.648270 4.119688 13 | 1.000000 0.731209 4.282233 14 | 1.000000 0.236833 3.486582 15 | 1.000000 0.969788 4.655492 16 | 1.000000 0.607492 3.965162 17 | 1.000000 0.358622 3.514900 18 | 1.000000 0.147846 3.125947 19 | 1.000000 0.637820 4.094115 20 | 1.000000 0.230372 3.476039 21 | 1.000000 0.070237 3.210610 22 | 1.000000 0.067154 3.190612 23 | 1.000000 0.925577 4.631504 24 | 1.000000 0.717733 4.295890 25 | 1.000000 0.015371 3.085028 26 | 1.000000 0.335070 3.448080 27 | 1.000000 0.040486 3.167440 28 | 1.000000 0.212575 3.364266 29 | 1.000000 0.617218 3.993482 30 | 1.000000 0.541196 3.891471 31 | 1.000000 0.045353 3.143259 32 | 1.000000 0.126762 3.114204 33 | 1.000000 0.556486 3.851484 34 | 1.000000 0.901144 4.621899 35 | 1.000000 0.958476 4.580768 36 | 1.000000 0.274561 3.620992 37 | 1.000000 0.394396 3.580501 38 | 1.000000 0.872480 4.618706 39 | 1.000000 0.409932 3.676867 40 | 1.000000 0.908969 4.641845 41 | 1.000000 0.166819 3.175939 42 | 1.000000 0.665016 4.264980 43 | 1.000000 0.263727 3.558448 44 | 1.000000 0.231214 3.436632 45 | 1.000000 0.552928 3.831052 46 | 1.000000 0.047744 3.182853 47 | 1.000000 0.365746 3.498906 48 | 1.000000 0.495002 3.946833 49 | 1.000000 0.493466 3.900583 50 | 1.000000 0.792101 4.238522 51 | 1.000000 0.769660 4.233080 52 | 1.000000 0.251821 3.521557 53 | 1.000000 0.181951 3.203344 54 | 1.000000 0.808177 4.278105 55 | 1.000000 0.334116 3.555705 56 | 1.000000 0.338630 3.502661 57 | 1.000000 0.452584 3.859776 58 | 1.000000 0.694770 4.275956 59 | 1.000000 0.590902 3.916191 60 | 1.000000 0.307928 3.587961 61 | 1.000000 0.148364 3.183004 62 | 1.000000 0.702180 4.225236 63 | 1.000000 0.721544 4.231083 64 | 1.000000 0.666886 4.240544 65 | 1.000000 0.124931 3.222372 66 | 1.000000 0.618286 4.021445 67 | 1.000000 0.381086 3.567479 68 | 1.000000 0.385643 3.562580 69 | 1.000000 0.777175 4.262059 70 | 1.000000 0.116089 3.208813 71 | 1.000000 0.115487 3.169825 72 | 1.000000 0.663510 4.193949 73 | 1.000000 0.254884 3.491678 74 | 1.000000 0.993888 4.533306 75 | 1.000000 0.295434 3.550108 76 | 1.000000 0.952523 4.636427 77 | 1.000000 0.307047 3.557078 78 | 1.000000 0.277261 3.552874 79 | 1.000000 0.279101 3.494159 80 | 1.000000 0.175724 3.206828 81 | 1.000000 0.156383 3.195266 82 | 1.000000 0.733165 4.221292 83 | 1.000000 0.848142 4.413372 84 | 1.000000 0.771184 4.184347 85 | 1.000000 0.429492 3.742878 86 | 1.000000 0.162176 3.201878 87 | 1.000000 0.917064 4.648964 88 | 1.000000 0.315044 3.510117 89 | 1.000000 0.201473 3.274434 90 | 1.000000 0.297038 3.579622 91 | 1.000000 0.336647 3.489244 92 | 1.000000 0.666109 4.237386 93 | 1.000000 0.583888 3.913749 94 | 1.000000 0.085031 3.228990 95 | 1.000000 0.687006 4.286286 96 | 1.000000 0.949655 4.628614 97 | 1.000000 0.189912 3.239536 98 | 1.000000 0.844027 4.457997 99 | 1.000000 0.333288 3.513384 100 | 1.000000 0.427035 3.729674 101 | 1.000000 0.466369 3.834274 102 | 1.000000 0.550659 3.811155 103 | 1.000000 0.278213 3.598316 104 | 1.000000 0.918769 4.692514 105 | 1.000000 0.886555 4.604859 106 | 1.000000 0.569488 3.864912 107 | 1.000000 0.066379 3.184236 108 | 1.000000 0.335751 3.500796 109 | 1.000000 0.426863 3.743365 110 | 1.000000 0.395746 3.622905 111 | 1.000000 0.694221 4.310796 112 | 1.000000 0.272760 3.583357 113 | 1.000000 0.503495 3.901852 114 | 1.000000 0.067119 3.233521 115 | 1.000000 0.038326 3.105266 116 | 1.000000 0.599122 3.865544 117 | 1.000000 0.947054 4.628625 118 | 1.000000 0.671279 4.231213 119 | 1.000000 0.434811 3.791149 120 | 1.000000 0.509381 3.968271 121 | 1.000000 0.749442 4.253910 122 | 1.000000 0.058014 3.194710 123 | 1.000000 0.482978 3.996503 124 | 1.000000 0.466776 3.904358 125 | 1.000000 0.357767 3.503976 126 | 1.000000 0.949123 4.557545 127 | 1.000000 0.417320 3.699876 128 | 1.000000 0.920461 4.613614 129 | 1.000000 0.156433 3.140401 130 | 1.000000 0.656662 4.206717 131 | 1.000000 0.616418 3.969524 132 | 1.000000 0.853428 4.476096 133 | 1.000000 0.133295 3.136528 134 | 1.000000 0.693007 4.279071 135 | 1.000000 0.178449 3.200603 136 | 1.000000 0.199526 3.299012 137 | 1.000000 0.073224 3.209873 138 | 1.000000 0.286515 3.632942 139 | 1.000000 0.182026 3.248361 140 | 1.000000 0.621523 3.995783 141 | 1.000000 0.344584 3.563262 142 | 1.000000 0.398556 3.649712 143 | 1.000000 0.480369 3.951845 144 | 1.000000 0.153350 3.145031 145 | 1.000000 0.171846 3.181577 146 | 1.000000 0.867082 4.637087 147 | 1.000000 0.223855 3.404964 148 | 1.000000 0.528301 3.873188 149 | 1.000000 0.890192 4.633648 150 | 1.000000 0.106352 3.154768 151 | 1.000000 0.917886 4.623637 152 | 1.000000 0.014855 3.078132 153 | 1.000000 0.567682 3.913596 154 | 1.000000 0.068854 3.221817 155 | 1.000000 0.603535 3.938071 156 | 1.000000 0.532050 3.880822 157 | 1.000000 0.651362 4.176436 158 | 1.000000 0.901225 4.648161 159 | 1.000000 0.204337 3.332312 160 | 1.000000 0.696081 4.240614 161 | 1.000000 0.963924 4.532224 162 | 1.000000 0.981390 4.557105 163 | 1.000000 0.987911 4.610072 164 | 1.000000 0.990947 4.636569 165 | 1.000000 0.736021 4.229813 166 | 1.000000 0.253574 3.500860 167 | 1.000000 0.674722 4.245514 168 | 1.000000 0.939368 4.605182 169 | 1.000000 0.235419 3.454340 170 | 1.000000 0.110521 3.180775 171 | 1.000000 0.218023 3.380820 172 | 1.000000 0.869778 4.565020 173 | 1.000000 0.196830 3.279973 174 | 1.000000 0.958178 4.554241 175 | 1.000000 0.972673 4.633520 176 | 1.000000 0.745797 4.281037 177 | 1.000000 0.445674 3.844426 178 | 1.000000 0.470557 3.891601 179 | 1.000000 0.549236 3.849728 180 | 1.000000 0.335691 3.492215 181 | 1.000000 0.884739 4.592374 182 | 1.000000 0.918916 4.632025 183 | 1.000000 0.441815 3.756750 184 | 1.000000 0.116598 3.133555 185 | 1.000000 0.359274 3.567919 186 | 1.000000 0.814811 4.363382 187 | 1.000000 0.387125 3.560165 188 | 1.000000 0.982243 4.564305 189 | 1.000000 0.780880 4.215055 190 | 1.000000 0.652565 4.174999 191 | 1.000000 0.870030 4.586640 192 | 1.000000 0.604755 3.960008 193 | 1.000000 0.255212 3.529963 194 | 1.000000 0.730546 4.213412 195 | 1.000000 0.493829 3.908685 196 | 1.000000 0.257017 3.585821 197 | 1.000000 0.833735 4.374394 198 | 1.000000 0.070095 3.213817 199 | 1.000000 0.527070 3.952681 200 | 1.000000 0.116163 3.129283 201 | -------------------------------------------------------------------------------- /Regession/ex1.txt: -------------------------------------------------------------------------------- 1 | 1.000000 0.635975 4.093119 2 | 1.000000 0.552438 3.804358 3 | 1.000000 0.855922 4.456531 4 | 1.000000 0.083386 3.187049 5 | 1.000000 0.975802 4.506176 6 | 1.000000 0.181269 3.171914 7 | 1.000000 0.129156 3.053996 8 | 1.000000 0.605648 3.974659 9 | 1.000000 0.301625 3.542525 10 | 1.000000 0.698805 4.234199 11 | 1.000000 0.226419 3.405937 12 | 1.000000 0.519290 3.932469 13 | 1.000000 0.354424 3.514051 14 | 1.000000 0.118380 3.105317 15 | 1.000000 0.512811 3.843351 16 | 1.000000 0.236795 3.576074 17 | 1.000000 0.353509 3.544471 18 | 1.000000 0.481447 3.934625 19 | 1.000000 0.060509 3.228226 20 | 1.000000 0.174090 3.300232 21 | 1.000000 0.806818 4.331785 22 | 1.000000 0.531462 3.908166 23 | 1.000000 0.853167 4.386918 24 | 1.000000 0.304804 3.617260 25 | 1.000000 0.612021 4.082411 26 | 1.000000 0.620880 3.949470 27 | 1.000000 0.580245 3.984041 28 | 1.000000 0.742443 4.251907 29 | 1.000000 0.110770 3.115214 30 | 1.000000 0.742687 4.234319 31 | 1.000000 0.574390 3.947544 32 | 1.000000 0.986378 4.532519 33 | 1.000000 0.294867 3.510392 34 | 1.000000 0.472125 3.927832 35 | 1.000000 0.872321 4.631825 36 | 1.000000 0.843537 4.482263 37 | 1.000000 0.864577 4.487656 38 | 1.000000 0.341874 3.486371 39 | 1.000000 0.097980 3.137514 40 | 1.000000 0.757874 4.212660 41 | 1.000000 0.877656 4.506268 42 | 1.000000 0.457993 3.800973 43 | 1.000000 0.475341 3.975979 44 | 1.000000 0.848391 4.494447 45 | 1.000000 0.746059 4.244715 46 | 1.000000 0.153462 3.019251 47 | 1.000000 0.694256 4.277945 48 | 1.000000 0.498712 3.812414 49 | 1.000000 0.023580 3.116973 50 | 1.000000 0.976826 4.617363 51 | 1.000000 0.624004 4.005158 52 | 1.000000 0.472220 3.874188 53 | 1.000000 0.390551 3.630228 54 | 1.000000 0.021349 3.145849 55 | 1.000000 0.173488 3.192618 56 | 1.000000 0.971028 4.540226 57 | 1.000000 0.595302 3.835879 58 | 1.000000 0.097638 3.141948 59 | 1.000000 0.745972 4.323316 60 | 1.000000 0.676390 4.204829 61 | 1.000000 0.488949 3.946710 62 | 1.000000 0.982873 4.666332 63 | 1.000000 0.296060 3.482348 64 | 1.000000 0.228008 3.451286 65 | 1.000000 0.671059 4.186388 66 | 1.000000 0.379419 3.595223 67 | 1.000000 0.285170 3.534446 68 | 1.000000 0.236314 3.420891 69 | 1.000000 0.629803 4.115553 70 | 1.000000 0.770272 4.257463 71 | 1.000000 0.493052 3.934798 72 | 1.000000 0.631592 4.154963 73 | 1.000000 0.965676 4.587470 74 | 1.000000 0.598675 3.944766 75 | 1.000000 0.351997 3.480517 76 | 1.000000 0.342001 3.481382 77 | 1.000000 0.661424 4.253286 78 | 1.000000 0.140912 3.131670 79 | 1.000000 0.373574 3.527099 80 | 1.000000 0.223166 3.378051 81 | 1.000000 0.908785 4.578960 82 | 1.000000 0.915102 4.551773 83 | 1.000000 0.410940 3.634259 84 | 1.000000 0.754921 4.167016 85 | 1.000000 0.764453 4.217570 86 | 1.000000 0.101534 3.237201 87 | 1.000000 0.780368 4.353163 88 | 1.000000 0.819868 4.342184 89 | 1.000000 0.173990 3.236950 90 | 1.000000 0.330472 3.509404 91 | 1.000000 0.162656 3.242535 92 | 1.000000 0.476283 3.907937 93 | 1.000000 0.636391 4.108455 94 | 1.000000 0.758737 4.181959 95 | 1.000000 0.778372 4.251103 96 | 1.000000 0.936287 4.538462 97 | 1.000000 0.510904 3.848193 98 | 1.000000 0.515737 3.974757 99 | 1.000000 0.437823 3.708323 100 | 1.000000 0.828607 4.385210 101 | 1.000000 0.556100 3.927788 102 | 1.000000 0.038209 3.187881 103 | 1.000000 0.321993 3.444542 104 | 1.000000 0.067288 3.199263 105 | 1.000000 0.774989 4.285745 106 | 1.000000 0.566077 3.878557 107 | 1.000000 0.796314 4.155745 108 | 1.000000 0.746600 4.197772 109 | 1.000000 0.360778 3.524928 110 | 1.000000 0.397321 3.525692 111 | 1.000000 0.062142 3.211318 112 | 1.000000 0.379250 3.570495 113 | 1.000000 0.248238 3.462431 114 | 1.000000 0.682561 4.206177 115 | 1.000000 0.355393 3.562322 116 | 1.000000 0.889051 4.595215 117 | 1.000000 0.733806 4.182694 118 | 1.000000 0.153949 3.320695 119 | 1.000000 0.036104 3.122670 120 | 1.000000 0.388577 3.541312 121 | 1.000000 0.274481 3.502135 122 | 1.000000 0.319401 3.537559 123 | 1.000000 0.431653 3.712609 124 | 1.000000 0.960398 4.504875 125 | 1.000000 0.083660 3.262164 126 | 1.000000 0.122098 3.105583 127 | 1.000000 0.415299 3.742634 128 | 1.000000 0.854192 4.566589 129 | 1.000000 0.925574 4.630884 130 | 1.000000 0.109306 3.190539 131 | 1.000000 0.805161 4.289105 132 | 1.000000 0.344474 3.406602 133 | 1.000000 0.769116 4.251899 134 | 1.000000 0.182003 3.183214 135 | 1.000000 0.225972 3.342508 136 | 1.000000 0.413088 3.747926 137 | 1.000000 0.964444 4.499998 138 | 1.000000 0.203334 3.350089 139 | 1.000000 0.285574 3.539554 140 | 1.000000 0.850209 4.443465 141 | 1.000000 0.061561 3.290370 142 | 1.000000 0.426935 3.733302 143 | 1.000000 0.389376 3.614803 144 | 1.000000 0.096918 3.175132 145 | 1.000000 0.148938 3.164284 146 | 1.000000 0.893738 4.619629 147 | 1.000000 0.195527 3.426648 148 | 1.000000 0.407248 3.670722 149 | 1.000000 0.224357 3.412571 150 | 1.000000 0.045963 3.110330 151 | 1.000000 0.944647 4.647928 152 | 1.000000 0.756552 4.164515 153 | 1.000000 0.432098 3.730603 154 | 1.000000 0.990511 4.609868 155 | 1.000000 0.649699 4.094111 156 | 1.000000 0.584879 3.907636 157 | 1.000000 0.785934 4.240814 158 | 1.000000 0.029945 3.106915 159 | 1.000000 0.075747 3.201181 160 | 1.000000 0.408408 3.872302 161 | 1.000000 0.583851 3.860890 162 | 1.000000 0.497759 3.884108 163 | 1.000000 0.421301 3.696816 164 | 1.000000 0.140320 3.114540 165 | 1.000000 0.546465 3.791233 166 | 1.000000 0.843181 4.443487 167 | 1.000000 0.295390 3.535337 168 | 1.000000 0.825059 4.417975 169 | 1.000000 0.946343 4.742471 170 | 1.000000 0.350404 3.470964 171 | 1.000000 0.042787 3.113381 172 | 1.000000 0.352487 3.594600 173 | 1.000000 0.590736 3.914875 174 | 1.000000 0.120748 3.108492 175 | 1.000000 0.143140 3.152725 176 | 1.000000 0.511926 3.994118 177 | 1.000000 0.496358 3.933417 178 | 1.000000 0.382802 3.510829 179 | 1.000000 0.252464 3.498402 180 | 1.000000 0.845894 4.460441 181 | 1.000000 0.132023 3.245277 182 | 1.000000 0.442301 3.771067 183 | 1.000000 0.266889 3.434771 184 | 1.000000 0.008575 2.999612 185 | 1.000000 0.897632 4.454221 186 | 1.000000 0.533171 3.985348 187 | 1.000000 0.285243 3.557982 188 | 1.000000 0.377258 3.625972 189 | 1.000000 0.486995 3.922226 190 | 1.000000 0.305993 3.547421 191 | 1.000000 0.277528 3.580944 192 | 1.000000 0.750899 4.268081 193 | 1.000000 0.694756 4.278096 194 | 1.000000 0.870158 4.517640 195 | 1.000000 0.276457 3.555461 196 | 1.000000 0.017761 3.055026 197 | 1.000000 0.802046 4.354819 198 | 1.000000 0.559275 3.894387 199 | 1.000000 0.941305 4.597773 200 | 1.000000 0.856877 4.523616 201 | -------------------------------------------------------------------------------- /Regession/regession.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf8-*- 2 | ''' 3 | Created on 2016年5月14日 4 | 5 | @author: Gamer Think 6 | ''' 7 | 8 | from numpy import * 9 | #加载数据集 10 | def loadDataSet(filename): 11 | numFeat = len(open(filename).readline().split("\t")) -1 12 | dataMat = []; labelMat = [] 13 | fr = open(filename) 14 | for line in fr.readlines(): 15 | lineArr = [] 16 | curLine = line.strip().split("\t") 17 | for i in range(numFeat): 18 | lineArr.append(float(curLine[i])) 19 | 20 | dataMat.append(lineArr) 21 | labelMat.append(float(curLine[-1])) 22 | 23 | return dataMat,labelMat 24 | 25 | #====================用线性回归找到最佳拟合曲线=========== 26 | #计算最佳拟合曲线 27 | def standRegress(xArr,yArr): 28 | xMat = mat(xArr); yMat = mat(yArr).T #.T代表转置矩阵 29 | xTx = xMat.T * xMat 30 | if linalg.det(xTx) ==0.0: #linalg.det(xTx) 计算行列式的值 31 | print "This matrix is singular , cannot do inverse" 32 | return 33 | ws = xTx.I * (xMat.T * yMat) 34 | return ws 35 | 36 | #测试上边的函数 37 | xArr,yArr = loadDataSet("ex0.txt") 38 | ws = standRegress(xArr, yArr) 39 | print "ws(相关系数):",ws #ws 存放的就是回归系数 40 | 41 | #画图展示 42 | def show(): 43 | import matplotlib.pyplot as plt 44 | xMat = mat(xArr); yMat = mat(yArr) 45 | yHat = xMat*ws 46 | fig = plt.figure() #创建绘图对象 47 | ax = fig.add_subplot(111) #111表示将画布划分为1行2列选择使用从上到下第一块 48 | #scatter绘制散点图 49 | ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0]) 50 | #复制,排序 51 | xCopy =xMat.copy() 52 | xCopy.sort(0) 53 | yHat = xCopy * ws 54 | #plot画线 55 | ax.plot(xCopy[:,1],yHat) 56 | plt.show() 57 | 58 | # show() 59 | 60 | #利用numpy库提供的corrcoef来计算预测值和真实值得相关性 61 | yHat = mat(xArr) * ws #yHat = xMat * ws 62 | print "相关性:",corrcoef(yHat.T,mat(yArr)) 63 | #====================用线性回归找到最佳拟合曲线=========== 64 | 65 | ''' 66 | #==================局部加权线性回归================ 67 | 68 | def lwlr(testPoint,xArr,yArr,k=1.0): 69 | xMat = mat(xArr); yMat = mat(yArr).T 70 | m = shape(xMat)[0] 71 | weights = mat(eye((m))) #产生对角线矩阵 72 | for j in range(m): 73 | diffMat = testPoint - xMat[j,:] 74 | #更新权重值,以指数级递减 75 | weights[j,j] = exp(diffMat * diffMat.T /(-2.0*k**2)) 76 | xTx = xMat.T * (weights * xMat) 77 | if linalg.det(xTx) == 0.0: 78 | print "this matrix is singular,cannot do inverse" 79 | return 80 | ws = xTx.I * (xMat.T * (weights * yMat)) 81 | return testPoint * ws 82 | 83 | def lwlrTest(testArr,xArr,yArr,k=1.0): 84 | m = shape(testArr)[0] 85 | yHat = zeros(m) 86 | for i in range(m): 87 | yHat[i] =lwlr(testArr[i],xArr,yArr,k) 88 | return yHat 89 | 90 | 91 | xArr,yArr = loadDataSet('ex0.txt') 92 | print "k=1.0:",lwlr(xArr[0],xArr,yArr,1.0) 93 | print "k=0.001:",lwlr(xArr[0],xArr,yArr,0.001) 94 | print "k=0.003:",lwlr(xArr[0],xArr,yArr,0.003) 95 | 96 | #画图 97 | def showlwlr(): 98 | yHat = lwlrTest(xArr, xArr, yArr, 0.01) 99 | xMat = mat(xArr) 100 | srtInd = xMat[:,1].argsort(0) 101 | xSort = xMat[srtInd][:,0,:] 102 | 103 | import matplotlib.pyplot as plt 104 | fig = plt.figure() #创建绘图对象 105 | ax = fig.add_subplot(111) #111表示将画布划分为1行2列选择使用从上到下第一块 106 | ax.plot(xSort[:,1],yHat[srtInd]) 107 | #scatter绘制散点图 108 | ax.scatter(xMat[:,1].flatten().A[0],mat(yArr).T[:,0].flatten().A[0],s=2,c='red') 109 | plt.show() 110 | 111 | showlwlr() 112 | ''' 113 | ''' 114 | #=========================岭回归================== 115 | #用于计算回归系数 116 | def ridgeRegres(xMat,yMat,lam=0.2): 117 | xTx = xMat.T * xMat 118 | denom = xTx + eye(shape(xMat)[1]) * lam 119 | if linalg.det(denom)==0.0: 120 | print "This matrix is singular, cannot do inverse" 121 | return 122 | ws = denom.I * (xMat.T * yMat) 123 | return ws 124 | 125 | #用于在一组lambda上做测试 126 | def ridgeTest(xArr,yArr): 127 | xMat = mat(xArr); yMat = mat(yArr).T 128 | yMean = mean(yMat,0) 129 | #数据标准化 130 | yMat = yMat - yMean 131 | xMeans = mean(xMat,0) 132 | xVar = var(xMat,0) 133 | xMat = (xMat - xMeans)/xVar 134 | 135 | numTestPts = 30 136 | wMat = zeros((numTestPts, shape(xMat)[1])) 137 | for i in range(numTestPts): 138 | ws = ridgeRegres(xMat, yMat, exp(i-10)) 139 | wMat[i,:]=ws.T 140 | return wMat 141 | 142 | abX,abY = loadDataSet('abalone.txt') 143 | ridgeWeights = ridgeTest(abX,abY) 144 | # print ridgeWeights 145 | 146 | def showRidge(): 147 | import matplotlib.pyplot as plt 148 | fig = plt.figure() 149 | ax = fig.add_subplot(111) 150 | ax.plot(ridgeWeights) 151 | plt.show() 152 | 153 | showRidge() 154 | #===================岭回归============= 155 | ''' 156 | #===================向前逐步回归============ 157 | 158 | #计算平方误差 159 | def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays 160 | return ((yArr-yHatArr)**2).sum() 161 | 162 | #数据标准化处理 163 | def regularize(xMat):#regularize by columns 164 | inMat = xMat.copy() 165 | inMeans = mean(inMat,0) #calc mean then subtract it off 166 | inVar = var(inMat,0) #calc variance of Xi then divide by it 167 | inMat = (inMat - inMeans)/inVar 168 | return inMat 169 | 170 | 171 | def stageWise(xArr,yArr,eps=0.01,numIt=100): 172 | xMat = mat(xArr); yMat=mat(yArr).T 173 | yMean = mean(yMat,0) 174 | yMat = yMat - yMean #can also regularize ys but will get smaller coef 175 | xMat = regularize(xMat) 176 | m,n=shape(xMat) 177 | returnMat = zeros((numIt,n)) #testing code remove 178 | ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy() 179 | for i in range(numIt):#could change this to while loop 180 | #print ws.T 181 | lowestError = inf; 182 | for j in range(n): 183 | for sign in [-1,1]: 184 | wsTest = ws.copy() 185 | wsTest[j] += eps*sign 186 | yTest = xMat*wsTest 187 | rssE = rssError(yMat.A,yTest.A) 188 | if rssE < lowestError: 189 | lowestError = rssE 190 | wsMax = wsTest 191 | ws = wsMax.copy() 192 | returnMat[i,:]=ws.T 193 | return returnMat 194 | 195 | xArr,yArr = loadDataSet('abalone.txt') 196 | print stageWise(xArr, yArr, 0.01, 200),"\n\n" 197 | 198 | # print stageWise(xArr, yArr, 0.001, 200) 199 | 200 | xMat = mat(xArr) 201 | yMat = mat(yArr).T 202 | xMat = regularize(xMat) 203 | yM = mean(yMat,0) 204 | yMat = yMat - yM 205 | weights = standRegress(xMat, yMat.T) 206 | print weights.T -------------------------------------------------------------------------------- /sklearn/README.md: -------------------------------------------------------------------------------- 1 | 0: line_regression——回归分析之Sklearn实现电力预测
2 | http://blog.csdn.net/Gamer_gyt/article/details/78467021
3 | -------------------------------------------------------------------------------- /sklearn/line_regression/sk_linreg.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2017 Register 6 | # 7 | # Distributed under terms of the GPLv3 license. 8 | 9 | """ 10 | """ 11 | from sklearn.cross_validation import train_test_split 12 | from sklearn.linear_model import LinearRegression 13 | import pandas as pd 14 | import numpy as np 15 | 16 | # pandas 读取数据 17 | data = pd.read_csv("Folds5x2_pp.csv") 18 | print data.shape 19 | 20 | # 准备样本数据和样本输出 21 | X = data[["AT","V","AP","RH"]] 22 | print X.shape 23 | y = data[["PE"]] 24 | print y.shape 25 | 26 | linreg = LinearRegression() 27 | linreg.fit(X_train,y_train) 28 | 29 | # 训练模型完毕,查看结果 30 | print linreg.intercept_ 31 | print linreg.coef_ 32 | 33 | y_pred = linreg.predict(X_test) 34 | from sklearn import metrics 35 | 36 | # 使用sklearn来计算mse和Rmse 37 | print "MSE:",metrics.mean_squared_error(y_test, y_pred) 38 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)) 39 | 40 | # 交叉验证 41 | from sklearn.model_selection import cross_val_predict 42 | predicted = cross_val_predict(linreg,X,y,cv=10) 43 | print "MSE:",metrics.mean_squared_error(y, predicted) 44 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y, predicted)) 45 | 46 | # 画图查看结果 47 | import matplotlib.pyplot as plt 48 | fig, ax = plt.subplots() 49 | ax.scatter(y, predicted) 50 | ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) 51 | ax.set_xlabel('Measured') 52 | ax.set_ylabel('Predicted') 53 | plt.show() 54 | --------------------------------------------------------------------------------