├── .idea ├── encodings.xml ├── libraries │ └── R_User_Library.xml ├── misc.xml ├── modules.xml ├── other.xml └── vcs.xml ├── 1.Python Foundation ├── 1.1 intro.py ├── 1.2 calc_e.py ├── 1.3 class_intro.py ├── 1.4 stat.py ├── README.md ├── figures │ ├── Taylor.png │ └── distribution.png └── guideline.pdf ├── 10.LDA Topic Model ├── 10.1 LDA_intro.py ├── 10.2 netease_news.py ├── 10.3 reuters.py ├── LDA_test.txt ├── README.md ├── figures │ ├── doc-topic.png │ └── topic-word.png ├── news.dat ├── principle.pdf ├── reuters.dat ├── reuters.ldac ├── reuters.titles ├── reuters.tokens └── stopword.txt ├── 11.HMM ├── 11.1 TrainHMM.py ├── 11.2 Segmentation.py ├── 11.3 jieba_intro.py ├── 11.4 GMHMM.py ├── 11.5 Stock.py ├── A.txt ├── B.txt ├── MyBook.txt ├── README.md ├── SH600000.txt ├── figures │ └── stock.png ├── novel.txt ├── pi.txt ├── pku_training.utf8 └── principle.pdf ├── 2.Application Foundation ├── 2.1 Ensumble.py ├── 2.2 Convolve.py ├── 2.3 Image_convolve.py ├── 2.4 FFT.py ├── 2.5 SVD.py ├── README.md ├── SH600000.txt ├── figures │ ├── FFT(1).png │ ├── FFT(2).png │ ├── SVD.png │ ├── convolve.png │ ├── ema.png │ └── sma.png ├── guideline.pdf ├── lena.png └── son.png ├── 3.Regression ├── 3.1 Advertising.py ├── 3.2 LinearRegression_CV.py ├── 3.3 Iris_LR.py ├── 3.4 Overfit.py ├── Advertising.csv ├── README.md ├── figures │ ├── adv.png │ ├── adv_pred.png │ ├── adv_pred_cv.png │ ├── adv_self.png │ ├── iris_LR.png │ └── overfit.png ├── guideline.pdf ├── iris.data ├── iris.names └── principle.pdf ├── 4.Decision Tree & Random Forest ├── 4.1 Iris_DecisionTree.py ├── 4.2 Iris_DecisionTree_Enum.py ├── 4.3 DecisionTreeRegressor.py ├── 4.4 MultiOutput_DTR.py ├── 4.5 Iris_RandomForest_Enum.py ├── README.md ├── figures │ ├── DT.png │ ├── DTR.png │ ├── DT_pair.png │ ├── DT_reg.png │ ├── DT_reg_depth.png │ ├── RF_pair.png │ └── depth.png ├── iris.data ├── iris_tree.dot └── principle.pdf ├── 5.Boost ├── 5.1 xgBoost_Intro.py ├── 5.2 xgBoost_Predict.py ├── 5.3 xgBoost_Wine.py ├── 5.4 xgBoost_ReadData.py ├── 5.5 Titanic.py ├── 5.6 Bagging_intro.py ├── New_Data.csv ├── README.md ├── Titanic.test.csv ├── Titanic.train.csv ├── Titanic.train_Prime.csv ├── Titannic_Meta.txt ├── agaricus.txt ├── agaricus_test.txt ├── agaricus_train.txt ├── figures │ └── bagging.png ├── iris.data ├── principle.pdf ├── wine.data └── wine_names ├── 6.SVM ├── 6.1 SVM_intro.py ├── 6.2 SVM_draw.py ├── 6.3 ClassifierIndex.py ├── 6.4 unBalance.py ├── 6.5 HandWrittenDigits.py ├── 6.6 SVR.py ├── 6.7 CV.py ├── HandWritten │ ├── 0.png │ ├── 1.png │ ├── 10.png │ ├── 11.png │ ├── 12.png │ ├── 13.png │ ├── 14.png │ ├── 15.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ └── 9.png ├── README.md ├── bipartition.txt ├── bipartition2.txt ├── figures │ ├── CV.png │ ├── SVM_parameter.png │ ├── hand_writing.png │ ├── hand_wrong.png │ ├── intro.png │ ├── kernal.png │ └── unbalance.png ├── iris.data ├── optdigits.names ├── optdigits.tes ├── optdigits.tra └── principle.pdf ├── 7.Cluster ├── 7.1 kMeans.py ├── 7.2 criteria.py ├── 7.3 VectorQuantization.py ├── 7.4 AP.py ├── 7.5 MeanShift.py ├── 7.6 DBSCAN.py ├── 7.7 SC.py ├── 7.8 SpectralClusterImage.py ├── Chrome.png ├── Lena.png ├── README.md ├── figures │ ├── 2D.png │ ├── 3D.png │ ├── AP.png │ ├── DBSCAN.png │ ├── SC.png │ ├── cluster.png │ ├── mean.png │ ├── spectral.png │ └── target.png └── principle.pdf ├── 8.EM Model ├── 8.1 EM.py ├── 8.2 GMM.py ├── 8.3 GMM_Parameter.py ├── 8.4 GMM_Iris.py ├── 8.5 DPGMM.py ├── 8.6 GMM_pdf.py ├── HeightWeight.csv ├── README.md ├── figures │ ├── DPGMM.png │ ├── EM.png │ ├── EM_para.png │ ├── EM_para_modi.png │ ├── GMM.png │ ├── GMM_sim.png │ └── iris.png ├── iris.data └── principle.pdf ├── 9.Bayes Network ├── 9.1 Iris_GaussianNB.py ├── 9.2 MultinomialNB_intro.py ├── 9.3 text_classification.py ├── README.md ├── figures │ ├── iris.png │ └── text.png ├── iris.data └── principle.pdf ├── Figures ├── 10-3-1.png ├── 10-3-2.png ├── 11-5.png ├── 3-1-1.png ├── 3-1-2.png ├── 3-1-3.png ├── 3-2.png ├── 3-3.png ├── 3-4.png ├── 4-1-1.png ├── 4-1-2.png ├── 4-2.png ├── 4-3-1.png ├── 4-3-2.png ├── 4-4.png ├── 4-5.png ├── 5-6.png ├── 6-1.png ├── 6-2.png ├── 6-4.png ├── 6-5-1.png ├── 6-5-2.png ├── 6-6.png ├── 6-7.png ├── 7-1.png ├── 7-3-1.png ├── 7-3-2.png ├── 7-4.png ├── 7-5.png ├── 7-6.png ├── 7-7.png ├── 8-1.png ├── 8-2.png ├── 8-3-1.png ├── 8-3-2.png ├── 8-4.png ├── 8-5.png ├── 8-6.png ├── 9-1.png └── 9-3.png ├── LittleElephant.iml └── README.md /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/libraries/R_User_Library.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /1.Python Foundation/1.2 calc_e.py: -------------------------------------------------------------------------------- 1 | import math 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | def calc_e_small(x): 8 | n = 10 9 | f = np.arange(1, n + 1).cumprod() 10 | b = np.array([x] * n).cumprod() 11 | return np.sum(b / f) + 1 12 | 13 | 14 | def calc_e(x): 15 | reverse = False 16 | if x < 0: # 处理负数 17 | x = -x 18 | reverse = True 19 | ln2 = 0.69314718055994530941723212145818 20 | c = x / ln2 21 | a = int(c + 0.5) 22 | b = x - a * ln2 23 | y = (2 ** a) * calc_e_small(b) 24 | if reverse: 25 | return 1 / y 26 | return y 27 | 28 | 29 | if __name__ == "__main__": 30 | t1 = np.linspace(-2, 0, 10, endpoint=False) 31 | t2 = np.linspace(0, 2, 20) 32 | t = np.concatenate((t1, t2)) 33 | print(t) # 横轴数据 34 | y = np.empty_like(t) 35 | for i, x in enumerate(t): 36 | y[i] = calc_e(x) 37 | print('e^', x, ' = ', y[i], '(近似值)\t', math.exp(x), '(真实值)') 38 | # print('误差:', y[i] - math.exp(x)) 39 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 40 | mpl.rcParams['axes.unicode_minus'] = False 41 | plt.plot(t, y, 'r-', t, y, 'go', linewidth=2) 42 | plt.title(u'Taylor展式的应用', fontsize=18) 43 | plt.xlabel('X', fontsize=15) 44 | plt.ylabel('exp(X)', fontsize=15) 45 | plt.grid(True) 46 | plt.show() 47 | -------------------------------------------------------------------------------- /1.Python Foundation/1.3 class_intro.py: -------------------------------------------------------------------------------- 1 | class People: 2 | def __init__(self, n, a, s): 3 | self.name = n 4 | self.age = a 5 | self.__score = s 6 | self.print_people() 7 | # self.__print_people() # 私有函数的作用 8 | 9 | def print_people(self): 10 | str = u'%s的年龄:%d,成绩为:%.2f' % (self.name, self.age, self.__score) 11 | print(str) 12 | 13 | __print_people = print_people 14 | 15 | 16 | class Student(People): 17 | def __init__(self, n, a, w): 18 | People.__init__(self, n, a, w) 19 | self.name = 'Student ' + self.name 20 | 21 | def print_people(self): 22 | str = u'%s的年龄:%d' % (self.name, self.age) 23 | print(str) 24 | 25 | 26 | def func(p): 27 | p.age = 11 28 | 29 | 30 | if __name__ == '__main__': 31 | p = People('Tom', 10, 3.14159) 32 | func(p) # p传入的是引用类型 33 | p.print_people() 34 | 35 | # 注意分析下面语句的打印结果,是否觉得有些“怪异”? 36 | j = Student('Jerry', 12, 2.71828) 37 | 38 | # 成员函数 39 | p.print_people() 40 | j.print_people() 41 | 42 | People.print_people(p) 43 | People.print_people(j) 44 | 45 | ''' 46 | Tom的年龄:10,成绩为:3.14 47 | Tom的年龄:11,成绩为:3.14 48 | Jerry的年龄:12 49 | Tom的年龄:11,成绩为:3.14 50 | Student Jerry的年龄:12 51 | Tom的年龄:11,成绩为:3.14 52 | Student Jerry的年龄:12,成绩为:2.72 53 | ''' -------------------------------------------------------------------------------- /1.Python Foundation/1.4 stat.py: -------------------------------------------------------------------------------- 1 | import math 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | from matplotlib import cm 6 | from scipy import stats 7 | 8 | 9 | def calc_statistics(x): 10 | n = x.shape[0] # 样本个数 11 | # 手动计算 12 | m = 0 13 | m2 = 0 14 | m3 = 0 15 | m4 = 0 16 | for t in x: 17 | m += t 18 | m2 += t * t 19 | m3 += t ** 3 20 | m4 += t ** 4 21 | m /= n 22 | m2 /= n 23 | m3 /= n 24 | m4 /= n 25 | 26 | mu = m 27 | sigma = np.sqrt(m2 - mu * mu) 28 | skew = (m3 - 3 * mu * m2 + 2 * mu ** 3) / sigma ** 3 29 | kurtosis = (m4 - 4 * mu * m3 + 6 * mu * mu * m2 - 4 * mu ** 3 * mu + mu ** 4) / sigma ** 4 - 3 30 | print('手动计算均值、标准差、偏度、峰度:', mu, sigma, skew, kurtosis) 31 | 32 | # 使用系统函数验证 33 | mu = np.mean(x, axis=0) 34 | sigma = np.std(x, axis=0) 35 | skew = stats.skew(x) 36 | kurtosis = stats.kurtosis(x) 37 | return mu, sigma, skew, kurtosis 38 | 39 | 40 | if __name__ == '__main__': 41 | d = np.random.randn(100000) 42 | print(d) 43 | mu, sigma, skew, kurtosis = calc_statistics(d) 44 | print('函数库计算均值、标准差、偏度、峰度:', mu, sigma, skew, kurtosis) 45 | # 一维直方图 46 | mpl.rcParams[u'font.sans-serif'] = 'SimHei' 47 | mpl.rcParams[u'axes.unicode_minus'] = False 48 | y1, x1, dummy = plt.hist(d, bins=50, normed=True, color='g', alpha=0.75) 49 | t = np.arange(x1.min(), x1.max(), 0.05) 50 | y = np.exp(-t ** 2 / 2) / math.sqrt(2 * math.pi) 51 | plt.plot(t, y, 'r-', lw=2) 52 | plt.title(u'高斯分布,样本个数:%d' % d.shape[0]) 53 | plt.grid(True) 54 | plt.show() 55 | 56 | d = np.random.randn(100000, 2) 57 | mu, sigma, skew, kurtosis = calc_statistics(d) 58 | print('函数库计算均值、标准差、偏度、峰度:', mu, sigma, skew, kurtosis) 59 | # 二维图像 60 | N = 30 61 | density, edges = np.histogramdd(d, bins=[N, N]) 62 | print('样本总数:', np.sum(density)) 63 | density /= density.max() 64 | x = y = np.arange(N) 65 | t = np.meshgrid(x, y) 66 | fig = plt.figure(facecolor='w') 67 | ax = fig.add_subplot(111, projection='3d') 68 | ax.scatter(t[0], t[1], density, c='r', s=15 * density, marker='o', depthshade=True) 69 | ax.plot_surface(t[0], t[1], density, cmap=cm.Accent, rstride=2, cstride=2, alpha=0.9, lw=0.75) 70 | ax.set_xlabel(u'X') 71 | ax.set_ylabel(u'Y') 72 | ax.set_zlabel(u'Z') 73 | plt.title(u'二元高斯分布,样本个数:%d' % d.shape[0], fontsize=20) 74 | plt.tight_layout(0.1) 75 | plt.show() 76 | -------------------------------------------------------------------------------- /1.Python Foundation/README.md: -------------------------------------------------------------------------------- 1 | ## Python Foundation 2 | ## (Python基础复习) 3 | 4 | ### 项目背景 5 | >该项目是整合一些Python将在接下来机器学习项目实践的基础内容,主要是针对一些绘图函数库的使用实践,以及一些分布图及曲线手动绘制方法,方便接下来使用机器学习算法实现一些数据分析后进行可视化操作。 6 | 7 | ### 项目简介 8 | |名称|简介| 9 | |:-------------|:-------------:| 10 | |1.1 intro|Python3.6的一些基础语言用法和知识点复习| 11 | |1.2 calc_e|手动实现Taylor展开效果| 12 | |1.3 class_intro|实践Python类函数继承特性| 13 | |1.4 stat|Matplotlib绘制图像实践| 14 | 15 | ### 效果图 16 | #### ·Taylor展开 17 | 18 | 19 | #### ·手动堆积高斯分布 20 | 21 | 22 | -------------------------------------------------------------------------------- /1.Python Foundation/figures/Taylor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/1.Python Foundation/figures/Taylor.png -------------------------------------------------------------------------------- /1.Python Foundation/figures/distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/1.Python Foundation/figures/distribution.png -------------------------------------------------------------------------------- /1.Python Foundation/guideline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/1.Python Foundation/guideline.pdf -------------------------------------------------------------------------------- /10.LDA Topic Model/10.1 LDA_intro.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | from gensim import corpora, models, similarities 3 | 4 | # 配置输出结果 5 | # import logging 6 | # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | # 9个英文文档数据,每一行即一个文档 12 | f = open('LDA_test.txt') 13 | # 创造停止词 14 | stop_list = set('for a of the and to in'.split()) 15 | 16 | # 读取每一行文本数据,去掉两边空格,不考虑停止词,输出原文本分词 17 | # texts = [line.strip().split() for line in f] 18 | # pprint(texts) 19 | 20 | # 读取每一行文本数据,去掉两边空格,转小写,分开,考虑停止词,输出文本分词(除去停止词) 21 | texts = [[word for word in line.strip().lower().split() if word not in stop_list] for line in f] 22 | print('Text = ') 23 | pprint(texts) 24 | 25 | # 生成词典 26 | dictionary = corpora.Dictionary(texts) 27 | # 获取词典中词的个数 28 | V = len(dictionary) 29 | # 实际分词数据转换成词典向量语料库模型 30 | corpus = [dictionary.doc2bow(text) for text in texts] 31 | # 建立TF-IDF预料模型,此模型对原始词典向量语料加权处理了 32 | corpus_tfidf = models.TfidfModel(corpus)[corpus] 33 | 34 | # 输出原始词典向量语料库,非稀疏矩阵 35 | print('Initial Vector Data:') 36 | for c in corpus: 37 | print(c) 38 | 39 | # 输出TF-IDF 40 | print('TF-IDF:') 41 | for c in corpus_tfidf: 42 | print(c) 43 | 44 | # LSI模型(即LSA隐语义分析模型) 45 | print('\n---------------LSI Model---------------\n') 46 | 47 | # 设置语料库为TF-IDF,主题数为2,词典为dictionary 48 | lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary) 49 | 50 | # 遍历获取所有文档语料的主题 51 | topic_result = [a for a in lsi[corpus_tfidf]] 52 | 53 | print('LSI Topics Result 文档的主题分布:') 54 | pprint(topic_result) 55 | 56 | print('LSI Topics Content 主题下的词分布(取前5个词相关度):') 57 | pprint(lsi.print_topics(num_topics=2, num_words=5)) 58 | 59 | # 根据主题计算文档间的相似度 60 | similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf]) 61 | print('LSI Similarity:') 62 | pprint(list(similarity)) 63 | 64 | # LDA模型 65 | print('\n---------------LDA Model---------------:') 66 | 67 | # 指定主题个数 68 | num_topics = 2 69 | lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, 70 | alpha='auto', eta='auto', minimum_probability=0.001) # 主题小于0.001则忽略 71 | 72 | # 模型得到后,文档放进去,返回文档对应的主题 73 | doc_topic = [doc_t for doc_t in lda[corpus_tfidf]] 74 | print('LDA Topics Result 文档的主题分布:') 75 | pprint(doc_topic) 76 | 77 | # for doc_topic in lda.get_document_topics(corpus_tfidf): 78 | # print(doc_topic) 79 | 80 | # 显示主题内部的词分布,即相关度 81 | for topic_id in range(num_topics): 82 | print('LDA Topics Content 主题下的词分布:', topic_id) 83 | # pprint(lda.get_topic_terms(topicid=topic_id)) 84 | pprint(lda.show_topic(topic_id)) 85 | 86 | # 根据主题计算文档间的相似度 87 | similarity = similarities.MatrixSimilarity(lda[corpus_tfidf]) 88 | print('LDA Similarity:') 89 | pprint(list(similarity)) 90 | 91 | # HDA模型 92 | print('\n---------------HDA Model---------------:') 93 | 94 | hda = models.HdpModel(corpus_tfidf, id2word=dictionary) 95 | 96 | # 获取HDA分析的每个文本的主题分布 97 | topic_result = [a for a in hda[corpus_tfidf]] 98 | 99 | print('HDA Topics Result 文档的主题分布:') 100 | pprint(topic_result) 101 | 102 | print('HDA Topics Content 主题下的词分布(取前5个词相关度):') 103 | print(hda.print_topics(num_topics=2, num_words=5)) 104 | -------------------------------------------------------------------------------- /10.LDA Topic Model/10.2 netease_news.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | from gensim import corpora, models 4 | 5 | 6 | # import logging 7 | # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 8 | 9 | 10 | # 停止词加载函数 11 | def load_stopword(): 12 | f_stop = open('stopword.txt') 13 | sw = [line.strip() for line in f_stop] 14 | f_stop.close() 15 | return sw 16 | 17 | 18 | if __name__ == '__main__': 19 | print('初始化停止词列表 --') 20 | 21 | # 记录开始时间 22 | t_start = time.time() 23 | 24 | # 获取停止词 25 | stop_words = load_stopword() 26 | 27 | print('开始读入语料数据 -- ') 28 | f = open('news.dat', encoding='utf-8') # LDA_test.txt 29 | 30 | # 使用停止词分割 31 | texts = [[word for word in line.strip().lower().split() if word not in stop_words] for line in f] 32 | # texts = [line.strip().split() for line in f] 33 | print('读入语料数据完成,用时%.3f秒' % (time.time() - t_start)) 34 | 35 | # 关闭文件流 36 | f.close() 37 | 38 | # 获取文本数量 39 | M = len(texts) 40 | print('文本数目:%d个' % M) 41 | # pprint(texts) 42 | 43 | print('正在建立词典 --') 44 | dictionary = corpora.Dictionary(texts) 45 | 46 | # 获取词典长度 47 | V = len(dictionary) 48 | print('词典中词的个数:', V) 49 | 50 | # 计算文本向量 51 | print('正在计算文本向量 --') 52 | corpus = [dictionary.doc2bow(text) for text in texts] 53 | 54 | # 计算TF-IDF 55 | print('正在计算文档TF-IDF --') 56 | t_start = time.time() 57 | corpus_tfidf = models.TfidfModel(corpus)[corpus] # 喂养数据 58 | print('建立文档TF-IDF完成,用时%.3f秒' % (time.time() - t_start)) 59 | 60 | print('LDA模型拟合推断 --') 61 | # 设置主题数目 62 | num_topics = 10 63 | 64 | t_start = time.time() 65 | 66 | # 构建LDA模型 67 | lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, 68 | alpha=0.01, eta=0.01, minimum_probability=0.001, 69 | update_every=1, chunksize=100, passes=1) 70 | print('LDA模型完成,训练时间为\t%.3f秒' % (time.time() - t_start)) 71 | 72 | # 所有文档的主题分布 73 | # doc_topic = [a for a in lda[corpus_tfidf]] 74 | # print('Document-Topic:\n') 75 | # pprint(doc_topic) 76 | 77 | # 随机打印某10个文档的主题 78 | num_show_topic = 10 # 每个文档显示前几个主题 79 | print('10个文档的主题分布:') 80 | # 所有文档的主题分布 81 | doc_topics = lda.get_document_topics(corpus_tfidf) 82 | # 0~(M-1)的数组 83 | idx = np.arange(M) 84 | # 乱序 85 | np.random.shuffle(idx) 86 | # 取前十个不重复数字 87 | idx = idx[:10] 88 | for i in idx: 89 | # i号文档的主题分布 90 | topic = np.array(doc_topics[i]) 91 | # 只获取当前主题分布的分布概率数据,即第二列,第一列为0~9序号忽略 92 | topic_distribute = np.array(topic[:, 1]) 93 | # print(topic_distribute) 94 | # 主题分布排序 95 | topic_idx = topic_distribute.argsort()[:-num_show_topic - 1:-1] 96 | print(('第%d个文档的前%d个主题:' % (i, num_show_topic)), topic_idx) 97 | print(topic_distribute[topic_idx]) 98 | 99 | num_show_term = 7 # 每个主题显示几个词 100 | print('每个主题的词分布:') 101 | for topic_id in range(num_topics): 102 | print('主题#%d:\t' % topic_id) 103 | # LDA第id号主题对应的词分布 104 | term_distribute_all = lda.get_topic_terms(topicid=topic_id) 105 | # 取前7个词 106 | term_distribute = term_distribute_all[:num_show_term] 107 | # 转换array形式 108 | term_distribute = np.array(term_distribute) 109 | term_id = term_distribute[:, 0].astype(np.int) 110 | print('词:\t', ) 111 | for t in term_id: 112 | # 从词典中取出对应的词显示 113 | print(dictionary.id2token[t], ) 114 | # print('\n概率:\t', term_distribute[:, 1]) 115 | -------------------------------------------------------------------------------- /10.LDA Topic Model/10.3 reuters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import lda.datasets 3 | import matplotlib as mpl 4 | import matplotlib.pyplot as plt 5 | from pprint import pprint 6 | 7 | if __name__ == "__main__": 8 | # document-term matrix 9 | X = lda.datasets.load_reuters() 10 | print("type(X): {}".format(type(X))) 11 | print("shape: {}\n".format(X.shape)) 12 | print(X[:10, :10]) 13 | 14 | # the vocab 15 | vocab = lda.datasets.load_reuters_vocab() 16 | print("type(vocab): {}".format(type(vocab))) 17 | print("len(vocab): {}\n".format(len(vocab))) 18 | print(vocab[:10]) 19 | 20 | # titles for each story 21 | titles = lda.datasets.load_reuters_titles() 22 | print("type(titles): {}".format(type(titles))) 23 | print("len(titles): {}\n".format(len(titles))) 24 | pprint(titles[:10]) 25 | 26 | print('LDA start ----') 27 | topic_num = 20 28 | model = lda.LDA(n_topics=topic_num, n_iter=500, random_state=1) 29 | model.fit(X) 30 | 31 | # topic-word 32 | topic_word = model.topic_word_ 33 | print("type(topic_word): {}".format(type(topic_word))) 34 | print("shape: {}".format(topic_word.shape)) 35 | print(vocab[:5]) 36 | print(topic_word[:, :5]) 37 | 38 | # Print Topic distribution 39 | n = 7 40 | for i, topic_dist in enumerate(topic_word): 41 | topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n + 1):-1] 42 | print('*Topic {}\n- {}'.format(i, ' '.join(topic_words))) 43 | 44 | # Document - topic 45 | doc_topic = model.doc_topic_ 46 | print("type(doc_topic): {}".format(type(doc_topic))) 47 | print("shape: {}".format(doc_topic.shape)) 48 | for i in range(10): 49 | topic_most_pr = doc_topic[i].argmax() 50 | print(u"文档: {} 主题: {} value: {}".format(i, topic_most_pr, doc_topic[i][topic_most_pr])) 51 | 52 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 53 | mpl.rcParams['axes.unicode_minus'] = False 54 | 55 | # Topic - word 56 | plt.figure(figsize=(8, 9)) 57 | # f, ax = plt.subplots(5, 1, sharex=True) 58 | for i, k in enumerate([0, 5, 9, 14, 19]): 59 | ax = plt.subplot(5, 1, i + 1) 60 | ax.plot(topic_word[k, :], 'r-') 61 | ax.set_xlim(-50, 4350) # [0,4258] 62 | ax.set_ylim(0, 0.08) 63 | ax.set_ylabel(u"概率") 64 | ax.set_title(u"主题 {}".format(k)) 65 | plt.xlabel(u"词", fontsize=14) 66 | plt.tight_layout() 67 | plt.suptitle(u'主题的词分布', fontsize=18) 68 | plt.subplots_adjust(top=0.9) 69 | plt.show() 70 | 71 | # Document - Topic 72 | plt.figure(figsize=(8, 9)) 73 | # f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True) 74 | for i, k in enumerate([1, 3, 4, 8, 9]): 75 | ax = plt.subplot(5, 1, i + 1) 76 | ax.stem(doc_topic[k, :], linefmt='g-', markerfmt='ro') 77 | ax.set_xlim(-1, topic_num + 1) 78 | ax.set_ylim(0, 1) 79 | ax.set_ylabel(u"概率") 80 | ax.set_title(u"文档 {}".format(k)) 81 | plt.xlabel(u"主题", fontsize=14) 82 | plt.suptitle(u'文档的主题分布', fontsize=18) 83 | plt.tight_layout() 84 | plt.subplots_adjust(top=0.9) 85 | plt.show() 86 | -------------------------------------------------------------------------------- /10.LDA Topic Model/LDA_test.txt: -------------------------------------------------------------------------------- 1 | Human machine interface for lab abc computer applications 2 | A survey of user opinion of computer system response time 3 | The EPS user interface management system 4 | System and human system engineering testing of EPS 5 | Relation of user perceived response time to error measurement 6 | The generation of random binary unordered trees 7 | The intersection graph of paths in trees 8 | Graph minors IV Widths of trees and well quasi ordering 9 | Graph minors A survey -------------------------------------------------------------------------------- /10.LDA Topic Model/README.md: -------------------------------------------------------------------------------- 1 | ## LDA Topic Model 2 | ## (LDA主题模型) 3 | 4 | ### 项目背景 5 | > LDA(Latent Dirichlet Allocation)是一种文档生成模型。它认为一篇文章是有多个主题的,而每个主题又对应着不同的词。一篇文章的构造过程,首先是以一定的概率选择某个主题,然后再在这个主题下以一定的概率选出某一个词,这样就生成了这篇文章的第一个词。不断重复这个过程,就生成了整片文章。当然这里假定词与词之间是没顺序的。LDA的使用是上述文档生成的逆过程,它将根据一篇得到的文章,去寻找出这篇文章的主题,以及这些主题对应的词。 6 | 7 | 8 | ### 项目简介 9 | |名称|简介| 10 | |:-------------|:-------------:| 11 | |10.1 LDA_intro|LDA主题模型基础应用| 12 | |10.2 netease_news|基于LDA主题模型的新闻材料主题提取| 13 | |10.3 reuters|文档-主题-词分布统计| 14 | 15 | 16 | ### 效果图 17 | #### ·文档-主题分布统计 18 | 19 | 20 | #### ·主题-词分布统计 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /10.LDA Topic Model/figures/doc-topic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/10.LDA Topic Model/figures/doc-topic.png -------------------------------------------------------------------------------- /10.LDA Topic Model/figures/topic-word.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/10.LDA Topic Model/figures/topic-word.png -------------------------------------------------------------------------------- /10.LDA Topic Model/principle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/10.LDA Topic Model/principle.pdf -------------------------------------------------------------------------------- /10.LDA Topic Model/stopword.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/10.LDA Topic Model/stopword.txt -------------------------------------------------------------------------------- /11.HMM/11.3 jieba_intro.py: -------------------------------------------------------------------------------- 1 | import jieba.posseg 2 | 3 | if __name__ == "__main__": 4 | 5 | f = open('.\\novel.txt', encoding='utf-8') 6 | str = f.read() 7 | f.close() 8 | 9 | seg = jieba.posseg.cut(str) 10 | for s in seg: 11 | # print(s.word, s.flag, '|', end='') 12 | print(s.word, '|', end='') 13 | -------------------------------------------------------------------------------- /11.HMM/11.4 GMHMM.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | import matplotlib as mpl 4 | from hmmlearn import hmm 5 | import matplotlib.pyplot as plt 6 | from sklearn.metrics.pairwise import pairwise_distances_argmin 7 | 8 | 9 | def expand(a, b): 10 | d = (b - a) * 0.05 11 | return a - d, b + d 12 | 13 | 14 | if __name__ == "__main__": 15 | warnings.filterwarnings("ignore") # hmmlearn(0.2.0) < sklearn(0.18) 16 | np.random.seed(0) 17 | 18 | n = 5 # 隐状态数目 19 | n_samples = 1000 20 | pi = np.random.rand(n) 21 | pi /= pi.sum() 22 | print('初始概率:', pi) # 长度为5的初始概率 23 | 24 | # 生成n*n的转换概率 25 | A = np.random.rand(n, n) 26 | mask = np.zeros((n, n), dtype=np.bool) 27 | # 特殊位置清0 28 | mask[0][1] = mask[0][4] = True 29 | mask[1][0] = mask[1][2] = True 30 | mask[2][1] = mask[2][3] = True 31 | mask[3][2] = mask[3][4] = True 32 | mask[4][0] = mask[4][3] = True 33 | A[mask] = 0 34 | for i in range(n): 35 | A[i] /= A[i].sum() 36 | print('转移概率:\n', A) 37 | 38 | # 生成5个均值 39 | means = np.array(((30, 30), (0, 50), (-25, 30), (-15, 0), (15, 0))) 40 | print('均值:\n', means) 41 | 42 | # 生成5个方差 43 | covars = np.empty((n, 2, 2)) 44 | for i in range(n): 45 | # covars[i] = np.diag(np.random.randint(1, 5, size=2)) 46 | covars[i] = np.diag(np.random.rand(2) + 0.001) * 10 # np.random.rand ∈[0,1) 47 | print('方差:\n', covars) 48 | 49 | # 建立模型 50 | model = hmm.GaussianHMM(n_components=n, covariance_type='full') 51 | model.startprob_ = pi 52 | model.transmat_ = A 53 | model.means_ = means 54 | model.covars_ = covars 55 | sample, labels = model.sample(n_samples=n_samples, random_state=0) 56 | 57 | # 估计参数 58 | model = hmm.GaussianHMM(n_components=n, covariance_type='full', n_iter=10) 59 | model = model.fit(sample) 60 | y = model.predict(sample) 61 | np.set_printoptions(suppress=True) 62 | print('##估计初始概率:\n', model.startprob_) 63 | print('##估计转移概率:\n', model.transmat_) 64 | print('##估计均值:\n', model.means_) 65 | print('##估计方差:\n', model.covars_) 66 | 67 | # 类别 68 | order = pairwise_distances_argmin(means, model.means_, metric='euclidean') 69 | print(order) 70 | pi_hat = model.startprob_[order] 71 | A_hat = model.transmat_[order] 72 | A_hat = A_hat[:, order] 73 | means_hat = model.means_[order] 74 | covars_hat = model.covars_[order] 75 | change = np.empty((n, n_samples), dtype=np.bool) 76 | for i in range(n): 77 | change[i] = y == order[i] 78 | for i in range(n): 79 | y[change[i]] = i 80 | print('估计初始概率:', pi_hat) 81 | print('估计转移概率:\n', A_hat) 82 | print('估计均值:\n', means_hat) 83 | print('估计方差:\n', covars_hat) 84 | print(labels) 85 | print(y) 86 | acc = np.mean(labels == y) * 100 87 | print('准确率:%.2f%%' % acc) 88 | 89 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 90 | mpl.rcParams['axes.unicode_minus'] = False 91 | plt.scatter(sample[:, 0], sample[:, 1], s=50, c=labels, cmap=plt.cm.Spectral, marker='o', 92 | label=u'观测值', linewidths=0.5, zorder=20) 93 | plt.plot(sample[:, 0], sample[:, 1], 'r-', zorder=10) 94 | plt.scatter(means[:, 0], means[:, 1], s=100, c=np.random.rand(n), marker='D', label=u'中心', alpha=0.8, zorder=30) 95 | x1_min, x1_max = sample[:, 0].min(), sample[:, 0].max() 96 | x2_min, x2_max = sample[:, 1].min(), sample[:, 1].max() 97 | x1_min, x1_max = expand(x1_min, x1_max) 98 | x2_min, x2_max = expand(x2_min, x2_max) 99 | plt.xlim((x1_min, x1_max)) 100 | plt.ylim((x2_min, x2_max)) 101 | plt.legend(loc='upper left') 102 | plt.grid(True) 103 | plt.show() 104 | -------------------------------------------------------------------------------- /11.HMM/11.5 Stock.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | import matplotlib as mpl 4 | from hmmlearn import hmm 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def expand(a, b): 9 | d = (b - a) * 0.05 10 | return a - d, b + d 11 | 12 | 13 | if __name__ == "__main__": 14 | warnings.filterwarnings("ignore") # hmmlearn(0.2.0) < sklearn(0.18) 15 | 16 | # 0日期 1开盘 2最高 3最低 4收盘 5成交量 6成交额 17 | x = np.loadtxt('SH600000.txt', delimiter='\t', skiprows=2, usecols=(4, 5, 6, 2, 3)) 18 | close_price = x[:, 0] 19 | volumn = x[:, 1] 20 | amount = x[:, 2] 21 | amplitude_price = x[:, 3] - x[:, 4] # 每天的最高价与最低价的差 22 | diff_price = np.diff(close_price) # 涨跌值 23 | # 后者-前者,扔掉第一列 24 | volumn = volumn[1:] # 成交量 25 | amount = amount[1:] # 成交额 26 | amplitude_price = amplitude_price[1:] # 每日振幅 27 | sample = np.column_stack((diff_price, volumn, amount, amplitude_price)) # 观测值 28 | n = 5 29 | model = hmm.GaussianHMM(n_components=n, covariance_type='full') 30 | model.fit(sample) 31 | y = model.predict_proba(sample) 32 | np.set_printoptions(suppress=True) 33 | print(y) 34 | 35 | t = np.arange(len(diff_price)) 36 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 37 | mpl.rcParams['axes.unicode_minus'] = False 38 | plt.figure(figsize=(10, 8), facecolor='w') 39 | plt.subplot(421) 40 | plt.plot(t, diff_price, 'r-') 41 | plt.grid(True) 42 | plt.title(u'涨跌幅') 43 | plt.subplot(422) 44 | plt.plot(t, volumn, 'g-') 45 | plt.grid(True) 46 | plt.title(u'交易量') 47 | 48 | clrs = plt.cm.terrain(np.linspace(0, 0.8, n)) 49 | plt.subplot(423) 50 | for i, clr in enumerate(clrs): 51 | plt.plot(t, y[:, i], '-', color=clr, alpha=0.7) 52 | plt.title(u'所有组分') 53 | plt.grid(True) 54 | for i, clr in enumerate(clrs): 55 | axes = plt.subplot(4, 2, i + 4) 56 | plt.plot(t, y[:, i], '-', color=clr) 57 | plt.title(u'组分%d' % (i + 1)) 58 | plt.grid(True) 59 | plt.suptitle(u'SH600000股票:GaussianHMM分解隐变量', fontsize=18) 60 | plt.tight_layout() 61 | plt.subplots_adjust(top=0.9) 62 | plt.show() 63 | -------------------------------------------------------------------------------- /11.HMM/A.txt: -------------------------------------------------------------------------------- 1 | -2147483648.0 -1.9188474846212102 -0.158733194226798 -2147483648.0 2 | -2147483648.0 -1.0622669534219753 -0.42414545547419813 -2147483648.0 3 | -0.7204778074205844 -2147483648.0 -2147483648.0 -0.6665436878868185 4 | -0.5574137052287327 -2147483648.0 -2147483648.0 -0.8502415341754279 5 | -------------------------------------------------------------------------------- /11.HMM/MyBook.txt: -------------------------------------------------------------------------------- 1 | 前言 2 | 数据,数据,数据!想必在新闻、报刊、白皮书、电视等媒介的持续冲击下,人们无法摆脱大数据和数据科学的洗礼。现实需求推动了对数据的学习、分析和应用,这些数据来自于社交媒体、智能手机、硬件设备(亦称作“物联网”)、传感器等任何可以产生数据的设备。 3 | 大多数数据挖掘的宣传着重于数据规模和处理速度上。数据洪水(data flood)的预言告诉人们我们无法实时处理这些数据,硬件推销人员会进一步卖给我们需要的服务,以期能够满足处理速度的要求。从某种程度上来说,他们是对的,但是我们值得停下来思考片刻,并对手边的任务进行适当的再认识。 4 | 近年来,数据挖掘和机器学习在我们周围持续火爆,各种媒体也不断推送着海量的数据。仔细观察就能发现,实际应用中的那些机器学习算法与多年前并没有什么两样;它们只是在应用的数据规模上有些不同。历数一下产生数据的组织,至少在我看来,数目其实并不多。无非是Google、Facebook、Twitter、NetFlix以及其他为数不多的机构在使用若干学习算法和工具,这些算法和工具使得他们能够对数据进行测试分析。那么,真正的问题是:“对于其他人,大数据框架下的算法和工具的作用是什么呢?” 5 | 我承认本书将多次提及大数据和机器学习之间的关系,这是我无法忽视的一个客观问题;但是它只是一个很小的因素,终极目标是如何利用可用数据获取数据的本质内涵。请记住我是在探讨工具,关键点是选择哪个工具来胜任我们尝试完成的工作。迫于技术上的压力,可能会选择Hadoop,但是Hadoop并不一定总是完成任务最好的选择。 6 | 7 | 本书目的 8 | 本书是关于机器学习而非大数据的,书中会介绍多种用于分析数据本质的技术。读完本书,您将掌握许多有用的机器学习方法的实际运用,并分析、解释算法实现中如此组织代码的原因。本书针对具体实际问题选择哪种恰当的方法会提供推荐的建议。 9 | 本书没有固定的阅读顺序。您可以从头读到尾,或者选择性阅读您需要的内容。 10 | 11 | “实践”意味着亲自动手 12 | 过去的几年里,我读过的许多关于机器学习的书都非常重视理论,这并不是什么坏事。如果您正在看一本使用复杂公式、深入讲解数学理论的书,我要为您的严谨喝彩。对于我而言,我更加关注使用何种机器学习方法及其工程实践。我的信条很简单: 13 |  在头脑中思考一个问题; 14 |  找到我需要学习的理论; 15 |  找到和我要学习的理论最相关的例子; 16 |  在我的实践工程中让它们发挥作用。 17 | 作为软件开发人员,我个人喜欢看很多示例。作为一名老师,我喜欢尽可能多的亲自实现它并且尽可能简单的将内容传授给我的学生:大概过程是指出关键点,在IDE环境中敲完正确的代码,然后让它产生作用。这是令人振奋并且很有成就感的事情,并且我在本书中也希望传达这样的思想。 18 | 每个人都有自己的学习方式。我相信本书涵盖了大多数的通用方法,所以,相信每个人都能从中获益。 19 | 20 | “如何对待数学?” 21 | 就像论您最喜欢哪支足球队?吉他手吉米•佩奇(Jimmy Page)和杰夫•贝克(Jeff Beck)哪个更帅气(嘿,本人更喜欢贝克)?有些话题是没有标准答案的。比如这个问题:在开始做机器学习之前,我需要知道多少数学知识? 22 | 做机器学习和学习机器学习的理论是两个不同的概念。为了学习理论,显然需要一个好的数学背景。本书讨论机器学习的实践方法。现在已经有了很多可被开发者利用的机器学习工具,当前的重点不是为什么这些工具有用,而是如何让这些工具为我所用。前人已经完成了艰难的工作,他们值得我们给予尊重和掌声。 23 | 24 | “但是您需要一个博士!” 25 | 同行的一些言语或许会葬送您的研究之路。在您开始做数据分析或者敢于声称自己是“数据科学家”的时候,会长期充斥着关于您应该具备什么样的知识层次这样的争论(我将马上剖析“数据科学家”这个术语)。个人而言,我相信如果您能够花费多年去完成一个学位,追求硕士学位然后是博士学位,您应该会在那条路上感到自由。我对待事情更加注重实用性,并且喜欢边阅读资料边动手实践。 26 | 学术环境很关键;现在有大量的在线课程、论文、网站以及关于数学、统计、数据挖掘方面的书籍,足够让您跟上最新最热的思想。我从中获取了很多资源。 27 | 但是,对于我,最直接的莫过于甩开袖子,获得书籍,尝试一些方法,并且检验它们的结果。如果您想重温一下线性回归的理论,我再次向您保证有很多资料可以阅读,本书也将涉及它们。 28 | 最后,大家能够成为“数据科学家”吗?或许本书更有可能的结果是:带给大家更多的机器学习实践技能。我将在第二章再次讨论这个话题。 29 | 所以,当办公室的其他人还在争论是否需要在项目组中增加几个博士的时候,您已经开始用代码实现一个决策树并且检验它是否切实可行了。 30 | 31 | 最终您会学到什么? 32 | 假定您将本书从头读到尾,您将学到机器学习的常用策略、机器学习的不同方法、以及如何将它们应用在实时和批处理的环境下。 33 | 或者,您也可以直接参考您需要的某一个章节。本书各章节之间联系并不紧密,每一章的内容和实例和其他章节并无太多相关性。 34 | 本书的目的是以实例的方式讲述通用的机器学习概念。在各种工具和软件库中选择适合您的那个,借助已有工具的方式能够让您快速学习掌握相关知识,同时对于深刻分析、理解当前数据不会有太大影响。 35 | 36 | 理论和实践学习的平衡 37 | 现在已经有很多关于机器学习和数据挖掘的书籍,难点在于如何找到理论和实践的平衡点。当筹划本书时,我会将重点放在实际和易用的例子上,提供逐步的指导,并且让您能看到如何把这些技术组合在一起。 38 | 我并没有说要轻视理论,恰好相反,理论是极其重要的。清楚您想学什么,或者更重要的是您想怎么学,将决定您如何阅读本书。 39 | 前两章介绍机器学习和数据挖掘的定义、实践中所使用的相关工具以及它们产生的效果,为接下来真正的机器学习算法做准备。主要章节(3到8章)聚焦于不同类型的机器学习理论上,类似游戏攻略,每章将给出代码片段的解释,再配合其他方法,保证您能够从中获得需要的知识。 40 | 最后,您将对比实时和批处理两种环境下的方法,并且考察如何将它们整合在一个大的算法框架中;您还将了解到Apache Spark和R语言,它是统计的根本语言。 41 | 42 | 章节概述 43 | 第1章阐述“什么是机器学习”,考察机器学习的定义和用途,并且指出您将遇到的算法方面的挑战。同时还讨论人类自身的“机器学习”过程,探讨下人类建立的模型是如何在将来产生作用的。 44 | 在任何实际编程之前,都应该做好计划。第2章“为机器学习做好计划”的重点就是完成这一步。计划包括一组分析和展示方法、数据处理过程、确定存储空间、数据私有性、数据质量和清洗(data cleaning);须知,没有任何一种方法能够胜任所有任务。第2章将通过一些简单的Linux命令帮助您整理需要处理的数据。 45 | 决策树是机器学习中的常用方法。在模型中使用观测结果或者样本标签以及众多的输入数据(信号、特征),机器学习的算法就能够预测新数据的可能结果。第3章考察决策树的设计,并使用Weka实现一个决策树实例。 46 | 贝叶斯网络表示了一组随机变量的条件独立性。第4章将构造一个简单的例子,展示贝叶斯网络是如何工作的,并且提供一些可用的代码。 47 | 受生物领域中枢神经系统工作方法的启发,神经网络模型被应用在深度学习系统中。第5章将考察机器学习的这一分支是如何起作用,并且阐述把输入样本传递给网络结构的具体过程。 48 | 如果您关注购物篮分析(basket analysis),您将会喜欢第六章涉及的关联规则学习,从而找到大数据集之间的联系。本章将细致的考察Apriori算法以及它是如何应用在当前的超市行业的。 49 | 支持向量机(SVM)是一个监督学习方法,用于分析数据和模式识别。第7章将给出文本分类等实例,展示SVM是如何工作的。 50 | 第8章讲解聚类——对象间的分组;例如,聚类能够在市场细分分析(Market Segmentation Analysis)的应用中工作的很好。在初始学习阶段,聚类算法是最好的逐步迭代减少分类误差的机器学习方法。 51 | 第9章和第10章是实例攻略。第9章的例子关注实时处理。您将使用Spring XD——它甚至可以称作“数据吸收引擎”(data ingesting engine),同时利用流式的推特应用程序接口(streaming Twitter API),只要有新的推特消息(Twitter),就把它们收集起来,从而准备好数据,在第10章做进一步处理。 52 | 第10章考察机器学习的批处理过程。使用第九章获得的数据,您可以搭建Hadoop的聚类模型并且运行多个作业。本章介绍使用Sqoop获得数据库中数据的常用方法,如何使用Mahout执行用户推荐, 以及如何使用Hadoop和Pig分析个性化用户数据。 53 | 第11章介绍机器学习舞台上的新成员。这一章考察Apache Spark,并且介绍Scala语言,它可以针对内存数据执行类似于SQL的查询语句。 54 | R语言是世界范围内统计工作的重要语言,第12章将介绍和分析R语言。您可以使用它来实现本书前面章节涉及的机器学习算法。 55 | 56 | 本书的源代码 57 | 本书每一章阐述的所有代码都保存在Github库上供大家下载调试。Github的地址是https://github.com/jasebell/mlbook。您也可以访问Wiley个人网站找到代码: www.wiley.com/go/machinelearning。 58 | 本书的例子是使用Java实现的,如果您想使用其他语言实现,可以搜索下Github网站,或许能找到很多有趣的例子。 59 | 代码已经按照章节分好了,代码包中每章都有各自的文件夹。如果某一部分需要额外的库,文件夹中会有一个README注释文件。 60 | 61 | 使用Git 62 | Git是一个在业界和开源软件社区广泛使用的版本控制系统。如果软件是团队开发,Git将非常重要。您可以使用它在代码的各个分支上工作,最后将分支合并在一起。 63 | 本书使用Git并不多,但是如果您希望使用本书的代码库,您需要克隆一份实例。 64 | 为了克隆本书的实例,可以使用下面的命令: 65 | $mkdir mlbookexamples 66 | $cd mlbookexamples 67 | $git clone https://github.com/jasebell/mlbook.git 68 | 您可以看到克隆进度,当它完成的时候,您能够改变文件夹,在新下载的目录中查阅代码实例。 69 | -------------------------------------------------------------------------------- /11.HMM/README.md: -------------------------------------------------------------------------------- 1 | ## Hidden Markov Model 2 | ## (隐马尔可夫模型) 3 | 4 | ### 项目背景 5 | > 隐马尔可夫模型(Hidden Markov Model,HMM)是统计模型,它用来描述一个含有隐含未知参数的马尔可夫过程。其难点是从可观察的参数中确定该过程的隐含参数。然后利用这些参数来作进一步的分析,例如模式识别。是在被建模的系统被认为是一个马尔可夫过程与未观测到的(隐藏的)的状态的统计马尔可夫模型。其实对于HMM来说,如果提前知道所有隐含状态之间的转换概率和所有隐含状态到所有可见状态之间的输出概率,做模拟是相当容易的。但是应用HMM模型时候呢,往往是缺失了一部分信息的,有时候你知道骰子有几种,每种骰子是什么,但是不知道掷出来的骰子序列;有时候你只是看到了很多次掷骰子的结果,剩下的什么都不知道。如果应用算法去估计这些缺失的信息,就成了一个很重要的问题。 6 | 7 | 8 | ### 项目简介 9 | |名称|简介| 10 | |:-------------|:-------------:| 11 | |11.1 TrainHMM|训练基本隐马尔可夫模型| 12 | |11.2 Segmentation|分词实战| 13 | |11.3 jieba_intro|jieba分词库基本引入| 14 | |11.4 GMHMM|高斯混合分布隐马尔可夫模型| 15 | |11.5 Stock|股票组分分析| 16 | 17 | 18 | ### 效果图 19 | #### ·股票组分分析 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /11.HMM/SH600000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/11.HMM/SH600000.txt -------------------------------------------------------------------------------- /11.HMM/figures/stock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/11.HMM/figures/stock.png -------------------------------------------------------------------------------- /11.HMM/pi.txt: -------------------------------------------------------------------------------- 1 | -1.138130892386048 -2.63283292153306 -1.138130892386048 -1.2472605882949992 2 | -------------------------------------------------------------------------------- /11.HMM/principle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/11.HMM/principle.pdf -------------------------------------------------------------------------------- /2.Application Foundation/2.1 Ensumble.py: -------------------------------------------------------------------------------- 1 | import operator 2 | from functools import reduce 3 | 4 | ''' 5 | 二分类多次迭代后准确率可以拉升 6 | ''' 7 | 8 | 9 | def c(n, k): 10 | return reduce(operator.mul, range(n - k + 1, n + 1)) / reduce(operator.mul, range(1, k + 1)) 11 | 12 | 13 | def bagging(n, p): 14 | s = 0 15 | for i in range(int(n / 2 + 1), n + 1): 16 | s += c(n, i) * p ** i * (1 - p) ** (n - i) 17 | return s 18 | 19 | 20 | if __name__ == "__main__": 21 | for t in range(10, 101, 10): 22 | print(t, '次采样正确率:', bagging(t, 0.6)) 23 | 24 | ''' 25 | 10 次采样正确率: 0.6331032576 26 | 20 次采样正确率: 0.7553372033163932 27 | 30 次采样正确率: 0.8246309464931707 28 | 40 次采样正确率: 0.8702342941780972 29 | 50 次采样正确率: 0.9021926358467504 30 | 60 次采样正确率: 0.9253763056485725 31 | 70 次采样正确率: 0.9425655385148007 32 | 80 次采样正确率: 0.9555029441181861 33 | 90 次采样正确率: 0.9653473393248491 34 | 100 次采样正确率: 0.972900802242991 35 | ''' -------------------------------------------------------------------------------- /2.Application Foundation/2.2 Convolve.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | if __name__ == "__main__": 6 | stock_max, stock_min, stock_close, stock_amount = np.loadtxt('SH600000.txt', delimiter='\t', skiprows=2, 7 | usecols=(2, 3, 4, 5), unpack=True) 8 | N = 100 9 | stock_close = stock_close[:N] 10 | print(stock_close) 11 | 12 | n = 5 13 | weight = np.ones(n) 14 | weight /= weight.sum() 15 | print(weight) 16 | stock_sma = np.convolve(stock_close, weight, mode='valid') # simple moving average 17 | 18 | weight = np.linspace(1, 0, n) 19 | weight = np.exp(weight) 20 | weight /= weight.sum() 21 | print(weight) 22 | stock_ema = np.convolve(stock_close, weight, mode='valid') # exponential moving average 23 | 24 | t = np.arange(n - 1, N) 25 | poly = np.polyfit(t, stock_ema, 10) 26 | print(poly) 27 | stock_ema_hat = np.polyval(poly, t) 28 | 29 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 30 | mpl.rcParams['axes.unicode_minus'] = False 31 | plt.plot(np.arange(N), stock_close, 'ro-', linewidth=2, label=u'原始收盘价') 32 | t = np.arange(n - 1, N) 33 | plt.plot(t, stock_sma, 'b-', linewidth=2, label=u'简单移动平均线') 34 | plt.plot(t, stock_ema, 'g-', linewidth=2, label=u'指数移动平均线') 35 | plt.legend(loc='upper right') 36 | plt.grid(True) 37 | plt.show() 38 | 39 | plt.figure(figsize=(9, 6)) 40 | plt.plot(np.arange(N), stock_close, 'r-', linewidth=1, label=u'原始收盘价') 41 | plt.plot(t, stock_ema, 'g-', linewidth=2, label=u'指数移动平均线') 42 | plt.plot(t, stock_ema_hat, 'm-', linewidth=3, label=u'指数移动平均线估计') 43 | plt.legend(loc='upper right') 44 | plt.grid(True) 45 | plt.show() 46 | -------------------------------------------------------------------------------- /2.Application Foundation/2.3 Image_convolve.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from PIL import Image 4 | from matplotlib import pyplot as plt 5 | 6 | 7 | def convolve(image, weight): 8 | height, width = image.shape 9 | h, w = weight.shape 10 | height_new = height - h + 1 11 | width_new = width - w + 1 12 | image_new = np.zeros((height_new, width_new), dtype=np.float) 13 | for i in range(height_new): 14 | for j in range(width_new): 15 | image_new[i, j] = np.sum(image[i:i + h, j:j + w] * weight) 16 | image_new = image_new.clip(0, 255) 17 | image_new = np.rint(image_new).astype('uint8') 18 | return image_new 19 | 20 | 21 | # image_new = 255 * (image_new - image_new.min()) / (image_new.max() - image_new.min()) 22 | 23 | fig, axes = plt.subplots(2, 4, sharex=True, sharey=True, subplot_kw={'adjustable': 'box-forced'}) 24 | axes = axes.ravel() 25 | 26 | if __name__ == "__main__": 27 | A = Image.open("son.png", 'r') 28 | output_path = './figures/' 29 | if not os.path.exists(output_path): 30 | os.mkdir(output_path) 31 | a = np.array(A) 32 | soble_x = np.array(([-1, 0, 1], [-2, 0, 2], [-1, 0, 1])) 33 | soble_y = np.array(([-1, -2, -1], [0, 0, 0], [1, 2, 1])) 34 | soble = np.array(([-1, -1, 0], [-1, 0, 1], [0, 1, 1])) 35 | prewitt_x = np.array(([-1, 0, 1], [-1, 0, 1], [-1, 0, 1])) 36 | prewitt_y = np.array(([-1, -1, -1], [0, 0, 0], [1, 1, 1])) 37 | prewitt = np.array(([-2, -1, 0], [-1, 0, 1], [0, 1, 2])) 38 | laplacian = np.array(([0, -1, 0], [-1, 4, -1], [0, -1, 0])) 39 | laplacian2 = np.array(([-1, -1, -1], [-1, 8, -1], [-1, -1, -1])) 40 | weight_list = ('soble_x', 'soble_y', 'soble', 'prewitt_x', 'prewitt_y', 'prewitt', 'laplacian', 'laplacian2') 41 | print('梯度检测:') 42 | for weight in weight_list: 43 | print(weight, 'R', ) 44 | R = convolve(a[:, :, 0], eval(weight)) 45 | print('G', ) 46 | G = convolve(a[:, :, 1], eval(weight)) 47 | print('B') 48 | B = convolve(a[:, :, 2], eval(weight)) 49 | I = 255 - np.stack((R, G, B), 2) 50 | image = Image.fromarray(I) 51 | # Image.fromarray(I).save(output_path + weight + '.png') 52 | ax = axes[0] 53 | ax.set_title(weight) 54 | axes = axes[1:] 55 | ax.imshow(image, interpolation='nearest') 56 | 57 | # X & Y 58 | # print('梯度检测XY:') 59 | # for w in (0, 2): 60 | # weight = weight_list[w] 61 | # print(weight, 'R', ) 62 | # R = convolve(a[:, :, 0], eval(weight)) 63 | # print('G') 64 | # G = convolve(a[:, :, 1], eval(weight)) 65 | # print('B') 66 | # B = convolve(a[:, :, 2], eval(weight)) 67 | # I1 = np.stack((R, G, B), 2) 68 | # 69 | # weight = weight_list[w + 1] 70 | # print(weight, 'R', ) 71 | # R = convolve(a[:, :, 0], eval(weight)) 72 | # print('G') 73 | # G = convolve(a[:, :, 1], eval(weight)) 74 | # print('B') 75 | # B = convolve(a[:, :, 2], eval(weight)) 76 | # I2 = np.stack((R, G, B), 2) 77 | # 78 | # I = 255 - np.maximum(I1, I2) 79 | # Image.fromarray(I).save(output_path + weight[:-2] + '.png') 80 | 81 | plt.show() 82 | -------------------------------------------------------------------------------- /2.Application Foundation/2.4 FFT.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | 6 | def triangle_wave(size, T): 7 | t = np.linspace(-1, 1, size, endpoint=False) 8 | # where 9 | # y = np.where(t < 0, -t, 0) 10 | # y = np.where(t >= 0, t, y) 11 | y = np.abs(t) 12 | y = np.tile(y, T) - 0.5 13 | x = np.linspace(0, 2 * np.pi * T, size * T, endpoint=False) 14 | return x, y 15 | 16 | 17 | def sawtooth_wave(size, T): 18 | t = np.linspace(-1, 1, size) 19 | y = np.tile(t, T) 20 | x = np.linspace(0, 2 * np.pi * T, size * T, endpoint=False) 21 | return x, y 22 | 23 | 24 | def triangle_wave2(size, T): 25 | x, y = sawtooth_wave(size, T) 26 | return x, np.abs(y) 27 | 28 | 29 | def non_zero(f): 30 | f1 = np.real(f) 31 | f2 = np.imag(f) 32 | eps = 1e-4 33 | return f1[(f1 > eps) | (f1 < -eps)], f2[(f2 > eps) | (f2 < -eps)] 34 | 35 | 36 | if __name__ == "__main__": 37 | mpl.rcParams['font.sans-serif'] = [u'simHei'] 38 | mpl.rcParams['axes.unicode_minus'] = False 39 | np.set_printoptions(suppress=True) 40 | 41 | # 0~2π 42 | x = np.linspace(0, 2 * np.pi, 16, endpoint=False) 43 | print('时域采样值:', x) 44 | y = np.sin(2 * x) + np.sin(3 * x + np.pi / 4) 45 | # y = np.sin(x) 46 | 47 | N = len(x) 48 | print('采样点个数:', N) 49 | print('\n原始信号:', y) 50 | # 快速傅里叶变换 51 | f = np.fft.fft(y) 52 | print('\n频域信号:', f / N) 53 | a = np.abs(f / N) 54 | print('\n频率强度:', a) 55 | 56 | # 逆傅里叶变换 57 | iy = np.fft.ifft(f) 58 | print('\n逆傅里叶变换恢复信号:', iy) 59 | print('\n虚部:', np.imag(iy)) 60 | print('\n实部:', np.real(iy)) 61 | print('\n恢复信号与原始信号是否相同:', np.allclose(np.real(iy), y)) 62 | 63 | plt.subplot(211) 64 | plt.plot(x, y, 'go-', lw=2) 65 | plt.title(u'时域信号', fontsize=15) 66 | plt.grid(True) 67 | plt.subplot(212) 68 | w = np.arange(N) * 2 * np.pi / N 69 | print(u'频率采样值:', w) 70 | plt.stem(w, a, linefmt='r-', markerfmt='ro') 71 | plt.title(u'频域信号', fontsize=15) 72 | plt.grid(True) 73 | plt.show() 74 | 75 | # 三角/锯齿波 76 | x, y = triangle_wave(20, 5) 77 | # x, y = sawtooth_wave(20, 5) 78 | N = len(y) 79 | f = np.fft.fft(y) 80 | # print '原始频域信号:', np.real(f), np.imag(f) 81 | print('原始频域信号:', non_zero(f)) 82 | 83 | a = np.abs(f / N) 84 | 85 | # np.real_if_close 86 | f_real = np.real(f) 87 | eps = 0.1 * f_real.max() 88 | print(eps) 89 | f_real[(f_real < eps) & (f_real > -eps)] = 0 90 | f_imag = np.imag(f) 91 | eps = 0.1 * f_imag.max() 92 | print(eps) 93 | 94 | f_imag[(f_imag < eps) & (f_imag > -eps)] = 0 95 | f1 = f_real + f_imag * 1j 96 | y1 = np.fft.ifft(f1) 97 | y1 = np.real(y1) 98 | # print '恢复频域信号:', np.real(f1), np.imag(f1) 99 | print('恢复频域信号:', non_zero(f1)) 100 | 101 | plt.figure(figsize=(8, 8), facecolor='w') 102 | plt.subplot(311) 103 | plt.plot(x, y, 'g-', lw=2) 104 | plt.title(u'三角波', fontsize=15) 105 | plt.grid(True) 106 | plt.subplot(312) 107 | w = np.arange(N) * 2 * np.pi / N 108 | plt.stem(w, a, linefmt='r-', markerfmt='ro') 109 | plt.title(u'频域信号', fontsize=15) 110 | plt.grid(True) 111 | plt.subplot(313) 112 | plt.plot(x, y1, 'b-', lw=2, markersize=4) 113 | plt.title(u'三角波恢复信号', fontsize=15) 114 | plt.grid(True) 115 | plt.tight_layout(1.5, rect=[0, 0.04, 1, 0.96]) 116 | plt.suptitle(u'快速傅里叶变换FFT与频域滤波', fontsize=17) 117 | plt.show() 118 | 119 | -------------------------------------------------------------------------------- /2.Application Foundation/2.5 SVD.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import os 4 | import matplotlib as mpl 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | from PIL import Image 8 | 9 | 10 | def restore1(sigma, u, v, K): # 奇异值、左特征向量、右特征向量 11 | m = len(u) 12 | n = len(v[0]) 13 | a = np.zeros((m, n)) 14 | for k in range(K): 15 | uk = u[:, k].reshape(m, 1) 16 | vk = v[k].reshape(1, n) 17 | a += sigma[k] * np.dot(uk, vk) 18 | a[a < 0] = 0 19 | a[a > 255] = 255 20 | # a = a.clip(0, 255) 21 | return np.rint(a).astype('uint8') 22 | 23 | 24 | def restore2(sigma, u, v, K): # 奇异值、左特征向量、右特征向量 25 | m = len(u) 26 | n = len(v[0]) 27 | a = np.zeros((m, n)) 28 | for k in range(K + 1): 29 | for i in range(m): 30 | a[i] += sigma[k] * u[i][k] * v[k] 31 | a[a < 0] = 0 32 | a[a > 255] = 255 33 | return np.rint(a).astype('uint8') 34 | 35 | 36 | if __name__ == "__main__": 37 | A = Image.open("son.png", 'r') 38 | output_path = r'.\Pic' 39 | if not os.path.exists(output_path): 40 | os.mkdir(output_path) 41 | a = np.array(A) 42 | K = 50 43 | # 三通道分别进行SVD分解 44 | u_r, sigma_r, v_r = np.linalg.svd(a[:, :, 0]) 45 | u_g, sigma_g, v_g = np.linalg.svd(a[:, :, 1]) 46 | u_b, sigma_b, v_b = np.linalg.svd(a[:, :, 2]) 47 | plt.figure(figsize=(10, 10), facecolor='w') 48 | mpl.rcParams['font.sans-serif'] = [u'simHei'] 49 | mpl.rcParams['axes.unicode_minus'] = False 50 | # k为sigma前k个奇异值 51 | for k in range(1, K + 1): 52 | print(k) 53 | R = restore1(sigma_r, u_r, v_r, k) 54 | G = restore1(sigma_g, u_g, v_g, k) 55 | B = restore1(sigma_b, u_b, v_b, k) 56 | I = np.stack((R, G, B), 2) 57 | # Image.fromarray(I).save('%s\\svd_%d.png' % (output_path, k)) 58 | if k <= 12: 59 | plt.subplot(3, 4, k) 60 | plt.imshow(I) 61 | plt.axis('off') 62 | plt.title(u'奇异值个数:%d' % k) 63 | plt.suptitle(u'SVD与图像分解', fontsize=18) 64 | plt.tight_layout(2) 65 | plt.subplots_adjust(top=0.9) 66 | plt.show() 67 | 68 | -------------------------------------------------------------------------------- /2.Application Foundation/README.md: -------------------------------------------------------------------------------- 1 | ## Application Foundation 2 | ## (概念操作基础) 3 | 4 | ### 项目背景 5 | >该项目是整合一些基础算法概念的操作实现,包括bagging操作效果好处,基本的滑动均值操作效果,以及一些SVD奇异值分解降维图像效果,比较零碎,但是可以以小窥大看到一些操作的数据呈现效果。 6 | 7 | ### 项目简介 8 | |名称|简介| 9 | |:-------------|:-------------:| 10 | |2.1 Ensumble|二分类多次迭代后准确率可以拉升| 11 | |2.2 Convolve|滑动均值实现股票曲线拟合| 12 | |2.3 Image_convolve|基于不同算子的图像卷积轮毂勾勒| 13 | |2.4 FFT|FFT傅里叶变化| 14 | |2.5 SVD|SVD奇异值分解图像降维| 15 | 16 | ### 效果图 17 | #### ·简易滑动均值拟合 18 | 19 | 20 | #### ·指数滑动均值拟合 21 | 22 | 23 | #### ·不同算子边界勾勒效果 24 | 25 | 26 | #### ·时域信号&频域信号 27 | 28 | 29 | #### ·傅里叶变化滤波操作 30 | 31 | 32 | #### ·SVD奇异值分解图像压缩 33 | 34 | -------------------------------------------------------------------------------- /2.Application Foundation/SH600000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/SH600000.txt -------------------------------------------------------------------------------- /2.Application Foundation/figures/FFT(1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/figures/FFT(1).png -------------------------------------------------------------------------------- /2.Application Foundation/figures/FFT(2).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/figures/FFT(2).png -------------------------------------------------------------------------------- /2.Application Foundation/figures/SVD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/figures/SVD.png -------------------------------------------------------------------------------- /2.Application Foundation/figures/convolve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/figures/convolve.png -------------------------------------------------------------------------------- /2.Application Foundation/figures/ema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/figures/ema.png -------------------------------------------------------------------------------- /2.Application Foundation/figures/sma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/figures/sma.png -------------------------------------------------------------------------------- /2.Application Foundation/guideline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/guideline.pdf -------------------------------------------------------------------------------- /2.Application Foundation/lena.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/lena.png -------------------------------------------------------------------------------- /2.Application Foundation/son.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/2.Application Foundation/son.png -------------------------------------------------------------------------------- /3.Regression/3.1 Advertising.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.model_selection import train_test_split 7 | 8 | ''' 9 | 线性回归:目标给每个特征分配合理权重,并额外评估出一个偏移量 10 | 数据样例:3种不同渠道投入对应的广告收入,TV || Radio || Newspaper => Sales,一共200条数据 11 | ''' 12 | 13 | if __name__ == "__main__": 14 | # 数据路径 15 | path = 'Advertising.csv' 16 | 17 | ''' 18 | # 手写读取数据 - 请自行分析,在Iris代码中给出类似的例子 19 | f = open(path) 20 | x = [] 21 | y = [] 22 | for i, d in enumerate(f): 23 | # 第0行数据类别不要 24 | if i == 0: 25 | continue 26 | # 去空格等不标准输入 27 | d = d.strip() 28 | # 如果没有数据 29 | if not d: 30 | continue 31 | # 分割数据 32 | d = list(map(float, d.split(','))) 33 | # 排除第一列索引,从第二列读到倒数第二列 34 | x.append(d[1:-1]) 35 | # 最后一列为sale 36 | y.append(d[-1]) 37 | print(x) 38 | print(y) 39 | x = np.array(x) 40 | y = np.array(y) 41 | print('------------------------------') 42 | ''' 43 | 44 | ''' 45 | # python自带库 46 | with open(path, "rt", encoding="utf-8") as vsvfile: 47 | reader = csv.reader(vsvfile) 48 | rows = [row for row in reader] 49 | print(rows) 50 | print('------------------------------') 51 | ''' 52 | 53 | ''' 54 | # numpy读入 55 | p = np.loadtxt(path, delimiter=',', skiprows=1) # 省略第1行 56 | print(p) 57 | print('------------------------------') 58 | ''' 59 | 60 | # pandas读入 61 | data = pd.read_csv(path) # TV、Radio、Newspaper、Sales 62 | x = data[['TV', 'Radio', 'Newspaper']] 63 | # x = data[['TV', 'Radio']] 64 | y = data['Sales'] 65 | print(x) 66 | print(y) 67 | 68 | # # 绘制1 69 | plt.plot(data['TV'], y, 'ro', label='TV') 70 | plt.plot(data['Radio'], y, 'g^', label='Radio') 71 | plt.plot(data['Newspaper'], y, 'mv', label='Newspaer') 72 | plt.legend(loc='lower right') # 图例显示位置 73 | plt.grid() 74 | plt.show() 75 | 76 | # 绘制2 77 | plt.figure(figsize=(9, 12)) 78 | plt.subplot(311) 79 | plt.plot(data['TV'], y, 'ro') 80 | plt.title('TV') 81 | plt.grid() 82 | plt.subplot(312) 83 | plt.plot(data['Radio'], y, 'g^') 84 | plt.title('Radio') 85 | plt.grid() 86 | plt.subplot(313) 87 | plt.plot(data['Newspaper'], y, 'b*') 88 | plt.title('Newspaper') 89 | plt.grid() 90 | plt.tight_layout() 91 | plt.show() 92 | 93 | print('-----------------------------------') 94 | # 分离训练测试数据,random_state是随机种子,因为Python中随机种子变化,所以此处固定 95 | x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1) 96 | print(x_train, y_train) 97 | linreg = LinearRegression() 98 | model = linreg.fit(x_train, y_train) 99 | print(model) 100 | print(linreg.coef_) # 系数 101 | print(linreg.intercept_) # 截距 102 | 103 | y_hat = linreg.predict(np.array(x_test)) 104 | mse = np.average((y_hat - np.array(y_test)) ** 2) # 均方误差:平方和取均值 105 | rmse = np.sqrt(mse) # 求平方根 106 | print(mse, rmse) 107 | 108 | # 绘制测试值和预测值 109 | t = np.arange(len(x_test)) 110 | plt.plot(t, y_test, 'r-', linewidth=2, label='Test') 111 | plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict') 112 | plt.legend(loc='upper right') 113 | plt.grid() 114 | plt.show() 115 | -------------------------------------------------------------------------------- /3.Regression/3.2 LinearRegression_CV.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.linear_model import Lasso, Ridge 5 | from sklearn.model_selection import GridSearchCV 6 | from sklearn.model_selection import train_test_split 7 | 8 | if __name__ == "__main__": 9 | # pandas读入 10 | data = pd.read_csv('Advertising.csv') # TV、Radio、Newspaper、Sales 11 | x = data[['TV', 'Radio', 'Newspaper']] 12 | # x = data[['TV', 'Radio']] 13 | y = data['Sales'] 14 | print(x) 15 | print(y) 16 | 17 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.75) 18 | # print(x_train, y_train) 19 | # 线性回归:正则项为0 20 | # L1正则,Lasso:正则项是系数绝对值和 21 | # model = Lasso() 22 | # L2正则,岭回归:正则项是系数平方和 23 | model = Ridge() 24 | 25 | # 0.001 ~ 100 取10个数成等比数列 26 | alpha_can = np.logspace(-3, 2, 10) 27 | # 5折交叉验证,cv是cross verify交叉验证,给定alpha超参数 28 | lasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5) 29 | # 喂数据训练 30 | lasso_model.fit(x, y) 31 | print('超参数:\n', lasso_model.best_params_) 32 | 33 | # 获取预测值 34 | y_hat = lasso_model.predict(np.array(x_test)) 35 | # 计算误差平方均值 36 | mse = np.average((y_hat - np.array(y_test)) ** 2) 37 | # 开方衡量误差程度 38 | rmse = np.sqrt(mse) 39 | print(mse, rmse) 40 | 41 | t = np.arange(len(x_test)) 42 | plt.plot(t, y_test, 'r-', linewidth=2, label='Test') 43 | plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict') 44 | plt.legend(loc='upper right') 45 | plt.grid() 46 | plt.show() 47 | -------------------------------------------------------------------------------- /3.Regression/3.3 Iris_LR.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.linear_model import LogisticRegression 7 | 8 | ''' 9 | LR:就是Logistic回归而不是Linner回归 10 | 莺尾花四个特征: 11 | 花萼长度 || 花萼宽度 || 花瓣长度 || 花瓣宽度 + 类别(3种各50条数据) 12 | ''' 13 | 14 | 15 | def iris_type(s): 16 | it = {b'Iris-setosa': 0, 17 | b'Iris-versicolor': 1, 18 | b'Iris-virginica': 2} 19 | return it[s] 20 | 21 | 22 | if __name__ == "__main__": 23 | path = u'iris.data' # 数据文件路径 24 | 25 | ''' 26 | # 手写读取数据 27 | f = open(path) 28 | x = [] 29 | y = [] 30 | for d in f: 31 | # 去空格 32 | d = d.strip() 33 | # 如果有数据 34 | if d: 35 | # 用逗号分割 36 | d = d.split(',') 37 | # 最后一列类别给y 38 | y.append(d[-1]) 39 | # 前面四列数据给x 40 | x.append(map(float, d[:-1])) 41 | print('原始数据X:\n', x) 42 | print('原始数据Y:\n', y) 43 | x = np.array(x) 44 | y = np.array(y) 45 | print('Numpy格式X:\n', x) 46 | print('Numpy格式Y-1:\n', y) 47 | # 用数值替换类别 48 | y[y == 'Iris-setosa'] = 0 49 | y[y == 'Iris-versicolor'] = 1 50 | y[y == 'Iris-virginica'] = 2 51 | print('Numpy格式Y-2:\n', y) 52 | y = y.astype(dtype=np.int) 53 | print('Numpy格式Y-3:\n', y) 54 | ''' 55 | 56 | ''' 57 | # 使用sklearn的数据预处理 58 | df = pd.read_csv(path, header=0) 59 | # 所有行都要,列截取到倒数第1列前 60 | x = df.values[:, :-1] 61 | # 所有行都要,列只要最后1列 62 | y = df.values[:, -1] 63 | print('x = \n', x) 64 | print('y = \n', y) 65 | # 用preprocessing预处理类型数据 66 | le = preprocessing.LabelEncoder() 67 | le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']) 68 | print(le.classes_) 69 | y = le.transform(y) 70 | print('Last Version, y = \n', y) 71 | ''' 72 | 73 | # 路径,浮点型数据,逗号分隔,第4列使用函数iris_type单独处理 74 | data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) 75 | print(data) 76 | 77 | # 将数据的0到3列组成x,第4列得到y,4是分割位,anis为1是指:按列分割,水平方向 78 | x, y = np.split(data, (4,), axis=1) 79 | 80 | # 为了可视化,仅使用前两列特征,行全要,列只要前两列:花萼长度,花萼宽度 81 | x = x[:, :2] 82 | 83 | print(x) 84 | print(y) 85 | 86 | # x = StandardScaler().fit_transform(x) 87 | # lr = LogisticRegression() # Logistic回归模型 88 | # lr.fit(x, y.ravel()) # 根据数据[x,y],计算回归参数 89 | 90 | # 管道处理:先标准化处理,再喂给Logist回归模型 91 | lr = Pipeline([('sc', StandardScaler()), 92 | ('clf', LogisticRegression())]) 93 | lr.fit(x, y.ravel()) # ravel()将列向量转置为行向量,由于fit函数的要求 94 | 95 | # 画图 96 | N, M = 500, 500 # 横纵各采样多少个值 97 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 98 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 99 | t1 = np.linspace(x1_min, x1_max, N) 100 | t2 = np.linspace(x2_min, x2_max, M) 101 | x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 102 | x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点 103 | 104 | # 凑另外两个维度 105 | # x3 = np.ones(x1.size) * np.average(x[:, 2]) 106 | # x4 = np.ones(x1.size) * np.average(x[:, 3]) 107 | # x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1) # 测试点 108 | 109 | cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF']) 110 | cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) 111 | y_hat = lr.predict(x_test) # 预测值 112 | y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同 113 | plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值的显示 114 | plt.scatter(x[:, 0], x[:, 1], c=y.reshape(x[:, 0].shape), edgecolors='k', s=50, cmap=cm_dark) # 样本的显示 115 | plt.xlabel('petal length') 116 | plt.ylabel('petal width') 117 | plt.xlim(x1_min, x1_max) 118 | plt.ylim(x2_min, x2_max) 119 | plt.grid() 120 | plt.savefig('Logistic.png') # 存储图片 121 | plt.show() 122 | 123 | # 训练集上的预测结果 124 | y_hat = lr.predict(x) 125 | y = y.reshape(-1) 126 | result = y_hat == y 127 | print(y_hat) 128 | print(result) 129 | acc = np.mean(result) 130 | print('准确度: %.2f%%' % (100 * acc)) 131 | -------------------------------------------------------------------------------- /3.Regression/3.4 Overfit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.preprocessing import PolynomialFeatures 6 | from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV 7 | 8 | if __name__ == "__main__": 9 | 10 | N = 9 11 | 12 | # 定义域 13 | x = np.linspace(0, 6, N) + np.random.randn(N) 14 | x = np.sort(x) 15 | 16 | # 值域 17 | y = x ** 2 - 4 * x - 3 + np.random.randn(N) 18 | 19 | x.shape = -1, 1 20 | y.shape = -1, 1 21 | 22 | # 模型管道处理:取若干阶,再线性回归 23 | model_1 = Pipeline([ 24 | ('poly', PolynomialFeatures()), # 具体超参数后面运行时候设置 25 | ('linear', LinearRegression(fit_intercept=False))]) 26 | model_2 = Pipeline([ 27 | ('poly', PolynomialFeatures()), 28 | ('linear', RidgeCV(alphas=np.logspace(-3, 2, 100), fit_intercept=False))]) 29 | model_3 = Pipeline([ 30 | ('poly', PolynomialFeatures()), 31 | ('linear', LassoCV(alphas=np.logspace(-3, 2, 100), fit_intercept=False))]) 32 | 33 | models = model_1, model_2, model_3 34 | 35 | mpl.rcParams['font.sans-serif'] = [u'simHei'] 36 | mpl.rcParams['axes.unicode_minus'] = False 37 | np.set_printoptions(suppress=True) 38 | 39 | plt.figure(figsize=(7, 11), facecolor='w') 40 | # 阶数池,从1~8 41 | d_pool = np.arange(1, N, 1) # 阶:最大为8 42 | m = d_pool.size 43 | 44 | clrs = [] # 颜色 45 | for c in np.linspace(16711680, 255, m): 46 | clrs.append('#%06x' % int(c)) 47 | 48 | line_width = np.linspace(5, 2, m) 49 | 50 | titles = u'线性回归', u'Ridge回归', u'Lasso回归' 51 | 52 | for t in range(3): 53 | # 获取当前模型 54 | model = models[t] 55 | # 绘制三行一列图片 56 | plt.subplot(3, 1, t + 1) 57 | plt.plot(x, y, 'ro', ms=10, zorder=N) 58 | for i, d in enumerate(d_pool): 59 | model.set_params(poly__degree=d) # 设置阶数这个超参量,poly+__+超参数名称 60 | model.fit(x, y) 61 | lin = model.get_params('linear')['linear'] 62 | if t == 0: 63 | # 线性回归没有alpha 64 | print(u'线性回归:%d阶,系数为:' % d, lin.coef_.ravel()) 65 | else: 66 | print(u'岭回归/Lasso:%d阶,alpha=%.6f,系数为:' % (d, lin.alpha_), lin.coef_.ravel()) 67 | x_hat = np.linspace(x.min(), x.max(), num=100) 68 | x_hat.shape = -1, 1 69 | y_hat = model.predict(x_hat) 70 | s = model.score(x, y) 71 | print(s, '\n') 72 | zorder = N - 1 if (d == 2) else 0 73 | plt.plot(x_hat, y_hat, color=clrs[i], lw=line_width[i], label=(u'%d阶,score=%.3f' % (d, s)), zorder=zorder) 74 | plt.legend(loc='upper left') 75 | plt.grid(True) 76 | plt.title(titles[t], fontsize=16) 77 | plt.xlabel('X', fontsize=14) 78 | plt.ylabel('Y', fontsize=14) 79 | plt.tight_layout(1, rect=(0, 0, 1, 0.95)) 80 | plt.suptitle(u'多项式曲线拟合', fontsize=18) 81 | plt.savefig('Overfit.png') # 存储图片 82 | plt.show() 83 | -------------------------------------------------------------------------------- /3.Regression/README.md: -------------------------------------------------------------------------------- 1 | ## Regression 2 | ## (回归) 3 | 4 | ### 项目背景 5 | >该项目是整合一些基础回归算法,主要是线性回归应用。分别尝试将最简单的线性回归应用到广告投放数据集,以及莺尾花数据集。此外还引入了常用的cross validation交叉验证,尝试提升精度。最后实现了了非线性回归的拟合效果。在一些情况下会产生荣格现象,也就是过拟合现象,分别对应调参,调整过拟合和欠拟合效果。 6 | 7 | ### 项目简介 8 | |名称|简介| 9 | |:-------------|:-------------:| 10 | |3.1 Advertising|广告投放数据集应用线性回归拟合预期收入| 11 | |3.2 LinearRegression_CV|线性回归基础上引入交叉验证| 12 | |3.3 Iris_LR|Iris莺尾花数据集引入线性回归分类花的种类| 13 | |3.4 Overfit|非线性回归过拟合现象调整| 14 | 15 | ### 效果图 16 | #### ·广告数据特征分布离散图 17 | 18 | 19 | #### ·广告数据分布各自离散图 20 | 21 | 22 | #### ·广告数据线性回归预测拟合 23 | 24 | 25 | #### ·引入交叉验证的广告数据线性回归预测拟合 26 | 27 | 28 | #### ·Iris莺尾花数据集应用线性回归分类 29 | 30 | 31 | #### ·Overfit过拟合及荣格现象展示 32 | 33 | -------------------------------------------------------------------------------- /3.Regression/figures/adv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/3.Regression/figures/adv.png -------------------------------------------------------------------------------- /3.Regression/figures/adv_pred.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/3.Regression/figures/adv_pred.png -------------------------------------------------------------------------------- /3.Regression/figures/adv_pred_cv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/3.Regression/figures/adv_pred_cv.png -------------------------------------------------------------------------------- /3.Regression/figures/adv_self.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/3.Regression/figures/adv_self.png -------------------------------------------------------------------------------- /3.Regression/figures/iris_LR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/3.Regression/figures/iris_LR.png -------------------------------------------------------------------------------- /3.Regression/figures/overfit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/3.Regression/figures/overfit.png -------------------------------------------------------------------------------- /3.Regression/guideline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/3.Regression/guideline.pdf -------------------------------------------------------------------------------- /3.Regression/iris.names: -------------------------------------------------------------------------------- 1 | 1. Title: Iris Plants Database 2 | Updated Sept 21 by C.Blake - Added discrepency information 3 | 4 | 2. Sources: 5 | (a) Creator: R.A. Fisher 6 | (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) 7 | (c) Date: July, 1988 8 | 9 | 3. Past Usage: 10 | - Publications: too many to mention!!! Here are a few. 11 | 1. Fisher,R.A. "The use of multiple measurements in taxonomic problems" 12 | Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions 13 | to Mathematical Statistics" (John Wiley, NY, 1950). 14 | 2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis. 15 | (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. 16 | 3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System 17 | Structure and Classification Rule for Recognition in Partially Exposed 18 | Environments". IEEE Transactions on Pattern Analysis and Machine 19 | Intelligence, Vol. PAMI-2, No. 1, 67-71. 20 | -- Results: 21 | -- very low misclassification rates (0% for the setosa class) 22 | 4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE 23 | Transactions on Information Theory, May 1972, 431-433. 24 | -- Results: 25 | -- very low misclassification rates again 26 | 5. See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II 27 | conceptual clustering system finds 3 classes in the data. 28 | 29 | 4. Relevant Information: 30 | --- This is perhaps the best known database to be found in the pattern 31 | recognition literature. Fisher's paper is a classic in the field 32 | and is referenced frequently to this day. (See Duda & Hart, for 33 | example.) The data set contains 3 classes of 50 instances each, 34 | where each class refers to a type of iris plant. One class is 35 | linearly separable from the other 2; the latter are NOT linearly 36 | separable from each other. 37 | --- Predicted attribute: class of iris plant. 38 | --- This is an exceedingly simple domain. 39 | --- This data differs from the data presented in Fishers article 40 | (identified by Steve Chadwick, spchadwick@espeedaz.net ) 41 | The 35th sample should be: 4.9,3.1,1.5,0.2,"Iris-setosa" 42 | where the error is in the fourth feature. 43 | The 38th sample: 4.9,3.6,1.4,0.1,"Iris-setosa" 44 | where the errors are in the second and third features. 45 | 46 | 5. Number of Instances: 150 (50 in each of three classes) 47 | 48 | 6. Number of Attributes: 4 numeric, predictive attributes and the class 49 | 50 | 7. Attribute Information: 51 | 1. sepal length in cm 52 | 2. sepal width in cm 53 | 3. petal length in cm 54 | 4. petal width in cm 55 | 5. class: 56 | -- Iris Setosa 57 | -- Iris Versicolour 58 | -- Iris Virginica 59 | 60 | 8. Missing Attribute Values: None 61 | 62 | Summary Statistics: 63 | Min Max Mean SD Class Correlation 64 | sepal length: 4.3 7.9 5.84 0.83 0.7826 65 | sepal width: 2.0 4.4 3.05 0.43 -0.4194 66 | petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) 67 | petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) 68 | 69 | 9. Class Distribution: 33.3% for each of 3 classes. 70 | -------------------------------------------------------------------------------- /3.Regression/principle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/3.Regression/principle.pdf -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/4.1 Iris_DecisionTree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib as mpl 4 | from sklearn import tree 5 | from sklearn.tree import DecisionTreeClassifier 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.pipeline import Pipeline 9 | 10 | 11 | def iris_type(s): 12 | it = {b'Iris-setosa': 0, 13 | b'Iris-versicolor': 1, 14 | b'Iris-virginica': 2} 15 | return it[s] 16 | 17 | 18 | # 花萼长度、花萼宽度,花瓣长度,花瓣宽度 19 | # iris_feature = 'sepal length', 'sepal width', 'petal length', 'petal width' 20 | iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度' 21 | 22 | if __name__ == "__main__": 23 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 24 | mpl.rcParams['axes.unicode_minus'] = False 25 | 26 | path = '.\\iris.data' # 数据文件路径 27 | data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) 28 | x, y = np.split(data, (4,), axis=1) 29 | 30 | # 为了可视化,仅使用前两列特征 31 | x = x[:, :2] 32 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1) 33 | # ss = StandardScaler() 34 | # ss = ss.fit(x_train) 35 | 36 | # 决策树参数估计 37 | # min_samples_split = 10:如果该结点包含的样本数目大于10,则(有可能)对其分支 38 | # min_samples_leaf = 10:若将某结点分支后,得到的每个子结点样本数目都大于10,则完成分支;否则,不进行分支 39 | model = Pipeline([ 40 | ('ss', StandardScaler()), # 预处理标准化:均值为0 || 方差为1 41 | ('DTC', DecisionTreeClassifier(criterion='entropy', max_depth=3))]) # 熵准则,最大深度为3 42 | # clf = DecisionTreeClassifier(criterion='entropy', max_depth=3) 43 | 44 | # 训练模型 45 | model = model.fit(x_train, y_train) 46 | 47 | # 测试数据获取预测值 48 | y_test_hat = model.predict(x_test) 49 | 50 | # 保存 51 | # dot -Tpng -o 1.png 1.dot 52 | f = open('.\\iris_tree.dot', 'w') 53 | tree.export_graphviz(model.get_params('DTC')['DTC'], out_file=f) 54 | 55 | # 画图 56 | N, M = 100, 100 # 横纵各采样多少个值 57 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的数据值范围 58 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的数据值范围 59 | t1 = np.linspace(x1_min, x1_max, N) # 线性等差取值间隔 60 | t2 = np.linspace(x2_min, x2_max, M) # 线性等差取值间隔 61 | x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 62 | x_show = np.stack((x1.flat, x2.flat), axis=1) # 测试点,x1和x2拉平,按垂直列的方式,意义匹配 63 | 64 | # 加上另外两个维度 65 | # 打开该注释前,确保注释掉x = x[:, :2] 66 | # x3 = np.ones(x1.size) * np.average(x[:, 2]) 67 | # x4 = np.ones(x1.size) * np.average(x[:, 3]) 68 | # x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1) # 测试点 69 | 70 | cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) 71 | cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) 72 | 73 | y_show_hat = model.predict(x_show) # 预测值 74 | y_show_hat = y_show_hat.reshape(x1.shape) # 使之与输入的形状相同 75 | 76 | plt.figure(facecolor='w') # 白底 77 | plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light) # 预测值的显示 78 | plt.scatter(x_test[:, 0], x_test[:, 1], c=y_test.ravel(), edgecolors='k', s=100, cmap=cm_dark, marker='o') # 测试数据 79 | plt.scatter(x[:, 0], x[:, 1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark) # 全部数据 80 | plt.xlabel(iris_feature[0], fontsize=15) 81 | plt.ylabel(iris_feature[1], fontsize=15) 82 | plt.xlim(x1_min, x1_max) # 限定x轴范围 83 | plt.ylim(x2_min, x2_max) # 限定x轴范围 84 | plt.grid(True) 85 | plt.title(u'鸢尾花数据的决策树分类', fontsize=17) 86 | plt.show() 87 | 88 | # 训练集上的预测结果 89 | y_test = y_test.reshape(-1) 90 | print('y_test_hat : \n', y_test_hat) 91 | print('y_test_hat : \n', y_test) 92 | result = (y_test_hat == y_test) # True则预测正确,False则预测错误 93 | acc = np.mean(result) 94 | print('准确度: %.2f%%' % (100 * acc)) 95 | 96 | # 过拟合:错误率 97 | depth = np.arange(1, 15) # 层数深度1~15 98 | err_list = [] 99 | for d in depth: 100 | clf = DecisionTreeClassifier(criterion='entropy', max_depth=d) 101 | clf = clf.fit(x_train, y_train) 102 | y_test_hat = clf.predict(x_test) # 测试数据 103 | result = (y_test_hat == y_test) # True则预测正确,False则预测错误 104 | err = 1 - np.mean(result) 105 | err_list.append(err) 106 | print(d, ' 错误率: %.2f%%' % (100 * err)) 107 | 108 | plt.figure(facecolor='w') 109 | plt.plot(depth, err_list, 'ro-', lw=2) 110 | plt.xlabel(u'决策树深度', fontsize=15) 111 | plt.ylabel(u'错误率', fontsize=15) 112 | plt.title(u'决策树深度与过拟合', fontsize=17) 113 | plt.grid(True) 114 | plt.show() 115 | -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/4.2 Iris_DecisionTree_Enum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib as mpl 4 | from sklearn.tree import DecisionTreeClassifier 5 | 6 | 7 | def iris_type(s): 8 | it = {b'Iris-setosa': 0, 9 | b'Iris-versicolor': 1, 10 | b'Iris-virginica': 2} 11 | return it[s] 12 | 13 | 14 | # 'sepal length', 'sepal width', 'petal length', 'petal width' 15 | iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度' 16 | 17 | if __name__ == "__main__": 18 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] # 黑体 FangSong/KaiTi 19 | mpl.rcParams['axes.unicode_minus'] = False 20 | 21 | path = '.\\iris.data' # 数据文件路径 22 | data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) 23 | x_prime, y = np.split(data, (4,), axis=1) 24 | 25 | # 枚举特征组合 26 | feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]] 27 | plt.figure(figsize=(10, 9), facecolor='#FFFFFF') 28 | 29 | for i, pair in enumerate(feature_pairs): 30 | # 准备数据 31 | x = x_prime[:, pair] 32 | 33 | # 决策树学习 34 | clf = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3) # 最小叶子节点数目,小于3个就不分了 35 | dt_clf = clf.fit(x, y) 36 | 37 | # 画图 38 | N, M = 500, 500 # 横纵各采样多少个值 39 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 40 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 41 | t1 = np.linspace(x1_min, x1_max, N) 42 | t2 = np.linspace(x2_min, x2_max, M) 43 | x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 44 | x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点 45 | 46 | # 训练集上的预测结果 47 | y_hat = dt_clf.predict(x) 48 | y = y.reshape(-1) 49 | c = np.count_nonzero(y_hat == y) # 统计预测正确的个数 50 | print('特征: ', iris_feature[pair[0]], ' + ', iris_feature[pair[1]]) 51 | print('\t预测正确数目:', c) 52 | print('\t准确率: %.2f%%' % (100 * float(c) / float(len(y)))) 53 | 54 | # 显示 55 | cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) 56 | cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) 57 | y_hat = dt_clf.predict(x_test) # 预测值 58 | y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同 59 | plt.subplot(2, 3, i + 1) 60 | plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值 61 | plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', cmap=cm_dark) # 样本 62 | plt.xlabel(iris_feature[pair[0]], fontsize=14) 63 | plt.ylabel(iris_feature[pair[1]], fontsize=14) 64 | plt.xlim(x1_min, x1_max) 65 | plt.ylim(x2_min, x2_max) 66 | plt.grid() 67 | 68 | plt.suptitle(u'决策树对鸢尾花数据的两特征组合的分类结果', fontsize=18) 69 | plt.tight_layout(2) 70 | plt.subplots_adjust(top=0.92) 71 | plt.show() 72 | -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/4.3 DecisionTreeRegressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.tree import DecisionTreeRegressor 4 | 5 | if __name__ == "__main__": 6 | N = 100 7 | x = np.random.rand(N) * 6 - 3 # [-3,3) 8 | x.sort() 9 | y = np.sin(x) + np.random.randn(N) * 0.05 # 加上一点噪声 10 | print('y : \n', y) 11 | x = x.reshape(-1, 1) # 转置后,得到N个样本,每个样本都是1维的 12 | print('x : \n', x) 13 | 14 | # 决策树回归:均方误差最小的地方劈开,而不是熵下降速度最快的地方 15 | reg = DecisionTreeRegressor(criterion='mse', max_depth=9) 16 | dt = reg.fit(x, y) 17 | 18 | x_test = np.linspace(-3, 3, 50).reshape(-1, 1) 19 | y_hat = dt.predict(x_test) 20 | 21 | plt.plot(x, y, 'r*', linewidth=2, label='Actual') 22 | plt.plot(x_test, y_hat, 'g-', linewidth=2, label='Predict') 23 | plt.legend(loc='upper left') 24 | plt.grid() 25 | plt.show() 26 | 27 | # 比较决策树的深度影响 28 | depth = [2, 4, 6, 8, 10] 29 | clr = 'rgbmy' 30 | reg = [DecisionTreeRegressor(criterion='mse', max_depth=depth[0]), 31 | DecisionTreeRegressor(criterion='mse', max_depth=depth[1]), 32 | DecisionTreeRegressor(criterion='mse', max_depth=depth[2]), 33 | DecisionTreeRegressor(criterion='mse', max_depth=depth[3]), 34 | DecisionTreeRegressor(criterion='mse', max_depth=depth[4])] 35 | plt.plot(x, y, 'k^', linewidth=2, label='Actual') 36 | x_test = np.linspace(-3, 3, 50).reshape(-1, 1) 37 | for i, r in enumerate(reg): 38 | dt = r.fit(x, y) 39 | y_hat = dt.predict(x_test) 40 | plt.plot(x_test, y_hat, '-', color=clr[i], linewidth=2, label='Depth=%d' % depth[i]) 41 | plt.legend(loc='upper left') 42 | plt.grid() 43 | plt.show() 44 | -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/4.4 MultiOutput_DTR.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.tree import DecisionTreeRegressor 4 | 5 | if __name__ == "__main__": 6 | N = 300 7 | x = np.random.rand(N) * 8 - 4 # [-4,4) 8 | x.sort() 9 | # y1 = np.sin(x) + 3 + np.random.randn(N) * 0.1 10 | # y2 = np.cos(0.3 * x) + np.random.randn(N) * 0.01 11 | y1 = np.sin(x) + np.random.randn(N) * 0.05 12 | y2 = np.cos(x) + np.random.randn(N) * 0.1 13 | y = np.vstack((y1, y2)) 14 | y = np.vstack((y1, y2)).T 15 | x = x.reshape(-1, 1) # 转置后,得到N个样本,每个样本都是1维的 16 | 17 | deep = 3 18 | reg = DecisionTreeRegressor(criterion='mse', max_depth=deep) 19 | dt = reg.fit(x, y) 20 | 21 | x_test = np.linspace(-4, 4, num=1000).reshape(-1, 1) 22 | print('x_test : \n', x_test) 23 | y_hat = dt.predict(x_test) 24 | print('y_hat : \n', y_hat) 25 | 26 | plt.scatter(y[:, 0], y[:, 1], c='r', s=40, label='Actual') 27 | plt.scatter(y_hat[:, 0], y_hat[:, 1], c='g', marker='s', s=100, label='Depth=%d' % deep, alpha=1) 28 | plt.legend(loc='upper left') 29 | plt.xlabel('y1') 30 | plt.ylabel('y2') 31 | plt.grid() 32 | plt.show() 33 | -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/4.5 Iris_RandomForest_Enum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib as mpl 4 | from sklearn.ensemble import RandomForestClassifier 5 | 6 | 7 | def iris_type(s): 8 | it = {b'Iris-setosa': 0, 9 | b'Iris-versicolor': 1, 10 | b'Iris-virginica': 2} 11 | return it[s] 12 | 13 | 14 | # 'sepal length', 'sepal width', 'petal length', 'petal width' 15 | iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度' 16 | 17 | if __name__ == "__main__": 18 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] # 黑体 FangSong/KaiTi 19 | mpl.rcParams['axes.unicode_minus'] = False 20 | 21 | path = '.\\iris.data' # 数据文件路径 22 | data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) 23 | x_prime, y = np.split(data, (4,), axis=1) 24 | 25 | feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]] 26 | plt.figure(figsize=(10, 9), facecolor='#FFFFFF') 27 | for i, pair in enumerate(feature_pairs): 28 | # 准备数据 29 | x = x_prime[:, pair] 30 | 31 | # 随机森林 32 | clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4) 33 | rf_clf = clf.fit(x, y.ravel()) 34 | 35 | # 画图 36 | N, M = 500, 500 # 横纵各采样多少个值 37 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 38 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 39 | t1 = np.linspace(x1_min, x1_max, N) 40 | t2 = np.linspace(x2_min, x2_max, M) 41 | x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 42 | x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点 43 | 44 | # 训练集上的预测结果 45 | y_hat = rf_clf.predict(x) 46 | y = y.reshape(-1) 47 | c = np.count_nonzero(y_hat == y) # 统计预测正确的个数 48 | print('特征: ', iris_feature[pair[0]], ' + ', iris_feature[pair[1]]) 49 | print('\t预测正确数目:', c) 50 | print('\t准确率: %.2f%%' % (100 * float(c) / float(len(y)))) 51 | 52 | # 显示 53 | cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) 54 | cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) 55 | y_hat = rf_clf.predict(x_test) # 预测值 56 | y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同 57 | plt.subplot(2, 3, i + 1) 58 | plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值 59 | plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', cmap=cm_dark) # 样本 60 | plt.xlabel(iris_feature[pair[0]], fontsize=14) 61 | plt.ylabel(iris_feature[pair[1]], fontsize=14) 62 | plt.xlim(x1_min, x1_max) 63 | plt.ylim(x2_min, x2_max) 64 | plt.grid() 65 | plt.tight_layout(2.5) 66 | plt.subplots_adjust(top=0.92) 67 | plt.suptitle(u'随机森林对鸢尾花数据的两特征组合的分类结果', fontsize=18) 68 | plt.show() 69 | -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/README.md: -------------------------------------------------------------------------------- 1 | ## Decision Tree & Random Forest 2 | ## (决策树 & 随机森林) 3 | 4 | ### 项目背景 5 | > 决策树思想,实际上就是寻找最纯净的划分方法,这个最纯净在数学上叫纯度,纯度通俗点理解就是目标变量要分得足够开(y=1的和y=0的混到一起就会不纯)。另一种理解是分类误差率的一种衡量。实际决策树算法往往用到的是,纯度的另一面也即不纯度,下面是不纯度的公式。不纯度的选取有多种方法,每种方法也就形成了不同的决策树方法,比如ID3算法使用信息增益作为不纯度;C4.5算法使用信息增益率作为不纯度;CART算法使用基尼系数作为不纯度。尽管有剪枝等等方法,一棵树的生成肯定还是不如多棵树,因此就有了随机森林,解决决策树泛化能力弱的缺点。(可以理解成三个臭皮匠顶过诸葛亮)。 6 | 7 | ### 项目简介 8 | |名称|简介| 9 | |:-------------|:-------------:| 10 | |4.1 Iris_DecisionTree|Iris莺尾花数据集决策树模型搭建| 11 | |4.2 Iris_DecisionTree_Enum|决策树特征组合效果比较| 12 | |4.3 DecisionTreeRegressor|决策树回归| 13 | |4.4 MultiOutput_DTR|尝试不同深度对决策树的影响| 14 | |4.5 Iris_RandomForest_Enum|随机森林模型搭建和特征选择比对| 15 | 16 | ### 效果图 17 | #### ·决策树分类莺尾花数据效果 18 | 19 | 20 | #### ·决策树深度对模型精度影响 21 | 22 | 23 | #### ·决策树模型特征选择比较 24 | 25 | 26 | #### ·树回归 27 | 28 | 29 | #### ·不同深度的树回归效果比较 30 | 31 | 32 | #### ·DTR 33 | 34 | 35 | #### ·随机森林模型特征选择效果比较 36 | 37 | -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/figures/DT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/4.Decision Tree & Random Forest/figures/DT.png -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/figures/DTR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/4.Decision Tree & Random Forest/figures/DTR.png -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/figures/DT_pair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/4.Decision Tree & Random Forest/figures/DT_pair.png -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/figures/DT_reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/4.Decision Tree & Random Forest/figures/DT_reg.png -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/figures/DT_reg_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/4.Decision Tree & Random Forest/figures/DT_reg_depth.png -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/figures/RF_pair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/4.Decision Tree & Random Forest/figures/RF_pair.png -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/figures/depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/4.Decision Tree & Random Forest/figures/depth.png -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/iris_tree.dot: -------------------------------------------------------------------------------- 1 | digraph Tree { 2 | node [shape=box] ; 3 | 0 [label="X[0] <= -0.416\nentropy = 1.582\nsamples = 105\nvalue = [36, 32, 37]"] ; 4 | 1 [label="X[1] <= -0.565\nentropy = 0.657\nsamples = 38\nvalue = [33, 4, 1]"] ; 5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ; 6 | 2 [label="X[0] <= -1.309\nentropy = 1.252\nsamples = 6\nvalue = [1, 4, 1]"] ; 7 | 1 -> 2 ; 8 | 3 [label="entropy = 0.0\nsamples = 1\nvalue = [1, 0, 0]"] ; 9 | 2 -> 3 ; 10 | 4 [label="entropy = 0.722\nsamples = 5\nvalue = [0, 4, 1]"] ; 11 | 2 -> 4 ; 12 | 5 [label="entropy = 0.0\nsamples = 32\nvalue = [32, 0, 0]"] ; 13 | 1 -> 5 ; 14 | 6 [label="X[1] <= 1.008\nentropy = 1.208\nsamples = 67\nvalue = [3, 28, 36]"] ; 15 | 0 -> 6 [labeldistance=2.5, labelangle=-45, headlabel="False"] ; 16 | 7 [label="X[0] <= 0.535\nentropy = 0.993\nsamples = 62\nvalue = [0, 28, 34]"] ; 17 | 6 -> 7 ; 18 | 8 [label="entropy = 0.918\nsamples = 30\nvalue = [0, 20, 10]"] ; 19 | 7 -> 8 ; 20 | 9 [label="entropy = 0.811\nsamples = 32\nvalue = [0, 8, 24]"] ; 21 | 7 -> 9 ; 22 | 10 [label="X[0] <= 0.773\nentropy = 0.971\nsamples = 5\nvalue = [3, 0, 2]"] ; 23 | 6 -> 10 ; 24 | 11 [label="entropy = 0.0\nsamples = 3\nvalue = [3, 0, 0]"] ; 25 | 10 -> 11 ; 26 | 12 [label="entropy = 0.0\nsamples = 2\nvalue = [0, 0, 2]"] ; 27 | 10 -> 12 ; 28 | } -------------------------------------------------------------------------------- /4.Decision Tree & Random Forest/principle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/4.Decision Tree & Random Forest/principle.pdf -------------------------------------------------------------------------------- /5.Boost/5.1 xgBoost_Intro.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import xgboost as xgb 3 | 4 | ''' 5 | 数据内容:蘑菇125特征对应是否有毒,测试集训练集简化存储模式,只将125特征列中,存在为是的特征标为1并集合。 6 | ''' 7 | 8 | 9 | # 自定义损失函数的梯度和二阶导 10 | def log_reg(y_hat, y): 11 | p = 1.0 / (1.0 + np.exp(-y_hat)) 12 | g = p - y.get_label() 13 | h = p * (1.0 - p) 14 | return g, h 15 | 16 | 17 | # 错误率定义函数 18 | def error_rate(y_hat, y): 19 | return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat) 20 | 21 | 22 | if __name__ == "__main__": 23 | # 读取训练数据和测试数据 24 | data_train = xgb.DMatrix('agaricus_train.txt') 25 | data_test = xgb.DMatrix('agaricus_test.txt') 26 | 27 | # 设置参数: 28 | ''' 29 | max_depth-树深度 30 | eta-衰减因子:防止过拟合,1为原始模型 31 | silent-是否输出树的生成情况:1表示不输出 32 | objective-输出情况:binary是二分类,softmax是多分类。Logistic分类界限为0.5,logitraw的输出值为实数域,分类界限为0。 33 | ''' 34 | param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} 35 | # param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'reg:logistic'} 36 | # eval:evaluate估计数据 || train:训练数据 37 | watchlist = [(data_test, 'eval'), (data_train, 'train')] 38 | n_round = 3 # 决策树个数 39 | bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist) 40 | # 自定义损失函数的梯度和二阶导 41 | # bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate) 42 | 43 | # 计算错误率 44 | y_hat = bst.predict(data_test) 45 | y = data_test.get_label() 46 | print(y_hat) # [0.15353635 0.84625006 0.15353635 ... 0.95912963 0.02411181 0.95912963] 47 | print(y) # [0. 1. 0. ... 1. 0. 1.] 48 | error = sum(y != (y_hat > 0.5)) 49 | error_rate = float(error) / len(y_hat) 50 | print('样本总数:\t', len(y_hat)) # 样本总数: 1611 51 | print('错误数目:\t%4d' % error) # 错误数目: 10 52 | print('错误率:\t%.5f%%' % (100 * error_rate)) # 错误率: 0.62073% 53 | -------------------------------------------------------------------------------- /5.Boost/5.2 xgBoost_Predict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import xgboost as xgb 3 | from sklearn.model_selection import train_test_split # cross_validation 4 | 5 | 6 | def iris_type(s): 7 | it = {b'Iris-setosa': 0, 8 | b'Iris-versicolor': 1, 9 | b'Iris-virginica': 2} 10 | return it[s] 11 | 12 | 13 | if __name__ == "__main__": 14 | path = u'.\\iris.data' # 数据文件路径 15 | data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) 16 | x, y = np.split(data, (4,), axis=1) 17 | # 测试数据设置50,训练数据则为100 18 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50) 19 | 20 | # 训练数据和标记组装 21 | data_train = xgb.DMatrix(x_train, label=y_train) 22 | # 测试数据和标记组装 23 | data_test = xgb.DMatrix(x_test, label=y_test) 24 | # 测试数据和训练数据整合 25 | watch_list = [(data_test, 'eval'), (data_train, 'train')] 26 | # objective:多分类问题,用softmax 27 | param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3} 28 | 29 | # 训练函数 30 | bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list) 31 | # 预测标记 32 | y_hat = bst.predict(data_test) 33 | # 计算结果 34 | result = y_test.reshape(1, -1) == y_hat 35 | print('正确率:\t', float(np.sum(result)) / len(y_hat)) 36 | # 正确率: 0.96 37 | -------------------------------------------------------------------------------- /5.Boost/5.3 xgBoost_Wine.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | 3 | import xgboost as xgb 4 | import numpy as np 5 | from sklearn.model_selection import train_test_split # cross_validation 6 | from sklearn.linear_model import LogisticRegression 7 | 8 | ''' 9 | 数据集:分类三种酒,对应的13个特征,第1列为标记数据,后面13列为13种特征 10 | ''' 11 | 12 | 13 | # 正确率计算函数 14 | def show_accuracy(a, b, tip): 15 | acc = a.ravel() == b.ravel() 16 | print(acc) 17 | print(tip + '正确率:\t', float(acc.sum()) / a.size) 18 | 19 | 20 | if __name__ == "__main__": 21 | data = np.loadtxt('wine.data', dtype=float, delimiter=',') 22 | # 第1列是分割点,前面是标记数据y,后面的是特征向量x 23 | y, x = np.split(data, (1,), axis=1) 24 | # x正则化,保证每一列均值是0,方差为1 25 | # x = StandardScaler().fit_transform(x) 26 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.5) 27 | 28 | # Logistic回归 29 | lr = LogisticRegression(penalty='l2') # L2正则 30 | lr.fit(x_train, y_train.ravel()) 31 | y_hat = lr.predict(x_test) 32 | show_accuracy(y_hat, y_test, 'Logistic回归 ') 33 | 34 | # XGBoost 35 | # 因为当前版本要求标记从0开始,所以把为3的标记设置为0,形成0 1 2三类标记 36 | y_train[y_train == 3] = 0 37 | y_test[y_test == 3] = 0 38 | data_train = xgb.DMatrix(x_train, label=y_train) 39 | data_test = xgb.DMatrix(x_test, label=y_test) 40 | watch_list = [(data_test, 'eval'), (data_train, 'train')] 41 | param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3} 42 | bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list) 43 | y_hat = bst.predict(data_test) 44 | show_accuracy(y_hat, y_test, 'XGBoost ') 45 | # XGBoost 正确率: 0.9887640449438202 46 | -------------------------------------------------------------------------------- /5.Boost/5.4 xgBoost_ReadData.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse 3 | import xgboost as xgb 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import LogisticRegression 6 | 7 | 8 | # 读数据函数 9 | def read_data(path): 10 | y = [] 11 | row = [] 12 | col = [] 13 | values = [] 14 | r = 0 # 首行 15 | for d in open(path): 16 | d = d.strip().split() # 以空格分开 17 | y.append(int(d[0])) 18 | d = d[1:] 19 | for c in d: 20 | key, value = c.split(':') 21 | row.append(r) # 添加行 22 | col.append(int(key)) # 添加列 23 | values.append(float(value)) # 添加Value 24 | r += 1 25 | # 稀疏矩阵,只存1的地方就行 || 稠密矩阵,0 1在矩阵中全部显示 26 | x = scipy.sparse.csr_matrix((values, (row, col))).toarray() 27 | y = np.array(y) 28 | return x, y 29 | 30 | 31 | def show_accuracy(a, b, tip): 32 | acc = a.ravel() == b.ravel() 33 | print(acc) 34 | print(tip + '正确率:\t', float(acc.sum()) / a.size) 35 | 36 | 37 | if __name__ == '__main__': 38 | x, y = read_data('agaricus_train.txt') 39 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6) 40 | 41 | # Logistic回归 42 | lr = LogisticRegression(penalty='l2') # L2正则 43 | lr.fit(x_train, y_train.ravel()) 44 | y_hat = lr.predict(x_test) 45 | show_accuracy(y_hat, y_test, 'Logistic回归 ') 46 | 47 | # XGBoost 48 | y_train[y_train == 3] = 0 49 | y_test[y_test == 3] = 0 50 | data_train = xgb.DMatrix(x_train, label=y_train) 51 | data_test = xgb.DMatrix(x_test, label=y_test) 52 | watch_list = [(data_test, 'eval'), (data_train, 'train')] 53 | param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3} 54 | bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list) 55 | y_hat = bst.predict(data_test) 56 | show_accuracy(y_hat, y_test, 'XGBoost ') 57 | # XGBoost 正确率: 0.9992325402916347 58 | -------------------------------------------------------------------------------- /5.Boost/5.6 Bagging_intro.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.linear_model import RidgeCV 6 | from sklearn.ensemble import BaggingRegressor 7 | from sklearn.tree import DecisionTreeRegressor 8 | from sklearn.preprocessing import PolynomialFeatures 9 | 10 | 11 | def f(x): 12 | return 0.5 * np.exp(-(x + 3) ** 2) + np.exp(-x ** 2) + + 0.5 * np.exp(-(x - 3) ** 2) 13 | 14 | 15 | if __name__ == "__main__": 16 | # 设定随机种子,保证每次运行结果相同 17 | np.random.seed(0) 18 | # 200个红色样本点 19 | N = 200 20 | # x属于[-5,5) 21 | x = np.random.rand(N) * 10 - 5 22 | # 排序x定义域 23 | x = np.sort(x) 24 | # y函数+随机噪声 25 | y = f(x) + 0.05 * np.random.randn(N) 26 | # 转换成一列 27 | x.shape = -1, 1 # 或200, 1 28 | 29 | # CV:cross validation 30 | ridge = RidgeCV(alphas=np.logspace(-3, 2, 10), fit_intercept=False) 31 | ridged = Pipeline([('poly', PolynomialFeatures(degree=10)), ('Ridge', ridge)]) 32 | # bagging操作,100次,每次取30%样本 33 | bagging_ridged = BaggingRegressor(ridged, n_estimators=100, max_samples=0.3) 34 | # 决策树回归 35 | dtr = DecisionTreeRegressor(max_depth=5) 36 | # 可以整合四种回归策略 37 | regs = [ 38 | ('DecisionTree Regressor', dtr), 39 | ('Ridge Regressor(6 Degree)', ridged), 40 | ('Bagging Ridge(6 Degree)', bagging_ridged), 41 | ('Bagging DecisionTree Regressor', BaggingRegressor(dtr, n_estimators=100, max_samples=0.3))] 42 | 43 | x_test = np.linspace(1.1 * x.min(), 1.1 * x.max(), 1000) 44 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 45 | mpl.rcParams['axes.unicode_minus'] = False 46 | # figsize:宽度 高度 || facecolor:背景白色 47 | plt.figure(figsize=(12, 8), facecolor='w') 48 | # 训练数据:原始离散样本点,红圈, 49 | plt.plot(x, y, 'ro', label=u'训练数据') 50 | # 测试数据:黑色线,粗3.5 51 | plt.plot(x_test, f(x_test), color='k', lw=3.5, label=u'真实值') 52 | # 设定四种个颜色 53 | clrs = 'bmyg' 54 | # 提取四种评估模型 55 | for i, (name, reg) in enumerate(regs): 56 | reg.fit(x, y) 57 | y_test = reg.predict(x_test.reshape(-1, 1)) 58 | plt.plot(x_test, y_test.ravel(), color=clrs[i], lw=i + 1, label=name, zorder=6 - i) 59 | plt.legend(loc='upper left') 60 | plt.xlabel('X', fontsize=15) 61 | plt.ylabel('Y', fontsize=15) 62 | plt.title(u'回归曲线拟合', fontsize=21) 63 | plt.ylim((-0.2, 1.2)) 64 | plt.tight_layout(2) 65 | plt.grid(True) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /5.Boost/README.md: -------------------------------------------------------------------------------- 1 | ## Boost 2 | ## (提升算法) 3 | 4 | ### 项目背景 5 | > boost算法是基于PAC学习理论(probably approximately correct)而建立的一套集成学习算法(ensemble learning)。其根本思想在于通过多个简单的弱分类器,构建出准确率很高的强分类器,PAC学习理论证实了这一方法的可行性。提升方法思路:对于一个复杂的问题,将多个专家的判断进行适当的综合所得出的判断,要比任何一个专家单独判断好。每一步产生一个弱预测模型(如决策树),并加权累加到总模型中,可以用于回归和分类问题;如果每一步的弱预测模型生成都是依据损失函数的梯度方向,则称之为梯度提升(Gradient boosting)。 6 | 7 | ### 项目简介 8 | |名称|简介| 9 | |:-------------|:-------------:| 10 | |5.1 xgBoost_Intro|基于XGBoost算法预测蘑菇毒性| 11 | |5.2 xgBoost_Predict|基于XGBoost算法预测莺尾花种类| 12 | |5.3 xgBoost_Wine|对比Logistic与XGBoost算法对于酒类数据预测| 13 | |5.4 xgBoost_ReadData|基于XGBoost算法预测落叶松类别| 14 | |5.5 Titanic|基于XGBoost算法预测泰坦尼克号存活率| 15 | |5.6 Bagging_intro|Bagging操作引入以及数据模拟对比| 16 | 17 | ### 效果图 18 | #### ·几种算法回归效果比对 19 | 20 | 21 | -------------------------------------------------------------------------------- /5.Boost/Titannic_Meta.txt: -------------------------------------------------------------------------------- 1 | VARIABLE DESCRIPTIONS: 2 | survival Survival 3 | (0 = No; 1 = Yes) 4 | pclass Passenger Class 5 | (1 = 1st; 2 = 2nd; 3 = 3rd) 6 | name Name 7 | sex Sex 8 | age Age 9 | sibsp Number of Siblings/Spouses Aboard 10 | parch Number of Parents/Children Aboard 11 | ticket Ticket Number 12 | fare Passenger Fare 13 | cabin Cabin 14 | embarked Port of Embarkation 15 | (C = Cherbourg; Q = Queenstown; S = Southampton) 16 | 17 | SPECIAL NOTES: 18 | Pclass is a proxy for socio-economic status (SES) 19 | 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower 20 | 21 | Age is in Years; Fractional if Age less than One (1) 22 | If the Age is Estimated, it is in the form xx.5 23 | 24 | With respect to the family relation variables (i.e. sibsp and parch) 25 | some relations were ignored. The following are the definitions used 26 | for sibsp and parch. 27 | 28 | Sibling: Brother, Sister, Stepbrother, or Stepsister of Passenger Aboard Titanic 29 | Spouse: Husband or Wife of Passenger Aboard Titanic (Mistresses and Fiances Ignored) 30 | Parent: Mother or Father of Passenger Aboard Titanic 31 | Child: Son, Daughter, Stepson, or Stepdaughter of Passenger Aboard Titanic 32 | 33 | Other family relatives excluded from this study include cousins, 34 | nephews/nieces, aunts/uncles, and in-laws. Some children travelled 35 | only with a nanny, therefore parch=0 for them. As well, some 36 | travelled with very close friends or neighbors in a village, however, 37 | the definitions do not support such relations. -------------------------------------------------------------------------------- /5.Boost/agaricus.txt: -------------------------------------------------------------------------------- 1 | 0 cap-shape=bell i 2 | 1 cap-shape=conical i 3 | 2 cap-shape=convex i 4 | 3 cap-shape=flat i 5 | 4 cap-shape=knobbed i 6 | 5 cap-shape=sunken i 7 | 6 cap-surface=fibrous i 8 | 7 cap-surface=grooves i 9 | 8 cap-surface=scaly i 10 | 9 cap-surface=smooth i 11 | 10 cap-color=brown i 12 | 11 cap-color=buff i 13 | 12 cap-color=cinnamon i 14 | 13 cap-color=gray i 15 | 14 cap-color=green i 16 | 15 cap-color=pink i 17 | 16 cap-color=purple i 18 | 17 cap-color=red i 19 | 18 cap-color=white i 20 | 19 cap-color=yellow i 21 | 20 bruises?=bruises i 22 | 21 bruises?=no i 23 | 22 odor=almond i 24 | 23 odor=anise i 25 | 24 odor=creosote i 26 | 25 odor=fishy i 27 | 26 odor=foul i 28 | 27 odor=musty i 29 | 28 odor=none i 30 | 29 odor=pungent i 31 | 30 odor=spicy i 32 | 31 gill-attachment=attached i 33 | 32 gill-attachment=descending i 34 | 33 gill-attachment=free i 35 | 34 gill-attachment=notched i 36 | 35 gill-spacing=close i 37 | 36 gill-spacing=crowded i 38 | 37 gill-spacing=distant i 39 | 38 gill-size=broad i 40 | 39 gill-size=narrow i 41 | 40 gill-color=black i 42 | 41 gill-color=brown i 43 | 42 gill-color=buff i 44 | 43 gill-color=chocolate i 45 | 44 gill-color=gray i 46 | 45 gill-color=green i 47 | 46 gill-color=orange i 48 | 47 gill-color=pink i 49 | 48 gill-color=purple i 50 | 49 gill-color=red i 51 | 50 gill-color=white i 52 | 51 gill-color=yellow i 53 | 52 stalk-shape=enlarging i 54 | 53 stalk-shape=tapering i 55 | 54 stalk-root=bulbous i 56 | 55 stalk-root=club i 57 | 56 stalk-root=cup i 58 | 57 stalk-root=equal i 59 | 58 stalk-root=rhizomorphs i 60 | 59 stalk-root=rooted i 61 | 60 stalk-root=missing i 62 | 61 stalk-surface-above-ring=fibrous i 63 | 62 stalk-surface-above-ring=scaly i 64 | 63 stalk-surface-above-ring=silky i 65 | 64 stalk-surface-above-ring=smooth i 66 | 65 stalk-surface-below-ring=fibrous i 67 | 66 stalk-surface-below-ring=scaly i 68 | 67 stalk-surface-below-ring=silky i 69 | 68 stalk-surface-below-ring=smooth i 70 | 69 stalk-color-above-ring=brown i 71 | 70 stalk-color-above-ring=buff i 72 | 71 stalk-color-above-ring=cinnamon i 73 | 72 stalk-color-above-ring=gray i 74 | 73 stalk-color-above-ring=orange i 75 | 74 stalk-color-above-ring=pink i 76 | 75 stalk-color-above-ring=red i 77 | 76 stalk-color-above-ring=white i 78 | 77 stalk-color-above-ring=yellow i 79 | 78 stalk-color-below-ring=brown i 80 | 79 stalk-color-below-ring=buff i 81 | 80 stalk-color-below-ring=cinnamon i 82 | 81 stalk-color-below-ring=gray i 83 | 82 stalk-color-below-ring=orange i 84 | 83 stalk-color-below-ring=pink i 85 | 84 stalk-color-below-ring=red i 86 | 85 stalk-color-below-ring=white i 87 | 86 stalk-color-below-ring=yellow i 88 | 87 veil-type=partial i 89 | 88 veil-type=universal i 90 | 89 veil-color=brown i 91 | 90 veil-color=orange i 92 | 91 veil-color=white i 93 | 92 veil-color=yellow i 94 | 93 ring-number=none i 95 | 94 ring-number=one i 96 | 95 ring-number=two i 97 | 96 ring-type=cobwebby i 98 | 97 ring-type=evanescent i 99 | 98 ring-type=flaring i 100 | 99 ring-type=large i 101 | 100 ring-type=none i 102 | 101 ring-type=pendant i 103 | 102 ring-type=sheathing i 104 | 103 ring-type=zone i 105 | 104 spore-print-color=black i 106 | 105 spore-print-color=brown i 107 | 106 spore-print-color=buff i 108 | 107 spore-print-color=chocolate i 109 | 108 spore-print-color=green i 110 | 109 spore-print-color=orange i 111 | 110 spore-print-color=purple i 112 | 111 spore-print-color=white i 113 | 112 spore-print-color=yellow i 114 | 113 population=abundant i 115 | 114 population=clustered i 116 | 115 population=numerous i 117 | 116 population=scattered i 118 | 117 population=several i 119 | 118 population=solitary i 120 | 119 habitat=grasses i 121 | 120 habitat=leaves i 122 | 121 habitat=meadows i 123 | 122 habitat=paths i 124 | 123 habitat=urban i 125 | 124 habitat=waste i 126 | 125 habitat=woods i 127 | -------------------------------------------------------------------------------- /5.Boost/figures/bagging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/5.Boost/figures/bagging.png -------------------------------------------------------------------------------- /5.Boost/iris.data: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica 151 | 152 | -------------------------------------------------------------------------------- /5.Boost/principle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/5.Boost/principle.pdf -------------------------------------------------------------------------------- /5.Boost/wine_names: -------------------------------------------------------------------------------- 1 | 1. Title of Database: Wine recognition data 2 | Updated Sept 21, 1998 by C.Blake : Added attribute information 3 | 4 | 2. Sources: 5 | (a) Forina, M. et al, PARVUS - An Extendible Package for Data 6 | Exploration, Classification and Correlation. Institute of Pharmaceutical 7 | and Food Analysis and Technologies, Via Brigata Salerno, 8 | 16147 Genoa, Italy. 9 | 10 | (b) Stefan Aeberhard, email: stefan@coral.cs.jcu.edu.au 11 | (c) July 1991 12 | 3. Past Usage: 13 | 14 | (1) 15 | S. Aeberhard, D. Coomans and O. de Vel, 16 | Comparison of Classifiers in High Dimensional Settings, 17 | Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of 18 | Mathematics and Statistics, James Cook University of North Queensland. 19 | (Also submitted to Technometrics). 20 | 21 | The data was used with many others for comparing various 22 | classifiers. The classes are separable, though only RDA 23 | has achieved 100% correct classification. 24 | (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) 25 | (All results using the leave-one-out technique) 26 | 27 | In a classification context, this is a well posed problem 28 | with "well behaved" class structures. A good data set 29 | for first testing of a new classifier, but not very 30 | challenging. 31 | 32 | (2) 33 | S. Aeberhard, D. Coomans and O. de Vel, 34 | "THE CLASSIFICATION PERFORMANCE OF RDA" 35 | Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of 36 | Mathematics and Statistics, James Cook University of North Queensland. 37 | (Also submitted to Journal of Chemometrics). 38 | 39 | Here, the data was used to illustrate the superior performance of 40 | the use of a new appreciation function with RDA. 41 | 42 | 4. Relevant Information: 43 | 44 | -- These data are the results of a chemical analysis of 45 | wines grown in the same region in Italy but derived from three 46 | different cultivars. 47 | The analysis determined the quantities of 13 constituents 48 | found in each of the three types of wines. 49 | 50 | -- I think that the initial data set had around 30 variables, but 51 | for some reason I only have the 13 dimensional version. 52 | I had a list of what the 30 or so variables were, but a.) 53 | I lost it, and b.), I would not know which 13 variables 54 | are included in the set. 55 | 56 | -- The attributes are (dontated by Riccardo Leardi, 57 | riclea@anchem.unige.it ) 58 | 1) Alcohol 59 | 2) Malic acid 60 | 3) Ash 61 | 4) Alcalinity of ash 62 | 5) Magnesium 63 | 6) Total phenols 64 | 7) Flavanoids 65 | 8) Nonflavanoid phenols 66 | 9) Proanthocyanins 67 | 10)Color intensity 68 | 11)Hue 69 | 12)OD280/OD315 of diluted wines 70 | 13)Proline 71 | 72 | 5. Number of Instances 73 | 74 | class 1 59 75 | class 2 71 76 | class 3 48 77 | 78 | 6. Number of Attributes 79 | 80 | 13 81 | 82 | 7. For Each Attribute: 83 | 84 | All attributes are continuous 85 | 86 | No statistics available, but suggest to standardise 87 | variables for certain uses (e.g. for us with classifiers 88 | which are NOT scale invariant) 89 | 90 | NOTE: 1st attribute is class identifier (1-3) 91 | 92 | 8. Missing Attribute Values: 93 | 94 | None 95 | 96 | 9. Class Distribution: number of instances per class 97 | 98 | class 1 59 99 | class 2 71 100 | class 3 48 101 | -------------------------------------------------------------------------------- /6.SVM/6.1 SVM_intro.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from sklearn import svm 5 | from sklearn.model_selection import train_test_split 6 | 7 | 8 | def iris_type(s): 9 | it = {b'Iris-setosa': 0, 10 | b'Iris-versicolor': 1, 11 | b'Iris-virginica': 2} 12 | return it[s] 13 | 14 | 15 | # 'sepal length', 'sepal width', 'petal length', 'petal width' 16 | iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度' 17 | 18 | 19 | def show_accuracy(a, b, tip): 20 | acc = a.ravel() == b.ravel() 21 | print(tip + '正确率:', np.mean(acc)) 22 | 23 | 24 | if __name__ == "__main__": 25 | path = '.\\iris.data' # 数据文件路径 26 | data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) 27 | x, y = np.split(data, (4,), axis=1) 28 | x = x[:, :2] # 取前2个特征 29 | x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6) 30 | 31 | # 分类器 32 | # clf = svm.SVC(C=0.1, kernel='linear', decision_function_shape='ovr') 33 | # 超参数C和分类过渡带的宽度成反比,rbf是计算机核函数 34 | clf = svm.SVC(C=0.8, kernel='rbf', gamma=20, decision_function_shape='ovr') 35 | clf.fit(x_train, y_train.ravel()) 36 | 37 | # 准确率 38 | print(clf.score(x_train, y_train)) # 精度 39 | y_hat = clf.predict(x_train) 40 | print('y_hat:\n', y_hat) 41 | show_accuracy(y_hat, y_train, '训练集') 42 | print(clf.score(x_test, y_test)) 43 | y_hat = clf.predict(x_test) 44 | show_accuracy(y_hat, y_test, '测试集') 45 | 46 | # 画图 47 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 48 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 49 | x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点 50 | grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点 51 | 52 | Z = clf.decision_function(grid_test) # 样本到决策面的距离 53 | print("Z:\n", Z) 54 | 55 | grid_hat = clf.predict(grid_test) # 预测分类值 56 | print(grid_hat) 57 | 58 | grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同 59 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 60 | mpl.rcParams['axes.unicode_minus'] = False 61 | 62 | cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) 63 | cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) 64 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 65 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 66 | x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点 67 | grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点 68 | plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light) 69 | 70 | plt.scatter(x[:, 0], x[:, 1], c=y.reshape(x[:, 0].shape), edgecolors='k', s=50, cmap=cm_dark) # 样本 71 | plt.scatter(x_test[:, 0], x_test[:, 1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 72 | plt.xlabel(iris_feature[0], fontsize=13) 73 | plt.ylabel(iris_feature[1], fontsize=13) 74 | plt.xlim(x1_min, x1_max) 75 | plt.ylim(x2_min, x2_max) 76 | plt.title(u'鸢尾花SVM二特征分类', fontsize=15) 77 | plt.grid() 78 | plt.show() 79 | -------------------------------------------------------------------------------- /6.SVM/6.2 SVM_draw.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.colors 3 | import matplotlib.pyplot as plt 4 | from sklearn import svm 5 | 6 | 7 | def show_accuracy(a, b): 8 | acc = a.ravel() == b.ravel() 9 | print('正确率:%.2f%%' % (100 * float(acc.sum()) / a.size)) 10 | 11 | 12 | if __name__ == "__main__": 13 | data = np.loadtxt('bipartition.txt', dtype=np.float, delimiter='\t') 14 | # 第0 1列给x,从第2列及其往后分给y 15 | x, y = np.split(data, (2,), axis=1) 16 | # 分类标注规范为1 -1 17 | y[y == 0] = -1 18 | # 从列形式变换为行形式 19 | y = y.ravel() 20 | 21 | # 分类器 22 | clfs = [svm.SVC(C=0.3, kernel='linear'), 23 | svm.SVC(C=10, kernel='linear'), 24 | svm.SVC(C=5, kernel='rbf', gamma=1), 25 | svm.SVC(C=5, kernel='rbf', gamma=4)] 26 | titles = 'Linear,C=0.3', 'Linear, C=10', 'RBF, gamma=1', 'RBF, gamma=4' 27 | 28 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 29 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 30 | x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点 31 | grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点 32 | 33 | cm_light = matplotlib.colors.ListedColormap(['#77E0A0', '#FF8080']) 34 | cm_dark = matplotlib.colors.ListedColormap(['g', 'r']) 35 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 36 | matplotlib.rcParams['axes.unicode_minus'] = False 37 | # 窗口大小分别为10 8英寸,背景色为白色 38 | plt.figure(figsize=(10, 8), facecolor='w') 39 | for i, clf in enumerate(clfs): 40 | clf.fit(x, y) 41 | y_hat = clf.predict(x) 42 | show_accuracy(y_hat, y) # 准确率 43 | 44 | # 画图 45 | print('支撑向量的数目:', clf.n_support_) 46 | print('支撑向量的系数:', clf.dual_coef_) 47 | print('支撑向量:', clf.support_) 48 | print('\n') 49 | 50 | plt.subplot(2, 2, i + 1) 51 | grid_hat = clf.predict(grid_test) # 预测分类值 52 | grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同 53 | plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light, alpha=0.8) 54 | plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=40, cmap=cm_dark) # 样本的显示 55 | plt.scatter(x[clf.support_, 0], x[clf.support_, 1], edgecolors='k', facecolors='none', s=100, 56 | marker='o') # 支撑向量 57 | # 画等高线 58 | z = clf.decision_function(grid_test) 59 | print('z:\n', z) 60 | z = z.reshape(x1.shape) 61 | plt.contour(x1, x2, z, colors=list('krk'), linestyles=['--', '-', '--'], linewidths=[1, 2, 1], 62 | levels=[-1, 0, 1]) 63 | plt.xlim(x1_min, x1_max) 64 | plt.ylim(x2_min, x2_max) 65 | plt.title(titles[i]) 66 | plt.grid() 67 | plt.suptitle(u'SVM不同参数的分类', fontsize=18) 68 | plt.tight_layout(2) 69 | plt.subplots_adjust(top=0.92) 70 | plt.show() 71 | -------------------------------------------------------------------------------- /6.SVM/6.3 ClassifierIndex.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import accuracy_score 3 | from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score 4 | from sklearn.metrics import precision_recall_fscore_support 5 | 6 | if __name__ == "__main__": 7 | y_true = np.array([1, 1, 1, 1, 0, 0]) 8 | y_hat = np.array([1, 0, 1, 1, 1, 1]) 9 | print('Accuracy:\t', accuracy_score(y_true, y_hat)) 10 | 11 | ''' 12 | The precision is the ratio 'tp / (tp + fp)' 13 | 'tp' is the number of true positives 14 | 'fp' the number of false positives 15 | The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. 16 | The best value is 1 and the worst value is 0. 17 | ''' 18 | precision = precision_score(y_true, y_hat) 19 | print('Precision:\t', precision) 20 | 21 | ''' 22 | The recall is the ratio 'tp / (tp + fn)' 23 | 'tp' is the number of true positives 24 | 'fn' the number of false negatives 25 | The recall is intuitively the ability of the classifier to find all the positive samples. 26 | The best value is 1 and the worst value is 0. 27 | ''' 28 | recall = recall_score(y_true, y_hat) 29 | print('Recall: \t', recall) 30 | 31 | ''' 32 | F1 score, also known as balanced F-score or F-measure 33 | The F1 score can be interpreted as a weighted average of the precision and recall 34 | an F1 score reaches its best value at 1 and worst score at 0. 35 | The relative contribution of precision and recall to the F1 score are equal. 36 | The formula for the F1 score is: 37 | F1 = 2 * (precision * recall) / (precision + recall) 38 | ''' 39 | 40 | print('f1 score: \t', f1_score(y_true, y_hat)) 41 | # print(2 * (precision * recall) / (precision + recall)) 42 | 43 | ''' 44 | The F-beta score is the weighted harmonic mean of precision and recall, 45 | reaching its optimal value at 1 and its worst value at 0. 46 | The 'beta' parameter determines the weight of precision in the combined score. 47 | 'beta < 1' lends more weight to precision 48 | 'beta > 1' favors recall 49 | ('beta -> 0' considers only precision, 'beta -> infinite' only recall). 50 | ''' 51 | print('F-beta:') 52 | # beta取0.001~1000 53 | for beta in np.logspace(-3, 3, num=7, base=10): 54 | fbeta = fbeta_score(y_true, y_hat, beta=beta) 55 | print('\tbeta=%9.3f\tF-beta=%.5f' % (beta, fbeta)) 56 | # print((1+beta**2)*precision*recall / (beta**2 * precision + recall)) 57 | 58 | print(precision_recall_fscore_support(y_true, y_hat, beta=1)) 59 | 60 | ''' 61 | Accuracy: 0.5 62 | Precision: 0.6 63 | Recall: 0.75 64 | f1 score: 0.6666666666666665 65 | F-beta: 66 | beta= 0.001 F-beta=0.60000 67 | beta= 0.010 F-beta=0.60001 68 | beta= 0.100 F-beta=0.60119 69 | beta= 1.000 F-beta=0.66667 70 | beta= 10.000 F-beta=0.74815 71 | beta= 100.000 F-beta=0.74998 72 | beta= 1000.000 F-beta=0.75000 73 | (array([0. , 0.6]), array([0. , 0.75]), array([0. , 0.66666667]), array([2, 4], dtype=int64)) 74 | ''' -------------------------------------------------------------------------------- /6.SVM/6.4 unBalance.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | import matplotlib.colors 4 | import matplotlib.pyplot as plt 5 | from sklearn import svm 6 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score 7 | 8 | 9 | def show_accuracy(a, b): 10 | acc = a.ravel() == b.ravel() 11 | print('正确率:%.2f%%' % (100 * float(acc.sum()) / a.size)) 12 | 13 | 14 | def show_recall(y, y_hat): 15 | # print y_hat[y == 1] 16 | print('召回率:%.2f%%' % (100 * float(np.sum(y_hat[y == 1] == 1)) / np.extract(y == 1, y).size)) 17 | 18 | 19 | if __name__ == "__main__": 20 | warnings.filterwarnings("ignore") # UndefinedMetricWarning:忽略分母为0情况 21 | np.random.seed(0) # 保持每次生成的数据相同 22 | 23 | c1 = 990 24 | c2 = 10 25 | N = c1 + c2 26 | x_c1 = 3 * np.random.randn(c1, 2) # 990行*2列 27 | print(x_c1) 28 | x_c2 = 0.5 * np.random.randn(c2, 2) + (4, 4) # 10行*2列 围绕(4,4) 29 | x = np.vstack((x_c1, x_c2)) 30 | y = np.ones(N) 31 | # 990*-1 || 10*+1 32 | y[:c1] = -1 33 | 34 | # 显示大小 35 | s = np.ones(N) * 30 36 | s[:c1] = 10 37 | 38 | # 分类器 39 | clfs = [svm.SVC(C=1, kernel='linear'), 40 | svm.SVC(C=1, kernel='linear', class_weight={-1: 1, 1: 50}), 41 | svm.SVC(C=0.8, kernel='rbf', gamma=0.5, class_weight={-1: 1, 1: 2}), 42 | svm.SVC(C=0.8, kernel='rbf', gamma=0.5, class_weight={-1: 1, 1: 10})] 43 | titles = 'Linear', 'Linear, Weight=50', 'RBF, Weight=2', 'RBF, Weight=10' 44 | 45 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 46 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 47 | x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点 48 | grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点 49 | 50 | cm_light = matplotlib.colors.ListedColormap(['#77E0A0', '#FF8080']) 51 | cm_dark = matplotlib.colors.ListedColormap(['g', 'r']) 52 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 53 | matplotlib.rcParams['axes.unicode_minus'] = False 54 | plt.figure(figsize=(10, 8), facecolor='w') 55 | for i, clf in enumerate(clfs): 56 | clf.fit(x, y) 57 | y_hat = clf.predict(x) 58 | # show_accuracy(y_hat, y) # 正确率 59 | # show_recall(y, y_hat) # 召回率 60 | print(i + 1, '次:') 61 | print('正确率:\t', accuracy_score(y, y_hat)) 62 | print(' 精度 :\t', precision_score(y, y_hat, pos_label=1)) 63 | print('召回率:\t', recall_score(y, y_hat, pos_label=1)) 64 | print('F1Score:\t', f1_score(y, y_hat, pos_label=1)) 65 | 66 | # 画图 67 | plt.subplot(2, 2, i + 1) 68 | grid_hat = clf.predict(grid_test) # 预测分类值 69 | grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同 70 | plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light, alpha=0.8) 71 | plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=s, cmap=cm_dark) # 样本的显示 72 | plt.xlim(x1_min, x1_max) 73 | plt.ylim(x2_min, x2_max) 74 | plt.title(titles[i]) 75 | plt.grid() 76 | plt.suptitle(u'不平衡数据的处理', fontsize=18) 77 | plt.tight_layout(1.5) 78 | plt.subplots_adjust(top=0.92) 79 | plt.show() 80 | -------------------------------------------------------------------------------- /6.SVM/6.5 HandWrittenDigits.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import matplotlib.colors 4 | import matplotlib.pyplot as plt 5 | from PIL import Image 6 | from sklearn import svm 7 | 8 | 9 | def show_accuracy(a, b, tip): 10 | acc = a.ravel() == b.ravel() 11 | print(tip + '正确率:%.2f%%' % (100 * np.mean(acc))) 12 | 13 | 14 | def save_image(im, i): 15 | im *= 15.9375 16 | im = 255 - im 17 | a = im.astype(np.uint8) 18 | output_path = '.\\HandWritten' 19 | if not os.path.exists(output_path): 20 | os.mkdir(output_path) 21 | Image.fromarray(a).save(output_path + ('\\%d.png' % i)) 22 | 23 | 24 | if __name__ == "__main__": 25 | print('Load Training File Start...') 26 | data = np.loadtxt('optdigits.tra', dtype=np.float, delimiter=',') 27 | # 除了最后1列都给x,一共64列;最后1列给y 28 | x, y = np.split(data, (-1,), axis=1) 29 | images = x.reshape(-1, 8, 8) 30 | y = y.ravel().astype(np.int) 31 | 32 | print('Load Test Data Start...') 33 | data = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',') 34 | x_test, y_test = np.split(data, (-1,), axis=1) 35 | images_test = x_test.reshape(-1, 8, 8) 36 | y_test = y_test.ravel().astype(np.int) 37 | print('Load Data OK...') 38 | 39 | # x, x_test, y, y_test = train_test_split(x, y, random_state=1) 40 | # images = x.reshape(-1, 8, 8) 41 | # images_test = x_test.reshape(-1, 8, 8) 42 | 43 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 44 | matplotlib.rcParams['axes.unicode_minus'] = False 45 | # 宽度15英寸,长度9英寸,背景白色 46 | plt.figure(figsize=(15, 9), facecolor='w') 47 | # 取3000多个图像的前16个 48 | for index, image in enumerate(images[:16]): 49 | plt.subplot(4, 8, index + 1) 50 | plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') 51 | plt.title(u'训练图片: %i' % y[index]) 52 | for index, image in enumerate(images_test[:16]): 53 | plt.subplot(4, 8, index + 17) 54 | plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') 55 | save_image(image.copy(), index) 56 | plt.title(u'测试图片: %i' % y_test[index]) 57 | plt.tight_layout() 58 | plt.show() 59 | 60 | clf = svm.SVC(C=1, kernel='rbf', gamma=0.001) # ~ kNN 61 | print('Start Learning...') 62 | clf.fit(x, y) 63 | print('Learning is OK...') 64 | y_hat = clf.predict(x) 65 | show_accuracy(y, y_hat, '训练集') 66 | y_hat = clf.predict(x_test) 67 | print(y_hat) 68 | print(y_test) 69 | show_accuracy(y_test, y_hat, '测试集') 70 | 71 | err_images = images_test[y_test != y_hat] 72 | err_y_hat = y_hat[y_test != y_hat] 73 | err_y = y_test[y_test != y_hat] 74 | print(err_y_hat) 75 | print(err_y) 76 | plt.figure(figsize=(10, 8), facecolor='w') 77 | for index, image in enumerate(err_images): 78 | if index >= 12: 79 | break 80 | plt.subplot(3, 4, index + 1) 81 | plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') 82 | plt.title(u'错分为:%i,真实值:%i' % (err_y_hat[index], err_y[index])) 83 | plt.tight_layout() 84 | plt.show() 85 | -------------------------------------------------------------------------------- /6.SVM/6.6 SVR.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import svm 4 | 5 | if __name__ == "__main__": 6 | N = 50 7 | np.random.seed(0) 8 | x = np.sort(np.random.uniform(0, 6, N), axis=0) 9 | y = 2 * np.sin(x) + 0.1 * np.random.randn(N) 10 | x = x.reshape(-1, 1) 11 | print('x =\n', x) 12 | print('y =\n', y) 13 | 14 | print('SVR - RBF') 15 | svr_rbf = svm.SVR(kernel='rbf', gamma=0.2, C=100) 16 | svr_rbf.fit(x, y) 17 | print('SVR - Linear') 18 | svr_linear = svm.SVR(kernel='linear', C=100) 19 | svr_linear.fit(x, y) 20 | print('SVR - Polynomial') 21 | svr_poly = svm.SVR(kernel='poly', degree=3, C=100) 22 | svr_poly.fit(x, y) 23 | print('Fit OK.') 24 | 25 | # 思考:系数1.1改成1.5 26 | x_test = np.linspace(x.min(), 1.5 * x.max(), 100).reshape(-1, 1) 27 | y_rbf = svr_rbf.predict(x_test) 28 | y_linear = svr_linear.predict(x_test) 29 | y_poly = svr_poly.predict(x_test) 30 | 31 | plt.figure(figsize=(9, 8), facecolor='w') 32 | plt.plot(x_test, y_rbf, 'r-', linewidth=2, label='RBF Kernel') 33 | plt.plot(x_test, y_linear, 'g-', linewidth=2, label='Linear Kernel') 34 | plt.plot(x_test, y_poly, 'b-', linewidth=2, label='Polynomial Kernel') 35 | plt.plot(x, y, 'mo', markersize=6) 36 | plt.scatter(x[svr_rbf.support_], y[svr_rbf.support_], s=130, c='r', marker='*', label='RBF Support Vectors') 37 | plt.legend(loc='lower left') 38 | plt.title('SVR', fontsize=16) 39 | plt.xlabel('X') 40 | plt.ylabel('Y') 41 | plt.grid(True) 42 | plt.show() 43 | -------------------------------------------------------------------------------- /6.SVM/6.7 CV.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import svm 4 | from sklearn.model_selection import GridSearchCV # 0.17 grid_search 5 | 6 | if __name__ == "__main__": 7 | N = 50 8 | np.random.seed(0) 9 | x = np.sort(np.random.uniform(0, 6, N), axis=0) 10 | y = 2 * np.sin(x) + 0.1 * np.random.randn(N) 11 | x = x.reshape(-1, 1) 12 | print('x =\n', x) 13 | print('y =\n', y) 14 | 15 | model = svm.SVR(kernel='rbf') 16 | # c 取 0.01~100 17 | c_can = np.logspace(-2, 2, 10) 18 | # gamma 取 0.01~100 19 | gamma_can = np.logspace(-2, 2, 10) 20 | # 交叉验证 21 | svr = GridSearchCV(model, param_grid={'C': c_can, 'gamma': gamma_can}, cv=5) 22 | svr.fit(x, y) 23 | print('验证参数:\n', svr.best_params_) 24 | 25 | x_test = np.linspace(x.min(), x.max(), 100).reshape(-1, 1) 26 | y_hat = svr.predict(x_test) 27 | 28 | sp = svr.best_estimator_.support_ 29 | plt.figure(facecolor='w') 30 | plt.scatter(x[sp], y[sp], s=120, c='r', marker='*', label='Support Vectors', zorder=3) 31 | plt.plot(x_test, y_hat, 'r-', linewidth=2, label='RBF Kernel') 32 | plt.plot(x, y, 'go', markersize=5) 33 | plt.legend(loc='upper right') 34 | plt.title('SVR', fontsize=16) 35 | plt.xlabel('X') 36 | plt.ylabel('Y') 37 | plt.grid(True) 38 | plt.show() 39 | -------------------------------------------------------------------------------- /6.SVM/HandWritten/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/0.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/1.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/10.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/11.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/12.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/13.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/14.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/15.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/2.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/3.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/4.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/5.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/6.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/7.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/8.png -------------------------------------------------------------------------------- /6.SVM/HandWritten/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/HandWritten/9.png -------------------------------------------------------------------------------- /6.SVM/README.md: -------------------------------------------------------------------------------- 1 | ## SVM 2 | ## (支撑向量机) 3 | 4 | ### 项目背景 5 | > SVM方法是通过一个非线性映射p,把样本空间映射到一个高维乃至无穷维的特征空间中(Hilbert空间),使得在原来的样本空间中非线性可分的问题转化为在特征空间中的线性可分的问题.简单地说,就是升维和线性化.升维,就是把样本向高维空间做映射,一般情况下这会增加计算的复杂性,甚至会引起“维数灾难”,因而人们很少问津.但是作为分类、回归等问题来说,很可能在低维样本空间无法线性处理的样本集,在高维特征空间中却可以通过一个线性超平面实现线性划分(或回归).一般的升维都会带来计算的复杂化,SVM方法巧妙地解决了这个难题:应用核函数的展开定理,就不需要知道非线性映射的显式表达式;由于是在高维特征空间中建立线性学习机,所以与线性模型相比,不但几乎不增加计算的复杂性,而且在某种程度上避免了“维数灾难”.这一切要归功于核函数的展开和计算理论. 6 | 7 | ### 项目简介 8 | |名称|简介| 9 | |:-------------|:-------------:| 10 | |6.1 SVM_intro|基于SVM算法莺尾花类别分类| 11 | |6.2 SVM_draw|SVM基于不同超参数分类效果| 12 | |6.3 ClassifierIndex|召回率等评价指标计算| 13 | |6.4 unBalance|不平衡数据SVM分类处理效果| 14 | |6.5 HandWrittenDigits|基于SVM手写体数字预测| 15 | |6.6 SVR|不同核函数预测效果对比| 16 | |6.7 CV|交叉验证配合SVM实现效果| 17 | 18 | ### 效果图 19 | #### ·基于SVM算法莺尾花类别分类 20 | 21 | 22 | #### ·SVM基于不同超参数分类效果 23 | 24 | 25 | #### ·不平衡数据SVM分类处理效果 26 | 27 | 28 | #### ·手写体数字数据集显示 29 | 30 | 31 | #### ·手写体预测错误显示 32 | 33 | 34 | #### ·不同核函数预测效果对比 35 | 36 | 37 | #### ·交叉验证配合SVM实现效果 38 | 39 | -------------------------------------------------------------------------------- /6.SVM/bipartition.txt: -------------------------------------------------------------------------------- 1 | 7 3.2 0 2 | 6.4 3.2 0 3 | 6.9 3.1 0 4 | 5.5 2.3 0 5 | 6.5 2.8 0 6 | 5.7 2.8 0 7 | 6.3 3.3 0 8 | 4.9 2.4 0 9 | 6.6 2.9 0 10 | 5.2 2.7 0 11 | 5 2 0 12 | 5.9 3 0 13 | 6 2.2 0 14 | 6.1 2.9 0 15 | 5.6 2.9 0 16 | 6.7 3.1 0 17 | 5.6 3 0 18 | 5.8 2.7 0 19 | 6.2 2.2 0 20 | 5.6 2.5 0 21 | 5.9 3.2 0 22 | 6.1 2.8 0 23 | 6.3 2.5 0 24 | 6.1 2.8 0 25 | 6.4 2.9 0 26 | 6.6 3 0 27 | 6.8 2.8 0 28 | 6.7 3 0 29 | 6 2.9 0 30 | 5.7 2.6 0 31 | 5.5 2.4 0 32 | 5.5 2.4 0 33 | 5.8 2.7 0 34 | 6 2.7 0 35 | 5.4 3 0 36 | 6 3.4 0 37 | 6.7 3.1 0 38 | 6.3 2.3 0 39 | 5.6 3 0 40 | 5.5 2.5 0 41 | 5.5 2.6 0 42 | 6.1 3 0 43 | 5.8 2.6 0 44 | 5 2.3 0 45 | 5.6 2.7 0 46 | 5.7 3 0 47 | 5.7 2.9 0 48 | 6.2 2.9 0 49 | 5.1 2.5 0 50 | 5.7 2.8 0 51 | 6.3 3.3 1 52 | 5.8 2.7 1 53 | 7.1 3 1 54 | 6.3 2.9 1 55 | 6.5 3 1 56 | 7.6 3 1 57 | 4.9 2.5 1 58 | 7.3 2.9 1 59 | 6.7 2.5 1 60 | 7.2 3.6 1 61 | 6.5 3.2 1 62 | 6.4 2.7 1 63 | 6.8 3 1 64 | 5.7 2.5 1 65 | 5.8 2.8 1 66 | 6.4 3.2 1 67 | 6.5 3 1 68 | 7.7 3.8 1 69 | 7.7 2.6 1 70 | 6 2.2 1 71 | 6.9 3.2 1 72 | 5.6 2.8 1 73 | 7.7 2.8 1 74 | 6.3 2.7 1 75 | 6.7 3.3 1 76 | 7.2 3.2 1 77 | 6.2 2.8 1 78 | 6.1 3 1 79 | 6.4 2.8 1 80 | 7.2 3 1 81 | 7.4 2.8 1 82 | 7.9 3.8 1 83 | 6.4 2.8 1 84 | 6.3 2.8 1 85 | 6.1 2.6 1 86 | 7.7 3 1 87 | 6.3 3.4 1 88 | 6.4 3.1 1 89 | 6 3 1 90 | 6.9 3.1 1 91 | 6.7 3.1 1 92 | 6.9 3.1 1 93 | 5.8 2.7 1 94 | 6.8 3.2 1 95 | 6.7 3.3 1 96 | 6.7 3 1 97 | 6.3 2.5 1 98 | 6.5 3 1 99 | 6.2 3.4 1 100 | 5.9 3 1 -------------------------------------------------------------------------------- /6.SVM/bipartition2.txt: -------------------------------------------------------------------------------- 1 | 4.7 1.4 0 2 | 4.5 1.5 0 3 | 4.9 1.5 0 4 | 4 1.3 0 5 | 4.6 1.5 0 6 | 4.5 1.3 0 7 | 4.7 1.6 0 8 | 3.3 1 0 9 | 4.6 1.3 0 10 | 3.9 1.4 0 11 | 3.5 1 0 12 | 4.2 1.5 0 13 | 4 1 0 14 | 4.7 1.4 0 15 | 3.6 1.3 0 16 | 4.4 1.4 0 17 | 4.5 1.5 0 18 | 4.1 1 0 19 | 4.5 1.5 0 20 | 3.9 1.1 0 21 | 4.8 1.8 0 22 | 4 1.3 0 23 | 4.9 1.5 0 24 | 4.7 1.2 0 25 | 4.3 1.3 0 26 | 4.4 1.4 0 27 | 4.8 1.4 0 28 | 5 1.7 0 29 | 4.5 1.5 0 30 | 3.5 1 0 31 | 3.8 1.1 0 32 | 3.7 1 0 33 | 3.9 1.2 0 34 | 5.1 1.6 0 35 | 4.5 1.5 0 36 | 4.5 1.6 0 37 | 4.7 1.5 0 38 | 4.4 1.3 0 39 | 4.1 1.3 0 40 | 4 1.3 0 41 | 4.4 1.2 0 42 | 4.6 1.4 0 43 | 4 1.2 0 44 | 3.3 1 0 45 | 4.2 1.3 0 46 | 4.2 1.2 0 47 | 4.2 1.3 0 48 | 4.3 1.3 0 49 | 3 1.1 0 50 | 4.1 1.3 0 51 | 6 2.5 1 52 | 5.1 1.9 1 53 | 5.9 2.1 1 54 | 5.6 1.8 1 55 | 5.8 2.2 1 56 | 6.6 2.1 1 57 | 4.5 1.7 1 58 | 6.3 1.8 1 59 | 5.8 1.8 1 60 | 6.1 2.5 1 61 | 5.1 2 1 62 | 5.3 1.9 1 63 | 5.5 2.1 1 64 | 5 2 1 65 | 5.1 2.4 1 66 | 5.3 2.3 1 67 | 5.5 1.8 1 68 | 6.7 2.2 1 69 | 6.9 2.3 1 70 | 5 1.5 1 71 | 5.7 2.3 1 72 | 4.9 2 1 73 | 6.7 2 1 74 | 4.9 1.8 1 75 | 5.7 2.1 1 76 | 6 1.8 1 77 | 4.8 1.8 1 78 | 4.9 1.8 1 79 | 5.6 2.1 1 80 | 5.8 1.6 1 81 | 6.1 1.9 1 82 | 6.4 2 1 83 | 5.6 2.2 1 84 | 5.1 1.5 1 85 | 5.6 1.4 1 86 | 6.1 2.3 1 87 | 5.6 2.4 1 88 | 5.5 1.8 1 89 | 4.8 1.8 1 90 | 5.4 2.1 1 91 | 5.6 2.4 1 92 | 5.1 2.3 1 93 | 5.1 1.9 1 94 | 5.9 2.3 1 95 | 5.7 2.5 1 96 | 5.2 2.3 1 97 | 5 1.9 1 98 | 5.2 2 1 99 | 5.4 2.3 1 100 | 5.1 1.8 1 101 | -------------------------------------------------------------------------------- /6.SVM/figures/CV.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/figures/CV.png -------------------------------------------------------------------------------- /6.SVM/figures/SVM_parameter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/figures/SVM_parameter.png -------------------------------------------------------------------------------- /6.SVM/figures/hand_writing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/figures/hand_writing.png -------------------------------------------------------------------------------- /6.SVM/figures/hand_wrong.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/figures/hand_wrong.png -------------------------------------------------------------------------------- /6.SVM/figures/intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/figures/intro.png -------------------------------------------------------------------------------- /6.SVM/figures/kernal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/figures/kernal.png -------------------------------------------------------------------------------- /6.SVM/figures/unbalance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/figures/unbalance.png -------------------------------------------------------------------------------- /6.SVM/iris.data: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica 151 | 152 | -------------------------------------------------------------------------------- /6.SVM/optdigits.names: -------------------------------------------------------------------------------- 1 | 2 | 1. Title of Database: Optical Recognition of Handwritten Digits 3 | 4 | 2. Source: 5 | E. Alpaydin, C. Kaynak 6 | Department of Computer Engineering 7 | Bogazici University, 80815 Istanbul Turkey 8 | alpaydin@boun.edu.tr 9 | July 1998 10 | 11 | 3. Past Usage: 12 | C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their 13 | Applications to Handwritten Digit Recognition, 14 | MSc Thesis, Institute of Graduate Studies in Science and 15 | Engineering, Bogazici University. 16 | 17 | E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika, 18 | to appear. ftp://ftp.icsi.berkeley.edu/pub/ai/ethem/kyb.ps.Z 19 | 20 | 4. Relevant Information: 21 | We used preprocessing programs made available by NIST to extract 22 | normalized bitmaps of handwritten digits from a preprinted form. From 23 | a total of 43 people, 30 contributed to the training set and different 24 | 13 to the test set. 32x32 bitmaps are divided into nonoverlapping 25 | blocks of 4x4 and the number of on pixels are counted in each block. 26 | This generates an input matrix of 8x8 where each element is an 27 | integer in the range 0..16. This reduces dimensionality and gives 28 | invariance to small distortions. 29 | 30 | For info on NIST preprocessing routines, see 31 | M. D. Garris, J. L. Blue, G. T. Candela, D. L. Dimmick, J. Geist, 32 | P. J. Grother, S. A. Janet, and C. L. Wilson, NIST Form-Based 33 | Handprint Recognition System, NISTIR 5469, 1994. 34 | 35 | 5. Number of Instances 36 | optdigits.tra Training 3823 37 | optdigits.tes Testing 1797 38 | 39 | The way we used the dataset was to use half of training for 40 | actual training, one-fourth for validation and one-fourth 41 | for writer-dependent testing. The test set was used for 42 | writer-independent testing and is the actual quality measure. 43 | 44 | 6. Number of Attributes 45 | 64 input+1 class attribute 46 | 47 | 7. For Each Attribute: 48 | All input attributes are integers in the range 0..16. 49 | The last attribute is the class code 0..9 50 | 51 | 8. Missing Attribute Values 52 | None 53 | 54 | 9. Class Distribution 55 | Class: No of examples in training set 56 | 0: 376 57 | 1: 389 58 | 2: 380 59 | 3: 389 60 | 4: 387 61 | 5: 376 62 | 6: 377 63 | 7: 387 64 | 8: 380 65 | 9: 382 66 | 67 | Class: No of examples in testing set 68 | 0: 178 69 | 1: 182 70 | 2: 177 71 | 3: 183 72 | 4: 181 73 | 5: 182 74 | 6: 181 75 | 7: 179 76 | 8: 174 77 | 9: 180 78 | 79 | Accuracy on the testing set with k-nn 80 | using Euclidean distance as the metric 81 | 82 | k = 1 : 98.00 83 | k = 2 : 97.38 84 | k = 3 : 97.83 85 | k = 4 : 97.61 86 | k = 5 : 97.89 87 | k = 6 : 97.77 88 | k = 7 : 97.66 89 | k = 8 : 97.66 90 | k = 9 : 97.72 91 | k = 10 : 97.55 92 | k = 11 : 97.89 93 | 94 | -------------------------------------------------------------------------------- /6.SVM/principle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/6.SVM/principle.pdf -------------------------------------------------------------------------------- /7.Cluster/7.1 kMeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.colors 3 | import sklearn.datasets as ds 4 | import matplotlib.pyplot as plt 5 | from sklearn.cluster import KMeans 6 | 7 | 8 | def expand(a, b): 9 | d = (b - a) * 0.1 10 | return a - d, b + d 11 | 12 | 13 | if __name__ == "__main__": 14 | N = 400 15 | centers = 4 16 | data, y = ds.make_blobs(N, n_features=2, centers=centers, random_state=2) 17 | data2, y2 = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=(1, 2.5, 0.5, 2), random_state=2) 18 | data3 = np.vstack((data[y == 0][:], data[y == 1][:50], data[y == 2][:20], data[y == 3][:5])) 19 | y3 = np.array([0] * 100 + [1] * 50 + [2] * 20 + [3] * 5) 20 | 21 | cls = KMeans(n_clusters=4, init='k-means++') 22 | y_hat = cls.fit_predict(data) 23 | y2_hat = cls.fit_predict(data2) 24 | y3_hat = cls.fit_predict(data3) 25 | 26 | m = np.array(((1, 1), (1, 3))) 27 | data_r = data.dot(m) 28 | y_r_hat = cls.fit_predict(data_r) 29 | 30 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 31 | matplotlib.rcParams['axes.unicode_minus'] = False 32 | cm = matplotlib.colors.ListedColormap(list('rgbm')) 33 | 34 | plt.figure(figsize=(9, 10), facecolor='w') 35 | plt.subplot(421) 36 | plt.title(u'原始数据') 37 | plt.scatter(data[:, 0], data[:, 1], c=y, s=30, cmap=cm, edgecolors='none') 38 | x1_min, x2_min = np.min(data, axis=0) 39 | x1_max, x2_max = np.max(data, axis=0) 40 | x1_min, x1_max = expand(x1_min, x1_max) 41 | x2_min, x2_max = expand(x2_min, x2_max) 42 | plt.xlim((x1_min, x1_max)) 43 | plt.ylim((x2_min, x2_max)) 44 | plt.grid(True) 45 | 46 | plt.subplot(422) 47 | plt.title(u'KMeans++聚类') 48 | plt.scatter(data[:, 0], data[:, 1], c=y_hat, s=30, cmap=cm, edgecolors='none') 49 | plt.xlim((x1_min, x1_max)) 50 | plt.ylim((x2_min, x2_max)) 51 | plt.grid(True) 52 | 53 | plt.subplot(423) 54 | plt.title(u'旋转后数据') 55 | plt.scatter(data_r[:, 0], data_r[:, 1], c=y, s=30, cmap=cm, edgecolors='none') 56 | x1_min, x2_min = np.min(data_r, axis=0) 57 | x1_max, x2_max = np.max(data_r, axis=0) 58 | x1_min, x1_max = expand(x1_min, x1_max) 59 | x2_min, x2_max = expand(x2_min, x2_max) 60 | plt.xlim((x1_min, x1_max)) 61 | plt.ylim((x2_min, x2_max)) 62 | plt.grid(True) 63 | 64 | plt.subplot(424) 65 | plt.title(u'旋转后KMeans++聚类') 66 | plt.scatter(data_r[:, 0], data_r[:, 1], c=y_r_hat, s=30, cmap=cm, edgecolors='none') 67 | plt.xlim((x1_min, x1_max)) 68 | plt.ylim((x2_min, x2_max)) 69 | plt.grid(True) 70 | 71 | plt.subplot(425) 72 | plt.title(u'方差不相等数据') 73 | plt.scatter(data2[:, 0], data2[:, 1], c=y2, s=30, cmap=cm, edgecolors='none') 74 | x1_min, x2_min = np.min(data2, axis=0) 75 | x1_max, x2_max = np.max(data2, axis=0) 76 | x1_min, x1_max = expand(x1_min, x1_max) 77 | x2_min, x2_max = expand(x2_min, x2_max) 78 | plt.xlim((x1_min, x1_max)) 79 | plt.ylim((x2_min, x2_max)) 80 | plt.grid(True) 81 | 82 | plt.subplot(426) 83 | plt.title(u'方差不相等KMeans++聚类') 84 | plt.scatter(data2[:, 0], data2[:, 1], c=y2_hat, s=30, cmap=cm, edgecolors='none') 85 | plt.xlim((x1_min, x1_max)) 86 | plt.ylim((x2_min, x2_max)) 87 | plt.grid(True) 88 | 89 | plt.subplot(427) 90 | plt.title(u'数量不相等数据') 91 | plt.scatter(data3[:, 0], data3[:, 1], s=30, c=y3, cmap=cm, edgecolors='none') 92 | x1_min, x2_min = np.min(data3, axis=0) 93 | x1_max, x2_max = np.max(data3, axis=0) 94 | x1_min, x1_max = expand(x1_min, x1_max) 95 | x2_min, x2_max = expand(x2_min, x2_max) 96 | plt.xlim((x1_min, x1_max)) 97 | plt.ylim((x2_min, x2_max)) 98 | plt.grid(True) 99 | 100 | plt.subplot(428) 101 | plt.title(u'数量不相等KMeans++聚类') 102 | plt.scatter(data3[:, 0], data3[:, 1], c=y3_hat, s=30, cmap=cm, edgecolors='none') 103 | plt.xlim((x1_min, x1_max)) 104 | plt.ylim((x2_min, x2_max)) 105 | plt.grid(True) 106 | 107 | plt.tight_layout(2) 108 | plt.suptitle(u'数据分布对KMeans聚类的影响', fontsize=18) 109 | # https://github.com/matplotlib/matplotlib/issues/829 110 | plt.subplots_adjust(top=0.92) 111 | plt.show() 112 | -------------------------------------------------------------------------------- /7.Cluster/7.2 criteria.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | 3 | if __name__ == "__main__": 4 | y = [0, 0, 0, 1, 1, 1] 5 | y_hat = [0, 0, 1, 1, 2, 2] 6 | h = metrics.homogeneity_score(y, y_hat) 7 | c = metrics.completeness_score(y, y_hat) 8 | print(u'同一性(Homogeneity):', h) 9 | print(u'完整性(Completeness):', c) 10 | v2 = 2 * c * h / (c + h) 11 | v = metrics.v_measure_score(y, y_hat) 12 | print(u'V-Measure:', v2, v) 13 | ''' 14 | 同一性(Homogeneity): 0.6666666666666669 15 | 完整性(Completeness): 0.420619835714305 16 | V-Measure: 0.5158037429793889 0.5158037429793889 17 | ''' 18 | 19 | print('\n') 20 | y = [0, 0, 0, 1, 1, 1] 21 | y_hat = [0, 0, 1, 2, 3, 3] 22 | h = metrics.homogeneity_score(y, y_hat) 23 | c = metrics.completeness_score(y, y_hat) 24 | v = metrics.v_measure_score(y, y_hat) 25 | print(u'同一性(Homogeneity):', h) 26 | print(u'完整性(Completeness):', c) 27 | print(u'V-Measure:', v) 28 | ''' 29 | 同一性(Homogeneity): 1.0 30 | 完整性(Completeness): 0.52129602861432 31 | V-Measure: 0.6853314789615865 32 | ''' 33 | 34 | # 允许不同值 35 | print('\n') 36 | y = [0, 0, 0, 1, 1, 1] 37 | y_hat = [1, 1, 1, 0, 0, 0] 38 | h = metrics.homogeneity_score(y, y_hat) 39 | c = metrics.completeness_score(y, y_hat) 40 | v = metrics.v_measure_score(y, y_hat) 41 | print(u'同一性(Homogeneity):', h) 42 | print(u'完整性(Completeness):', c) 43 | print(u'V-Measure:', v) 44 | 45 | y = [0, 0, 1, 1] 46 | y_hat = [0, 1, 0, 1] 47 | ari = metrics.adjusted_rand_score(y, y_hat) 48 | print(ari) 49 | 50 | y = [0, 0, 0, 1, 1, 1] 51 | y_hat = [0, 0, 1, 1, 2, 2] 52 | ari = metrics.adjusted_rand_score(y, y_hat) 53 | print(ari) 54 | 55 | ''' 56 | 同一性(Homogeneity): 1.0 57 | 完整性(Completeness): 1.0 58 | V-Measure: 1.0 59 | -0.49999999999999994 60 | 0.24242424242424246 61 | ''' 62 | -------------------------------------------------------------------------------- /7.Cluster/7.3 VectorQuantization.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from PIL import Image 5 | from sklearn.cluster import KMeans 6 | from mpl_toolkits.mplot3d import Axes3D 7 | 8 | 9 | def restore_image(cb, cluster, shape): 10 | row, col, dummy = shape 11 | image = np.empty((row, col, 3)) 12 | index = 0 13 | for r in range(row): 14 | for c in range(col): 15 | image[r, c] = cb[cluster[index]] 16 | index += 1 17 | return image 18 | 19 | 20 | def show_scatter(a): 21 | N = 10 22 | print('原始数据:\n', a) 23 | density, edges = np.histogramdd(a, bins=[N, N, N], range=[(0, 1), (0, 1), (0, 1)]) 24 | density /= density.max() 25 | x = y = z = np.arange(N) 26 | d = np.meshgrid(x, y, z) 27 | 28 | fig = plt.figure(1, facecolor='w') 29 | ax = fig.add_subplot(111, projection='3d') 30 | ax.scatter(d[1], d[0], d[2], c='r', s=100 * density, marker='o', depthshade=True) 31 | ax.set_xlabel(u'红色分量') 32 | ax.set_ylabel(u'绿色分量') 33 | ax.set_zlabel(u'蓝色分量') 34 | plt.title(u'图像颜色三维频数分布', fontsize=20) 35 | 36 | plt.figure(2, facecolor='w') 37 | den = density[density > 0] 38 | den = np.sort(den)[::-1] 39 | t = np.arange(len(den)) 40 | plt.plot(t, den, 'r-', t, den, 'go', lw=2) 41 | plt.title(u'图像颜色频数分布', fontsize=18) 42 | plt.grid(True) 43 | 44 | plt.show() 45 | 46 | 47 | if __name__ == '__main__': 48 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 49 | matplotlib.rcParams['axes.unicode_minus'] = False 50 | 51 | num_vq = 50 52 | im = Image.open('lena.png') # son.bmp(100)/flower2.png(200)/son.png(60)/lena.png(50) 53 | image = np.array(im).astype(np.float) / 255 54 | image = image[:, :, :3] 55 | image_v = image.reshape((-1, 3)) 56 | model = KMeans(num_vq) 57 | show_scatter(image_v) 58 | 59 | N = image_v.shape[0] # 图像像素总数 60 | # 选择足够多的样本(如1000个),计算聚类中心 61 | idx = np.random.randint(0, N, size=1000) 62 | image_sample = image_v[idx] 63 | model.fit(image_sample) 64 | c = model.predict(image_v) # 聚类结果 65 | print('聚类结果:\n', c) 66 | print('聚类中心:\n', model.cluster_centers_) 67 | 68 | plt.figure(figsize=(15, 8), facecolor='w') 69 | plt.subplot(121) 70 | plt.axis('off') 71 | plt.title(u'原始图片', fontsize=18) 72 | plt.imshow(image) 73 | # plt.savefig('1.png') 74 | 75 | plt.subplot(122) 76 | vq_image = restore_image(model.cluster_centers_, c, image.shape) 77 | plt.axis('off') 78 | plt.title(u'矢量量化后图片:%d色' % num_vq, fontsize=18) 79 | plt.imshow(vq_image) 80 | # plt.savefig('2.png') 81 | 82 | plt.tight_layout(1.2) 83 | plt.show() 84 | -------------------------------------------------------------------------------- /7.Cluster/7.4 AP.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.colors 3 | import sklearn.datasets as ds 4 | import matplotlib.pyplot as plt 5 | from sklearn.cluster import AffinityPropagation 6 | from sklearn.metrics import euclidean_distances 7 | 8 | if __name__ == "__main__": 9 | N = 400 10 | centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]] 11 | data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0) 12 | m = euclidean_distances(data, squared=True) 13 | preference = -np.median(m) 14 | print('Preference:', preference) 15 | 16 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 17 | matplotlib.rcParams['axes.unicode_minus'] = False 18 | plt.figure(figsize=(12, 9), facecolor='w') 19 | for i, mul in enumerate(np.linspace(1, 4, 9)): 20 | print(mul) 21 | p = mul * preference 22 | model = AffinityPropagation(affinity='euclidean', preference=p) 23 | af = model.fit(data) 24 | center_indices = af.cluster_centers_indices_ 25 | n_clusters = len(center_indices) 26 | print('p = %.1f' % mul), p, '聚类簇的个数为:', n_clusters 27 | y_hat = af.labels_ 28 | 29 | plt.subplot(3, 3, i + 1) 30 | plt.title(u'Preference:%.2f,簇个数:%d' % (p, n_clusters)) 31 | clrs = [] 32 | for c in np.linspace(16711680, 255, n_clusters): 33 | clrs.append('#%06x' % np.int(c)) 34 | # clrs = plt.cm.Spectral(np.linspace(0, 1, n_clusters)) 35 | for k, clr in enumerate(clrs): 36 | cur = (y_hat == k) 37 | plt.scatter(data[cur, 0], data[cur, 1], c=clr, edgecolors='none') 38 | center = data[center_indices[k]] 39 | for x in data[cur]: 40 | plt.plot([x[0], center[0]], [x[1], center[1]], color=clr, zorder=1) 41 | plt.scatter(data[center_indices, 0], data[center_indices, 1], s=100, c=clrs, marker='*', edgecolors='k', 42 | zorder=2) 43 | plt.grid(True) 44 | plt.tight_layout() 45 | plt.suptitle(u'AP聚类', fontsize=20) 46 | plt.subplots_adjust(top=0.92) 47 | plt.show() 48 | -------------------------------------------------------------------------------- /7.Cluster/7.5 MeanShift.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.colors 3 | import sklearn.datasets as ds 4 | import matplotlib.pyplot as plt 5 | from sklearn.cluster import MeanShift 6 | from sklearn.metrics import euclidean_distances 7 | 8 | if __name__ == "__main__": 9 | N = 1000 10 | centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]] 11 | data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0) 12 | 13 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 14 | matplotlib.rcParams['axes.unicode_minus'] = False 15 | plt.figure(figsize=(10, 9), facecolor='w') 16 | m = euclidean_distances(data, squared=True) 17 | bw = np.median(m) 18 | print(bw) 19 | for i, mul in enumerate(np.linspace(0.1, 0.4, 4)): 20 | band_width = mul * bw 21 | model = MeanShift(bin_seeding=True, bandwidth=band_width) 22 | ms = model.fit(data) 23 | centers = ms.cluster_centers_ 24 | y_hat = ms.labels_ 25 | n_clusters = np.unique(y_hat).size 26 | print('带宽:', mul, band_width, '聚类簇的个数为:', n_clusters) 27 | 28 | plt.subplot(2, 2, i + 1) 29 | plt.title(u'带宽:%.2f,聚类簇的个数为:%d' % (band_width, n_clusters)) 30 | clrs = [] 31 | for c in np.linspace(16711680, 255, n_clusters): 32 | clrs.append('#%06x' % np.int(c)) 33 | # clrs = plt.cm.Spectral(np.linspace(0, 1, n_clusters)) 34 | print(clrs) 35 | for k, clr in enumerate(clrs): 36 | cur = (y_hat == k) 37 | plt.scatter(data[cur, 0], data[cur, 1], c=clr, edgecolors='none') 38 | plt.scatter(centers[:, 0], centers[:, 1], s=150, c=clrs, marker='*', edgecolors='k') 39 | plt.grid(True) 40 | plt.tight_layout(2) 41 | plt.suptitle(u'MeanShift聚类', fontsize=20) 42 | plt.subplots_adjust(top=0.92) 43 | plt.show() 44 | -------------------------------------------------------------------------------- /7.Cluster/7.6 DBSCAN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.colors 3 | import sklearn.datasets as ds 4 | import matplotlib.pyplot as plt 5 | from sklearn.cluster import DBSCAN 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | 9 | def expand(a, b): 10 | d = (b - a) * 0.1 11 | return a - d, b + d 12 | 13 | 14 | if __name__ == "__main__": 15 | N = 1000 16 | centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]] 17 | data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0) 18 | data = StandardScaler().fit_transform(data) 19 | # 数据1的参数:(epsilon, min_sample) 20 | params = ((0.2, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15)) 21 | 22 | # # 数据2 23 | # t = np.arange(0, 2*np.pi, 0.1) 24 | # data1 = np.vstack((np.cos(t), np.sin(t))).T 25 | # data2 = np.vstack((2*np.cos(t), 2*np.sin(t))).T 26 | # data3 = np.vstack((3*np.cos(t), 3*np.sin(t))).T 27 | # data = np.vstack((data1, data2, data3)) 28 | # # 数据2的参数:(epsilon, min_sample) 29 | # params = ((0.5, 3), (0.5, 5), (0.5, 10), (1., 3), (1., 10), (1., 20)) 30 | 31 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 32 | matplotlib.rcParams['axes.unicode_minus'] = False 33 | 34 | plt.figure(figsize=(12, 8), facecolor='w') 35 | plt.suptitle(u'DBSCAN聚类', fontsize=20) 36 | 37 | for i in range(6): 38 | eps, min_samples = params[i] 39 | model = DBSCAN(eps=eps, min_samples=min_samples) 40 | model.fit(data) 41 | y_hat = model.labels_ 42 | 43 | core_indices = np.zeros_like(y_hat, dtype=bool) 44 | core_indices[model.core_sample_indices_] = True 45 | 46 | y_unique = np.unique(y_hat) 47 | n_clusters = y_unique.size - (1 if -1 in y_hat else 0) 48 | print(y_unique, '聚类簇的个数为:', n_clusters) 49 | 50 | # clrs = [] 51 | # for c in np.linspace(16711680, 255, y_unique.size): 52 | # clrs.append('#%06x' % c) 53 | plt.subplot(2, 3, i + 1) 54 | clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size)) 55 | for k, clr in zip(y_unique, clrs): 56 | cur = (y_hat == k) 57 | if k == -1: 58 | plt.scatter(data[cur, 0], data[cur, 1], s=20, c='k') 59 | continue 60 | plt.scatter(data[cur, 0], data[cur, 1], s=30, c=clr, edgecolors='k') 61 | plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=60, c=clr, marker='o', 62 | edgecolors='k') 63 | x1_min, x2_min = np.min(data, axis=0) 64 | x1_max, x2_max = np.max(data, axis=0) 65 | x1_min, x1_max = expand(x1_min, x1_max) 66 | x2_min, x2_max = expand(x2_min, x2_max) 67 | plt.xlim((x1_min, x1_max)) 68 | plt.ylim((x2_min, x2_max)) 69 | plt.grid(True) 70 | plt.title(u'$\epsilon$ = %.1f m = %d,聚类数目:%d' % (eps, min_samples, n_clusters), fontsize=16) 71 | plt.tight_layout() 72 | plt.subplots_adjust(top=0.9) 73 | plt.show() 74 | -------------------------------------------------------------------------------- /7.Cluster/7.7 SC.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.colors 3 | import matplotlib.pyplot as plt 4 | from sklearn.cluster import spectral_clustering 5 | from sklearn.metrics import euclidean_distances 6 | 7 | 8 | def expand(a, b): 9 | d = (b - a) * 0.1 10 | return a - d, b + d 11 | 12 | 13 | if __name__ == "__main__": 14 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 15 | matplotlib.rcParams['axes.unicode_minus'] = False 16 | 17 | t = np.arange(0, 2 * np.pi, 0.1) 18 | data1 = np.vstack((np.cos(t), np.sin(t))).T 19 | data2 = np.vstack((2 * np.cos(t), 2 * np.sin(t))).T 20 | data3 = np.vstack((3 * np.cos(t), 3 * np.sin(t))).T 21 | data = np.vstack((data1, data2, data3)) 22 | 23 | n_clusters = 3 24 | m = euclidean_distances(data, squared=True) 25 | sigma = np.median(m) 26 | 27 | plt.figure(figsize=(12, 8), facecolor='w') 28 | plt.suptitle(u'谱聚类', fontsize=20) 29 | clrs = plt.cm.Spectral(np.linspace(0, 0.8, n_clusters)) 30 | for i, s in enumerate(np.logspace(-2, 0, 6)): 31 | print(s) 32 | af = np.exp(-m ** 2 / (s ** 2)) + 1e-6 33 | y_hat = spectral_clustering(af, n_clusters=n_clusters, assign_labels='kmeans', random_state=1) 34 | plt.subplot(2, 3, i + 1) 35 | for k, clr in enumerate(clrs): 36 | cur = (y_hat == k) 37 | plt.scatter(data[cur, 0], data[cur, 1], s=40, c=clr, edgecolors='k') 38 | x1_min, x2_min = np.min(data, axis=0) 39 | x1_max, x2_max = np.max(data, axis=0) 40 | x1_min, x1_max = expand(x1_min, x1_max) 41 | x2_min, x2_max = expand(x2_min, x2_max) 42 | plt.xlim((x1_min, x1_max)) 43 | plt.ylim((x2_min, x2_max)) 44 | plt.grid(True) 45 | plt.title(u'$\sigma$ = %.2f' % s, fontsize=16) 46 | plt.tight_layout() 47 | plt.subplots_adjust(top=0.9) 48 | plt.show() 49 | -------------------------------------------------------------------------------- /7.Cluster/7.8 SpectralClusterImage.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.colors 3 | import matplotlib.pyplot as plt 4 | from PIL import Image 5 | from sklearn.feature_extraction import image 6 | from sklearn.cluster import spectral_clustering 7 | 8 | if __name__ == "__main__": 9 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 10 | matplotlib.rcParams['axes.unicode_minus'] = False 11 | 12 | pic = Image.open('Chrome.png') 13 | pic = pic.convert('L') 14 | data = np.array(pic).astype(np.float) / 255 15 | 16 | plt.figure(figsize=(10, 5), facecolor='w') 17 | plt.subplot(121) 18 | plt.imshow(pic, cmap=plt.cm.gray, interpolation='nearest') 19 | plt.title(u'原始图片', fontsize=18) 20 | n_clusters = 15 21 | 22 | affinity = image.img_to_graph(data) 23 | beta = 3 24 | affinity.data = np.exp(-beta * affinity.data / affinity.data.std()) + 10e-5 25 | # a = affinity.toarray() 26 | # b = np.diag(a.diagonal()) 27 | # a -= b 28 | print('开始谱聚类') 29 | y = spectral_clustering(affinity, n_clusters=n_clusters, assign_labels='kmeans', random_state=1) 30 | print('谱聚类完成') 31 | y = y.reshape(data.shape) 32 | for n in range(n_clusters): 33 | data[y == n] = n 34 | plt.subplot(122) 35 | clrs = [] 36 | for c in np.linspace(16776960, 16711935, n_clusters): 37 | clrs.append('#%06x' % np.int(c)) 38 | cm = matplotlib.colors.ListedColormap(clrs) 39 | plt.imshow(data, cmap=cm, interpolation='nearest') 40 | plt.title(u'谱聚类:%d簇' % n_clusters, fontsize=18) 41 | plt.tight_layout() 42 | plt.show() 43 | -------------------------------------------------------------------------------- /7.Cluster/Chrome.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/Chrome.png -------------------------------------------------------------------------------- /7.Cluster/Lena.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/Lena.png -------------------------------------------------------------------------------- /7.Cluster/README.md: -------------------------------------------------------------------------------- 1 | ## Cluster 2 | ## (聚类算法) 3 | 4 | ### 项目背景 5 | > 聚类就是对大量未知标注的数据集,按照数据内部存在的数据特征将数据集划分为多个不同的类别,使类别内的数据比较相似,类别之间的数据相似度比较小,属于无监督学习。聚类算法的重点是计算样本项之间的相似度,有时候也称为样本间的距离。 6 | 7 | ### 基本步骤 8 | (1)首先我们选择一些类/组,并随机初始化它们各自的中心点。中心点是与每个数据点向量长度相同的位置。这需要我们提前预知类的数量(即中心点的数量)。
9 | (2)计算每个数据点到中心点的距离,数据点距离哪个中心点最近就划分到哪一类中。
10 | (3)计算每一类中中心点作为新的中心点。
11 | (4)重复以上步骤,直到每一类中心在每次迭代后变化不大为止。也可以多次随机初始化中心点,然后选择运行结果最好的一个。
12 | 13 | ### 项目简介 14 | |名称|简介| 15 | |:-------------|:-------------:| 16 | |7.1 kMeans|基本聚类算法实现效果| 17 | |7.2 criteria|验证聚类效果指标计算| 18 | |7.3 VectorQuantization|矢量量化聚类压缩图像| 19 | |7.4 AP|AP聚类| 20 | |7.5 MeanShift|MeanShift聚类| 21 | |7.6 DBSCAN|DBSCAN聚类| 22 | |7.7 SC|谱聚类| 23 | |7.8 SpectralClusterImage|谱聚类应用效果| 24 | 25 | ### 效果图 26 | #### ·基本聚类算法实现效果 27 | 28 | 29 | #### ·矢量量化聚类压缩图像 30 | 31 | 32 | #### ·矢量量化二维分布 33 | 34 | 35 | #### ·矢量量化三维分布 36 | 37 | 38 | #### ·AP聚类 39 | 40 | 41 | #### ·MeanShift聚类 42 | 43 | 44 | #### ·DBSCAN聚类 45 | 46 | 47 | #### ·谱聚类 48 | 49 | 50 | #### ·谱聚类应用效果 51 | 52 | 53 | -------------------------------------------------------------------------------- /7.Cluster/figures/2D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/figures/2D.png -------------------------------------------------------------------------------- /7.Cluster/figures/3D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/figures/3D.png -------------------------------------------------------------------------------- /7.Cluster/figures/AP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/figures/AP.png -------------------------------------------------------------------------------- /7.Cluster/figures/DBSCAN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/figures/DBSCAN.png -------------------------------------------------------------------------------- /7.Cluster/figures/SC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/figures/SC.png -------------------------------------------------------------------------------- /7.Cluster/figures/cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/figures/cluster.png -------------------------------------------------------------------------------- /7.Cluster/figures/mean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/figures/mean.png -------------------------------------------------------------------------------- /7.Cluster/figures/spectral.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/figures/spectral.png -------------------------------------------------------------------------------- /7.Cluster/figures/target.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/figures/target.png -------------------------------------------------------------------------------- /7.Cluster/principle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/7.Cluster/principle.pdf -------------------------------------------------------------------------------- /8.EM Model/8.1 EM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from mpl_toolkits.mplot3d import Axes3D 5 | from scipy.stats import multivariate_normal 6 | from sklearn.mixture import GaussianMixture 7 | from sklearn.metrics.pairwise import pairwise_distances_argmin 8 | 9 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 10 | mpl.rcParams['axes.unicode_minus'] = False 11 | 12 | if __name__ == '__main__': 13 | style = 'sklearn' # or 'others' 14 | 15 | np.random.seed(0) 16 | 17 | # 第一类数据 18 | # 均值取原点(0,0,0) 19 | mu1_fact = (0, 0, 0) 20 | # 方差取3*3单位阵 21 | cov_fact = np.identity(3) 22 | print(cov_fact) 23 | # 给定(均值,方差,样本数量)会返回400*3矩阵数据 24 | data1 = np.random.multivariate_normal(mu1_fact, cov_fact, 400) 25 | 26 | # 第二类数据 27 | # 均值取原点(2,2,1) 28 | mu2_fact = (2, 2, 1) 29 | # 方差取3*3单位阵 30 | cov_fact = np.identity(3) 31 | # 给定(均值,方差,样本数量)会返回100*3矩阵数据 32 | data2 = np.random.multivariate_normal(mu2_fact, cov_fact, 100) # 给定(均值,方差,样本数量)会返回100*3矩阵数据 33 | 34 | # 垂直堆叠两类数据,形成500*3矩阵 35 | data = np.vstack((data1, data2)) 36 | # 500个数据标记,无监督不用,用来计算正确率 37 | y = np.array([True] * 400 + [False] * 100) 38 | 39 | if style == 'sklearn': 40 | # n_components:类别数量 || covariance_type:方差类型(full、tied、diag、spherical) || tol:容差 || max_iter:最大迭代次数 41 | g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000) 42 | g.fit(data) 43 | print('类别概率:\t', g.weights_[0]) 44 | print('均值:\n', g.means_, '\n') # 实际均值(0,0,0)和(2,2,1) 45 | print('方差:\n', g.covariances_, '\n') # 实际方差是3*3单位阵 46 | mu1, mu2 = g.means_ 47 | sigma1, sigma2 = g.covariances_ 48 | else: 49 | num_iter = 100 50 | n, d = data.shape 51 | # 随机指定 52 | # mu1 = np.random.standard_normal(d) 53 | # print mu1 54 | # mu2 = np.random.standard_normal(d) 55 | # print mu2 56 | mu1 = data.min(axis=0) 57 | mu2 = data.max(axis=0) 58 | sigma1 = np.identity(d) 59 | sigma2 = np.identity(d) 60 | pi = 0.5 61 | # EM 62 | for i in range(num_iter): 63 | # E Step 64 | norm1 = multivariate_normal(mu1, sigma1) 65 | norm2 = multivariate_normal(mu2, sigma2) 66 | tau1 = pi * norm1.pdf(data) 67 | tau2 = (1 - pi) * norm2.pdf(data) 68 | gamma = tau1 / (tau1 + tau2) 69 | 70 | # M Step 71 | mu1 = np.dot(gamma, data) / np.sum(gamma) 72 | mu2 = np.dot((1 - gamma), data) / np.sum((1 - gamma)) 73 | sigma1 = np.dot(gamma * (data - mu1).T, data - mu1) / np.sum(gamma) 74 | sigma2 = np.dot((1 - gamma) * (data - mu2).T, data - mu2) / np.sum(1 - gamma) 75 | pi = np.sum(gamma) / n 76 | print(i, ":\t", mu1, mu2) 77 | print('类别概率:\t', pi) 78 | print('均值:\t', mu1, mu2) 79 | print('方差:\n', sigma1, '\n\n', sigma2, '\n') 80 | 81 | # 预测分类 82 | # 将求出来的2个高斯模型的4个参数代入高斯模型 83 | norm1 = multivariate_normal(mu1, sigma1) 84 | norm2 = multivariate_normal(mu2, sigma2) 85 | # 概率密度函数值 86 | tau1 = norm1.pdf(data) 87 | tau2 = norm2.pdf(data) 88 | 89 | fig = plt.figure(figsize=(13, 7), facecolor='w') 90 | ax = fig.add_subplot(121, projection='3d') 91 | ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', depthshade=True) 92 | ax.set_xlabel('X') 93 | ax.set_ylabel('Y') 94 | ax.set_zlabel('Z') 95 | ax.set_title(u'原始数据', fontsize=18) 96 | ax = fig.add_subplot(122, projection='3d') 97 | # 确保均值的对应顺序 98 | order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean') 99 | if order[0] == 0: 100 | c1 = tau1 > tau2 101 | else: 102 | c1 = tau1 < tau2 103 | c2 = ~c1 104 | acc = np.mean(y == c1) 105 | print(u'准确率:%.2f%%' % (100 * acc)) 106 | ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', depthshade=True) 107 | ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', depthshade=True) 108 | ax.set_xlabel('X') 109 | ax.set_ylabel('Y') 110 | ax.set_zlabel('Z') 111 | ax.set_title(u'EM算法分类', fontsize=18) 112 | plt.suptitle(u'EM算法的实现', fontsize=20) 113 | plt.subplots_adjust(top=0.92) 114 | plt.tight_layout() 115 | plt.show() 116 | -------------------------------------------------------------------------------- /8.EM Model/8.2 GMM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.colors 4 | import matplotlib.pyplot as plt 5 | from sklearn.mixture import GaussianMixture 6 | from sklearn.model_selection import train_test_split 7 | 8 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 9 | mpl.rcParams['axes.unicode_minus'] = False 10 | 11 | 12 | # from matplotlib.font_manager import FontProperties 13 | # font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15) 14 | # fontproperties=font_set 15 | 16 | 17 | def expand(a, b): 18 | d = (b - a) * 0.05 19 | return a - d, b + d 20 | 21 | 22 | if __name__ == '__main__': 23 | data = np.loadtxt('HeightWeight.csv', dtype=np.float, delimiter=',', skiprows=1) 24 | print(data.shape) 25 | y, x = np.split(data, [1, ], axis=1) 26 | x, x_test, y, y_test = train_test_split(x, y, train_size=0.6, random_state=0) 27 | gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0) 28 | gmm.fit(x) 29 | print('均值 = \n', gmm.means_) 30 | print('方差 = \n', gmm.covariances_) 31 | y_hat = gmm.predict(x) 32 | y_test_hat = gmm.predict(x_test) 33 | # 验证是否颠倒 34 | change = (gmm.means_[0][0] > gmm.means_[1][0]) 35 | if change: 36 | z = y_hat == 0 37 | y_hat[z] = 1 38 | y_hat[~z] = 0 39 | z = y_test_hat == 0 40 | y_test_hat[z] = 1 41 | y_test_hat[~z] = 0 42 | acc = np.mean(y_hat.ravel() == y.ravel()) 43 | acc_test = np.mean(y_test_hat.ravel() == y_test.ravel()) 44 | acc_str = u'训练集准确率:%.2f%%' % (acc * 100) 45 | acc_test_str = u'测试集准确率:%.2f%%' % (acc_test * 100) 46 | print(acc_str) 47 | print(acc_test_str) 48 | 49 | cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0']) 50 | cm_dark = mpl.colors.ListedColormap(['r', 'g']) 51 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() 52 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() 53 | x1_min, x1_max = expand(x1_min, x1_max) 54 | x2_min, x2_max = expand(x2_min, x2_max) 55 | x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] 56 | grid_test = np.stack((x1.flat, x2.flat), axis=1) 57 | grid_hat = gmm.predict(grid_test) 58 | grid_hat = grid_hat.reshape(x1.shape) 59 | if change: 60 | z = grid_hat == 0 61 | grid_hat[z] = 1 62 | grid_hat[~z] = 0 63 | plt.figure(figsize=(9, 7), facecolor='w') 64 | # 绘制两个颜色色块 65 | plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light) 66 | plt.scatter(x[:, 0], x[:, 1], s=50, c=y.reshape(x[:, 0].shape), marker='o', cmap=cm_dark, edgecolors='k') 67 | plt.scatter(x_test[:, 0], x_test[:, 1], s=60, c=y_test.reshape(x_test[:, 0].shape), marker='^', cmap=cm_dark, 68 | edgecolors='k') 69 | 70 | p = gmm.predict_proba(grid_test) 71 | p = p[:, 0].reshape(x1.shape) 72 | # 线上显示字 73 | CS = plt.contour(x1, x2, p, levels=(0.2, 0.5, 0.8), colors=list('rgb'), linewidths=2) 74 | plt.clabel(CS, fontsize=15, fmt='%.1f', inline=True) 75 | # 文字定位 76 | ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() 77 | xx = 0.9 * ax1_min + 0.1 * ax1_max 78 | yy = 0.1 * ax2_min + 0.9 * ax2_max 79 | plt.text(xx, yy, acc_str, fontsize=18) 80 | yy = 0.15 * ax2_min + 0.85 * ax2_max 81 | plt.text(xx, yy, acc_test_str, fontsize=18) 82 | plt.xlim((x1_min, x1_max)) 83 | plt.ylim((x2_min, x2_max)) 84 | plt.xlabel(u'身高(cm)', fontsize='large') 85 | plt.ylabel(u'体重(kg)', fontsize='large') 86 | plt.title(u'EM算法估算GMM的参数', fontsize=20) 87 | plt.grid() 88 | plt.show() 89 | -------------------------------------------------------------------------------- /8.EM Model/8.3 GMM_Parameter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.colors 4 | import matplotlib.pyplot as plt 5 | from sklearn.mixture import GaussianMixture 6 | 7 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 8 | mpl.rcParams['axes.unicode_minus'] = False 9 | 10 | 11 | def expand(a, b, rate=0.05): 12 | d = (b - a) * rate 13 | return a - d, b + d 14 | 15 | 16 | def accuracy_rate(y1, y2): 17 | acc = np.mean(y1 == y2) 18 | return acc if acc > 0.5 else 1 - acc 19 | 20 | 21 | if __name__ == '__main__': 22 | np.random.seed(0) 23 | # 协方差为1,2的对角矩阵 24 | cov1 = np.diag((1, 2)) 25 | N1 = 500 26 | N2 = 300 27 | N = N1 + N2 28 | 29 | # 第一类数据 30 | x1 = np.random.multivariate_normal(mean=(1, 2), cov=cov1, size=N1) 31 | m = np.array(((1, 1), (1, 3))) 32 | # 旋转x1分布 33 | x1 = x1.dot(m) 34 | 35 | # 第二类数据 36 | x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2) 37 | 38 | x = np.vstack((x1, x2)) 39 | y = np.array([0] * N1 + [1] * N2) 40 | 41 | ''' 42 | spherical:圆形 43 | diag:对角线 44 | tied:方差一样 45 | full:方差可以不一样 46 | ''' 47 | types = ('spherical', 'diag', 'tied', 'full') 48 | err = np.empty(len(types)) 49 | bic = np.empty(len(types)) 50 | for i, type in enumerate(types): 51 | gmm = GaussianMixture(n_components=2, covariance_type=type, random_state=0) 52 | gmm.fit(x) 53 | err[i] = 1 - accuracy_rate(gmm.predict(x), y) 54 | bic[i] = gmm.bic(x) 55 | print('错误率:', err.ravel()) 56 | print('BIC:', bic.ravel()) 57 | 58 | # 画图 59 | xpos = np.arange(4) 60 | ax = plt.axes() 61 | # -0.3~0 || 0.7~1 || 1.7~2 || 2.7~3 62 | b1 = ax.bar(xpos - 0.3, err, width=0.3, color='#77E0A0') 63 | # 0~0.3 || 1~1.3 || 2~2.3 || 3~3.3 64 | b2 = ax.twinx().bar(xpos, bic, width=0.3, color='#FF8080') 65 | plt.grid(True) 66 | bic_min, bic_max = expand(bic.min(), bic.max()) 67 | plt.ylim((bic_min, bic_max)) 68 | plt.xticks(xpos, types) 69 | plt.legend([b1[0], b2[0]], (u'错误率', u'BIC')) 70 | plt.title(u'不同方差类型的误差率和BIC', fontsize=18) 71 | plt.show() 72 | 73 | optimal = bic.argmin() 74 | gmm = GaussianMixture(n_components=2, covariance_type=types[optimal], random_state=0) 75 | gmm.fit(x) 76 | print('均值 = \n', gmm.means_) 77 | print('方差 = \n', gmm.covariances_) 78 | y_hat = gmm.predict(x) 79 | 80 | cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0']) 81 | cm_dark = mpl.colors.ListedColormap(['r', 'g']) 82 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() 83 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() 84 | x1_min, x1_max = expand(x1_min, x1_max) 85 | x2_min, x2_max = expand(x2_min, x2_max) 86 | x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] 87 | grid_test = np.stack((x1.flat, x2.flat), axis=1) 88 | grid_hat = gmm.predict(grid_test) 89 | grid_hat = grid_hat.reshape(x1.shape) 90 | if gmm.means_[0][0] > gmm.means_[1][0]: 91 | z = grid_hat == 0 92 | grid_hat[z] = 1 93 | grid_hat[~z] = 0 94 | plt.figure(figsize=(9, 7), facecolor='w') 95 | plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light) 96 | plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark, edgecolors='k') 97 | 98 | ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() 99 | plt.xlim((x1_min, x1_max)) 100 | plt.ylim((x2_min, x2_max)) 101 | plt.title(u'GMM调参:covariance_type=%s' % types[optimal], fontsize=20) 102 | plt.grid() 103 | plt.show() 104 | -------------------------------------------------------------------------------- /8.EM Model/8.4 GMM_Iris.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.colors 4 | import matplotlib.pyplot as plt 5 | from sklearn.mixture import GaussianMixture 6 | from sklearn.metrics.pairwise import pairwise_distances_argmin 7 | 8 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 9 | mpl.rcParams['axes.unicode_minus'] = False 10 | 11 | iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度' 12 | 13 | 14 | def expand(a, b, rate=0.05): 15 | d = (b - a) * rate 16 | return a - d, b + d 17 | 18 | 19 | def iris_type(s): 20 | it = {b'Iris-setosa': 0, 21 | b'Iris-versicolor': 1, 22 | b'Iris-virginica': 2} 23 | return it[s] 24 | 25 | 26 | if __name__ == '__main__': 27 | path = '.\\iris.data' # 数据文件路径 28 | data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type}) 29 | # 将数据的0到3列组成x,第4列得到y 30 | x_prime, y = np.split(data, (4,), axis=1) 31 | y = y.ravel() 32 | 33 | # 3类莺尾花 34 | n_components = 3 35 | # 4个特征6种组合 36 | feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]] 37 | plt.figure(figsize=(10, 9), facecolor='#FFFFFF') 38 | for k, pair in enumerate(feature_pairs): 39 | x = x_prime[:, pair] 40 | m = np.array([np.mean(x[y == i], axis=0) for i in range(3)]) # 均值的实际值 41 | print('实际均值 = \n', m) 42 | 43 | gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0) 44 | gmm.fit(x) 45 | print('预测均值 = \n', gmm.means_) 46 | print('预测方差 = \n', gmm.covariances_) 47 | y_hat = gmm.predict(x) 48 | order = pairwise_distances_argmin(m, gmm.means_, axis=1, metric='euclidean') 49 | # print('顺序:\t', order) 50 | 51 | n_sample = y.size 52 | n_types = 3 53 | change = np.empty((n_types, n_sample), dtype=np.bool) 54 | for i in range(n_types): 55 | change[i] = y_hat == order[i] 56 | for i in range(n_types): 57 | y_hat[change[i]] = i 58 | acc = u'准确率:%.2f%%' % (100 * np.mean(y_hat == y)) 59 | print(acc) 60 | 61 | cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0', '#A0A0FF']) 62 | cm_dark = mpl.colors.ListedColormap(['r', 'g', '#6060FF']) 63 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() 64 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() 65 | x1_min, x1_max = expand(x1_min, x1_max) 66 | x2_min, x2_max = expand(x2_min, x2_max) 67 | x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] 68 | grid_test = np.stack((x1.flat, x2.flat), axis=1) 69 | grid_hat = gmm.predict(grid_test) 70 | 71 | change = np.empty((n_types, grid_hat.size), dtype=np.bool) 72 | for i in range(n_types): 73 | change[i] = grid_hat == order[i] 74 | for i in range(n_types): 75 | grid_hat[change[i]] = i 76 | 77 | grid_hat = grid_hat.reshape(x1.shape) 78 | plt.subplot(3, 2, k + 1) 79 | plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light) 80 | plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark, edgecolors='k') 81 | xx = 0.95 * x1_min + 0.05 * x1_max 82 | yy = 0.1 * x2_min + 0.9 * x2_max 83 | plt.text(xx, yy, acc, fontsize=14) 84 | plt.xlim((x1_min, x1_max)) 85 | plt.ylim((x2_min, x2_max)) 86 | plt.xlabel(iris_feature[pair[0]], fontsize=14) 87 | plt.ylabel(iris_feature[pair[1]], fontsize=14) 88 | plt.grid() 89 | plt.tight_layout(2) 90 | plt.suptitle(u'EM算法无监督分类鸢尾花数据', fontsize=20) 91 | plt.subplots_adjust(top=0.92) 92 | plt.show() 93 | -------------------------------------------------------------------------------- /8.EM Model/8.5 DPGMM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | import matplotlib as mpl 4 | import matplotlib.colors 5 | import matplotlib.pyplot as plt 6 | from matplotlib.patches import Ellipse 7 | from sklearn.mixture import GaussianMixture, BayesianGaussianMixture 8 | 9 | 10 | def expand(a, b, rate=0.05): 11 | d = (b - a) * rate 12 | return a - d, b + d 13 | 14 | 15 | matplotlib.rcParams['font.sans-serif'] = [u'SimHei'] 16 | matplotlib.rcParams['axes.unicode_minus'] = False 17 | 18 | if __name__ == '__main__': 19 | np.random.seed(0) 20 | cov1 = np.diag((1, 2)) 21 | N1 = 500 22 | N2 = 300 23 | N = N1 + N2 24 | x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1) 25 | m = np.array(((1, 1), (1, 3))) 26 | x1 = x1.dot(m) 27 | x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2) 28 | x = np.vstack((x1, x2)) 29 | y = np.array([0] * N1 + [1] * N2) 30 | # 类别初始化为3,实际为2 31 | n_components = 3 32 | 33 | # 绘图使用 34 | colors = '#A0FFA0', '#2090E0', '#FF8080' 35 | cm = mpl.colors.ListedColormap(colors) 36 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() 37 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() 38 | x1_min, x1_max = expand(x1_min, x1_max) 39 | x2_min, x2_max = expand(x2_min, x2_max) 40 | x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] 41 | grid_test = np.stack((x1.flat, x2.flat), axis=1) 42 | 43 | plt.figure(figsize=(9, 9), facecolor='w') 44 | plt.suptitle(u'GMM/DPGMM比较', fontsize=23) 45 | ax = plt.subplot(211) 46 | 47 | # GMM 48 | gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0) 49 | gmm.fit(x) 50 | centers = gmm.means_ 51 | covs = gmm.covariances_ 52 | print('GMM均值 = \n', centers) 53 | print('GMM方差 = \n', covs) 54 | y_hat = gmm.predict(x) 55 | 56 | grid_hat = gmm.predict(grid_test) 57 | grid_hat = grid_hat.reshape(x1.shape) 58 | plt.pcolormesh(x1, x2, grid_hat, cmap=cm) 59 | plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o') 60 | 61 | clrs = list('rgbmy') 62 | # 均值方差两者打包,列举 63 | for i, cc in enumerate(zip(centers, covs)): 64 | center, cov = cc 65 | # 求特征值和特征向量 66 | value, vector = sp.linalg.eigh(cov) 67 | # 特征值对应椭圆的半长轴和半短轴 68 | width, height = value[0], value[1] 69 | # 标准化 70 | v = vector[0] / sp.linalg.norm(vector[0]) 71 | # 弧度转角度 72 | angle = 180 * np.arctan(v[1] / v[0]) / np.pi 73 | # 画椭圆 74 | e = Ellipse(xy=center, width=width, height=height, 75 | angle=angle, color=clrs[i], alpha=0.5, clip_box=ax.bbox) 76 | # 加入图中 77 | ax.add_artist(e) 78 | 79 | ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() 80 | plt.xlim((x1_min, x1_max)) 81 | plt.ylim((x2_min, x2_max)) 82 | plt.title(u'GMM', fontsize=20) 83 | plt.grid(True) 84 | 85 | # DPGMM 86 | dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5, 87 | weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=10) 88 | dpgmm.fit(x) 89 | centers = dpgmm.means_ 90 | covs = dpgmm.covariances_ 91 | print('DPGMM均值 = \n', centers) 92 | print('DPGMM方差 = \n', covs) 93 | y_hat = dpgmm.predict(x) 94 | # 虽然求值有3个,但是实际预测是2个高斯模型 95 | # print(y_hat) 96 | 97 | ax = plt.subplot(212) 98 | grid_hat = dpgmm.predict(grid_test) 99 | grid_hat = grid_hat.reshape(x1.shape) 100 | plt.pcolormesh(x1, x2, grid_hat, cmap=cm) 101 | plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o') 102 | 103 | for i, cc in enumerate(zip(centers, covs)): 104 | # 过滤第3类 105 | if i not in y_hat: 106 | continue 107 | center, cov = cc 108 | value, vector = sp.linalg.eigh(cov) 109 | width, height = value[0], value[1] 110 | v = vector[0] / sp.linalg.norm(vector[0]) 111 | angle = 180 * np.arctan(v[1] / v[0]) / np.pi 112 | e = Ellipse(xy=center, width=width, height=height, 113 | angle=angle, color='m', alpha=0.5, clip_box=ax.bbox) 114 | ax.add_artist(e) 115 | 116 | plt.xlim((x1_min, x1_max)) 117 | plt.ylim((x2_min, x2_max)) 118 | plt.title('DPGMM', fontsize=20) 119 | plt.grid(True) 120 | 121 | plt.tight_layout() 122 | plt.subplots_adjust(top=0.9) 123 | plt.show() 124 | -------------------------------------------------------------------------------- /8.EM Model/8.6 GMM_pdf.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | import scipy as sp 4 | import matplotlib as mpl 5 | import matplotlib.colors 6 | import matplotlib.pyplot as plt 7 | from matplotlib.patches import Ellipse 8 | from sklearn.mixture import GaussianMixture 9 | 10 | 11 | # 扩容画图尺寸函数 12 | def expand(a, b, rate=0.05): 13 | d = (b - a) * rate 14 | return a - d, b + d 15 | 16 | 17 | if __name__ == '__main__': 18 | # 过滤异常警报 19 | warnings.filterwarnings(action='ignore', category=RuntimeWarning) 20 | np.random.seed(0) 21 | cov1 = np.diag((1, 2)) 22 | N1 = 500 23 | N2 = 300 24 | N = N1 + N2 25 | x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1) 26 | m = np.array(((1, 1), (1, 3))) 27 | x1 = x1.dot(m) 28 | x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2) 29 | x = np.vstack((x1, x2)) 30 | y = np.array([0] * N1 + [1] * N2) 31 | 32 | gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0) 33 | gmm.fit(x) 34 | centers = gmm.means_ 35 | covs = gmm.covariances_ 36 | print('GMM均值 = \n', centers) 37 | print('GMM方差 = \n', covs) 38 | y_hat = gmm.predict(x) 39 | 40 | colors = '#A0FFA0', '#FF8080', 41 | levels = 10 42 | cm = mpl.colors.ListedColormap(colors) 43 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() 44 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() 45 | x1_min, x1_max = expand(x1_min, x1_max) 46 | x2_min, x2_max = expand(x2_min, x2_max) 47 | x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] 48 | grid_test = np.stack((x1.flat, x2.flat), axis=1) 49 | print(gmm.score_samples(grid_test)) 50 | # 负对数似然,值越大,概率越低 51 | grid_hat = -gmm.score_samples(grid_test) 52 | grid_hat = grid_hat.reshape(x1.shape) 53 | plt.figure(figsize=(9, 7), facecolor='w') 54 | ax = plt.subplot(111) 55 | cmesh = plt.pcolormesh(x1, x2, grid_hat, cmap=plt.cm.Spectral) 56 | plt.colorbar(cmesh, shrink=0.8) 57 | CS = plt.contour(x1, x2, grid_hat, levels=np.logspace(0, 2, num=levels, base=10), colors='w', linewidths=1) 58 | plt.clabel(CS, fontsize=9, inline=1, fmt='%.1f') 59 | plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o') 60 | 61 | for i, cc in enumerate(zip(centers, covs)): 62 | center, cov = cc 63 | value, vector = sp.linalg.eigh(cov) 64 | width, height = value[0], value[1] 65 | v = vector[0] / sp.linalg.norm(vector[0]) 66 | angle = 180 * np.arctan(v[1] / v[0]) / np.pi 67 | e = Ellipse(xy=center, width=width, height=height, 68 | angle=angle, color='m', alpha=0.5, clip_box=ax.bbox) 69 | ax.add_artist(e) 70 | 71 | plt.xlim((x1_min, x1_max)) 72 | plt.ylim((x2_min, x2_max)) 73 | mpl.rcParams['font.sans-serif'] = [u'SimHei'] 74 | mpl.rcParams['axes.unicode_minus'] = False 75 | plt.title(u'GMM似然函数值', fontsize=20) 76 | plt.grid(True) 77 | plt.show() 78 | -------------------------------------------------------------------------------- /8.EM Model/HeightWeight.csv: -------------------------------------------------------------------------------- 1 | Sex,Height(cm),Weight(kg) 2 | 0,156,50 3 | 0,160,60 4 | 0,162,54 5 | 0,162,55 6 | 0,160.5,56 7 | 0,160,53 8 | 0,158,55 9 | 0,164,60 10 | 0,165,50 11 | 0,166,55 12 | 0,158,47.5 13 | 0,161,49 14 | 0,169,55 15 | 0,161,46 16 | 0,160,45 17 | 0,167,44 18 | 0,155,49 19 | 0,154,57 20 | 0,172,52 21 | 0,155,56 22 | 0,157,55 23 | 0,165,65 24 | 0,156,52 25 | 0,155,50 26 | 0,156,56 27 | 0,160,55 28 | 0,158,55 29 | 0,162,70 30 | 0,162,65 31 | 0,155,57 32 | 0,163,70 33 | 0,160,60 34 | 0,162,55 35 | 0,165,65 36 | 0,159,60 37 | 0,147,47 38 | 0,163,53 39 | 0,157,54 40 | 0,160,55 41 | 0,162,48 42 | 0,158,60 43 | 0,155,48 44 | 0,165,60 45 | 0,161,58 46 | 0,159,45 47 | 0,163,50 48 | 0,158,49 49 | 0,155,50 50 | 0,162,55 51 | 0,157,63 52 | 0,159,49 53 | 0,152,47 54 | 0,156,51 55 | 0,165,49 56 | 0,154,47 57 | 0,156,52 58 | 0,162,48 59 | 1,162,60 60 | 1,164,62 61 | 1,168,86 62 | 1,187,75 63 | 1,167,75 64 | 1,174,64 65 | 1,175,62 66 | 1,170,65 67 | 1,176,73 68 | 1,169,58 69 | 1,178,54 70 | 1,165,66 71 | 1,183,68 72 | 1,171,61 73 | 1,179,64 74 | 1,172,60 75 | 1,173,59 76 | 1,172,58 77 | 1,175,62 78 | 1,160,60 79 | 1,160,58 80 | 1,160,60 81 | 1,175,75 82 | 1,163,60 83 | 1,181,77 84 | 1,172,80 85 | 1,175,73 86 | 1,175,60 87 | 1,167,65 88 | 1,172,60 89 | 1,169,75 90 | 1,172,65 91 | 1,175,72 92 | 1,172,60 93 | 1,170,65 94 | 1,158,59 95 | 1,167,63 96 | 1,164,61 97 | 1,176,65 98 | 1,182,95 99 | 1,173,75 100 | 1,176,67 101 | 1,163,58 102 | 1,166,67 103 | 1,162,59 104 | 1,169,56 105 | 1,163,59 106 | 1,163,56 107 | 1,176,62 108 | 1,169,57 109 | 1,173,61 110 | 1,163,59 111 | 1,167,57 112 | 1,176,63 113 | 1,168,61 114 | 1,167,60 115 | 1,170,69 116 | -------------------------------------------------------------------------------- /8.EM Model/README.md: -------------------------------------------------------------------------------- 1 | ## EM Model 2 | ## (Expectation Maximization Algorithm) 3 | 4 | ### 项目背景 5 | > EM算法是一种迭代优化策略,由于它的计算方法中每一次迭代都分两步,其中一个为期望步(E步),另一个为极大步(M步),所以算法被称为EM算法(Expectation Maximization Algorithm)。EM算法受到缺失思想影响,最初是为了解决数据缺失情况下的参数估计问题,其算法基础和收敛有效性等问题在Dempster,Laird和Rubin三人于1977年所做的文章Maximum likelihood from incomplete data via the EM algorithm中给出了详细的阐述。其基本思想是:首先根据己经给出的观测数据,估计出模型参数的值;然后再依据上一步估计出的参数值估计缺失数据的值,再根据估计出的缺失数据加上之前己经观测到的数据重新再对参数值进行估计,然后反复迭代,直至最后收敛,迭代结束。 6 | 7 | ### 项目简介 8 | |名称|简介| 9 | |:-------------|:-------------:| 10 | |8.1 EM|EM算法基本应用| 11 | |8.2 GMM|EM算法估算高斯混合模型参数| 12 | |8.3 GMM_Parameter|比较不同方差类型错误率| 13 | |8.4 GMM_Iris|EM算法应用到Iris莺尾花数据| 14 | |8.5 DPGMM|DPGMM与GMM模型比较| 15 | |8.6 GMM_pdf|GMM最大似然函数值| 16 | 17 | ### 效果图 18 | #### ·EM算法基本应用 19 | 20 | 21 | #### ·EM算法估算高斯混合模型参数 22 | 23 | 24 | #### ·比较不同方差类型错误率 25 | 26 | 27 | #### ·GMM调参 28 | 29 | 30 | #### ·EM算法应用到Iris莺尾花数据 31 | 32 | 33 | #### ·DPGMM与GMM模型比较 34 | 35 | 36 | #### ·GMM似然函数值 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /8.EM Model/figures/DPGMM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/8.EM Model/figures/DPGMM.png -------------------------------------------------------------------------------- /8.EM Model/figures/EM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/8.EM Model/figures/EM.png -------------------------------------------------------------------------------- /8.EM Model/figures/EM_para.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/8.EM Model/figures/EM_para.png -------------------------------------------------------------------------------- /8.EM Model/figures/EM_para_modi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/8.EM Model/figures/EM_para_modi.png -------------------------------------------------------------------------------- /8.EM Model/figures/GMM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/8.EM Model/figures/GMM.png -------------------------------------------------------------------------------- /8.EM Model/figures/GMM_sim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/8.EM Model/figures/GMM_sim.png -------------------------------------------------------------------------------- /8.EM Model/figures/iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/8.EM Model/figures/iris.png -------------------------------------------------------------------------------- /8.EM Model/principle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/8.EM Model/principle.pdf -------------------------------------------------------------------------------- /9.Bayes Network/9.1 Iris_GaussianNB.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.neighbors import KNeighborsClassifier 7 | from sklearn.naive_bayes import GaussianNB, MultinomialNB 8 | 9 | ''' 10 | 高斯朴素贝叶斯 11 | ''' 12 | 13 | 14 | def iris_type(s): 15 | it = {b'Iris-setosa': 0, 16 | b'Iris-versicolor': 1, 17 | b'Iris-virginica': 2} 18 | return it[s] 19 | 20 | 21 | if __name__ == "__main__": 22 | data = np.loadtxt('.\\iris.data', dtype=float, delimiter=',', converters={4: iris_type}) 23 | print(data) 24 | x, y = np.split(data, (4,), axis=1) 25 | # 只取前两个特征,假定任意类别都是服从高斯分布,并且特征之间是独立的 26 | x = x[:, :2] 27 | print(x) 28 | print(y) 29 | 30 | # 管道处理,标准化数据,均值为0,方差为1,然后传入高斯朴素贝叶斯 31 | gnb = Pipeline([ 32 | ('sc', StandardScaler()), 33 | ('clf', GaussianNB())]) 34 | gnb.fit(x, y.ravel()) # 函数要求输入是行向量 35 | # gnb = MultinomialNB().fit(x, y.ravel()) # K-Means处理 36 | # gnb = KNeighborsClassifier(n_neighbors=5).fit(x, y.ravel()) # 可以实现过拟合 37 | 38 | # 画图 39 | N, M = 500, 500 # 横纵各采样多少个值 40 | x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 41 | x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 42 | t1 = np.linspace(x1_min, x1_max, N) 43 | t2 = np.linspace(x2_min, x2_max, M) 44 | x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 45 | x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点,一共500*500得250000 46 | 47 | # 考虑另外两个特征 48 | # x3 = np.ones(x1.size) * np.average(x[:, 2]) 49 | # x4 = np.ones(x1.size) * np.average(x[:, 3]) 50 | # x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1) # 测试点 51 | 52 | mpl.rcParams['font.sans-serif'] = [u'simHei'] 53 | mpl.rcParams['axes.unicode_minus'] = False 54 | cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF']) # 设置三种类别颜色 55 | cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) 56 | y_hat = gnb.predict(x_test) # 预测250000测试点的类别 57 | y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同 58 | plt.figure(facecolor='w') 59 | plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值的显示 60 | plt.scatter(x[:, 0], x[:, 1], c=y.reshape(x[:, 0].shape), edgecolors='k', s=50, cmap=cm_dark) # 样本的显示 61 | plt.xlabel(u'花萼长度', fontsize=14) 62 | plt.ylabel(u'花萼宽度', fontsize=14) 63 | plt.xlim(x1_min, x1_max) 64 | plt.ylim(x2_min, x2_max) 65 | plt.title(u'GaussianNB对鸢尾花数据的分类结果', fontsize=18) 66 | plt.grid(True) 67 | plt.show() 68 | 69 | # 训练集上的预测结果 70 | y_hat = gnb.predict(x) 71 | y = y.reshape(-1) 72 | result = y_hat == y 73 | print(y_hat) 74 | print(result) 75 | acc = np.mean(result) 76 | print('准确度: %.2f%%' % (100 * acc)) 77 | -------------------------------------------------------------------------------- /9.Bayes Network/9.2 MultinomialNB_intro.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.naive_bayes import MultinomialNB 3 | 4 | if __name__ == "__main__": 5 | 6 | np.random.seed(0) # 保证随机种子一定 7 | M = 20 # 20个样本 8 | N = 5 # 每个数据是5维的 9 | 10 | x = np.random.randint(2, size=(M, N)) # [low, high) 给定2情况下,只能随机int即0和1,生成20*5的矩阵 11 | print('x = \n', x) 12 | print('x.shape = ', x.shape) 13 | 14 | x = np.array(list(set([tuple(t) for t in x]))) # 去重数据,去掉同样特征对应不同类别的数据,set元祖的元素,再list就会去重 15 | print('new x = \n', x) 16 | print('new x.shape = ', x.shape) 17 | 18 | M = len(x) 19 | y = np.arange(M) # 制造类别数据,此处是0~16 20 | 21 | mnb = MultinomialNB(alpha=1) # 可尝试切换成GaussianNB() 22 | # mnb = GaussianNB() # 可以达到100%,在去重的情况下 23 | mnb.fit(x, y) 24 | y_hat = mnb.predict(x) 25 | print('预测类别:', y_hat) 26 | print('准确率:%.2f%%' % (100 * np.mean(y_hat == y))) 27 | print('系统得分:', mnb.score(x, y)) 28 | # from sklearn import metrics 29 | # print metrics.accuracy_score(y, y_hat) # 和上面一样 30 | err = y_hat != y 31 | print('错误情况:\n', err) 32 | for i, e in enumerate(err): 33 | if e: 34 | print(y[i], ':\t', x[i], '被认为与', x[y_hat[i]], '一个类别') 35 | -------------------------------------------------------------------------------- /9.Bayes Network/README.md: -------------------------------------------------------------------------------- 1 | ## Bayes Network 2 | ## (贝叶斯网络) 3 | 4 | ### 项目背景 5 | > 托马斯·贝叶斯(Thomas Bayes)同学的详细生平在这里。以下摘一段 wikipedia上的简介:所谓的贝叶斯方法源于他生前为解决一个“逆概”问题写的一篇文章,而这篇文章是在他死后才由他的一位朋友发表出来的。在贝叶斯写这篇文章之前,人们已经能够计算“正向概率”,如“假设袋子里面有N个白球,M个黑球,你伸手进去摸一把,摸出黑球的概率是多大”。而一个自然而然的问题是反过来:“如果我们事先并不知道袋子里面黑白球的比例,而是闭着眼睛摸出一个(或好几个)球,观察这些取出来的球的颜色之后,那么我们可以就此对袋子里面的黑白球的比例作出什么样的推测”。这个问题,就是所谓的逆概问题。实际上,贝叶斯当时的论文只是对这个问题的一个直接的求解尝试,并不清楚他当时是不是已经意识到这里面包含着的深刻的思想。然而后来,贝叶斯方法席卷了概率论,并将应用延伸到各个问题领域,所有需要作出概率预测的地方都可以见到贝叶斯方法的影子,特别地,贝叶斯是机器学习的核心方法之一。这背后的深刻原因在于,现实世界本身就是不确定的,人类的观察能力是有局限性的,我们日常所观察到的只是事物表面上的结果,沿用刚才那个袋子里面取球的比方,我们往往只能知道从里面取出来的球是什么颜色,而并不能直接看到袋子里面实际的情况。这个时候,我们就需要提供一个猜测(hypothesis,更为严格的说法是“假设”,这里用“猜测”更通俗易懂一点),所谓猜测,当然就是不确定的(很可能有好多种乃至无数种猜测都能满足目前的观测),但也绝对不是两眼一抹黑瞎蒙——具体地说,我们需要做两件事情:1. 算出各种不同猜测的可能性大小。2. 算出最靠谱的猜测是什么。第一个就是计算特定猜测的后验概率,对于连续的猜测空间则是计算猜测的概率密度函数。第二个则是所谓的模型比较,模型比较如果不考虑先验概率的话就是最大似然方法。 6 | 7 | 8 | ### 项目简介 9 | |名称|简介| 10 | |:-------------|:-------------:| 11 | |9.1 Iris_GaussianNB|高斯朴素贝叶斯模型预测Iris莺尾花数据集| 12 | |9.2 MultinomialNB_intro|多项式朴素贝叶斯基础应用| 13 | |9.3 text_classification|基于TF-IDF文本特征提取用贝叶斯模型分类文本| 14 | 15 | 16 | ### 效果图 17 | #### ·高斯朴素贝叶斯模型预测Iris莺尾花数据集效果 18 | 19 | 20 | #### ·不同模型进行文本分类效果比对 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /9.Bayes Network/figures/iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/9.Bayes Network/figures/iris.png -------------------------------------------------------------------------------- /9.Bayes Network/figures/text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/9.Bayes Network/figures/text.png -------------------------------------------------------------------------------- /9.Bayes Network/principle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/9.Bayes Network/principle.pdf -------------------------------------------------------------------------------- /Figures/10-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/10-3-1.png -------------------------------------------------------------------------------- /Figures/10-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/10-3-2.png -------------------------------------------------------------------------------- /Figures/11-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/11-5.png -------------------------------------------------------------------------------- /Figures/3-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/3-1-1.png -------------------------------------------------------------------------------- /Figures/3-1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/3-1-2.png -------------------------------------------------------------------------------- /Figures/3-1-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/3-1-3.png -------------------------------------------------------------------------------- /Figures/3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/3-2.png -------------------------------------------------------------------------------- /Figures/3-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/3-3.png -------------------------------------------------------------------------------- /Figures/3-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/3-4.png -------------------------------------------------------------------------------- /Figures/4-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/4-1-1.png -------------------------------------------------------------------------------- /Figures/4-1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/4-1-2.png -------------------------------------------------------------------------------- /Figures/4-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/4-2.png -------------------------------------------------------------------------------- /Figures/4-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/4-3-1.png -------------------------------------------------------------------------------- /Figures/4-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/4-3-2.png -------------------------------------------------------------------------------- /Figures/4-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/4-4.png -------------------------------------------------------------------------------- /Figures/4-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/4-5.png -------------------------------------------------------------------------------- /Figures/5-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/5-6.png -------------------------------------------------------------------------------- /Figures/6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/6-1.png -------------------------------------------------------------------------------- /Figures/6-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/6-2.png -------------------------------------------------------------------------------- /Figures/6-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/6-4.png -------------------------------------------------------------------------------- /Figures/6-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/6-5-1.png -------------------------------------------------------------------------------- /Figures/6-5-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/6-5-2.png -------------------------------------------------------------------------------- /Figures/6-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/6-6.png -------------------------------------------------------------------------------- /Figures/6-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/6-7.png -------------------------------------------------------------------------------- /Figures/7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/7-1.png -------------------------------------------------------------------------------- /Figures/7-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/7-3-1.png -------------------------------------------------------------------------------- /Figures/7-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/7-3-2.png -------------------------------------------------------------------------------- /Figures/7-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/7-4.png -------------------------------------------------------------------------------- /Figures/7-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/7-5.png -------------------------------------------------------------------------------- /Figures/7-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/7-6.png -------------------------------------------------------------------------------- /Figures/7-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/7-7.png -------------------------------------------------------------------------------- /Figures/8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/8-1.png -------------------------------------------------------------------------------- /Figures/8-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/8-2.png -------------------------------------------------------------------------------- /Figures/8-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/8-3-1.png -------------------------------------------------------------------------------- /Figures/8-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/8-3-2.png -------------------------------------------------------------------------------- /Figures/8-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/8-4.png -------------------------------------------------------------------------------- /Figures/8-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/8-5.png -------------------------------------------------------------------------------- /Figures/8-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/8-6.png -------------------------------------------------------------------------------- /Figures/9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/9-1.png -------------------------------------------------------------------------------- /Figures/9-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/littleheap/MachineLearning-Algorithms/28f6eb352b58c58a8b80603b44d998b28e1f3434/Figures/9-3.png -------------------------------------------------------------------------------- /LittleElephant.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## MachineLearning-Algorithms (机器学习算法项目整合) 2 | 3 | ### 项目背景 4 | >该项目是我个人在学习ML基础过程中,操纵实践基础算法的整理合集,每一个小项目中,都有最新的,基于Python3.6实践相应算法到数据上的代码。理论内容几乎协同《统计学习方法》,算法实战同时有着几乎最详尽的注释。所有都是在我学习每个算法基础理论推导后,调用第三方库函数和相关算法框架,实现相关基于机器学习的算法实战内容,查看实现效果。具体每个小项目中有Readme说明。欢迎了解和完善。 5 | 6 | ### 项目简介 7 | |名称|简介| 8 | |:-------------|:-------------:| 9 | |[1.Python Foundation](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/1.Python%20Foundation)|Python基础要点回顾| 10 | |[2.Management Foundation](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/2.Management%20Foundation)|机器学习基础操作要点| 11 | |[3.Regression](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/3.Regression)|回归算法实战| 12 | |[4.Decision Tree & Random Forest](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/4.Decision%20Tree%20%26%20Random%20Forest)|决策树&随机森林算法实战| 13 | |[5.Boost](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/5.Boost)|Boost算法实战| 14 | |[6.SVM](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/6.SVM)|SVM支撑向量机实战| 15 | |[7.Cluster](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/7.Cluster)|聚类算法实战| 16 | |[8.EM Model](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/8.EM%20Model)|EM算法实战| 17 | |[9.Bayes Network](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/9.Bayes%20Network)|贝叶斯网络实战| 18 | |[10.LDA Topic Model](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/10.LDA%20Topic%20Model)|LDA主题模型实战| 19 | |[11.HMM](https://github.com/LittleHeap/MachineLearning-Algorithms/tree/master/11.HMM)|HMM隐马尔可夫模型实战| --------------------------------------------------------------------------------