├── .idea ├── deployment.xml ├── encodings.xml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml ├── tiny_ml.iml ├── vcs.xml └── workspace.xml ├── README.md ├── notes └── linear_model │ ├── linear_reg_closed_form.pdf │ └── logistic_regression.pdf ├── requirements.txt └── tinyml ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc └── __init__.cpython-37.pyc ├── bayes ├── NaiveBayesClassifier.py ├── __init__.py └── __pycache__ │ ├── NaiveBayesClassifier.cpython-37.pyc │ └── __init__.cpython-37.pyc ├── cluster ├── AGNES.py ├── DBSCAN.py ├── GaussianMixture.py ├── KMeans.py ├── LVQ.py ├── __init__.py └── __pycache__ │ ├── AGNES.cpython-37.pyc │ ├── DBSCAN.cpython-37.pyc │ ├── GaussianMixture.cpython-37.pyc │ ├── KMeans.cpython-37.pyc │ ├── LVQ.cpython-37.pyc │ └── __init__.cpython-37.pyc ├── compare ├── __init__.py ├── cluster_result │ ├── sklearn_AGNES.jpg │ ├── sklearn_DBSCAN.jpg │ ├── sklearn_GMM.jpg │ ├── sklearn_KMeans.jpg │ ├── tinyml_AGNES.jpg │ ├── tinyml_DBSCAN.jpg │ ├── tinyml_GMM.jpg │ └── tinyml_KMeans.jpg ├── compare_classification.py ├── compare_clustering.py ├── compare_dimension_reduction.py ├── compare_regresssor.py └── dimension_reduction_result │ ├── sklearn_KernalPCA.jpg │ ├── sklearn_LLE.jpg │ ├── sklearn_MDS.jpg │ ├── sklearn_PCA.jpg │ ├── tinyml_KernalPCA.jpg │ ├── tinyml_LLE.jpg │ ├── tinyml_MDS.jpg │ └── tinyml_PCA.jpg ├── dimension_reduction ├── Isomap.py ├── KernelPCA.py ├── LLE.py ├── MDS.py ├── PCA.py ├── __init__.py └── __pycache__ │ ├── Isomap.cpython-37.pyc │ ├── KernelPCA.cpython-37.pyc │ ├── LLE.cpython-37.pyc │ ├── MDS.cpython-36.pyc │ ├── MDS.cpython-37.pyc │ ├── PCA.cpython-37.pyc │ ├── __init__.cpython-36.pyc │ └── __init__.cpython-37.pyc ├── discriminant_analysis ├── GDA.py ├── LDA.py ├── __init__.py └── __pycache__ │ ├── GDA.cpython-37.pyc │ ├── LDA.cpython-37.pyc │ └── __init__.cpython-37.pyc ├── ensemble ├── AdaBoostClassifier.py ├── GradientBoostingRegressor.py ├── RandomForestRegressor.py ├── XGBRegressor.py ├── __init__.py └── __pycache__ │ ├── AdaBoostClassifier.cpython-37.pyc │ ├── GradientBoostingRegressor.cpython-37.pyc │ ├── RandomForestRegressor.cpython-37.pyc │ ├── XGBRegressor.cpython-37.pyc │ └── __init__.cpython-37.pyc ├── factorization_machine ├── FMClassifier.py └── __init__.py ├── feature_selection ├── ReliefFeatureSelection.py └── __init__.py ├── linear_model ├── LinearRegression.py ├── LocallyWeightedLinearRegression.py ├── LogisticRegression.py ├── SGDRegressor.py ├── __init__.py └── __pycache__ │ ├── LinearRegression.cpython-37.pyc │ ├── LogisticRegression.cpython-37.pyc │ ├── SGDRegressor.cpython-37.pyc │ └── __init__.cpython-37.pyc ├── metrices ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── curves.cpython-37.pyc └── curves.py ├── svm ├── SVC.py ├── __init__.py └── __pycache__ │ ├── SVC.cpython-37.pyc │ └── __init__.cpython-37.pyc └── tree ├── DecisionTreeClassifier.py ├── DecisionTreeRegressor.py ├── __init__.py ├── __pycache__ ├── DecisionTreeClassifier.cpython-37.pyc ├── DecisionTreeRegressor.cpython-36.pyc ├── DecisionTreeRegressor.cpython-37.pyc ├── __init__.cpython-36.pyc ├── __init__.cpython-37.pyc ├── treePlotter.cpython-36.pyc └── treePlotter.cpython-37.pyc └── treePlotter.py /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 27 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | ApexVCS 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/tiny_ml.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 51 | 52 | 53 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 81 | 82 | 83 | 84 | 85 | 105 | 106 | 107 | 127 | 128 | 129 | 149 | 150 | 151 | 171 | 172 | 173 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 1539678095512 217 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 253 | 254 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tinyml 2 | 利用numpy实现的一些周志华《机器学习》(西瓜书)一书及 斯坦福cs229课程中的算法,宜配合西瓜书和cs229课件食用。并选择性实现了一些经典算法的简易版本, 3 | 如 按照陈天奇的slides实现的XGBRegressor。 4 | ## 已经实现的算法 5 | - **线性模型** 6 | - [LinearRegression](/tinyml/linear_model/LinearRegression.py) [线性回归闭式解推导](notes/linear_model/linear_reg_closed_form.pdf) 7 | - [LogisticRegression](/tinyml/linear_model/LogisticRegression.py) [逻辑回归相关推导](/notes/linear_model/logistic_regression.pdf) 8 | - [SGDRegressor](/tinyml/linear_model/SGDRegressor.py) 9 | - [LocallyWeightedLinearRegression](/tinyml/linear_model/LocallyWeightedLinearRegression.py) 10 | - **判别分析** 11 | - [LDA](/tinyml/discriminant_analysis/LDA.py) 12 | - [GDA](/tinyml/discriminant_analysis/GDA.py) 13 | - **决策回归树** 14 | - [DecisionTreeClassifier](/tinyml/tree/DecisionTreeClassifier.py) 15 | - [DecisionTreeRegressor](/tinyml/tree/DecisionTreeRegressor.py) 16 | - **支持向量机** 17 | - [SVC](/tinyml/svm/SVC.py) 18 | - **贝叶斯** 19 | - [NaiveBayesClassifier](/tinyml/bayes/NaiveBayesClassifier.py) 20 | - **聚类算法** 21 | - [KMeans](/tinyml/cluster/KMeans.py) 22 | - [LVQ](/tinyml/cluster/LVQ.py) 23 | - [GaussianMixture](/tinyml/cluster/GaussianMixture.py) 24 | - [DBSCAN](/tinyml/cluster/DBSCAN.py) 25 | - [AGNES](/tinyml/cluster/AGNES.py) 26 | - **降维算法** 27 | - [MDS](/tinyml/dimension_reduction/MDS.py) 28 | - [PCA](/tinyml/dimension_reduction/PCA.py) 29 | - [KernelPCA](/tinyml/dimension_reduction/KernelPCA.py) 30 | - [LLE](/tinyml/dimension_reduction/LLE.py) 31 | - [Isomap](/tinyml/dimension_reduction/Isomap.py) 32 | - **集成学习** 33 | - [AdaBoostClassifier](/tinyml/ensemble/AdaBoostClassifier.py) 34 | - [GradientBoostingRegressor](/tinyml/ensemble/GradientBoostingRegressor.py) 35 | - [RandomForestRegressor](/tinyml/ensemble/RandomForestRegressor.py) 36 | - [XGBRegressor](/tinyml/ensemble/XGBRegressor.py) 37 | - **特征选择** 38 | - [ReliefFeatureSelection](/tinyml/feature_selection/ReliefFeatureSelection.py) 39 | ## 和sklearn实现的比较 40 | - **回归算法结果** [代码](/tinyml/compare/compare_regresssor.py) 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 |
Algorithm vs. RMSEsklearn-boston
tinymlsklearn
LinearRegression27.19627.196
SGDRegressor27.24627.231
DecisionTreeRegressor21.88721.761
RandomForestRegressor21.14221.142
GradientBoostRegressor16.77816.106
XGBRegressor20.14915.7
81 | 82 | - **分类算法结果** [代码](/tinyml/compare/compare_classification.py) 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 |
Algorithm vs. RMSEsklearn-breast_cancer
tinymlsklearn
NaiveBayes90.64%90.64%
LogisticRegression92.98%92.98%
LDA94.15%92.40%
GDA92.40%93.57%
SVC86.55%92.98%
AdaboostClassifier92.40%92.40%
123 | 124 | - **聚类算法比较** [代码](/tinyml/compare/compare_clustering.py) 125 | - KMeans 126 |
127 | tinyml KMeans 128 | sklearn KMeans 129 |
130 | 131 | - DBSCAN 132 |
133 | tinyml DBSCAN 134 | sklearn DBSCAN 135 |
136 | 137 | - GMM 138 |
139 | tinyml GMM 140 | sklearn GMM 141 |
142 | 143 | - AGNES 144 |
145 | tinyml AGNES 146 | sklearn AGNES 147 |
148 | 149 | - **降维算法比较** [代码](/tinyml/compare/compare_dimension_reduction.py) 150 | - PCA 151 |
152 | tinyml PCA 153 | sklearn PCA 154 |
155 | 156 | - KernalPCA 157 |
158 | tinyml KernalPCA 159 | sklearn KernalPCA 160 |
161 | 162 | - LLE 163 |
164 | tinyml LLE 165 | sklearn LLE 166 |
167 | 168 | - MDS 169 |
170 | tinyml MDS 171 | sklearn MDS 172 |
173 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /notes/linear_model/linear_reg_closed_form.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/notes/linear_model/linear_reg_closed_form.pdf -------------------------------------------------------------------------------- /notes/linear_model/logistic_regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/notes/linear_model/logistic_regression.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | xgboost 2 | numpy 3 | matplotlib 4 | torch 5 | scipy 6 | graphviz 7 | scikit_learn 8 | skrebate 9 | -------------------------------------------------------------------------------- /tinyml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/__init__.py -------------------------------------------------------------------------------- /tinyml/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /tinyml/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/bayes/NaiveBayesClassifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # 只考虑离散值 3 | class NaiveBayesClassifier: 4 | def __init__(self,n_classes=2): 5 | self.n_classes=n_classes 6 | self.priori_P={} 7 | self.conditional_P={} 8 | self.N={} 9 | pass 10 | 11 | def fit(self,X,y): 12 | for i in range(self.n_classes): 13 | # 公式 7.19 14 | self.priori_P[i]=(len(y[y==i])+1)/(len(y)+self.n_classes) 15 | for col in range(X.shape[1]): 16 | self.N[col]=len(np.unique(X[:,col])) 17 | self.conditional_P[col]={} 18 | for row in range(X.shape[0]): 19 | val=X[row,col] 20 | if val not in self.conditional_P[col].keys(): 21 | self.conditional_P[col][val]={} 22 | for i in range(self.n_classes): 23 | D_xi=np.where(X[:,col]==val) 24 | D_c=np.where(y==i) 25 | D_cxi=len(np.intersect1d(D_xi,D_c)) 26 | # 公式 7.20 27 | self.conditional_P[col][val][i]=(D_cxi+1)/(len(y[y==i])+self.N[col]) 28 | else: 29 | continue 30 | 31 | def predict(self,X): 32 | pred_y=[] 33 | for i in range(len(X)): 34 | p=np.ones((self.n_classes,)) 35 | for j in range(self.n_classes): 36 | p[j]=self.priori_P[j] 37 | for col in range(X.shape[1]): 38 | val=X[i,col] 39 | for j in range(self.n_classes): 40 | p[j]*=self.conditional_P[col][val][j] 41 | pred_y.append(np.argmax(p)) 42 | return np.array(pred_y) 43 | # 连续值 44 | class NaiveBayesClassifierContinuous: 45 | def __init__(self,n_classes=2): 46 | self.n_classes=n_classes 47 | self.priori_P={} 48 | 49 | def fit(self,X,y): 50 | self.mus=np.zeros((self.n_classes,X.shape[1])) 51 | self.sigmas=np.zeros((self.n_classes,X.shape[1])) 52 | 53 | for c in range(self.n_classes): 54 | # 公式 7.19 55 | self.priori_P[c]=(len(y[y==c]))/(len(y)) 56 | X_c=X[np.where(y==c)] 57 | 58 | self.mus[c]=np.mean(X_c,axis=0) 59 | self.sigmas[c]=np.std(X_c,axis=0) 60 | 61 | def predict(self,X): 62 | pred_y=[] 63 | for i in range(len(X)): 64 | p=np.ones((self.n_classes,)) 65 | for c in range(self.n_classes): 66 | p[c]=self.priori_P[c] 67 | for col in range(X.shape[1]): 68 | x=X[i,col] 69 | p[c]*=1./(np.sqrt(2*np.pi)*self.sigmas[c,col])*np.exp(-(x-self.mus[c,col])**2/(2*self.sigmas[c,col]**2)) 70 | pred_y.append(np.argmax(p)) 71 | return np.array(pred_y) 72 | 73 | if __name__=='__main__': 74 | X = np.array([[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0], 75 | [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], 76 | [2, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1], 77 | [1, 1, 0, 1, 1, 1], [1, 1, 0, 0, 1, 0], 78 | [1, 1, 1, 1, 1, 0], [0, 2, 2, 0, 2, 1], 79 | [2, 2, 2, 2, 2, 0], [2, 0, 0, 2, 2, 1], 80 | [0, 1, 0, 1, 0, 0], [2, 1, 1, 1, 0, 0], 81 | [1, 1, 0, 0, 1, 1], [2, 0, 0, 2, 2, 0], 82 | [0, 0, 1, 1, 1, 0]]) 83 | y = np.array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 84 | 85 | X_test=np.array([[0, 0, 1, 0, 0, 0], [1, 0, 1, 0, 0, 0], 86 | [1, 1, 0, 1, 1, 0], [1, 0, 1, 1, 1, 0], 87 | [1, 1, 0, 0, 1, 1], [2, 0, 0, 2, 2, 0], 88 | [0, 0, 1, 1, 1, 0], 89 | [2, 0, 0, 2, 2, 0], 90 | [0, 0, 1, 1, 1, 0] 91 | ]) 92 | 93 | naive_bayes=NaiveBayesClassifier(n_classes=2) 94 | naive_bayes.fit(X,y) 95 | print('self.PrirP:',naive_bayes.priori_P) 96 | print('self.CondiP:',naive_bayes.conditional_P) 97 | pred_y=naive_bayes.predict(X_test) 98 | print('pred_y:',pred_y) 99 | 100 | 101 | -------------------------------------------------------------------------------- /tinyml/bayes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/bayes/__init__.py -------------------------------------------------------------------------------- /tinyml/bayes/__pycache__/NaiveBayesClassifier.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/bayes/__pycache__/NaiveBayesClassifier.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/bayes/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/bayes/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/cluster/AGNES.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | class AGNES: 5 | def __init__(self,k=3,dist_type='AVG'): 6 | self.k=k 7 | self.labels_=None 8 | self.C={} 9 | self.dist_func=None 10 | if dist_type=='MIN': 11 | self.dist_func=self.mindist 12 | elif dist_type=='MAX': 13 | self.dist_func=self.maxdist 14 | else: 15 | self.dist_func=self.avgdist 16 | 17 | # p215 图9.11 AGNES算法 18 | def fit(self,X): 19 | for j in range(X.shape[0]): 20 | self.C[j]=set() 21 | self.C[j].add(j) 22 | M=1e10*np.ones((X.shape[0],X.shape[0]),dtype=np.float32) 23 | for i in range(X.shape[0]): 24 | for j in range(i+1,X.shape[0]): 25 | M[i,j]=self.dist_func(X,self.C[i],self.C[j]) 26 | M[j,i]=M[i,j] 27 | q=X.shape[0] 28 | while q>self.k: 29 | index=np.argmin(M) 30 | i_=index//M.shape[1] 31 | j_=index%M.shape[1] 32 | self.C[i_]=set(self.C[i_].union(self.C[j_])) 33 | #print(self.C[i_]) 34 | for j in range(j_+1,q): 35 | self.C[j-1]=set(self.C[j]) 36 | del self.C[q-1] 37 | M=np.delete(M,[j_],axis=0) 38 | M=np.delete(M,[j_],axis=1) 39 | for j in range(q-1): 40 | if i_!=j: 41 | M[i_,j]=self.dist_func(X,self.C[i_],self.C[j]) 42 | M[j,i_]=M[i_,j] 43 | q-=1 44 | self.labels_=np.zeros((X.shape[0],),dtype=np.int32) 45 | for i in range(self.k): 46 | self.labels_[list(self.C[i])] = i 47 | 48 | @classmethod 49 | def mindist(cls,X,Ci,Cj): 50 | Xi=X[list(Ci)] 51 | Xj=X[list(Cj)] 52 | min=1e10 53 | for i in range(len(Xi)): 54 | d=np.sqrt(np.sum((Xi[i]-Xj)**2,axis=1)) 55 | dmin=np.min(d) 56 | if dminmax: 69 | max=dmax 70 | return max 71 | 72 | @classmethod 73 | def avgdist(cls,X,Ci,Cj): 74 | Xi=X[list(Ci)] 75 | Xj=X[list(Cj)] 76 | sum=0. 77 | for i in range(len(Xi)): 78 | d=np.sqrt(np.sum((Xi[i]-Xj)**2,axis=1)) 79 | sum+=np.sum(d) 80 | dist=sum/(len(Ci)*len(Cj)) 81 | return dist 82 | 83 | 84 | 85 | 86 | if __name__=='__main__': 87 | X=np.array([[0.697,0.460],[0.774,0.376],[0.634,0.264],[0.608,0.318],[0.556,0.215], 88 | [0.403,0.237],[0.481,0.149],[0.437,0.211],[0.666,0.091],[0.243,0.267], 89 | [0.245,0.057],[0.343,0.099],[0.639,0.161],[0.657,0.198],[0.360,0.370], 90 | [0.593,0.042],[0.719,0.103],[0.359,0.188],[0.339,0.241],[0.282,0.257], 91 | [0.748,0.232],[0.714,0.346],[0.483,0.312],[0.478,0.437],[0.525,0.369], 92 | [0.751,0.489],[0.532,0.472],[0.473,0.376],[0.725,0.445],[0.446,0.459]]) 93 | 94 | X_test=X 95 | agnes=AGNES() 96 | agnes.fit(X) 97 | print('C:', agnes.C) 98 | print(agnes.labels_) 99 | plt.figure(12) 100 | plt.subplot(121) 101 | plt.scatter(X[:, 0], X[:, 1], c=agnes.labels_) 102 | plt.title('tinyml') 103 | 104 | from sklearn.cluster.hierarchical import AgglomerativeClustering 105 | sklearn_agnes=AgglomerativeClustering(n_clusters=7,affinity='l2',linkage='average') 106 | sklearn_agnes.fit(X) 107 | print(sklearn_agnes.labels_) 108 | plt.subplot(122) 109 | plt.scatter(X[:,0],X[:,1],c=sklearn_agnes.labels_) 110 | plt.title('sklearn') 111 | plt.show() 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /tinyml/cluster/DBSCAN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import random 4 | from queue import Queue 5 | random.seed(1) 6 | 7 | class DBSCAN: 8 | def __init__(self,epsilon=0.11,min_pts=5): 9 | self.epsilon=epsilon 10 | self.min_pts=min_pts 11 | self.labels_=None 12 | self.C=None 13 | self.Omega=set() 14 | self.N_epsilon={} 15 | 16 | # p213 图9.9 DBSCAN算法 17 | def fit(self,X): 18 | self.C={} 19 | for j in range(X.shape[0]): 20 | dist=np.sqrt(np.sum((X-X[j])**2,axis=1)) 21 | self.N_epsilon[j]=np.where(dist<=self.epsilon)[0] 22 | if len(self.N_epsilon[j])>=self.min_pts: 23 | self.Omega.add(j) 24 | self.k=0 25 | Gamma=set(range(X.shape[0])) 26 | while len(self.Omega)>0: 27 | Gamma_old=set(Gamma) 28 | o=random.sample(list(self.Omega),1)[0] 29 | Q=Queue() 30 | Q.put(o) 31 | Gamma.remove(o) 32 | while not Q.empty(): 33 | q=Q.get() 34 | if len(self.N_epsilon[q])>=self.min_pts: 35 | Delta=set(self.N_epsilon[q]).intersection(set(Gamma)) 36 | for delta in Delta: 37 | Q.put(delta) 38 | Gamma.remove(delta) 39 | self.C[self.k]=Gamma_old.difference(Gamma) 40 | self.Omega=self.Omega.difference(self.C[self.k]) 41 | self.k += 1 42 | self.labels_=np.zeros((X.shape[0],),dtype=np.int32) 43 | for i in range(self.k): 44 | self.labels_[list(self.C[i])]=i 45 | 46 | 47 | if __name__=='__main__': 48 | X=np.array([[0.697,0.460],[0.774,0.376],[0.634,0.264],[0.608,0.318],[0.556,0.215], 49 | [0.403,0.237],[0.481,0.149],[0.437,0.211],[0.666,0.091],[0.243,0.267], 50 | [0.245,0.057],[0.343,0.099],[0.639,0.161],[0.657,0.198],[0.360,0.370], 51 | [0.593,0.042],[0.719,0.103],[0.359,0.188],[0.339,0.241],[0.282,0.257], 52 | [0.748,0.232],[0.714,0.346],[0.483,0.312],[0.478,0.437],[0.525,0.369], 53 | [0.751,0.489],[0.532,0.472],[0.473,0.376],[0.725,0.445],[0.446,0.459]]) 54 | 55 | dbscan=DBSCAN() 56 | dbscan.fit(X) 57 | print('C:',dbscan.C) 58 | print(dbscan.labels_) 59 | plt.figure(12) 60 | plt.subplot(121) 61 | plt.scatter(X[:,0],X[:,1],c=dbscan.labels_) 62 | plt.title('tinyml') 63 | 64 | import sklearn.cluster as cluster 65 | sklearn_DBSCAN=cluster.DBSCAN(eps=0.11,min_samples=5,metric='l2') 66 | sklearn_DBSCAN.fit(X) 67 | print(sklearn_DBSCAN.labels_) 68 | plt.subplot(122) 69 | plt.scatter(X[:,0],X[:,1],c=sklearn_DBSCAN.labels_) 70 | plt.title('sklearn') 71 | plt.show() 72 | 73 | -------------------------------------------------------------------------------- /tinyml/cluster/GaussianMixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | class GaussianMixture: 5 | def __init__(self,k=3,max_iter=50): 6 | self.k=k 7 | self.max_iter=max_iter 8 | self.labels_=None 9 | self.C=None 10 | self.alpha=None 11 | self.mu=None 12 | self.cov=None 13 | self.gamma=None 14 | pass 15 | 16 | # p210 图9.6 高斯混合聚类算法 17 | def fit(self,X): 18 | # p210初始化方法 19 | self.alpha=np.zeros((self.k,)) 20 | for i in range(self.k): 21 | self.alpha[i]=1./self.k 22 | mu_indices=[5,21,26] 23 | self.mu=X[mu_indices] 24 | self.cov=np.array([[[0.1,0.],[0.0,0.1]],[[0.1,0.],[0.,0.1]],[[0.1,0.],[0.,0.1]]]) 25 | 26 | self.gamma=np.zeros((X.shape[0],self.k)) 27 | for _ in range(self.max_iter): 28 | for j in range(X.shape[0]): 29 | alpha_p=np.zeros((self.k,)) 30 | sum=0. 31 | for i in range(self.k): 32 | alpha_p[i]=self.alpha[i]*self._p(X[j],self.mu[i],self.cov[i]) 33 | sum+=alpha_p[i] 34 | self.gamma[j,:]=alpha_p/sum 35 | 36 | for i in range(self.k): 37 | sum_gamma_i=np.sum(self.gamma[:,i]) 38 | self.mu[i]=X.T.dot(self.gamma[:,i])/sum_gamma_i 39 | numerator=0. 40 | for j in range(X.shape[0]): 41 | numerator+=(self.gamma[j,i]*((X[j]-self.mu[i]).reshape(-1,1).dot((X[j]-self.mu[i]).reshape(1,-1)))) 42 | self.cov[i]=numerator/sum_gamma_i 43 | self.alpha[i]=sum_gamma_i/X.shape[0] 44 | self.labels_=np.argmax(self.gamma,axis=1) 45 | self.C={} 46 | for i in range(self.k): 47 | self.C[i]=[] 48 | for j in range(len(self.labels_)): 49 | self.C[self.labels_[j]].append(j) 50 | 51 | def predict(self,X): 52 | gamma = np.zeros((X.shape[0], self.k)) 53 | for j in range(X.shape[0]): 54 | alpha_p = np.zeros((self.k,)) 55 | sum = 0. 56 | for i in range(self.k): 57 | alpha_p[i] = self.alpha[i] * self._p(X[j], self.mu[i], self.cov[i]) 58 | sum += alpha_p[i] 59 | gamma[j, :] = alpha_p / sum 60 | return np.argmax(gamma,axis=1) 61 | 62 | 63 | # 公式 9.28 64 | @classmethod 65 | def _p(cls,x,mu,cov): 66 | exp_coef=-0.5*((x-mu).T.dot(np.linalg.inv(cov)).dot(x-mu)) 67 | p=np.exp(exp_coef)/(np.power(2*np.pi,mu.shape[0]/2)*np.sqrt(np.linalg.det(cov))) 68 | return p 69 | 70 | if __name__=='__main__': 71 | X=np.array([[0.697,0.460],[0.774,0.376],[0.634,0.264],[0.608,0.318],[0.556,0.215], 72 | [0.403,0.237],[0.481,0.149],[0.437,0.211],[0.666,0.091],[0.243,0.267], 73 | [0.245,0.057],[0.343,0.099],[0.639,0.161],[0.657,0.198],[0.360,0.370], 74 | [0.593,0.042],[0.719,0.103],[0.359,0.188],[0.339,0.241],[0.282,0.257], 75 | [0.748,0.232],[0.714,0.346],[0.483,0.312],[0.478,0.437],[0.525,0.369], 76 | [0.751,0.489],[0.532,0.472],[0.473,0.376],[0.725,0.445],[0.446,0.459]]) 77 | 78 | X_test=X 79 | gmm=GaussianMixture(k=3,max_iter=50) 80 | gmm.fit(X) 81 | print(gmm.C) 82 | print(gmm.labels_) 83 | print(gmm.predict(X_test)) 84 | plt.scatter(X[:, 0], X[:, 1], c=gmm.labels_) 85 | plt.scatter(gmm.mu[:, 0], gmm.mu[:, 1],c=range(gmm.k), marker='+') 86 | plt.title('tinyml') 87 | plt.show() 88 | 89 | 90 | from sklearn.mixture import GaussianMixture 91 | 92 | sklearn_gmm = GaussianMixture(n_components=3, covariance_type='full', 93 | max_iter=50).fit(X) 94 | labels=sklearn_gmm.predict(X) 95 | print(labels) 96 | plt.scatter(X[:,0],X[:,1],c=labels) 97 | plt.title('sklearn') 98 | plt.show() 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /tinyml/cluster/KMeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import matplotlib.pyplot as plt 4 | random.seed(1) 5 | 6 | class KMeans: 7 | def __init__(self,k=2): 8 | self.labels_=None 9 | self.mu=None 10 | self.k=k 11 | 12 | def init(self,X,method='kmeans++',random_state=False): 13 | if method=='kmeans++': 14 | if random_state is False: 15 | np.random.seed(0) 16 | mus=[X[np.random.randint(0,len(X))]] 17 | while len(mus) 0.5: 38 | break 39 | self.hs_.append(copy.copy(ht)) 40 | self.epsilons_.append(epsilon_t) 41 | alpha_t = 0.5 * np.log((1 - epsilon_t) / epsilon_t) 42 | self.alphas_.append(alpha_t) 43 | self.Ds_.append(self.Ds_[t] * np.exp(-alpha_t * y * y_pred)) 44 | self.Ds_[t + 1] = self.Ds_[t + 1] / np.sum(self.Ds_[t + 1]) 45 | 46 | 47 | @classmethod 48 | def calc_epsilon(clf, D, y_target, y_pred): 49 | return 1 - np.sum(D[y_target == y_pred]) 50 | 51 | def predict(self, X): 52 | H=np.zeros((X.shape[0],)) 53 | for t in range(len(self.alphas_)): 54 | H+=(self.alphas_[t]*self.hs_[t].predict(X)) 55 | return np.sign(H) 56 | 57 | 58 | if __name__ == '__main__': 59 | breast_data = datasets.load_breast_cancer() 60 | X, y = breast_data.data, breast_data.target 61 | y = 2 * y - 1 62 | X_train, y_train = X[:200], y[:200] 63 | X_test, y_test = X[200:], y[200:] 64 | base_estimator=DecisionTreeClassifier(max_depth=1,random_state=False) 65 | 66 | sklearn_decision_tree = DecisionTreeClassifier(max_depth=1) 67 | sklearn_decision_tree.fit(X_train, y_train) 68 | y_pred_decison_tree = sklearn_decision_tree.predict(X_test) 69 | print('single decision tree:', len(y_test[y_pred_decison_tree == y_test]) * 1.0 / len(y_test)) 70 | 71 | print('tinyml:') 72 | adaboost_clf = AdaBoostClassifier(n_estimators=100,base_estimator=base_estimator,method='re-weighting') 73 | adaboost_clf.fit(X_train, y_train) 74 | y_pred = adaboost_clf.predict(X_test) 75 | print('adaboost y_pred:', len(y_test[y_pred == y_test]) * 1. / len(y_test)) 76 | 77 | print('sklearn:') 78 | sklearn_adboost_clf = sklearnAdaBoostClassifier(n_estimators=100, random_state=False, algorithm='SAMME', 79 | base_estimator=base_estimator) 80 | sklearn_adboost_clf.fit(X_train, y_train) 81 | sklearn_y_pred = sklearn_adboost_clf.predict(X_test) 82 | print('sklearn adaboost y_pred:', len(y_test[y_test == sklearn_y_pred]) * 1. / len(y_test)) 83 | 84 | -------------------------------------------------------------------------------- /tinyml/ensemble/GradientBoostingRegressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import datasets 3 | from sklearn.metrics import mean_squared_error 4 | from sklearn import ensemble 5 | import copy 6 | from sklearn import tree 7 | 8 | """ 9 | loss使用均方误差 10 | 残差为 y-y_pred 11 | 李航《统计学习方法》 p151 12 | """ 13 | class GradientBoostingRegressor: 14 | def __init__(self,base_estimator=None,n_estimators=10,lr=0.1): 15 | self.base_estimator=base_estimator 16 | self.n_esimators=n_estimators 17 | self.estimators=[] 18 | self.lr=lr 19 | self.mean=None 20 | 21 | def fit(self,X,y): 22 | F0_x=np.ones_like(y)*np.mean(y) 23 | y_pred=F0_x 24 | self.mean=np.mean(y) 25 | for i in range(self.n_esimators): 26 | hm=copy.deepcopy(self.base_estimator) 27 | hm.fit(X,y-y_pred) 28 | self.estimators.append(hm) 29 | y_pred=y_pred+self.lr*hm.predict(X) 30 | 31 | def predict(self,X): 32 | y=self.mean*np.ones((X.shape[0],)) 33 | for i in range(self.n_esimators): 34 | y=y+self.lr*self.estimators[i].predict(X) 35 | return y 36 | 37 | 38 | if __name__=='__main__': 39 | breast_data = datasets.load_boston() 40 | X, y = breast_data.data, breast_data.target 41 | print(X.shape) 42 | X_train, y_train = X[:400], y[:400] 43 | X_test, y_test = X[400:], y[400:] 44 | 45 | sklearn_decisiontree_reg=tree.DecisionTreeRegressor(min_samples_split=15, min_samples_leaf=5,random_state=False) 46 | sklearn_decisiontree_reg.fit(X_train, y_train) 47 | decisiontree_pred=sklearn_decisiontree_reg.predict(X_test) 48 | print('base estimator:',mean_squared_error(y_test,decisiontree_pred)) 49 | 50 | tinyml_gbdt_reg=GradientBoostingRegressor(n_estimators=500, base_estimator=tree.DecisionTreeRegressor(min_samples_split=15, min_samples_leaf=5, random_state=False)) 51 | tinyml_gbdt_reg.fit(X_train, y_train) 52 | y_pred=tinyml_gbdt_reg.predict(X_test) 53 | print('tinyml mse:',mean_squared_error(y_test,y_pred)) 54 | 55 | 56 | sklearn_gbdt_reg=ensemble.GradientBoostingRegressor(n_estimators=500,min_samples_leaf=5,min_samples_split=15,random_state=False) 57 | sklearn_gbdt_reg.fit(X_train,y_train) 58 | sklearn_pred=sklearn_gbdt_reg.predict(X_test) 59 | print('sklearn mse:',mean_squared_error(y_test,sklearn_pred)) 60 | -------------------------------------------------------------------------------- /tinyml/ensemble/RandomForestRegressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import datasets,ensemble,tree 3 | from sklearn.metrics import mean_squared_error 4 | 5 | class RandomForestRegressor: 6 | def __init__(self,base_estimator,n_estimators=10,min_samples_leaf=5,min_samples_split=15): 7 | self.base_estimator=base_estimator 8 | self.n_estimators=n_estimators 9 | self.min_samples_split=min_samples_split 10 | self.min_samples_leaf=min_samples_leaf 11 | self.estimators_=[] 12 | 13 | def fit(self,X,y): 14 | for t in range(self.n_estimators): 15 | estimator_t=self.base_estimator(random_state=True,min_samples_split=self.min_samples_split,min_samples_leaf=self.min_samples_leaf) 16 | estimator_t.fit(X,y) 17 | self.estimators_.append(estimator_t) 18 | 19 | def predict(self,X): 20 | preds=[] 21 | for t in range(self.n_estimators): 22 | preds.append(self.estimators_[t].predict(X)) 23 | return np.mean(np.array(preds),axis=0) 24 | 25 | 26 | if __name__=='__main__': 27 | breast_data = datasets.load_boston() 28 | X, y = breast_data.data, breast_data.target 29 | X_train, y_train = X[:400], y[:400] 30 | X_test, y_test = X[400:], y[400:] 31 | 32 | tinyml_decisiontree_reg=tree.DecisionTreeRegressor(min_samples_split=20, min_samples_leaf=5,random_state=True) 33 | tinyml_decisiontree_reg.fit(X_train, y_train) 34 | decisiontree_pred=tinyml_decisiontree_reg.predict(X_test) 35 | print('base estimator:',mean_squared_error(y_test,decisiontree_pred)) 36 | 37 | tinyml_rf_reg=RandomForestRegressor(n_estimators=100, base_estimator=tree.DecisionTreeRegressor) 38 | tinyml_rf_reg.fit(X_train,y_train) 39 | y_pred=tinyml_rf_reg.predict(X_test) 40 | print('tinyml rf mse:',mean_squared_error(y_test,y_pred)) 41 | 42 | sklearn_rf_reg=ensemble.RandomForestRegressor(n_estimators=100, min_samples_leaf=5, min_samples_split=20, random_state=False) 43 | sklearn_rf_reg.fit(X_train, y_train) 44 | sklearn_pred=sklearn_rf_reg.predict(X_test) 45 | print('sklearn mse:',mean_squared_error(y_test,sklearn_pred)) 46 | -------------------------------------------------------------------------------- /tinyml/ensemble/XGBRegressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import abc 3 | from sklearn import datasets,tree 4 | from sklearn.metrics import mean_squared_error 5 | import xgboost as xgb 6 | np.random.seed(1) 7 | 8 | class LossBase(object): 9 | def __init__(self,y_target,y_pred): 10 | self.y_target=y_target 11 | self.y_pred=y_pred 12 | pass 13 | 14 | @abc.abstractmethod 15 | def forward(self): 16 | raise NotImplementedError 17 | 18 | @abc.abstractmethod 19 | def g(self): 20 | raise NotImplementedError 21 | 22 | @abc.abstractmethod 23 | def h(self): 24 | raise NotImplementedError 25 | 26 | class MSELoss(LossBase): 27 | def __init__(self,y_target,y_pred): 28 | super(MSELoss,self).__init__(y_target,y_pred) 29 | 30 | def forward(self): 31 | return (self.y_target-self.y_pred)**2 32 | 33 | def g(self): 34 | return 2*(self.y_pred-self.y_target) 35 | 36 | def h(self): 37 | return 2*np.ones_like(self.y_target) 38 | 39 | class CART: 40 | 41 | def __init__(self, reg_lambda=1, gamma=0., max_depth=3,col_sample_ratio=0.5,row_sample_ratio=1.): 42 | self.reg_lambda=reg_lambda 43 | self.gamma=gamma 44 | self.max_depth=max_depth 45 | self.tree = None 46 | self.leaf_nodes=0 47 | self.obj_val=0. 48 | self.col_sample_ratio=col_sample_ratio 49 | self.row_sample_ratio=row_sample_ratio 50 | 51 | def fit(self, X, y,g,h): 52 | D = {} 53 | D['X'] = X 54 | D['y'] = y 55 | A = np.arange(X.shape[1]) 56 | m=len(y) 57 | self.tree = self.TreeGenerate(D,A,g,h,np.array(range(m)),0) 58 | self.obj_val=-0.5*self.obj_val+self.gamma*self.leaf_nodes 59 | 60 | def predict(self, X): 61 | if self.tree is None: 62 | raise RuntimeError('cant predict before fit') 63 | y_pred = [] 64 | for i in range(X.shape[0]): 65 | tree = self.tree 66 | x = X[i] 67 | while True: 68 | if not isinstance(tree, dict): 69 | y_pred.append(tree) 70 | break 71 | a = list(tree.keys())[0] 72 | tree = tree[a] 73 | if isinstance(tree, dict): 74 | val = x[a] 75 | split_val=float(list(tree.keys())[0][1:]) 76 | if val<=split_val: 77 | tree=tree[list(tree.keys())[0]] 78 | else: 79 | tree=tree[list(tree.keys())[1]] 80 | else: 81 | y_pred.append(tree) 82 | break 83 | return np.array(y_pred) 84 | 85 | def TreeGenerate(self, D, A,g,h,indices,depth): 86 | X = D['X'] 87 | if depth>self.max_depth: 88 | G=np.sum(g[indices]) 89 | H=np.sum(h[indices]) 90 | w=-(G/(H+self.reg_lambda)) 91 | self.obj_val+=(G**2/(H+self.reg_lambda)) 92 | self.leaf_nodes+=1 93 | return w 94 | split_j=None 95 | split_s=None 96 | max_gain=0. 97 | 98 | col_sample_indices=np.random.choice(A,size=int(len(A)*self.col_sample_ratio)) 99 | indices=np.random.choice(indices,size=int(len(indices)*self.row_sample_ratio)) 100 | 101 | for j in A: 102 | if j not in col_sample_indices: 103 | continue 104 | for s in np.unique(X[:,j]): 105 | tmp_left=np.where(X[indices,j]<=s)[0] 106 | tmp_right=np.where(X[indices,j]>s)[0] 107 | if len(tmp_left)<1 or len(tmp_right)<1: 108 | continue 109 | left_indices=indices[tmp_left] 110 | right_indices=indices[tmp_right] 111 | G_L=np.sum(g[left_indices]) 112 | G_R=np.sum(g[right_indices]) 113 | H_L=np.sum(h[left_indices]) 114 | H_R=np.sum(h[right_indices]) 115 | gain= (G_L ** 2 / (H_L + self.reg_lambda) + G_R ** 2 / (H_R + self.reg_lambda) - (G_L + G_R) ** 2 / (H_L + H_R + self.reg_lambda)) - self.gamma 116 | if gain>max_gain: 117 | split_j=j 118 | split_s=s 119 | max_gain=gain 120 | 121 | if split_j is None: 122 | G = np.sum(g[indices]) 123 | H = np.sum(h[indices]) 124 | w = -(G / (H + self.reg_lambda)) 125 | self.obj_val += (G ** 2 / (H + self.reg_lambda)) 126 | self.leaf_nodes += 1 127 | return w 128 | 129 | tree = {split_j: {}} 130 | left_indices=indices[np.where(X[indices,split_j]<=split_s)[0]] 131 | right_indices=indices[np.where(X[indices,split_j]>split_s)[0]] 132 | tree[split_j]['l'+str(split_s)]=self.TreeGenerate(D,A,g,h,left_indices,depth+1) 133 | tree[split_j]['r'+str(split_s)]=self.TreeGenerate(D,A,g,h,right_indices,depth+1) 134 | # 当前节点值 135 | tree[split_j]['val']= -(np.sum(g[indices]) / (np.sum(h[indices]) + self.reg_lambda)) 136 | return tree 137 | 138 | """ 139 | 使用MSELoss 140 | 按照陈天奇的xgboost PPT实现 141 | """ 142 | class XGBRegressor: 143 | def __init__(self, reg_lambda=1, gamma=0., max_depth=5, n_estimators=250, eta=.1): 144 | self.reg_lambda=reg_lambda 145 | self.gamma=gamma 146 | self.max_depth=max_depth 147 | self.n_estimators=n_estimators 148 | self.eta=eta 149 | self.mean=None 150 | self.estimators_=[] 151 | 152 | def fit(self,X,y): 153 | self.mean=np.mean(y) 154 | y_pred = np.ones_like(y)*self.mean 155 | loss = MSELoss(y, y_pred) 156 | g, h = loss.g(), loss.h() 157 | for t in range(self.n_estimators): 158 | estimator_t=CART(self.reg_lambda, self.gamma, self.max_depth) 159 | y_target=y-y_pred 160 | estimator_t.fit(X,y_target,g,h) 161 | self.estimators_.append(estimator_t) 162 | y_pred+=(self.eta*estimator_t.predict(X)) 163 | loss=MSELoss(y,y_pred) 164 | g,h=loss.g(),loss.h() 165 | 166 | def predict(self,X): 167 | y_pred=np.ones((X.shape[0],))*self.mean 168 | for t in range(self.n_estimators): 169 | y_pred+=(self.eta*self.estimators_[t].predict(X)) 170 | return y_pred 171 | 172 | if __name__=='__main__': 173 | breast_data = datasets.load_boston() 174 | X, y = breast_data.data, breast_data.target 175 | 176 | X_train, y_train = X[:400], y[:400] 177 | X_test, y_test = X[400:], y[400:] 178 | 179 | sklearn_decisiontree_reg=tree.DecisionTreeRegressor(min_samples_split=15, min_samples_leaf=5,random_state=False) 180 | sklearn_decisiontree_reg.fit(X_train, y_train) 181 | decisiontree_pred=sklearn_decisiontree_reg.predict(X_test) 182 | print('base estimator:',mean_squared_error(y_test,decisiontree_pred)) 183 | 184 | tinyml_gbdt_reg=XGBRegressor(n_estimators=100,max_depth=3,gamma=0.) 185 | tinyml_gbdt_reg.fit(X_train, y_train) 186 | y_pred=tinyml_gbdt_reg.predict(X_test) 187 | print('tinyml mse:',mean_squared_error(y_test,y_pred)) 188 | 189 | xgb_reg=xgb.sklearn.XGBRegressor(max_depth=3,learning_rate=0.1,n_estimators=100,gamma=0,reg_lambda=1) 190 | xgb_reg.fit(X_train,y_train) 191 | xgb_pred=xgb_reg.predict(X_test) 192 | print('xgb mse:',mean_squared_error(y_test,xgb_pred)) 193 | -------------------------------------------------------------------------------- /tinyml/ensemble/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__init__.py -------------------------------------------------------------------------------- /tinyml/ensemble/__pycache__/AdaBoostClassifier.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/AdaBoostClassifier.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/ensemble/__pycache__/GradientBoostingRegressor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/GradientBoostingRegressor.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/ensemble/__pycache__/RandomForestRegressor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/RandomForestRegressor.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/ensemble/__pycache__/XGBRegressor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/XGBRegressor.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/ensemble/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/factorization_machine/FMClassifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import linear_model 3 | from sklearn.datasets import load_breast_cancer 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.preprocessing import MinMaxScaler 6 | import math 7 | np.random.seed(0) 8 | import torch 9 | from torch import nn,optim 10 | class SGDFMClassifier: 11 | class FMClassifier(nn.Module): 12 | def __init__(self,n_features,loss='logistic',degree=2,n_components=2): 13 | super(SGDFMClassifier.FMClassifier,self).__init__() 14 | self.loss=loss 15 | self.degree=degree 16 | self.n_components=n_components 17 | self.linear=nn.Linear(n_features,1) 18 | self.v=nn.Parameter(torch.Tensor(n_features,self.n_components)) 19 | stdev=1./math.sqrt(self.v.size(1)) 20 | self.v.data.uniform_(-stdev,stdev) 21 | self.sigmoid=nn.Sigmoid() 22 | 23 | def forward(self,X): 24 | y=self.linear(X)+0.5*torch.sum(torch.pow(torch.mm(X,self.v),2)- 25 | torch.mm(torch.pow(X,2),torch.pow(self.v,2))) 26 | return self.sigmoid(y) 27 | 28 | def __init__(self,max_iter=100000,learning_rate=0.005): 29 | self.max_iter=max_iter 30 | self.learning_rate=learning_rate 31 | self.criterion=nn.BCELoss() 32 | self.fitted=False 33 | 34 | def fit(self,X,y): 35 | n_feature=X.shape[1] 36 | self.model=self.FMClassifier(n_feature) 37 | self.optimizer=optim.SGD(self.model.parameters(),lr=self.learning_rate) 38 | X=torch.from_numpy(X.astype(np.float32)) 39 | y=torch.from_numpy(y.astype(np.float32)) 40 | for epoch in range(self.max_iter): 41 | y_predict=self.model(X)[:,0] 42 | loss=self.criterion(y_predict,y) 43 | #print('epoch:',epoch,' loss.item():',loss.item()) 44 | self.optimizer.zero_grad() 45 | loss.backward() 46 | self.optimizer.step() 47 | 48 | def predict(self,X): 49 | X = torch.from_numpy(X.astype(np.float32)) 50 | with torch.no_grad(): 51 | y_pred = self.model(X).detach().numpy() 52 | y_pred[y_pred>0.5]=1 53 | y_pred[y_pred<=0.5]=0 54 | return y_pred[:,0] 55 | 56 | if __name__=='__main__': 57 | breast_data = load_breast_cancer() 58 | X, y = breast_data.data[:, :7], breast_data.target 59 | X = MinMaxScaler().fit_transform(X) 60 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 61 | 62 | torch_mfclassifier = SGDFMClassifier(20000, 0.001) 63 | torch_mfclassifier.fit(X_train, y_train) 64 | torch_pred = torch_mfclassifier.predict(X_test) 65 | print('torch accuracy:', len(y_test[y_test == torch_pred]) / len(y_test)) -------------------------------------------------------------------------------- /tinyml/factorization_machine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/factorization_machine/__init__.py -------------------------------------------------------------------------------- /tinyml/feature_selection/ReliefFeatureSelection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import MinMaxScaler 3 | from sklearn.datasets import load_breast_cancer 4 | import random 5 | 6 | # 处理连续型 7 | class ReliefFeatureSelection: 8 | def __init__(self,sample_ratio=0.5,k=5,seed=None): 9 | self.feature_importances_=None 10 | self.k=k 11 | self.sample_ratio=sample_ratio 12 | self.seed=seed 13 | random.seed(self.seed) 14 | 15 | def fit(self,X,y): 16 | m,n=X.shape 17 | self.feature_importances_=np.zeros((n,)) 18 | for t in range(self.k): 19 | indices=random.sample(range(m),int(m*self.sample_ratio)) 20 | subX,suby=X[indices],y[indices] 21 | self.feature_importances_+=self._fit(subX,suby) 22 | self.feature_importances_/=self.k 23 | 24 | 25 | def transform(self,X,k_features): 26 | choosed_indices=np.argsort(self.feature_importances_)[::-1][:k_features] 27 | return X[:,choosed_indices] 28 | 29 | def _fit(self,subX,suby): 30 | label_to_indices = {} 31 | labels = np.unique(suby) 32 | for label in labels: 33 | label_to_indices[label] = list(np.where(suby == label)[0]) 34 | m, n = subX.shape 35 | feature_scores_ = np.zeros((n,)) 36 | for j in range(n): 37 | for i in range(m): 38 | label_i = suby[i] 39 | xi_nhs = (subX[i, j] - subX[label_to_indices[label_i], j]) ** 2 40 | if len(xi_nhs) == 1: 41 | xi_nh = 0 42 | else: 43 | xi_nh = np.sort(xi_nhs)[1] 44 | feature_scores_[j] -= xi_nh 45 | for label in labels: 46 | if label == label_i: 47 | continue 48 | xi_nm = np.sort((subX[i, j] - subX[label_to_indices[label], j]) ** 2)[0] 49 | feature_scores_[j] += (xi_nm * len(label_to_indices[label]) / m) 50 | return feature_scores_ 51 | 52 | 53 | if __name__=='__main__': 54 | breast_data = load_breast_cancer() 55 | subX, suby = breast_data.data, breast_data.target 56 | scaler=MinMaxScaler() 57 | subX=scaler.fit_transform(subX) 58 | reliefF=ReliefFeatureSelection() 59 | reliefF.fit(subX, suby) 60 | print('relief feature_importances:',reliefF.feature_importances_) 61 | print('sorted:',np.argsort(reliefF.feature_importances_)) 62 | 63 | import skrebate.relieff as relieff 64 | skrebate_reliefF=relieff.ReliefF() 65 | skrebate_reliefF.fit(subX, suby) 66 | print('skrebate feature_importances_:',skrebate_reliefF.feature_importances_) 67 | print('sorted:',np.argsort(skrebate_reliefF.feature_importances_)) 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /tinyml/feature_selection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/feature_selection/__init__.py -------------------------------------------------------------------------------- /tinyml/linear_model/LinearRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import linear_model 3 | 4 | 5 | class LinearRegression: 6 | def __init__(self): 7 | self.w=None 8 | self.n_features=None 9 | 10 | def fit(self,X,y): 11 | """ 12 | w=(X^TX)^{-1}X^Ty 13 | """ 14 | assert isinstance(X,np.ndarray) and isinstance(y,np.ndarray) 15 | assert X.ndim==2 and y.ndim==1 16 | assert y.shape[0]==X.shape[0] 17 | n_samples = X.shape[0] 18 | self.n_features=X.shape[1] 19 | extra=np.ones((n_samples,)) 20 | X=np.c_[X,extra] 21 | if self.n_features0.5]=1 46 | y_pred[y_pred<=0.5]=0 47 | return y_pred[:,0] 48 | 49 | 50 | class LogisticRegression: 51 | def __init__(self,max_iter=100,use_matrix=True): 52 | self.beta=None 53 | self.n_features=None 54 | self.max_iter=max_iter 55 | self.use_Hessian=use_matrix 56 | 57 | def fit(self,X,y): 58 | n_samples=X.shape[0] 59 | self.n_features=X.shape[1] 60 | extra=np.ones((n_samples,)) 61 | X=np.c_[X,extra] 62 | self.beta=np.random.random((X.shape[1],)) 63 | for i in range(self.max_iter): 64 | if self.use_Hessian is not True: 65 | dldbeta=self._dldbeta(X,y,self.beta) 66 | dldldbetadbeta=self._dldldbetadbeta(X,self.beta) 67 | self.beta-=(1./dldldbetadbeta*dldbeta) 68 | else: 69 | dldbeta = self._dldbeta(X, y, self.beta) 70 | dldldbetadbeta = self._dldldbetadbeta_matrix(X, self.beta) 71 | self.beta -= (np.linalg.inv(dldldbetadbeta).dot(dldbeta)) 72 | 73 | 74 | 75 | @staticmethod 76 | def _dldbeta(X,y,beta): 77 | # 《机器学习》 公式 3.30 78 | m=X.shape[0] 79 | sum=np.zeros(X.shape[1],).T 80 | for i in range(m): 81 | sum+=X[i]*(y[i]-np.exp(X[i].dot(beta))/(1+np.exp(X[i].dot(beta)))) 82 | return -sum 83 | 84 | @staticmethod 85 | def _dldldbetadbeta_matrix(X,beta): 86 | m=X.shape[0] 87 | Hessian=np.zeros((X.shape[1],X.shape[1])) 88 | for i in range(m): 89 | p1 = np.exp(X[i].dot(beta)) / (1 + np.exp(X[i].dot(beta))) 90 | tmp=X[i].reshape((-1,1)) 91 | Hessian+=tmp.dot(tmp.T)*p1*(1-p1) 92 | return Hessian 93 | 94 | @staticmethod 95 | def _dldldbetadbeta(X,beta): 96 | # 《机器学习》公式 3.31 97 | m=X.shape[0] 98 | sum=0. 99 | for i in range(m): 100 | p1=np.exp(X[i].dot(beta))/(1+np.exp(X[i].dot(beta))) 101 | sum+=X[i].dot(X[i].T)*p1*(1-p1) 102 | return sum 103 | 104 | def predict_proba(self,X): 105 | n_samples = X.shape[0] 106 | extra = np.ones((n_samples,)) 107 | X = np.c_[X, extra] 108 | if self.beta is None: 109 | raise RuntimeError('cant predict before fit') 110 | p1 = np.exp(X.dot(self.beta)) / (1 + np.exp(X.dot(self.beta))) 111 | p0 = 1 - p1 112 | return np.c_[p0,p1] 113 | 114 | def predict(self,X): 115 | p=self.predict_proba(X) 116 | res=np.argmax(p,axis=1) 117 | return res 118 | 119 | 120 | if __name__=='__main__': 121 | breast_data = load_breast_cancer() 122 | X, y = breast_data.data[:,:7], breast_data.target 123 | X = MinMaxScaler().fit_transform(X) 124 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 125 | tinyml_logisticreg = LogisticRegression(max_iter=100,use_matrix=True) 126 | tinyml_logisticreg.fit(X_train, y_train) 127 | lda_prob = tinyml_logisticreg.predict_proba(X_test) 128 | 129 | 130 | lda_pred = tinyml_logisticreg.predict(X_test) 131 | # print('tinyml logistic_prob:', lda_prob) 132 | # print('tinyml logistic_pred:', lda_pred) 133 | print('tinyml accuracy:', len(y_test[y_test == lda_pred]) * 1. / len(y_test)) 134 | 135 | sklearn_logsticreg = linear_model.LogisticRegression(max_iter=100,solver='newton-cg') 136 | sklearn_logsticreg.fit(X_train, y_train) 137 | sklearn_prob = sklearn_logsticreg.predict_proba(X_test) 138 | sklearn_pred = sklearn_logsticreg.predict(X_test) 139 | # print('sklearn prob:',sklearn_prob) 140 | # print('sklearn pred:',sklearn_pred) 141 | print('sklearn accuracy:', len(y_test[y_test == sklearn_pred]) * 1. / len(y_test)) 142 | 143 | torch_sgd_logisticreg=SGDLogisticRegression(100000,0.01) 144 | torch_sgd_logisticreg.fit(X_train,y_train) 145 | torch_pred=torch_sgd_logisticreg.predict(X_test) 146 | print('torch accuracy:',len(y_test[y_test==torch_pred])/len(y_test)) 147 | 148 | # expected output 149 | """ 150 | tinyml accuracy: 0.9590643274853801 151 | sklearn accuracy: 0.9298245614035088 152 | torch accuracy: 0.9532163742690059 153 | """ 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /tinyml/linear_model/SGDRegressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import linear_model 3 | 4 | # 采用MSE作为损失函数 5 | # penalty = 'l2' 则为 Ridge Regression 6 | # penalty = 'l1' 则为 Lasso Regression 7 | # penalty = 'l1l2' 则为 Elastic Net 8 | # alpha 为 正则化系数 9 | 10 | # https://wwdguu.github.io/2018/09/01/%C2%96HOMLWSLATF-ch4/ 11 | np.random.seed(1) 12 | class SGDRegressor: 13 | def __init__(self,max_iter=100,penalty=None,alpha=1e-3,l1_ratio=0.5): 14 | self.w = None 15 | self.n_features = None 16 | self.penalty=penalty 17 | self.alpha=alpha 18 | self.l1_ratio=l1_ratio 19 | self.max_iter=max_iter 20 | 21 | # 22 | def fit(self, X, y): 23 | assert isinstance(X, np.ndarray) and isinstance(y, np.ndarray) 24 | assert y.shape[0] == X.shape[0] 25 | n_samples = X.shape[0] 26 | self.n_features = X.shape[1] 27 | extra = np.ones((n_samples,1)) 28 | X = np.c_[X,extra] 29 | self.w=np.random.randn(X.shape[1],1) 30 | for iter in range(self.max_iter): 31 | for i in range(n_samples): 32 | sample_index=np.random.randint(n_samples) 33 | x_sample=X[sample_index:sample_index+1] 34 | y_sample=y[sample_index:sample_index+1] 35 | lr=SGDRegressor.learning_schedule(iter*n_samples+i) 36 | # 求导 37 | grad=2*x_sample.T.dot(x_sample.dot(self.w)-y_sample) 38 | if self.penalty is not None: 39 | # Ridge 40 | if self.penalty=='l2': 41 | grad+=self.alpha*self.w 42 | # Lasso 43 | elif self.penalty=='l1': 44 | grad+=self.alpha*np.sign(self.w) 45 | # Elastic Net 46 | elif self.penalty=='l1l2': 47 | grad+=(self.alpha*self.l1_ratio*np.sign(self.w)+ 48 | (1-self.l1_ratio)*self.alpha*self.w) 49 | 50 | self.w=self.w-lr*grad 51 | 52 | 53 | def predict(self, X): 54 | 55 | n_samples = X.shape[0] 56 | extra = np.ones((n_samples,1)) 57 | X = np.c_[X,extra] 58 | if self.w is None: 59 | raise RuntimeError('cant predict before fit') 60 | y_ = X.dot(self.w) 61 | return y_ 62 | 63 | @staticmethod 64 | def learning_schedule(t): 65 | return 5 / (t + 50) 66 | 67 | 68 | if __name__ == '__main__': 69 | X = 2 * np.random.rand(100,1) 70 | y = 4 + 3 * X + np.random.randn(100,1) 71 | y=y.ravel() 72 | print(X.shape) 73 | print(y.shape) 74 | lr = SGDRegressor(max_iter=200,penalty='l1l2',alpha=1e-3,l1_ratio=0.5) 75 | lr.fit(X, y) 76 | print('w:',lr.w) 77 | 78 | sklearn_lr = linear_model.SGDRegressor(max_iter=200,penalty='l1',alpha=1e-3) 79 | sklearn_lr.fit(X, y) 80 | print(sklearn_lr.coef_) 81 | print(sklearn_lr.intercept_) 82 | 83 | -------------------------------------------------------------------------------- /tinyml/linear_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__init__.py -------------------------------------------------------------------------------- /tinyml/linear_model/__pycache__/LinearRegression.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__pycache__/LinearRegression.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/linear_model/__pycache__/LogisticRegression.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__pycache__/LogisticRegression.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/linear_model/__pycache__/SGDRegressor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__pycache__/SGDRegressor.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/linear_model/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/metrices/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/metrices/__init__.py -------------------------------------------------------------------------------- /tinyml/metrices/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/metrices/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/metrices/__pycache__/curves.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/metrices/__pycache__/curves.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/metrices/curves.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def precision_recall_curve(y_true,pred_prob): 3 | probs=sorted(list(pred_prob),reverse=True) 4 | Rs=[] 5 | Ps=[] 6 | for i in range(1,len(probs)): 7 | thresh=probs[i] 8 | preds_p=np.where(pred_prob>=thresh)[0] 9 | preds_n=np.where(pred_prob=thresh)[0] 28 | preds_n = np.where(pred_prob 0 and y[i]*u[i]>=1: 49 | return False 50 | if (self.alpha[i]==0 or self.alpha[i]==self.C) and y[i]*u[i]==1: 51 | return False 52 | return True 53 | 54 | 55 | def fit(self,X,y): 56 | self.X=X 57 | self.y=y 58 | self.K=self.computeK(X,self.kernel) 59 | self.alpha=np.random.random((X.shape[0],)) 60 | self.omiga=np.zeros((X.shape[0],)) 61 | 62 | for _ in range(self.max_iter): 63 | u = self.compute_u(X, y) 64 | finish=True 65 | for i in range(X.shape[0]): 66 | if not self.checkKKT(u,y,i): 67 | finish=False 68 | y_indices=np.delete(np.arange(X.shape[0]),i) 69 | j=y_indices[int(np.random.random()*len(y_indices))] 70 | E_i=np.sum(self.alpha*y*self.K[:,i])+self.b-y[i] 71 | E_j=np.sum(self.alpha*y*self.K[:,j])+self.b-y[j] 72 | if y[i]!=y[j]: 73 | L=max(0,self.alpha[j]-self.alpha[i]) 74 | H=min(self.C,self.C+self.alpha[j]-self.alpha[i]) 75 | else: 76 | L=max(0,self.alpha[j]+self.alpha[i]-self.C) 77 | H=min(self.C,self.alpha[j]+self.alpha[i]) 78 | eta=self.K[i,i]+self.K[j,j]-2*self.K[i,j] 79 | alpha2_new_unc=self.alpha[j]+y[j]*(E_i-E_j)/eta 80 | alpha2_old=self.alpha[j] 81 | alpha1_old=self.alpha[i] 82 | if alpha2_new_unc>H: 83 | self.alpha[j]=H 84 | elif alpha2_new_unc0 and self.alpha[i]0 and self.alpha[j]0)[0] 107 | for j in support_indices: 108 | K[j]=self.kernel_func(self.kernel,self.X[j],X[i]) 109 | y_pred=np.sum(self.y[support_indices]*self.alpha[support_indices]*K[support_indices].T) 110 | y_pred+=self.b 111 | y_preds.append(y_pred) 112 | return np.array(y_preds) 113 | 114 | 115 | if __name__=='__main__': 116 | 117 | # 测试 线性核 118 | X = np.array([[2, -1], [3, -2], [1, 0], [0,1],[-2,1],[-1.3,0.3],[-0.2,-0.8],[2.3,-3.3],[-2,-4],[7,8]]) 119 | y = np.array([1, 1, 1, 1,-1,-1,-1,-1,-1,1]) 120 | svc=SVC(max_iter=100,kernel='linear',C=1) 121 | 122 | """ 123 | # 测试rbf核 124 | X=np.array([[1,0],[-1,0],[0,-1],[0,1],[2,np.sqrt(5)],[2,-np.sqrt(5)],[-2,np.sqrt(5)],[-2,-np.sqrt(5)],[300,400]]) 125 | y=np.array([-1,-1,-1,-1,1,1,1,1,1]) 126 | svc=SVC(max_iter=100,kernel='rbf',C=1) 127 | """ 128 | svc.fit(X,y) 129 | print('alpha:',svc.alpha) 130 | print('b:',svc.b) 131 | pred_y=svc.predict(np.array([[1,0],[-0.2,-0.1],[0,1]])) 132 | print('pred_y1:',pred_y) 133 | pred_y=np.sign(pred_y) 134 | print('pred_y:',pred_y) 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /tinyml/svm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/svm/__init__.py -------------------------------------------------------------------------------- /tinyml/svm/__pycache__/SVC.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/svm/__pycache__/SVC.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/svm/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/svm/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/tree/DecisionTreeClassifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | """ 3 | 简单的决策树实现,算法参考 周志华《机器学习》一书 4 | 只处理离散值,不考虑缺失值 5 | """ 6 | from tinyml.tree.treePlotter import createPlot 7 | np.random.seed(100) 8 | class DecisionTreeClassifier: 9 | """ 10 | 决策树分类 11 | """ 12 | def __init__(self,tree_type='ID3',k_classes=2): 13 | self.tree_type=tree_type 14 | self.k_classes=k_classes 15 | if tree_type=='ID3': 16 | self.gain_func=self.Gain 17 | elif tree_type=='CART': 18 | self.gain_func=self.GiniIndex 19 | elif tree_type=='C45': 20 | self.gain_func=self.GainRatio 21 | else: 22 | raise ValueError('must be ID3 or CART or C45') 23 | self.tree=None 24 | 25 | def fit(self,X,y): 26 | D={} 27 | D['X']=X 28 | D['y']=y 29 | A=np.arange(X.shape[1]) 30 | aVs={} 31 | for a in A: 32 | aVs[a]=np.unique(X[:,a]) 33 | self.tree=self.TreeGenerate(D,A,aVs) 34 | 35 | def predict(self,X): 36 | if self.tree is None: 37 | raise RuntimeError('cant predict before fit') 38 | y_pred=[] 39 | for i in range(X.shape[0]): 40 | tree = self.tree 41 | x=X[i] 42 | while True: 43 | if not isinstance(tree,dict): 44 | y_pred.append(tree) 45 | break 46 | a=list(tree.keys())[0] 47 | tree=tree[a] 48 | if isinstance(tree,dict): 49 | val = x[a] 50 | tree = tree[val] 51 | else: 52 | y_pred.append(tree) 53 | break 54 | return np.array(y_pred) 55 | 56 | 57 | # p74 图4.2算法图 58 | def TreeGenerate(self,D,A,aVs): 59 | X=D['X'] 60 | y=D['y'] 61 | # 情形1 62 | unique_classes=np.unique(y) 63 | if len(unique_classes)==1: 64 | return unique_classes[0] 65 | flag=True 66 | for a in A: 67 | if(len(np.unique(X[:,a]))>1): 68 | flag=False 69 | break 70 | # 情形2 71 | if flag: 72 | return np.argmax(np.bincount(y)) 73 | 74 | gains=np.zeros((len(A),)) 75 | if self.tree_type=='C45': 76 | gains=np.zeros((len(A),2)) 77 | for i in range(len(A)): 78 | gains[i]=self.gain_func(D,A[i]) 79 | #print(gains) 80 | subA=None 81 | if self.tree_type=='CART': 82 | a_best=A[np.argmin(gains)] 83 | subA=np.delete(A,np.argmin(gains)) 84 | elif self.tree_type=='ID3': 85 | a_best=A[np.argmax(gains)] 86 | subA=np.delete(A,np.argmax(gains)) 87 | elif self.tree_type=='C45': 88 | gain_mean=np.mean(gains[:,0]) 89 | higher_than_mean_indices=np.where(gains[:,0]>=gain_mean) 90 | higher_than_mean=gains[higher_than_mean_indices,1][0] 91 | index=higher_than_mean_indices[0][np.argmax(higher_than_mean)] 92 | a_best=A[index] 93 | subA=np.delete(A,index) 94 | 95 | tree={a_best:{}} 96 | 97 | for av in aVs[a_best]: 98 | indices=np.where(X[:,a_best]==av) 99 | Dv={} 100 | Dv['X']=X[indices] 101 | Dv['y']=y[indices] 102 | if len(Dv['y'])==0: 103 | tree[a_best][av]=np.argmax(np.bincount(y)) 104 | else: 105 | tree[a_best][av]=self.TreeGenerate(Dv,subA,aVs) 106 | return tree 107 | 108 | 109 | 110 | @classmethod 111 | def Ent(cls,D): 112 | """ 113 | 《机器学习》 公式4.1 信息熵 114 | :param D: 数据集 115 | :return: 信息熵 116 | """ 117 | y=D['y'] 118 | bin_count=np.bincount(y) 119 | total=len(y) 120 | ent=0. 121 | for k in range(len(bin_count)): 122 | p_k=bin_count[k]/total 123 | if p_k!=0: 124 | ent+=p_k*np.log2(p_k) 125 | return -ent 126 | 127 | @classmethod 128 | def Gain(cls,D,a): 129 | """ 130 | 《机器学习》 公式4.2 信息增益 131 | a表示属性列 index 132 | """ 133 | X=D['X'] 134 | y=D['y'] 135 | aV=np.unique(X[:,a]) 136 | sum=0. 137 | for v in range(len(aV)): 138 | Dv={} 139 | indices=np.where(X[:,a]==aV[v]) 140 | Dv['X']=X[indices] 141 | Dv['y']=y[indices] 142 | ent=cls.Ent(Dv) 143 | sum+=(len(Dv['y'])/len(y)*ent) 144 | gain=cls.Ent(D)-sum 145 | return gain 146 | 147 | @classmethod 148 | def Gini(cls,D): 149 | """ 150 | 《机器学习》 公式4.5 151 | """ 152 | y = D['y'] 153 | bin_count = np.bincount(y) 154 | total = len(y) 155 | ent = 0. 156 | for k in range(len(bin_count)): 157 | p_k = bin_count[k] / total 158 | ent+=p_k**2 159 | return 1-ent 160 | 161 | @classmethod 162 | def GiniIndex(cls,D,a): 163 | """ 164 | 公式4.6 165 | """ 166 | X = D['X'] 167 | y = D['y'] 168 | aV = np.unique(X[:, a]) 169 | sum = 0. 170 | for v in range(len(aV)): 171 | Dv = {} 172 | indices = np.where(X[:, a] == aV[v]) 173 | Dv['X'] = X[indices] 174 | Dv['y'] = y[indices] 175 | ent = cls.Gini(Dv) 176 | sum += (len(Dv['y']) / len(y) * ent) 177 | gain = sum 178 | return gain 179 | 180 | @classmethod 181 | def GainRatio(cls,D,a): 182 | """ 183 | 公式4.3 4.4 184 | """ 185 | X = D['X'] 186 | y = D['y'] 187 | aV = np.unique(X[:, a]) 188 | sum = 0. 189 | intrinsic_value=0. 190 | for v in range(len(aV)): 191 | Dv = {} 192 | indices = np.where(X[:, a] == aV[v]) 193 | Dv['X'] = X[indices] 194 | Dv['y'] = y[indices] 195 | ent = cls.Ent(Dv) 196 | sum += (len(Dv['y']) / len(y) * ent) 197 | intrinsic_value+=(len(Dv['y'])/len(y))*np.log2(len(Dv['y'])/len(y)) 198 | gain = cls.Ent(D) - sum 199 | intrinsic_value=-intrinsic_value 200 | gain_ratio=gain/intrinsic_value 201 | return np.array([gain,gain_ratio]) 202 | 203 | if __name__=='__main__': 204 | watermelon_data = np.array([[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0], 205 | [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], 206 | [2, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1], 207 | [1, 1, 0, 1, 1, 1], [1, 1, 0, 0, 1, 0], 208 | [1, 1, 1, 1, 1, 0], [0, 2, 2, 0, 2, 1], 209 | [2, 2, 2, 2, 2, 0], [2, 0, 0, 2, 2, 1], 210 | [0, 1, 0, 1, 0, 0], [2, 1, 1, 1, 0, 0], 211 | [1, 1, 0, 0, 1, 1], [2, 0, 0, 2, 2, 0], 212 | [0, 0, 1, 1, 1, 0]]) 213 | label = np.array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 214 | X_test=np.array([[0, 0, 1, 0, 0, 0], [1, 0, 1, 0, 0, 0], 215 | [1, 1, 0, 1, 1, 0], [1, 0, 1, 1, 1, 0], 216 | [1, 1, 0, 0, 1, 1], [2, 0, 0, 2, 2, 0], 217 | [0, 0, 1, 1, 1, 0]]) 218 | 219 | decision_clf=DecisionTreeClassifier(tree_type='ID3') 220 | decision_clf.fit(watermelon_data,label) 221 | print(decision_clf.tree) 222 | createPlot(decision_clf.tree) 223 | 224 | y_pred=decision_clf.predict(X_test) 225 | print('y_pred:',y_pred) 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | -------------------------------------------------------------------------------- /tinyml/tree/DecisionTreeRegressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tinyml.tree import treePlotter 3 | import sklearn.datasets as datasets 4 | from sklearn.metrics import mean_squared_error 5 | import sklearn.tree as tree 6 | import graphviz 7 | 8 | class DecisionTreeRegressor: 9 | """ 10 | 《统计学习方法》 p69 最小二乘回归树 11 | """ 12 | def __init__(self, min_samples_split=3,min_samples_leaf=1,random_state=False): 13 | self.min_samples_split=min_samples_split 14 | self.min_samples_leaf=min_samples_leaf 15 | self.random=random_state 16 | self.tree = None 17 | 18 | def fit(self, X, y): 19 | D = {} 20 | D['X'] = X 21 | D['y'] = y 22 | A = np.arange(X.shape[1]) 23 | self.tree = self.TreeGenerate(D, A) 24 | 25 | def predict(self, X): 26 | if self.tree is None: 27 | raise RuntimeError('cant predict before fit') 28 | y_pred = [] 29 | for i in range(X.shape[0]): 30 | tree = self.tree 31 | x = X[i] 32 | while True: 33 | if not isinstance(tree, dict): 34 | y_pred.append(tree) 35 | break 36 | a = list(tree.keys())[0] 37 | tree = tree[a] 38 | if isinstance(tree, dict): 39 | val = x[a] 40 | split_val=float(list(tree.keys())[0][1:]) 41 | if val<=split_val: 42 | tree=tree[list(tree.keys())[0]] 43 | else: 44 | tree=tree[list(tree.keys())[1]] 45 | else: 46 | y_pred.append(tree) 47 | break 48 | return np.array(y_pred) 49 | 50 | def TreeGenerate(self, D, A): 51 | X = D['X'] 52 | y = D['y'] 53 | if len(y)<=self.min_samples_split: 54 | return np.mean(y) 55 | split_j=None 56 | split_s=None 57 | min_val=1.e10 58 | select_A=A 59 | if self.random is True: 60 | d=len(A) 61 | select_A=np.random.choice(A,size=int(d//2),replace=False) 62 | for j in select_A: 63 | for s in np.unique(X[:,j]): 64 | left_indices=np.where(X[:,j]<=s)[0] 65 | right_indices=np.where(X[:,j]>s)[0] 66 | if len(left_indices)split_s)[0] 79 | D_left, D_right = {},{} 80 | D_left['X'],D_left['y'] = X[left_indices],y[left_indices] 81 | D_right['X'],D_right['y']=X[right_indices],y[right_indices] 82 | tree[split_j]['l'+str(split_s)]=self.TreeGenerate(D_left,A) 83 | tree[split_j]['r'+str(split_s)]=self.TreeGenerate(D_right,A) 84 | # 当前节点值 85 | tree[split_j]['val']=np.mean(y) 86 | return tree 87 | 88 | 89 | if __name__=='__main__': 90 | breast_data = datasets.load_boston() 91 | X, y = breast_data.data, breast_data.target 92 | X_train, y_train = X[:200], y[:200] 93 | X_test, y_test = X[200:], y[200:] 94 | 95 | 96 | decisiontree_reg=DecisionTreeRegressor(min_samples_split=20,min_samples_leaf=5) 97 | decisiontree_reg.fit(X_train,y_train) 98 | print(decisiontree_reg.tree) 99 | treePlotter.createPlot(decisiontree_reg.tree) 100 | y_pred=decisiontree_reg.predict(X_test) 101 | print('tinyml mse:',mean_squared_error(y_test,y_pred)) 102 | 103 | 104 | sklearn_reg=tree.DecisionTreeRegressor(min_samples_split=20,min_samples_leaf=5,random_state=False) 105 | sklearn_reg.fit(X_train,y_train) 106 | print(sklearn_reg.feature_importances_) 107 | sklearn_pred=sklearn_reg.predict(X_test) 108 | print('sklearn mse:',mean_squared_error(y_test,sklearn_pred)) 109 | dot_data=tree.export_graphviz(sklearn_reg,out_file=None) 110 | graph=graphviz.Source(dot_data) 111 | -------------------------------------------------------------------------------- /tinyml/tree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__init__.py -------------------------------------------------------------------------------- /tinyml/tree/__pycache__/DecisionTreeClassifier.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/DecisionTreeClassifier.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/tree/__pycache__/DecisionTreeRegressor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/DecisionTreeRegressor.cpython-36.pyc -------------------------------------------------------------------------------- /tinyml/tree/__pycache__/DecisionTreeRegressor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/DecisionTreeRegressor.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/tree/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /tinyml/tree/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/tree/__pycache__/treePlotter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/treePlotter.cpython-36.pyc -------------------------------------------------------------------------------- /tinyml/tree/__pycache__/treePlotter.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/treePlotter.cpython-37.pyc -------------------------------------------------------------------------------- /tinyml/tree/treePlotter.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from pylab import mpl 3 | 4 | """ 5 | 《Machine Learning in Action》一书中可视化决策树的代码 6 | """ 7 | 8 | mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体 9 | mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 10 | decisionNode = dict(boxstyle="sawtooth", fc="0.8") 11 | leafNode = dict(boxstyle="round4", fc="0.8") 12 | arrow_args = dict(arrowstyle="<-") 13 | 14 | def plotNode(nodeTxt, centerPt, parentPt, nodeType): 15 | createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', \ 16 | xytext=centerPt, textcoords='axes fraction', \ 17 | va="center", ha="center", bbox=nodeType, arrowprops=arrow_args) 18 | 19 | def getNumLeafs(myTree): 20 | numLeafs = 0 21 | firstStr = list(myTree.keys())[0] 22 | secondDict = myTree[firstStr] 23 | for key in secondDict.keys(): 24 | if type(secondDict[key]).__name__ == 'dict': 25 | numLeafs += getNumLeafs(secondDict[key]) 26 | else: 27 | numLeafs += 1 28 | return numLeafs 29 | 30 | def getTreeDepth(myTree): 31 | maxDepth = 0 32 | firstStr = list(myTree.keys())[0] 33 | secondDict = myTree[firstStr] 34 | for key in secondDict.keys(): 35 | if type(secondDict[key]).__name__ == 'dict': 36 | thisDepth = getTreeDepth(secondDict[key]) + 1 37 | else: 38 | thisDepth = 1 39 | if thisDepth > maxDepth: 40 | maxDepth = thisDepth 41 | return maxDepth 42 | 43 | def plotMidText(cntrPt, parentPt, txtString): 44 | xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0] 45 | yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1] 46 | createPlot.ax1.text(xMid, yMid, txtString) 47 | 48 | def plotTree(myTree, parentPt, nodeTxt): 49 | numLeafs = getNumLeafs(myTree) 50 | depth = getTreeDepth(myTree) 51 | firstStr = list(myTree.keys())[0] 52 | cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalw, plotTree.yOff) 53 | plotMidText(cntrPt, parentPt, nodeTxt) 54 | plotNode(firstStr, cntrPt, parentPt, decisionNode) 55 | secondDict = myTree[firstStr] 56 | plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD 57 | for key in secondDict.keys(): 58 | if type(secondDict[key]).__name__ == 'dict': 59 | plotTree(secondDict[key], cntrPt, str(key)) 60 | else: 61 | plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalw 62 | plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode) 63 | plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key)) 64 | plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD 65 | 66 | def createPlot(inTree): 67 | fig = plt.figure(1, facecolor='white') 68 | fig.clf() 69 | axprops = dict(xticks=[], yticks=[]) 70 | createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) 71 | plotTree.totalw = float(getNumLeafs(inTree)) 72 | plotTree.totalD = float(getTreeDepth(inTree)) 73 | plotTree.xOff = -0.5 / plotTree.totalw 74 | plotTree.yOff = 1.0 75 | plotTree(inTree, (0.5, 1.0), '') 76 | plt.show() 77 | --------------------------------------------------------------------------------