├── .idea
├── deployment.xml
├── encodings.xml
├── inspectionProfiles
│ └── Project_Default.xml
├── misc.xml
├── modules.xml
├── tiny_ml.iml
├── vcs.xml
└── workspace.xml
├── README.md
├── notes
└── linear_model
│ ├── linear_reg_closed_form.pdf
│ └── logistic_regression.pdf
├── requirements.txt
└── tinyml
├── __init__.py
├── __pycache__
├── __init__.cpython-36.pyc
└── __init__.cpython-37.pyc
├── bayes
├── NaiveBayesClassifier.py
├── __init__.py
└── __pycache__
│ ├── NaiveBayesClassifier.cpython-37.pyc
│ └── __init__.cpython-37.pyc
├── cluster
├── AGNES.py
├── DBSCAN.py
├── GaussianMixture.py
├── KMeans.py
├── LVQ.py
├── __init__.py
└── __pycache__
│ ├── AGNES.cpython-37.pyc
│ ├── DBSCAN.cpython-37.pyc
│ ├── GaussianMixture.cpython-37.pyc
│ ├── KMeans.cpython-37.pyc
│ ├── LVQ.cpython-37.pyc
│ └── __init__.cpython-37.pyc
├── compare
├── __init__.py
├── cluster_result
│ ├── sklearn_AGNES.jpg
│ ├── sklearn_DBSCAN.jpg
│ ├── sklearn_GMM.jpg
│ ├── sklearn_KMeans.jpg
│ ├── tinyml_AGNES.jpg
│ ├── tinyml_DBSCAN.jpg
│ ├── tinyml_GMM.jpg
│ └── tinyml_KMeans.jpg
├── compare_classification.py
├── compare_clustering.py
├── compare_dimension_reduction.py
├── compare_regresssor.py
└── dimension_reduction_result
│ ├── sklearn_KernalPCA.jpg
│ ├── sklearn_LLE.jpg
│ ├── sklearn_MDS.jpg
│ ├── sklearn_PCA.jpg
│ ├── tinyml_KernalPCA.jpg
│ ├── tinyml_LLE.jpg
│ ├── tinyml_MDS.jpg
│ └── tinyml_PCA.jpg
├── dimension_reduction
├── Isomap.py
├── KernelPCA.py
├── LLE.py
├── MDS.py
├── PCA.py
├── __init__.py
└── __pycache__
│ ├── Isomap.cpython-37.pyc
│ ├── KernelPCA.cpython-37.pyc
│ ├── LLE.cpython-37.pyc
│ ├── MDS.cpython-36.pyc
│ ├── MDS.cpython-37.pyc
│ ├── PCA.cpython-37.pyc
│ ├── __init__.cpython-36.pyc
│ └── __init__.cpython-37.pyc
├── discriminant_analysis
├── GDA.py
├── LDA.py
├── __init__.py
└── __pycache__
│ ├── GDA.cpython-37.pyc
│ ├── LDA.cpython-37.pyc
│ └── __init__.cpython-37.pyc
├── ensemble
├── AdaBoostClassifier.py
├── GradientBoostingRegressor.py
├── RandomForestRegressor.py
├── XGBRegressor.py
├── __init__.py
└── __pycache__
│ ├── AdaBoostClassifier.cpython-37.pyc
│ ├── GradientBoostingRegressor.cpython-37.pyc
│ ├── RandomForestRegressor.cpython-37.pyc
│ ├── XGBRegressor.cpython-37.pyc
│ └── __init__.cpython-37.pyc
├── factorization_machine
├── FMClassifier.py
└── __init__.py
├── feature_selection
├── ReliefFeatureSelection.py
└── __init__.py
├── linear_model
├── LinearRegression.py
├── LocallyWeightedLinearRegression.py
├── LogisticRegression.py
├── SGDRegressor.py
├── __init__.py
└── __pycache__
│ ├── LinearRegression.cpython-37.pyc
│ ├── LogisticRegression.cpython-37.pyc
│ ├── SGDRegressor.cpython-37.pyc
│ └── __init__.cpython-37.pyc
├── metrices
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ └── curves.cpython-37.pyc
└── curves.py
├── svm
├── SVC.py
├── __init__.py
└── __pycache__
│ ├── SVC.cpython-37.pyc
│ └── __init__.cpython-37.pyc
└── tree
├── DecisionTreeClassifier.py
├── DecisionTreeRegressor.py
├── __init__.py
├── __pycache__
├── DecisionTreeClassifier.cpython-37.pyc
├── DecisionTreeRegressor.cpython-36.pyc
├── DecisionTreeRegressor.cpython-37.pyc
├── __init__.cpython-36.pyc
├── __init__.cpython-37.pyc
├── treePlotter.cpython-36.pyc
└── treePlotter.cpython-37.pyc
└── treePlotter.py
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
13 |
14 |
15 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | ApexVCS
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/tiny_ml.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 | 1539678095512
217 |
218 |
219 | 1539678095512
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tinyml
2 | 利用numpy实现的一些周志华《机器学习》(西瓜书)一书及 斯坦福cs229课程中的算法,宜配合西瓜书和cs229课件食用。并选择性实现了一些经典算法的简易版本,
3 | 如 按照陈天奇的slides实现的XGBRegressor。
4 | ## 已经实现的算法
5 | - **线性模型**
6 | - [LinearRegression](/tinyml/linear_model/LinearRegression.py) [线性回归闭式解推导](notes/linear_model/linear_reg_closed_form.pdf)
7 | - [LogisticRegression](/tinyml/linear_model/LogisticRegression.py) [逻辑回归相关推导](/notes/linear_model/logistic_regression.pdf)
8 | - [SGDRegressor](/tinyml/linear_model/SGDRegressor.py)
9 | - [LocallyWeightedLinearRegression](/tinyml/linear_model/LocallyWeightedLinearRegression.py)
10 | - **判别分析**
11 | - [LDA](/tinyml/discriminant_analysis/LDA.py)
12 | - [GDA](/tinyml/discriminant_analysis/GDA.py)
13 | - **决策回归树**
14 | - [DecisionTreeClassifier](/tinyml/tree/DecisionTreeClassifier.py)
15 | - [DecisionTreeRegressor](/tinyml/tree/DecisionTreeRegressor.py)
16 | - **支持向量机**
17 | - [SVC](/tinyml/svm/SVC.py)
18 | - **贝叶斯**
19 | - [NaiveBayesClassifier](/tinyml/bayes/NaiveBayesClassifier.py)
20 | - **聚类算法**
21 | - [KMeans](/tinyml/cluster/KMeans.py)
22 | - [LVQ](/tinyml/cluster/LVQ.py)
23 | - [GaussianMixture](/tinyml/cluster/GaussianMixture.py)
24 | - [DBSCAN](/tinyml/cluster/DBSCAN.py)
25 | - [AGNES](/tinyml/cluster/AGNES.py)
26 | - **降维算法**
27 | - [MDS](/tinyml/dimension_reduction/MDS.py)
28 | - [PCA](/tinyml/dimension_reduction/PCA.py)
29 | - [KernelPCA](/tinyml/dimension_reduction/KernelPCA.py)
30 | - [LLE](/tinyml/dimension_reduction/LLE.py)
31 | - [Isomap](/tinyml/dimension_reduction/Isomap.py)
32 | - **集成学习**
33 | - [AdaBoostClassifier](/tinyml/ensemble/AdaBoostClassifier.py)
34 | - [GradientBoostingRegressor](/tinyml/ensemble/GradientBoostingRegressor.py)
35 | - [RandomForestRegressor](/tinyml/ensemble/RandomForestRegressor.py)
36 | - [XGBRegressor](/tinyml/ensemble/XGBRegressor.py)
37 | - **特征选择**
38 | - [ReliefFeatureSelection](/tinyml/feature_selection/ReliefFeatureSelection.py)
39 | ## 和sklearn实现的比较
40 | - **回归算法结果** [代码](/tinyml/compare/compare_regresssor.py)
41 |
42 |
43 | Algorithm vs. RMSE |
44 | sklearn-boston |
45 |
46 |
47 | tinyml |
48 | sklearn |
49 |
50 |
51 | LinearRegression |
52 | 27.196 |
53 | 27.196 |
54 |
55 |
56 | SGDRegressor |
57 | 27.246 |
58 | 27.231 |
59 |
60 |
61 | DecisionTreeRegressor |
62 | 21.887 |
63 | 21.761 |
64 |
65 |
66 | RandomForestRegressor |
67 | 21.142 |
68 | 21.142 |
69 |
70 |
71 | GradientBoostRegressor |
72 | 16.778 |
73 | 16.106 |
74 |
75 |
76 | XGBRegressor |
77 | 20.149 |
78 | 15.7 |
79 |
80 |
81 |
82 | - **分类算法结果** [代码](/tinyml/compare/compare_classification.py)
83 |
84 |
85 | Algorithm vs. RMSE |
86 | sklearn-breast_cancer |
87 |
88 |
89 | tinyml |
90 | sklearn |
91 |
92 |
93 | NaiveBayes |
94 | 90.64% |
95 | 90.64% |
96 |
97 |
98 | LogisticRegression |
99 | 92.98% |
100 | 92.98% |
101 |
102 |
103 | LDA |
104 | 94.15% |
105 | 92.40% |
106 |
107 |
108 | GDA |
109 | 92.40% |
110 | 93.57% |
111 |
112 |
113 | SVC |
114 | 86.55% |
115 | 92.98% |
116 |
117 |
118 | AdaboostClassifier |
119 | 92.40% |
120 | 92.40% |
121 |
122 |
123 |
124 | - **聚类算法比较** [代码](/tinyml/compare/compare_clustering.py)
125 | - KMeans
126 |
127 |

128 |

129 |
130 |
131 | - DBSCAN
132 |
133 |

134 |

135 |
136 |
137 | - GMM
138 |
139 |

140 |

141 |
142 |
143 | - AGNES
144 |
145 |

146 |

147 |
148 |
149 | - **降维算法比较** [代码](/tinyml/compare/compare_dimension_reduction.py)
150 | - PCA
151 |
152 |

153 |

154 |
155 |
156 | - KernalPCA
157 |
158 |

159 |

160 |
161 |
162 | - LLE
163 |
164 |

165 |

166 |
167 |
168 | - MDS
169 |
170 |

171 |

172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
--------------------------------------------------------------------------------
/notes/linear_model/linear_reg_closed_form.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/notes/linear_model/linear_reg_closed_form.pdf
--------------------------------------------------------------------------------
/notes/linear_model/logistic_regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/notes/linear_model/logistic_regression.pdf
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | xgboost
2 | numpy
3 | matplotlib
4 | torch
5 | scipy
6 | graphviz
7 | scikit_learn
8 | skrebate
9 |
--------------------------------------------------------------------------------
/tinyml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/__init__.py
--------------------------------------------------------------------------------
/tinyml/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/tinyml/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/bayes/NaiveBayesClassifier.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | # 只考虑离散值
3 | class NaiveBayesClassifier:
4 | def __init__(self,n_classes=2):
5 | self.n_classes=n_classes
6 | self.priori_P={}
7 | self.conditional_P={}
8 | self.N={}
9 | pass
10 |
11 | def fit(self,X,y):
12 | for i in range(self.n_classes):
13 | # 公式 7.19
14 | self.priori_P[i]=(len(y[y==i])+1)/(len(y)+self.n_classes)
15 | for col in range(X.shape[1]):
16 | self.N[col]=len(np.unique(X[:,col]))
17 | self.conditional_P[col]={}
18 | for row in range(X.shape[0]):
19 | val=X[row,col]
20 | if val not in self.conditional_P[col].keys():
21 | self.conditional_P[col][val]={}
22 | for i in range(self.n_classes):
23 | D_xi=np.where(X[:,col]==val)
24 | D_c=np.where(y==i)
25 | D_cxi=len(np.intersect1d(D_xi,D_c))
26 | # 公式 7.20
27 | self.conditional_P[col][val][i]=(D_cxi+1)/(len(y[y==i])+self.N[col])
28 | else:
29 | continue
30 |
31 | def predict(self,X):
32 | pred_y=[]
33 | for i in range(len(X)):
34 | p=np.ones((self.n_classes,))
35 | for j in range(self.n_classes):
36 | p[j]=self.priori_P[j]
37 | for col in range(X.shape[1]):
38 | val=X[i,col]
39 | for j in range(self.n_classes):
40 | p[j]*=self.conditional_P[col][val][j]
41 | pred_y.append(np.argmax(p))
42 | return np.array(pred_y)
43 | # 连续值
44 | class NaiveBayesClassifierContinuous:
45 | def __init__(self,n_classes=2):
46 | self.n_classes=n_classes
47 | self.priori_P={}
48 |
49 | def fit(self,X,y):
50 | self.mus=np.zeros((self.n_classes,X.shape[1]))
51 | self.sigmas=np.zeros((self.n_classes,X.shape[1]))
52 |
53 | for c in range(self.n_classes):
54 | # 公式 7.19
55 | self.priori_P[c]=(len(y[y==c]))/(len(y))
56 | X_c=X[np.where(y==c)]
57 |
58 | self.mus[c]=np.mean(X_c,axis=0)
59 | self.sigmas[c]=np.std(X_c,axis=0)
60 |
61 | def predict(self,X):
62 | pred_y=[]
63 | for i in range(len(X)):
64 | p=np.ones((self.n_classes,))
65 | for c in range(self.n_classes):
66 | p[c]=self.priori_P[c]
67 | for col in range(X.shape[1]):
68 | x=X[i,col]
69 | p[c]*=1./(np.sqrt(2*np.pi)*self.sigmas[c,col])*np.exp(-(x-self.mus[c,col])**2/(2*self.sigmas[c,col]**2))
70 | pred_y.append(np.argmax(p))
71 | return np.array(pred_y)
72 |
73 | if __name__=='__main__':
74 | X = np.array([[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0],
75 | [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0],
76 | [2, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1],
77 | [1, 1, 0, 1, 1, 1], [1, 1, 0, 0, 1, 0],
78 | [1, 1, 1, 1, 1, 0], [0, 2, 2, 0, 2, 1],
79 | [2, 2, 2, 2, 2, 0], [2, 0, 0, 2, 2, 1],
80 | [0, 1, 0, 1, 0, 0], [2, 1, 1, 1, 0, 0],
81 | [1, 1, 0, 0, 1, 1], [2, 0, 0, 2, 2, 0],
82 | [0, 0, 1, 1, 1, 0]])
83 | y = np.array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
84 |
85 | X_test=np.array([[0, 0, 1, 0, 0, 0], [1, 0, 1, 0, 0, 0],
86 | [1, 1, 0, 1, 1, 0], [1, 0, 1, 1, 1, 0],
87 | [1, 1, 0, 0, 1, 1], [2, 0, 0, 2, 2, 0],
88 | [0, 0, 1, 1, 1, 0],
89 | [2, 0, 0, 2, 2, 0],
90 | [0, 0, 1, 1, 1, 0]
91 | ])
92 |
93 | naive_bayes=NaiveBayesClassifier(n_classes=2)
94 | naive_bayes.fit(X,y)
95 | print('self.PrirP:',naive_bayes.priori_P)
96 | print('self.CondiP:',naive_bayes.conditional_P)
97 | pred_y=naive_bayes.predict(X_test)
98 | print('pred_y:',pred_y)
99 |
100 |
101 |
--------------------------------------------------------------------------------
/tinyml/bayes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/bayes/__init__.py
--------------------------------------------------------------------------------
/tinyml/bayes/__pycache__/NaiveBayesClassifier.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/bayes/__pycache__/NaiveBayesClassifier.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/bayes/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/bayes/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/cluster/AGNES.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | class AGNES:
5 | def __init__(self,k=3,dist_type='AVG'):
6 | self.k=k
7 | self.labels_=None
8 | self.C={}
9 | self.dist_func=None
10 | if dist_type=='MIN':
11 | self.dist_func=self.mindist
12 | elif dist_type=='MAX':
13 | self.dist_func=self.maxdist
14 | else:
15 | self.dist_func=self.avgdist
16 |
17 | # p215 图9.11 AGNES算法
18 | def fit(self,X):
19 | for j in range(X.shape[0]):
20 | self.C[j]=set()
21 | self.C[j].add(j)
22 | M=1e10*np.ones((X.shape[0],X.shape[0]),dtype=np.float32)
23 | for i in range(X.shape[0]):
24 | for j in range(i+1,X.shape[0]):
25 | M[i,j]=self.dist_func(X,self.C[i],self.C[j])
26 | M[j,i]=M[i,j]
27 | q=X.shape[0]
28 | while q>self.k:
29 | index=np.argmin(M)
30 | i_=index//M.shape[1]
31 | j_=index%M.shape[1]
32 | self.C[i_]=set(self.C[i_].union(self.C[j_]))
33 | #print(self.C[i_])
34 | for j in range(j_+1,q):
35 | self.C[j-1]=set(self.C[j])
36 | del self.C[q-1]
37 | M=np.delete(M,[j_],axis=0)
38 | M=np.delete(M,[j_],axis=1)
39 | for j in range(q-1):
40 | if i_!=j:
41 | M[i_,j]=self.dist_func(X,self.C[i_],self.C[j])
42 | M[j,i_]=M[i_,j]
43 | q-=1
44 | self.labels_=np.zeros((X.shape[0],),dtype=np.int32)
45 | for i in range(self.k):
46 | self.labels_[list(self.C[i])] = i
47 |
48 | @classmethod
49 | def mindist(cls,X,Ci,Cj):
50 | Xi=X[list(Ci)]
51 | Xj=X[list(Cj)]
52 | min=1e10
53 | for i in range(len(Xi)):
54 | d=np.sqrt(np.sum((Xi[i]-Xj)**2,axis=1))
55 | dmin=np.min(d)
56 | if dminmax:
69 | max=dmax
70 | return max
71 |
72 | @classmethod
73 | def avgdist(cls,X,Ci,Cj):
74 | Xi=X[list(Ci)]
75 | Xj=X[list(Cj)]
76 | sum=0.
77 | for i in range(len(Xi)):
78 | d=np.sqrt(np.sum((Xi[i]-Xj)**2,axis=1))
79 | sum+=np.sum(d)
80 | dist=sum/(len(Ci)*len(Cj))
81 | return dist
82 |
83 |
84 |
85 |
86 | if __name__=='__main__':
87 | X=np.array([[0.697,0.460],[0.774,0.376],[0.634,0.264],[0.608,0.318],[0.556,0.215],
88 | [0.403,0.237],[0.481,0.149],[0.437,0.211],[0.666,0.091],[0.243,0.267],
89 | [0.245,0.057],[0.343,0.099],[0.639,0.161],[0.657,0.198],[0.360,0.370],
90 | [0.593,0.042],[0.719,0.103],[0.359,0.188],[0.339,0.241],[0.282,0.257],
91 | [0.748,0.232],[0.714,0.346],[0.483,0.312],[0.478,0.437],[0.525,0.369],
92 | [0.751,0.489],[0.532,0.472],[0.473,0.376],[0.725,0.445],[0.446,0.459]])
93 |
94 | X_test=X
95 | agnes=AGNES()
96 | agnes.fit(X)
97 | print('C:', agnes.C)
98 | print(agnes.labels_)
99 | plt.figure(12)
100 | plt.subplot(121)
101 | plt.scatter(X[:, 0], X[:, 1], c=agnes.labels_)
102 | plt.title('tinyml')
103 |
104 | from sklearn.cluster.hierarchical import AgglomerativeClustering
105 | sklearn_agnes=AgglomerativeClustering(n_clusters=7,affinity='l2',linkage='average')
106 | sklearn_agnes.fit(X)
107 | print(sklearn_agnes.labels_)
108 | plt.subplot(122)
109 | plt.scatter(X[:,0],X[:,1],c=sklearn_agnes.labels_)
110 | plt.title('sklearn')
111 | plt.show()
112 |
113 |
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
/tinyml/cluster/DBSCAN.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import random
4 | from queue import Queue
5 | random.seed(1)
6 |
7 | class DBSCAN:
8 | def __init__(self,epsilon=0.11,min_pts=5):
9 | self.epsilon=epsilon
10 | self.min_pts=min_pts
11 | self.labels_=None
12 | self.C=None
13 | self.Omega=set()
14 | self.N_epsilon={}
15 |
16 | # p213 图9.9 DBSCAN算法
17 | def fit(self,X):
18 | self.C={}
19 | for j in range(X.shape[0]):
20 | dist=np.sqrt(np.sum((X-X[j])**2,axis=1))
21 | self.N_epsilon[j]=np.where(dist<=self.epsilon)[0]
22 | if len(self.N_epsilon[j])>=self.min_pts:
23 | self.Omega.add(j)
24 | self.k=0
25 | Gamma=set(range(X.shape[0]))
26 | while len(self.Omega)>0:
27 | Gamma_old=set(Gamma)
28 | o=random.sample(list(self.Omega),1)[0]
29 | Q=Queue()
30 | Q.put(o)
31 | Gamma.remove(o)
32 | while not Q.empty():
33 | q=Q.get()
34 | if len(self.N_epsilon[q])>=self.min_pts:
35 | Delta=set(self.N_epsilon[q]).intersection(set(Gamma))
36 | for delta in Delta:
37 | Q.put(delta)
38 | Gamma.remove(delta)
39 | self.C[self.k]=Gamma_old.difference(Gamma)
40 | self.Omega=self.Omega.difference(self.C[self.k])
41 | self.k += 1
42 | self.labels_=np.zeros((X.shape[0],),dtype=np.int32)
43 | for i in range(self.k):
44 | self.labels_[list(self.C[i])]=i
45 |
46 |
47 | if __name__=='__main__':
48 | X=np.array([[0.697,0.460],[0.774,0.376],[0.634,0.264],[0.608,0.318],[0.556,0.215],
49 | [0.403,0.237],[0.481,0.149],[0.437,0.211],[0.666,0.091],[0.243,0.267],
50 | [0.245,0.057],[0.343,0.099],[0.639,0.161],[0.657,0.198],[0.360,0.370],
51 | [0.593,0.042],[0.719,0.103],[0.359,0.188],[0.339,0.241],[0.282,0.257],
52 | [0.748,0.232],[0.714,0.346],[0.483,0.312],[0.478,0.437],[0.525,0.369],
53 | [0.751,0.489],[0.532,0.472],[0.473,0.376],[0.725,0.445],[0.446,0.459]])
54 |
55 | dbscan=DBSCAN()
56 | dbscan.fit(X)
57 | print('C:',dbscan.C)
58 | print(dbscan.labels_)
59 | plt.figure(12)
60 | plt.subplot(121)
61 | plt.scatter(X[:,0],X[:,1],c=dbscan.labels_)
62 | plt.title('tinyml')
63 |
64 | import sklearn.cluster as cluster
65 | sklearn_DBSCAN=cluster.DBSCAN(eps=0.11,min_samples=5,metric='l2')
66 | sklearn_DBSCAN.fit(X)
67 | print(sklearn_DBSCAN.labels_)
68 | plt.subplot(122)
69 | plt.scatter(X[:,0],X[:,1],c=sklearn_DBSCAN.labels_)
70 | plt.title('sklearn')
71 | plt.show()
72 |
73 |
--------------------------------------------------------------------------------
/tinyml/cluster/GaussianMixture.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | class GaussianMixture:
5 | def __init__(self,k=3,max_iter=50):
6 | self.k=k
7 | self.max_iter=max_iter
8 | self.labels_=None
9 | self.C=None
10 | self.alpha=None
11 | self.mu=None
12 | self.cov=None
13 | self.gamma=None
14 | pass
15 |
16 | # p210 图9.6 高斯混合聚类算法
17 | def fit(self,X):
18 | # p210初始化方法
19 | self.alpha=np.zeros((self.k,))
20 | for i in range(self.k):
21 | self.alpha[i]=1./self.k
22 | mu_indices=[5,21,26]
23 | self.mu=X[mu_indices]
24 | self.cov=np.array([[[0.1,0.],[0.0,0.1]],[[0.1,0.],[0.,0.1]],[[0.1,0.],[0.,0.1]]])
25 |
26 | self.gamma=np.zeros((X.shape[0],self.k))
27 | for _ in range(self.max_iter):
28 | for j in range(X.shape[0]):
29 | alpha_p=np.zeros((self.k,))
30 | sum=0.
31 | for i in range(self.k):
32 | alpha_p[i]=self.alpha[i]*self._p(X[j],self.mu[i],self.cov[i])
33 | sum+=alpha_p[i]
34 | self.gamma[j,:]=alpha_p/sum
35 |
36 | for i in range(self.k):
37 | sum_gamma_i=np.sum(self.gamma[:,i])
38 | self.mu[i]=X.T.dot(self.gamma[:,i])/sum_gamma_i
39 | numerator=0.
40 | for j in range(X.shape[0]):
41 | numerator+=(self.gamma[j,i]*((X[j]-self.mu[i]).reshape(-1,1).dot((X[j]-self.mu[i]).reshape(1,-1))))
42 | self.cov[i]=numerator/sum_gamma_i
43 | self.alpha[i]=sum_gamma_i/X.shape[0]
44 | self.labels_=np.argmax(self.gamma,axis=1)
45 | self.C={}
46 | for i in range(self.k):
47 | self.C[i]=[]
48 | for j in range(len(self.labels_)):
49 | self.C[self.labels_[j]].append(j)
50 |
51 | def predict(self,X):
52 | gamma = np.zeros((X.shape[0], self.k))
53 | for j in range(X.shape[0]):
54 | alpha_p = np.zeros((self.k,))
55 | sum = 0.
56 | for i in range(self.k):
57 | alpha_p[i] = self.alpha[i] * self._p(X[j], self.mu[i], self.cov[i])
58 | sum += alpha_p[i]
59 | gamma[j, :] = alpha_p / sum
60 | return np.argmax(gamma,axis=1)
61 |
62 |
63 | # 公式 9.28
64 | @classmethod
65 | def _p(cls,x,mu,cov):
66 | exp_coef=-0.5*((x-mu).T.dot(np.linalg.inv(cov)).dot(x-mu))
67 | p=np.exp(exp_coef)/(np.power(2*np.pi,mu.shape[0]/2)*np.sqrt(np.linalg.det(cov)))
68 | return p
69 |
70 | if __name__=='__main__':
71 | X=np.array([[0.697,0.460],[0.774,0.376],[0.634,0.264],[0.608,0.318],[0.556,0.215],
72 | [0.403,0.237],[0.481,0.149],[0.437,0.211],[0.666,0.091],[0.243,0.267],
73 | [0.245,0.057],[0.343,0.099],[0.639,0.161],[0.657,0.198],[0.360,0.370],
74 | [0.593,0.042],[0.719,0.103],[0.359,0.188],[0.339,0.241],[0.282,0.257],
75 | [0.748,0.232],[0.714,0.346],[0.483,0.312],[0.478,0.437],[0.525,0.369],
76 | [0.751,0.489],[0.532,0.472],[0.473,0.376],[0.725,0.445],[0.446,0.459]])
77 |
78 | X_test=X
79 | gmm=GaussianMixture(k=3,max_iter=50)
80 | gmm.fit(X)
81 | print(gmm.C)
82 | print(gmm.labels_)
83 | print(gmm.predict(X_test))
84 | plt.scatter(X[:, 0], X[:, 1], c=gmm.labels_)
85 | plt.scatter(gmm.mu[:, 0], gmm.mu[:, 1],c=range(gmm.k), marker='+')
86 | plt.title('tinyml')
87 | plt.show()
88 |
89 |
90 | from sklearn.mixture import GaussianMixture
91 |
92 | sklearn_gmm = GaussianMixture(n_components=3, covariance_type='full',
93 | max_iter=50).fit(X)
94 | labels=sklearn_gmm.predict(X)
95 | print(labels)
96 | plt.scatter(X[:,0],X[:,1],c=labels)
97 | plt.title('sklearn')
98 | plt.show()
99 |
100 |
101 |
102 |
103 |
104 |
--------------------------------------------------------------------------------
/tinyml/cluster/KMeans.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import matplotlib.pyplot as plt
4 | random.seed(1)
5 |
6 | class KMeans:
7 | def __init__(self,k=2):
8 | self.labels_=None
9 | self.mu=None
10 | self.k=k
11 |
12 | def init(self,X,method='kmeans++',random_state=False):
13 | if method=='kmeans++':
14 | if random_state is False:
15 | np.random.seed(0)
16 | mus=[X[np.random.randint(0,len(X))]]
17 | while len(mus) 0.5:
38 | break
39 | self.hs_.append(copy.copy(ht))
40 | self.epsilons_.append(epsilon_t)
41 | alpha_t = 0.5 * np.log((1 - epsilon_t) / epsilon_t)
42 | self.alphas_.append(alpha_t)
43 | self.Ds_.append(self.Ds_[t] * np.exp(-alpha_t * y * y_pred))
44 | self.Ds_[t + 1] = self.Ds_[t + 1] / np.sum(self.Ds_[t + 1])
45 |
46 |
47 | @classmethod
48 | def calc_epsilon(clf, D, y_target, y_pred):
49 | return 1 - np.sum(D[y_target == y_pred])
50 |
51 | def predict(self, X):
52 | H=np.zeros((X.shape[0],))
53 | for t in range(len(self.alphas_)):
54 | H+=(self.alphas_[t]*self.hs_[t].predict(X))
55 | return np.sign(H)
56 |
57 |
58 | if __name__ == '__main__':
59 | breast_data = datasets.load_breast_cancer()
60 | X, y = breast_data.data, breast_data.target
61 | y = 2 * y - 1
62 | X_train, y_train = X[:200], y[:200]
63 | X_test, y_test = X[200:], y[200:]
64 | base_estimator=DecisionTreeClassifier(max_depth=1,random_state=False)
65 |
66 | sklearn_decision_tree = DecisionTreeClassifier(max_depth=1)
67 | sklearn_decision_tree.fit(X_train, y_train)
68 | y_pred_decison_tree = sklearn_decision_tree.predict(X_test)
69 | print('single decision tree:', len(y_test[y_pred_decison_tree == y_test]) * 1.0 / len(y_test))
70 |
71 | print('tinyml:')
72 | adaboost_clf = AdaBoostClassifier(n_estimators=100,base_estimator=base_estimator,method='re-weighting')
73 | adaboost_clf.fit(X_train, y_train)
74 | y_pred = adaboost_clf.predict(X_test)
75 | print('adaboost y_pred:', len(y_test[y_pred == y_test]) * 1. / len(y_test))
76 |
77 | print('sklearn:')
78 | sklearn_adboost_clf = sklearnAdaBoostClassifier(n_estimators=100, random_state=False, algorithm='SAMME',
79 | base_estimator=base_estimator)
80 | sklearn_adboost_clf.fit(X_train, y_train)
81 | sklearn_y_pred = sklearn_adboost_clf.predict(X_test)
82 | print('sklearn adaboost y_pred:', len(y_test[y_test == sklearn_y_pred]) * 1. / len(y_test))
83 |
84 |
--------------------------------------------------------------------------------
/tinyml/ensemble/GradientBoostingRegressor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn import datasets
3 | from sklearn.metrics import mean_squared_error
4 | from sklearn import ensemble
5 | import copy
6 | from sklearn import tree
7 |
8 | """
9 | loss使用均方误差
10 | 残差为 y-y_pred
11 | 李航《统计学习方法》 p151
12 | """
13 | class GradientBoostingRegressor:
14 | def __init__(self,base_estimator=None,n_estimators=10,lr=0.1):
15 | self.base_estimator=base_estimator
16 | self.n_esimators=n_estimators
17 | self.estimators=[]
18 | self.lr=lr
19 | self.mean=None
20 |
21 | def fit(self,X,y):
22 | F0_x=np.ones_like(y)*np.mean(y)
23 | y_pred=F0_x
24 | self.mean=np.mean(y)
25 | for i in range(self.n_esimators):
26 | hm=copy.deepcopy(self.base_estimator)
27 | hm.fit(X,y-y_pred)
28 | self.estimators.append(hm)
29 | y_pred=y_pred+self.lr*hm.predict(X)
30 |
31 | def predict(self,X):
32 | y=self.mean*np.ones((X.shape[0],))
33 | for i in range(self.n_esimators):
34 | y=y+self.lr*self.estimators[i].predict(X)
35 | return y
36 |
37 |
38 | if __name__=='__main__':
39 | breast_data = datasets.load_boston()
40 | X, y = breast_data.data, breast_data.target
41 | print(X.shape)
42 | X_train, y_train = X[:400], y[:400]
43 | X_test, y_test = X[400:], y[400:]
44 |
45 | sklearn_decisiontree_reg=tree.DecisionTreeRegressor(min_samples_split=15, min_samples_leaf=5,random_state=False)
46 | sklearn_decisiontree_reg.fit(X_train, y_train)
47 | decisiontree_pred=sklearn_decisiontree_reg.predict(X_test)
48 | print('base estimator:',mean_squared_error(y_test,decisiontree_pred))
49 |
50 | tinyml_gbdt_reg=GradientBoostingRegressor(n_estimators=500, base_estimator=tree.DecisionTreeRegressor(min_samples_split=15, min_samples_leaf=5, random_state=False))
51 | tinyml_gbdt_reg.fit(X_train, y_train)
52 | y_pred=tinyml_gbdt_reg.predict(X_test)
53 | print('tinyml mse:',mean_squared_error(y_test,y_pred))
54 |
55 |
56 | sklearn_gbdt_reg=ensemble.GradientBoostingRegressor(n_estimators=500,min_samples_leaf=5,min_samples_split=15,random_state=False)
57 | sklearn_gbdt_reg.fit(X_train,y_train)
58 | sklearn_pred=sklearn_gbdt_reg.predict(X_test)
59 | print('sklearn mse:',mean_squared_error(y_test,sklearn_pred))
60 |
--------------------------------------------------------------------------------
/tinyml/ensemble/RandomForestRegressor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn import datasets,ensemble,tree
3 | from sklearn.metrics import mean_squared_error
4 |
5 | class RandomForestRegressor:
6 | def __init__(self,base_estimator,n_estimators=10,min_samples_leaf=5,min_samples_split=15):
7 | self.base_estimator=base_estimator
8 | self.n_estimators=n_estimators
9 | self.min_samples_split=min_samples_split
10 | self.min_samples_leaf=min_samples_leaf
11 | self.estimators_=[]
12 |
13 | def fit(self,X,y):
14 | for t in range(self.n_estimators):
15 | estimator_t=self.base_estimator(random_state=True,min_samples_split=self.min_samples_split,min_samples_leaf=self.min_samples_leaf)
16 | estimator_t.fit(X,y)
17 | self.estimators_.append(estimator_t)
18 |
19 | def predict(self,X):
20 | preds=[]
21 | for t in range(self.n_estimators):
22 | preds.append(self.estimators_[t].predict(X))
23 | return np.mean(np.array(preds),axis=0)
24 |
25 |
26 | if __name__=='__main__':
27 | breast_data = datasets.load_boston()
28 | X, y = breast_data.data, breast_data.target
29 | X_train, y_train = X[:400], y[:400]
30 | X_test, y_test = X[400:], y[400:]
31 |
32 | tinyml_decisiontree_reg=tree.DecisionTreeRegressor(min_samples_split=20, min_samples_leaf=5,random_state=True)
33 | tinyml_decisiontree_reg.fit(X_train, y_train)
34 | decisiontree_pred=tinyml_decisiontree_reg.predict(X_test)
35 | print('base estimator:',mean_squared_error(y_test,decisiontree_pred))
36 |
37 | tinyml_rf_reg=RandomForestRegressor(n_estimators=100, base_estimator=tree.DecisionTreeRegressor)
38 | tinyml_rf_reg.fit(X_train,y_train)
39 | y_pred=tinyml_rf_reg.predict(X_test)
40 | print('tinyml rf mse:',mean_squared_error(y_test,y_pred))
41 |
42 | sklearn_rf_reg=ensemble.RandomForestRegressor(n_estimators=100, min_samples_leaf=5, min_samples_split=20, random_state=False)
43 | sklearn_rf_reg.fit(X_train, y_train)
44 | sklearn_pred=sklearn_rf_reg.predict(X_test)
45 | print('sklearn mse:',mean_squared_error(y_test,sklearn_pred))
46 |
--------------------------------------------------------------------------------
/tinyml/ensemble/XGBRegressor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import abc
3 | from sklearn import datasets,tree
4 | from sklearn.metrics import mean_squared_error
5 | import xgboost as xgb
6 | np.random.seed(1)
7 |
8 | class LossBase(object):
9 | def __init__(self,y_target,y_pred):
10 | self.y_target=y_target
11 | self.y_pred=y_pred
12 | pass
13 |
14 | @abc.abstractmethod
15 | def forward(self):
16 | raise NotImplementedError
17 |
18 | @abc.abstractmethod
19 | def g(self):
20 | raise NotImplementedError
21 |
22 | @abc.abstractmethod
23 | def h(self):
24 | raise NotImplementedError
25 |
26 | class MSELoss(LossBase):
27 | def __init__(self,y_target,y_pred):
28 | super(MSELoss,self).__init__(y_target,y_pred)
29 |
30 | def forward(self):
31 | return (self.y_target-self.y_pred)**2
32 |
33 | def g(self):
34 | return 2*(self.y_pred-self.y_target)
35 |
36 | def h(self):
37 | return 2*np.ones_like(self.y_target)
38 |
39 | class CART:
40 |
41 | def __init__(self, reg_lambda=1, gamma=0., max_depth=3,col_sample_ratio=0.5,row_sample_ratio=1.):
42 | self.reg_lambda=reg_lambda
43 | self.gamma=gamma
44 | self.max_depth=max_depth
45 | self.tree = None
46 | self.leaf_nodes=0
47 | self.obj_val=0.
48 | self.col_sample_ratio=col_sample_ratio
49 | self.row_sample_ratio=row_sample_ratio
50 |
51 | def fit(self, X, y,g,h):
52 | D = {}
53 | D['X'] = X
54 | D['y'] = y
55 | A = np.arange(X.shape[1])
56 | m=len(y)
57 | self.tree = self.TreeGenerate(D,A,g,h,np.array(range(m)),0)
58 | self.obj_val=-0.5*self.obj_val+self.gamma*self.leaf_nodes
59 |
60 | def predict(self, X):
61 | if self.tree is None:
62 | raise RuntimeError('cant predict before fit')
63 | y_pred = []
64 | for i in range(X.shape[0]):
65 | tree = self.tree
66 | x = X[i]
67 | while True:
68 | if not isinstance(tree, dict):
69 | y_pred.append(tree)
70 | break
71 | a = list(tree.keys())[0]
72 | tree = tree[a]
73 | if isinstance(tree, dict):
74 | val = x[a]
75 | split_val=float(list(tree.keys())[0][1:])
76 | if val<=split_val:
77 | tree=tree[list(tree.keys())[0]]
78 | else:
79 | tree=tree[list(tree.keys())[1]]
80 | else:
81 | y_pred.append(tree)
82 | break
83 | return np.array(y_pred)
84 |
85 | def TreeGenerate(self, D, A,g,h,indices,depth):
86 | X = D['X']
87 | if depth>self.max_depth:
88 | G=np.sum(g[indices])
89 | H=np.sum(h[indices])
90 | w=-(G/(H+self.reg_lambda))
91 | self.obj_val+=(G**2/(H+self.reg_lambda))
92 | self.leaf_nodes+=1
93 | return w
94 | split_j=None
95 | split_s=None
96 | max_gain=0.
97 |
98 | col_sample_indices=np.random.choice(A,size=int(len(A)*self.col_sample_ratio))
99 | indices=np.random.choice(indices,size=int(len(indices)*self.row_sample_ratio))
100 |
101 | for j in A:
102 | if j not in col_sample_indices:
103 | continue
104 | for s in np.unique(X[:,j]):
105 | tmp_left=np.where(X[indices,j]<=s)[0]
106 | tmp_right=np.where(X[indices,j]>s)[0]
107 | if len(tmp_left)<1 or len(tmp_right)<1:
108 | continue
109 | left_indices=indices[tmp_left]
110 | right_indices=indices[tmp_right]
111 | G_L=np.sum(g[left_indices])
112 | G_R=np.sum(g[right_indices])
113 | H_L=np.sum(h[left_indices])
114 | H_R=np.sum(h[right_indices])
115 | gain= (G_L ** 2 / (H_L + self.reg_lambda) + G_R ** 2 / (H_R + self.reg_lambda) - (G_L + G_R) ** 2 / (H_L + H_R + self.reg_lambda)) - self.gamma
116 | if gain>max_gain:
117 | split_j=j
118 | split_s=s
119 | max_gain=gain
120 |
121 | if split_j is None:
122 | G = np.sum(g[indices])
123 | H = np.sum(h[indices])
124 | w = -(G / (H + self.reg_lambda))
125 | self.obj_val += (G ** 2 / (H + self.reg_lambda))
126 | self.leaf_nodes += 1
127 | return w
128 |
129 | tree = {split_j: {}}
130 | left_indices=indices[np.where(X[indices,split_j]<=split_s)[0]]
131 | right_indices=indices[np.where(X[indices,split_j]>split_s)[0]]
132 | tree[split_j]['l'+str(split_s)]=self.TreeGenerate(D,A,g,h,left_indices,depth+1)
133 | tree[split_j]['r'+str(split_s)]=self.TreeGenerate(D,A,g,h,right_indices,depth+1)
134 | # 当前节点值
135 | tree[split_j]['val']= -(np.sum(g[indices]) / (np.sum(h[indices]) + self.reg_lambda))
136 | return tree
137 |
138 | """
139 | 使用MSELoss
140 | 按照陈天奇的xgboost PPT实现
141 | """
142 | class XGBRegressor:
143 | def __init__(self, reg_lambda=1, gamma=0., max_depth=5, n_estimators=250, eta=.1):
144 | self.reg_lambda=reg_lambda
145 | self.gamma=gamma
146 | self.max_depth=max_depth
147 | self.n_estimators=n_estimators
148 | self.eta=eta
149 | self.mean=None
150 | self.estimators_=[]
151 |
152 | def fit(self,X,y):
153 | self.mean=np.mean(y)
154 | y_pred = np.ones_like(y)*self.mean
155 | loss = MSELoss(y, y_pred)
156 | g, h = loss.g(), loss.h()
157 | for t in range(self.n_estimators):
158 | estimator_t=CART(self.reg_lambda, self.gamma, self.max_depth)
159 | y_target=y-y_pred
160 | estimator_t.fit(X,y_target,g,h)
161 | self.estimators_.append(estimator_t)
162 | y_pred+=(self.eta*estimator_t.predict(X))
163 | loss=MSELoss(y,y_pred)
164 | g,h=loss.g(),loss.h()
165 |
166 | def predict(self,X):
167 | y_pred=np.ones((X.shape[0],))*self.mean
168 | for t in range(self.n_estimators):
169 | y_pred+=(self.eta*self.estimators_[t].predict(X))
170 | return y_pred
171 |
172 | if __name__=='__main__':
173 | breast_data = datasets.load_boston()
174 | X, y = breast_data.data, breast_data.target
175 |
176 | X_train, y_train = X[:400], y[:400]
177 | X_test, y_test = X[400:], y[400:]
178 |
179 | sklearn_decisiontree_reg=tree.DecisionTreeRegressor(min_samples_split=15, min_samples_leaf=5,random_state=False)
180 | sklearn_decisiontree_reg.fit(X_train, y_train)
181 | decisiontree_pred=sklearn_decisiontree_reg.predict(X_test)
182 | print('base estimator:',mean_squared_error(y_test,decisiontree_pred))
183 |
184 | tinyml_gbdt_reg=XGBRegressor(n_estimators=100,max_depth=3,gamma=0.)
185 | tinyml_gbdt_reg.fit(X_train, y_train)
186 | y_pred=tinyml_gbdt_reg.predict(X_test)
187 | print('tinyml mse:',mean_squared_error(y_test,y_pred))
188 |
189 | xgb_reg=xgb.sklearn.XGBRegressor(max_depth=3,learning_rate=0.1,n_estimators=100,gamma=0,reg_lambda=1)
190 | xgb_reg.fit(X_train,y_train)
191 | xgb_pred=xgb_reg.predict(X_test)
192 | print('xgb mse:',mean_squared_error(y_test,xgb_pred))
193 |
--------------------------------------------------------------------------------
/tinyml/ensemble/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__init__.py
--------------------------------------------------------------------------------
/tinyml/ensemble/__pycache__/AdaBoostClassifier.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/AdaBoostClassifier.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/ensemble/__pycache__/GradientBoostingRegressor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/GradientBoostingRegressor.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/ensemble/__pycache__/RandomForestRegressor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/RandomForestRegressor.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/ensemble/__pycache__/XGBRegressor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/XGBRegressor.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/ensemble/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/ensemble/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/factorization_machine/FMClassifier.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn import linear_model
3 | from sklearn.datasets import load_breast_cancer
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.preprocessing import MinMaxScaler
6 | import math
7 | np.random.seed(0)
8 | import torch
9 | from torch import nn,optim
10 | class SGDFMClassifier:
11 | class FMClassifier(nn.Module):
12 | def __init__(self,n_features,loss='logistic',degree=2,n_components=2):
13 | super(SGDFMClassifier.FMClassifier,self).__init__()
14 | self.loss=loss
15 | self.degree=degree
16 | self.n_components=n_components
17 | self.linear=nn.Linear(n_features,1)
18 | self.v=nn.Parameter(torch.Tensor(n_features,self.n_components))
19 | stdev=1./math.sqrt(self.v.size(1))
20 | self.v.data.uniform_(-stdev,stdev)
21 | self.sigmoid=nn.Sigmoid()
22 |
23 | def forward(self,X):
24 | y=self.linear(X)+0.5*torch.sum(torch.pow(torch.mm(X,self.v),2)-
25 | torch.mm(torch.pow(X,2),torch.pow(self.v,2)))
26 | return self.sigmoid(y)
27 |
28 | def __init__(self,max_iter=100000,learning_rate=0.005):
29 | self.max_iter=max_iter
30 | self.learning_rate=learning_rate
31 | self.criterion=nn.BCELoss()
32 | self.fitted=False
33 |
34 | def fit(self,X,y):
35 | n_feature=X.shape[1]
36 | self.model=self.FMClassifier(n_feature)
37 | self.optimizer=optim.SGD(self.model.parameters(),lr=self.learning_rate)
38 | X=torch.from_numpy(X.astype(np.float32))
39 | y=torch.from_numpy(y.astype(np.float32))
40 | for epoch in range(self.max_iter):
41 | y_predict=self.model(X)[:,0]
42 | loss=self.criterion(y_predict,y)
43 | #print('epoch:',epoch,' loss.item():',loss.item())
44 | self.optimizer.zero_grad()
45 | loss.backward()
46 | self.optimizer.step()
47 |
48 | def predict(self,X):
49 | X = torch.from_numpy(X.astype(np.float32))
50 | with torch.no_grad():
51 | y_pred = self.model(X).detach().numpy()
52 | y_pred[y_pred>0.5]=1
53 | y_pred[y_pred<=0.5]=0
54 | return y_pred[:,0]
55 |
56 | if __name__=='__main__':
57 | breast_data = load_breast_cancer()
58 | X, y = breast_data.data[:, :7], breast_data.target
59 | X = MinMaxScaler().fit_transform(X)
60 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
61 |
62 | torch_mfclassifier = SGDFMClassifier(20000, 0.001)
63 | torch_mfclassifier.fit(X_train, y_train)
64 | torch_pred = torch_mfclassifier.predict(X_test)
65 | print('torch accuracy:', len(y_test[y_test == torch_pred]) / len(y_test))
--------------------------------------------------------------------------------
/tinyml/factorization_machine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/factorization_machine/__init__.py
--------------------------------------------------------------------------------
/tinyml/feature_selection/ReliefFeatureSelection.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.preprocessing import MinMaxScaler
3 | from sklearn.datasets import load_breast_cancer
4 | import random
5 |
6 | # 处理连续型
7 | class ReliefFeatureSelection:
8 | def __init__(self,sample_ratio=0.5,k=5,seed=None):
9 | self.feature_importances_=None
10 | self.k=k
11 | self.sample_ratio=sample_ratio
12 | self.seed=seed
13 | random.seed(self.seed)
14 |
15 | def fit(self,X,y):
16 | m,n=X.shape
17 | self.feature_importances_=np.zeros((n,))
18 | for t in range(self.k):
19 | indices=random.sample(range(m),int(m*self.sample_ratio))
20 | subX,suby=X[indices],y[indices]
21 | self.feature_importances_+=self._fit(subX,suby)
22 | self.feature_importances_/=self.k
23 |
24 |
25 | def transform(self,X,k_features):
26 | choosed_indices=np.argsort(self.feature_importances_)[::-1][:k_features]
27 | return X[:,choosed_indices]
28 |
29 | def _fit(self,subX,suby):
30 | label_to_indices = {}
31 | labels = np.unique(suby)
32 | for label in labels:
33 | label_to_indices[label] = list(np.where(suby == label)[0])
34 | m, n = subX.shape
35 | feature_scores_ = np.zeros((n,))
36 | for j in range(n):
37 | for i in range(m):
38 | label_i = suby[i]
39 | xi_nhs = (subX[i, j] - subX[label_to_indices[label_i], j]) ** 2
40 | if len(xi_nhs) == 1:
41 | xi_nh = 0
42 | else:
43 | xi_nh = np.sort(xi_nhs)[1]
44 | feature_scores_[j] -= xi_nh
45 | for label in labels:
46 | if label == label_i:
47 | continue
48 | xi_nm = np.sort((subX[i, j] - subX[label_to_indices[label], j]) ** 2)[0]
49 | feature_scores_[j] += (xi_nm * len(label_to_indices[label]) / m)
50 | return feature_scores_
51 |
52 |
53 | if __name__=='__main__':
54 | breast_data = load_breast_cancer()
55 | subX, suby = breast_data.data, breast_data.target
56 | scaler=MinMaxScaler()
57 | subX=scaler.fit_transform(subX)
58 | reliefF=ReliefFeatureSelection()
59 | reliefF.fit(subX, suby)
60 | print('relief feature_importances:',reliefF.feature_importances_)
61 | print('sorted:',np.argsort(reliefF.feature_importances_))
62 |
63 | import skrebate.relieff as relieff
64 | skrebate_reliefF=relieff.ReliefF()
65 | skrebate_reliefF.fit(subX, suby)
66 | print('skrebate feature_importances_:',skrebate_reliefF.feature_importances_)
67 | print('sorted:',np.argsort(skrebate_reliefF.feature_importances_))
68 |
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/tinyml/feature_selection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/feature_selection/__init__.py
--------------------------------------------------------------------------------
/tinyml/linear_model/LinearRegression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn import linear_model
3 |
4 |
5 | class LinearRegression:
6 | def __init__(self):
7 | self.w=None
8 | self.n_features=None
9 |
10 | def fit(self,X,y):
11 | """
12 | w=(X^TX)^{-1}X^Ty
13 | """
14 | assert isinstance(X,np.ndarray) and isinstance(y,np.ndarray)
15 | assert X.ndim==2 and y.ndim==1
16 | assert y.shape[0]==X.shape[0]
17 | n_samples = X.shape[0]
18 | self.n_features=X.shape[1]
19 | extra=np.ones((n_samples,))
20 | X=np.c_[X,extra]
21 | if self.n_features0.5]=1
46 | y_pred[y_pred<=0.5]=0
47 | return y_pred[:,0]
48 |
49 |
50 | class LogisticRegression:
51 | def __init__(self,max_iter=100,use_matrix=True):
52 | self.beta=None
53 | self.n_features=None
54 | self.max_iter=max_iter
55 | self.use_Hessian=use_matrix
56 |
57 | def fit(self,X,y):
58 | n_samples=X.shape[0]
59 | self.n_features=X.shape[1]
60 | extra=np.ones((n_samples,))
61 | X=np.c_[X,extra]
62 | self.beta=np.random.random((X.shape[1],))
63 | for i in range(self.max_iter):
64 | if self.use_Hessian is not True:
65 | dldbeta=self._dldbeta(X,y,self.beta)
66 | dldldbetadbeta=self._dldldbetadbeta(X,self.beta)
67 | self.beta-=(1./dldldbetadbeta*dldbeta)
68 | else:
69 | dldbeta = self._dldbeta(X, y, self.beta)
70 | dldldbetadbeta = self._dldldbetadbeta_matrix(X, self.beta)
71 | self.beta -= (np.linalg.inv(dldldbetadbeta).dot(dldbeta))
72 |
73 |
74 |
75 | @staticmethod
76 | def _dldbeta(X,y,beta):
77 | # 《机器学习》 公式 3.30
78 | m=X.shape[0]
79 | sum=np.zeros(X.shape[1],).T
80 | for i in range(m):
81 | sum+=X[i]*(y[i]-np.exp(X[i].dot(beta))/(1+np.exp(X[i].dot(beta))))
82 | return -sum
83 |
84 | @staticmethod
85 | def _dldldbetadbeta_matrix(X,beta):
86 | m=X.shape[0]
87 | Hessian=np.zeros((X.shape[1],X.shape[1]))
88 | for i in range(m):
89 | p1 = np.exp(X[i].dot(beta)) / (1 + np.exp(X[i].dot(beta)))
90 | tmp=X[i].reshape((-1,1))
91 | Hessian+=tmp.dot(tmp.T)*p1*(1-p1)
92 | return Hessian
93 |
94 | @staticmethod
95 | def _dldldbetadbeta(X,beta):
96 | # 《机器学习》公式 3.31
97 | m=X.shape[0]
98 | sum=0.
99 | for i in range(m):
100 | p1=np.exp(X[i].dot(beta))/(1+np.exp(X[i].dot(beta)))
101 | sum+=X[i].dot(X[i].T)*p1*(1-p1)
102 | return sum
103 |
104 | def predict_proba(self,X):
105 | n_samples = X.shape[0]
106 | extra = np.ones((n_samples,))
107 | X = np.c_[X, extra]
108 | if self.beta is None:
109 | raise RuntimeError('cant predict before fit')
110 | p1 = np.exp(X.dot(self.beta)) / (1 + np.exp(X.dot(self.beta)))
111 | p0 = 1 - p1
112 | return np.c_[p0,p1]
113 |
114 | def predict(self,X):
115 | p=self.predict_proba(X)
116 | res=np.argmax(p,axis=1)
117 | return res
118 |
119 |
120 | if __name__=='__main__':
121 | breast_data = load_breast_cancer()
122 | X, y = breast_data.data[:,:7], breast_data.target
123 | X = MinMaxScaler().fit_transform(X)
124 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
125 | tinyml_logisticreg = LogisticRegression(max_iter=100,use_matrix=True)
126 | tinyml_logisticreg.fit(X_train, y_train)
127 | lda_prob = tinyml_logisticreg.predict_proba(X_test)
128 |
129 |
130 | lda_pred = tinyml_logisticreg.predict(X_test)
131 | # print('tinyml logistic_prob:', lda_prob)
132 | # print('tinyml logistic_pred:', lda_pred)
133 | print('tinyml accuracy:', len(y_test[y_test == lda_pred]) * 1. / len(y_test))
134 |
135 | sklearn_logsticreg = linear_model.LogisticRegression(max_iter=100,solver='newton-cg')
136 | sklearn_logsticreg.fit(X_train, y_train)
137 | sklearn_prob = sklearn_logsticreg.predict_proba(X_test)
138 | sklearn_pred = sklearn_logsticreg.predict(X_test)
139 | # print('sklearn prob:',sklearn_prob)
140 | # print('sklearn pred:',sklearn_pred)
141 | print('sklearn accuracy:', len(y_test[y_test == sklearn_pred]) * 1. / len(y_test))
142 |
143 | torch_sgd_logisticreg=SGDLogisticRegression(100000,0.01)
144 | torch_sgd_logisticreg.fit(X_train,y_train)
145 | torch_pred=torch_sgd_logisticreg.predict(X_test)
146 | print('torch accuracy:',len(y_test[y_test==torch_pred])/len(y_test))
147 |
148 | # expected output
149 | """
150 | tinyml accuracy: 0.9590643274853801
151 | sklearn accuracy: 0.9298245614035088
152 | torch accuracy: 0.9532163742690059
153 | """
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/tinyml/linear_model/SGDRegressor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn import linear_model
3 |
4 | # 采用MSE作为损失函数
5 | # penalty = 'l2' 则为 Ridge Regression
6 | # penalty = 'l1' 则为 Lasso Regression
7 | # penalty = 'l1l2' 则为 Elastic Net
8 | # alpha 为 正则化系数
9 |
10 | # https://wwdguu.github.io/2018/09/01/%C2%96HOMLWSLATF-ch4/
11 | np.random.seed(1)
12 | class SGDRegressor:
13 | def __init__(self,max_iter=100,penalty=None,alpha=1e-3,l1_ratio=0.5):
14 | self.w = None
15 | self.n_features = None
16 | self.penalty=penalty
17 | self.alpha=alpha
18 | self.l1_ratio=l1_ratio
19 | self.max_iter=max_iter
20 |
21 | #
22 | def fit(self, X, y):
23 | assert isinstance(X, np.ndarray) and isinstance(y, np.ndarray)
24 | assert y.shape[0] == X.shape[0]
25 | n_samples = X.shape[0]
26 | self.n_features = X.shape[1]
27 | extra = np.ones((n_samples,1))
28 | X = np.c_[X,extra]
29 | self.w=np.random.randn(X.shape[1],1)
30 | for iter in range(self.max_iter):
31 | for i in range(n_samples):
32 | sample_index=np.random.randint(n_samples)
33 | x_sample=X[sample_index:sample_index+1]
34 | y_sample=y[sample_index:sample_index+1]
35 | lr=SGDRegressor.learning_schedule(iter*n_samples+i)
36 | # 求导
37 | grad=2*x_sample.T.dot(x_sample.dot(self.w)-y_sample)
38 | if self.penalty is not None:
39 | # Ridge
40 | if self.penalty=='l2':
41 | grad+=self.alpha*self.w
42 | # Lasso
43 | elif self.penalty=='l1':
44 | grad+=self.alpha*np.sign(self.w)
45 | # Elastic Net
46 | elif self.penalty=='l1l2':
47 | grad+=(self.alpha*self.l1_ratio*np.sign(self.w)+
48 | (1-self.l1_ratio)*self.alpha*self.w)
49 |
50 | self.w=self.w-lr*grad
51 |
52 |
53 | def predict(self, X):
54 |
55 | n_samples = X.shape[0]
56 | extra = np.ones((n_samples,1))
57 | X = np.c_[X,extra]
58 | if self.w is None:
59 | raise RuntimeError('cant predict before fit')
60 | y_ = X.dot(self.w)
61 | return y_
62 |
63 | @staticmethod
64 | def learning_schedule(t):
65 | return 5 / (t + 50)
66 |
67 |
68 | if __name__ == '__main__':
69 | X = 2 * np.random.rand(100,1)
70 | y = 4 + 3 * X + np.random.randn(100,1)
71 | y=y.ravel()
72 | print(X.shape)
73 | print(y.shape)
74 | lr = SGDRegressor(max_iter=200,penalty='l1l2',alpha=1e-3,l1_ratio=0.5)
75 | lr.fit(X, y)
76 | print('w:',lr.w)
77 |
78 | sklearn_lr = linear_model.SGDRegressor(max_iter=200,penalty='l1',alpha=1e-3)
79 | sklearn_lr.fit(X, y)
80 | print(sklearn_lr.coef_)
81 | print(sklearn_lr.intercept_)
82 |
83 |
--------------------------------------------------------------------------------
/tinyml/linear_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__init__.py
--------------------------------------------------------------------------------
/tinyml/linear_model/__pycache__/LinearRegression.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__pycache__/LinearRegression.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/linear_model/__pycache__/LogisticRegression.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__pycache__/LogisticRegression.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/linear_model/__pycache__/SGDRegressor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__pycache__/SGDRegressor.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/linear_model/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/linear_model/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/metrices/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/metrices/__init__.py
--------------------------------------------------------------------------------
/tinyml/metrices/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/metrices/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/metrices/__pycache__/curves.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/metrices/__pycache__/curves.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/metrices/curves.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | def precision_recall_curve(y_true,pred_prob):
3 | probs=sorted(list(pred_prob),reverse=True)
4 | Rs=[]
5 | Ps=[]
6 | for i in range(1,len(probs)):
7 | thresh=probs[i]
8 | preds_p=np.where(pred_prob>=thresh)[0]
9 | preds_n=np.where(pred_prob=thresh)[0]
28 | preds_n = np.where(pred_prob 0 and y[i]*u[i]>=1:
49 | return False
50 | if (self.alpha[i]==0 or self.alpha[i]==self.C) and y[i]*u[i]==1:
51 | return False
52 | return True
53 |
54 |
55 | def fit(self,X,y):
56 | self.X=X
57 | self.y=y
58 | self.K=self.computeK(X,self.kernel)
59 | self.alpha=np.random.random((X.shape[0],))
60 | self.omiga=np.zeros((X.shape[0],))
61 |
62 | for _ in range(self.max_iter):
63 | u = self.compute_u(X, y)
64 | finish=True
65 | for i in range(X.shape[0]):
66 | if not self.checkKKT(u,y,i):
67 | finish=False
68 | y_indices=np.delete(np.arange(X.shape[0]),i)
69 | j=y_indices[int(np.random.random()*len(y_indices))]
70 | E_i=np.sum(self.alpha*y*self.K[:,i])+self.b-y[i]
71 | E_j=np.sum(self.alpha*y*self.K[:,j])+self.b-y[j]
72 | if y[i]!=y[j]:
73 | L=max(0,self.alpha[j]-self.alpha[i])
74 | H=min(self.C,self.C+self.alpha[j]-self.alpha[i])
75 | else:
76 | L=max(0,self.alpha[j]+self.alpha[i]-self.C)
77 | H=min(self.C,self.alpha[j]+self.alpha[i])
78 | eta=self.K[i,i]+self.K[j,j]-2*self.K[i,j]
79 | alpha2_new_unc=self.alpha[j]+y[j]*(E_i-E_j)/eta
80 | alpha2_old=self.alpha[j]
81 | alpha1_old=self.alpha[i]
82 | if alpha2_new_unc>H:
83 | self.alpha[j]=H
84 | elif alpha2_new_unc0 and self.alpha[i]0 and self.alpha[j]0)[0]
107 | for j in support_indices:
108 | K[j]=self.kernel_func(self.kernel,self.X[j],X[i])
109 | y_pred=np.sum(self.y[support_indices]*self.alpha[support_indices]*K[support_indices].T)
110 | y_pred+=self.b
111 | y_preds.append(y_pred)
112 | return np.array(y_preds)
113 |
114 |
115 | if __name__=='__main__':
116 |
117 | # 测试 线性核
118 | X = np.array([[2, -1], [3, -2], [1, 0], [0,1],[-2,1],[-1.3,0.3],[-0.2,-0.8],[2.3,-3.3],[-2,-4],[7,8]])
119 | y = np.array([1, 1, 1, 1,-1,-1,-1,-1,-1,1])
120 | svc=SVC(max_iter=100,kernel='linear',C=1)
121 |
122 | """
123 | # 测试rbf核
124 | X=np.array([[1,0],[-1,0],[0,-1],[0,1],[2,np.sqrt(5)],[2,-np.sqrt(5)],[-2,np.sqrt(5)],[-2,-np.sqrt(5)],[300,400]])
125 | y=np.array([-1,-1,-1,-1,1,1,1,1,1])
126 | svc=SVC(max_iter=100,kernel='rbf',C=1)
127 | """
128 | svc.fit(X,y)
129 | print('alpha:',svc.alpha)
130 | print('b:',svc.b)
131 | pred_y=svc.predict(np.array([[1,0],[-0.2,-0.1],[0,1]]))
132 | print('pred_y1:',pred_y)
133 | pred_y=np.sign(pred_y)
134 | print('pred_y:',pred_y)
135 |
136 |
137 |
138 |
139 |
--------------------------------------------------------------------------------
/tinyml/svm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/svm/__init__.py
--------------------------------------------------------------------------------
/tinyml/svm/__pycache__/SVC.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/svm/__pycache__/SVC.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/svm/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/svm/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/tree/DecisionTreeClassifier.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | """
3 | 简单的决策树实现,算法参考 周志华《机器学习》一书
4 | 只处理离散值,不考虑缺失值
5 | """
6 | from tinyml.tree.treePlotter import createPlot
7 | np.random.seed(100)
8 | class DecisionTreeClassifier:
9 | """
10 | 决策树分类
11 | """
12 | def __init__(self,tree_type='ID3',k_classes=2):
13 | self.tree_type=tree_type
14 | self.k_classes=k_classes
15 | if tree_type=='ID3':
16 | self.gain_func=self.Gain
17 | elif tree_type=='CART':
18 | self.gain_func=self.GiniIndex
19 | elif tree_type=='C45':
20 | self.gain_func=self.GainRatio
21 | else:
22 | raise ValueError('must be ID3 or CART or C45')
23 | self.tree=None
24 |
25 | def fit(self,X,y):
26 | D={}
27 | D['X']=X
28 | D['y']=y
29 | A=np.arange(X.shape[1])
30 | aVs={}
31 | for a in A:
32 | aVs[a]=np.unique(X[:,a])
33 | self.tree=self.TreeGenerate(D,A,aVs)
34 |
35 | def predict(self,X):
36 | if self.tree is None:
37 | raise RuntimeError('cant predict before fit')
38 | y_pred=[]
39 | for i in range(X.shape[0]):
40 | tree = self.tree
41 | x=X[i]
42 | while True:
43 | if not isinstance(tree,dict):
44 | y_pred.append(tree)
45 | break
46 | a=list(tree.keys())[0]
47 | tree=tree[a]
48 | if isinstance(tree,dict):
49 | val = x[a]
50 | tree = tree[val]
51 | else:
52 | y_pred.append(tree)
53 | break
54 | return np.array(y_pred)
55 |
56 |
57 | # p74 图4.2算法图
58 | def TreeGenerate(self,D,A,aVs):
59 | X=D['X']
60 | y=D['y']
61 | # 情形1
62 | unique_classes=np.unique(y)
63 | if len(unique_classes)==1:
64 | return unique_classes[0]
65 | flag=True
66 | for a in A:
67 | if(len(np.unique(X[:,a]))>1):
68 | flag=False
69 | break
70 | # 情形2
71 | if flag:
72 | return np.argmax(np.bincount(y))
73 |
74 | gains=np.zeros((len(A),))
75 | if self.tree_type=='C45':
76 | gains=np.zeros((len(A),2))
77 | for i in range(len(A)):
78 | gains[i]=self.gain_func(D,A[i])
79 | #print(gains)
80 | subA=None
81 | if self.tree_type=='CART':
82 | a_best=A[np.argmin(gains)]
83 | subA=np.delete(A,np.argmin(gains))
84 | elif self.tree_type=='ID3':
85 | a_best=A[np.argmax(gains)]
86 | subA=np.delete(A,np.argmax(gains))
87 | elif self.tree_type=='C45':
88 | gain_mean=np.mean(gains[:,0])
89 | higher_than_mean_indices=np.where(gains[:,0]>=gain_mean)
90 | higher_than_mean=gains[higher_than_mean_indices,1][0]
91 | index=higher_than_mean_indices[0][np.argmax(higher_than_mean)]
92 | a_best=A[index]
93 | subA=np.delete(A,index)
94 |
95 | tree={a_best:{}}
96 |
97 | for av in aVs[a_best]:
98 | indices=np.where(X[:,a_best]==av)
99 | Dv={}
100 | Dv['X']=X[indices]
101 | Dv['y']=y[indices]
102 | if len(Dv['y'])==0:
103 | tree[a_best][av]=np.argmax(np.bincount(y))
104 | else:
105 | tree[a_best][av]=self.TreeGenerate(Dv,subA,aVs)
106 | return tree
107 |
108 |
109 |
110 | @classmethod
111 | def Ent(cls,D):
112 | """
113 | 《机器学习》 公式4.1 信息熵
114 | :param D: 数据集
115 | :return: 信息熵
116 | """
117 | y=D['y']
118 | bin_count=np.bincount(y)
119 | total=len(y)
120 | ent=0.
121 | for k in range(len(bin_count)):
122 | p_k=bin_count[k]/total
123 | if p_k!=0:
124 | ent+=p_k*np.log2(p_k)
125 | return -ent
126 |
127 | @classmethod
128 | def Gain(cls,D,a):
129 | """
130 | 《机器学习》 公式4.2 信息增益
131 | a表示属性列 index
132 | """
133 | X=D['X']
134 | y=D['y']
135 | aV=np.unique(X[:,a])
136 | sum=0.
137 | for v in range(len(aV)):
138 | Dv={}
139 | indices=np.where(X[:,a]==aV[v])
140 | Dv['X']=X[indices]
141 | Dv['y']=y[indices]
142 | ent=cls.Ent(Dv)
143 | sum+=(len(Dv['y'])/len(y)*ent)
144 | gain=cls.Ent(D)-sum
145 | return gain
146 |
147 | @classmethod
148 | def Gini(cls,D):
149 | """
150 | 《机器学习》 公式4.5
151 | """
152 | y = D['y']
153 | bin_count = np.bincount(y)
154 | total = len(y)
155 | ent = 0.
156 | for k in range(len(bin_count)):
157 | p_k = bin_count[k] / total
158 | ent+=p_k**2
159 | return 1-ent
160 |
161 | @classmethod
162 | def GiniIndex(cls,D,a):
163 | """
164 | 公式4.6
165 | """
166 | X = D['X']
167 | y = D['y']
168 | aV = np.unique(X[:, a])
169 | sum = 0.
170 | for v in range(len(aV)):
171 | Dv = {}
172 | indices = np.where(X[:, a] == aV[v])
173 | Dv['X'] = X[indices]
174 | Dv['y'] = y[indices]
175 | ent = cls.Gini(Dv)
176 | sum += (len(Dv['y']) / len(y) * ent)
177 | gain = sum
178 | return gain
179 |
180 | @classmethod
181 | def GainRatio(cls,D,a):
182 | """
183 | 公式4.3 4.4
184 | """
185 | X = D['X']
186 | y = D['y']
187 | aV = np.unique(X[:, a])
188 | sum = 0.
189 | intrinsic_value=0.
190 | for v in range(len(aV)):
191 | Dv = {}
192 | indices = np.where(X[:, a] == aV[v])
193 | Dv['X'] = X[indices]
194 | Dv['y'] = y[indices]
195 | ent = cls.Ent(Dv)
196 | sum += (len(Dv['y']) / len(y) * ent)
197 | intrinsic_value+=(len(Dv['y'])/len(y))*np.log2(len(Dv['y'])/len(y))
198 | gain = cls.Ent(D) - sum
199 | intrinsic_value=-intrinsic_value
200 | gain_ratio=gain/intrinsic_value
201 | return np.array([gain,gain_ratio])
202 |
203 | if __name__=='__main__':
204 | watermelon_data = np.array([[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0],
205 | [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0],
206 | [2, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1],
207 | [1, 1, 0, 1, 1, 1], [1, 1, 0, 0, 1, 0],
208 | [1, 1, 1, 1, 1, 0], [0, 2, 2, 0, 2, 1],
209 | [2, 2, 2, 2, 2, 0], [2, 0, 0, 2, 2, 1],
210 | [0, 1, 0, 1, 0, 0], [2, 1, 1, 1, 0, 0],
211 | [1, 1, 0, 0, 1, 1], [2, 0, 0, 2, 2, 0],
212 | [0, 0, 1, 1, 1, 0]])
213 | label = np.array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
214 | X_test=np.array([[0, 0, 1, 0, 0, 0], [1, 0, 1, 0, 0, 0],
215 | [1, 1, 0, 1, 1, 0], [1, 0, 1, 1, 1, 0],
216 | [1, 1, 0, 0, 1, 1], [2, 0, 0, 2, 2, 0],
217 | [0, 0, 1, 1, 1, 0]])
218 |
219 | decision_clf=DecisionTreeClassifier(tree_type='ID3')
220 | decision_clf.fit(watermelon_data,label)
221 | print(decision_clf.tree)
222 | createPlot(decision_clf.tree)
223 |
224 | y_pred=decision_clf.predict(X_test)
225 | print('y_pred:',y_pred)
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
--------------------------------------------------------------------------------
/tinyml/tree/DecisionTreeRegressor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from tinyml.tree import treePlotter
3 | import sklearn.datasets as datasets
4 | from sklearn.metrics import mean_squared_error
5 | import sklearn.tree as tree
6 | import graphviz
7 |
8 | class DecisionTreeRegressor:
9 | """
10 | 《统计学习方法》 p69 最小二乘回归树
11 | """
12 | def __init__(self, min_samples_split=3,min_samples_leaf=1,random_state=False):
13 | self.min_samples_split=min_samples_split
14 | self.min_samples_leaf=min_samples_leaf
15 | self.random=random_state
16 | self.tree = None
17 |
18 | def fit(self, X, y):
19 | D = {}
20 | D['X'] = X
21 | D['y'] = y
22 | A = np.arange(X.shape[1])
23 | self.tree = self.TreeGenerate(D, A)
24 |
25 | def predict(self, X):
26 | if self.tree is None:
27 | raise RuntimeError('cant predict before fit')
28 | y_pred = []
29 | for i in range(X.shape[0]):
30 | tree = self.tree
31 | x = X[i]
32 | while True:
33 | if not isinstance(tree, dict):
34 | y_pred.append(tree)
35 | break
36 | a = list(tree.keys())[0]
37 | tree = tree[a]
38 | if isinstance(tree, dict):
39 | val = x[a]
40 | split_val=float(list(tree.keys())[0][1:])
41 | if val<=split_val:
42 | tree=tree[list(tree.keys())[0]]
43 | else:
44 | tree=tree[list(tree.keys())[1]]
45 | else:
46 | y_pred.append(tree)
47 | break
48 | return np.array(y_pred)
49 |
50 | def TreeGenerate(self, D, A):
51 | X = D['X']
52 | y = D['y']
53 | if len(y)<=self.min_samples_split:
54 | return np.mean(y)
55 | split_j=None
56 | split_s=None
57 | min_val=1.e10
58 | select_A=A
59 | if self.random is True:
60 | d=len(A)
61 | select_A=np.random.choice(A,size=int(d//2),replace=False)
62 | for j in select_A:
63 | for s in np.unique(X[:,j]):
64 | left_indices=np.where(X[:,j]<=s)[0]
65 | right_indices=np.where(X[:,j]>s)[0]
66 | if len(left_indices)split_s)[0]
79 | D_left, D_right = {},{}
80 | D_left['X'],D_left['y'] = X[left_indices],y[left_indices]
81 | D_right['X'],D_right['y']=X[right_indices],y[right_indices]
82 | tree[split_j]['l'+str(split_s)]=self.TreeGenerate(D_left,A)
83 | tree[split_j]['r'+str(split_s)]=self.TreeGenerate(D_right,A)
84 | # 当前节点值
85 | tree[split_j]['val']=np.mean(y)
86 | return tree
87 |
88 |
89 | if __name__=='__main__':
90 | breast_data = datasets.load_boston()
91 | X, y = breast_data.data, breast_data.target
92 | X_train, y_train = X[:200], y[:200]
93 | X_test, y_test = X[200:], y[200:]
94 |
95 |
96 | decisiontree_reg=DecisionTreeRegressor(min_samples_split=20,min_samples_leaf=5)
97 | decisiontree_reg.fit(X_train,y_train)
98 | print(decisiontree_reg.tree)
99 | treePlotter.createPlot(decisiontree_reg.tree)
100 | y_pred=decisiontree_reg.predict(X_test)
101 | print('tinyml mse:',mean_squared_error(y_test,y_pred))
102 |
103 |
104 | sklearn_reg=tree.DecisionTreeRegressor(min_samples_split=20,min_samples_leaf=5,random_state=False)
105 | sklearn_reg.fit(X_train,y_train)
106 | print(sklearn_reg.feature_importances_)
107 | sklearn_pred=sklearn_reg.predict(X_test)
108 | print('sklearn mse:',mean_squared_error(y_test,sklearn_pred))
109 | dot_data=tree.export_graphviz(sklearn_reg,out_file=None)
110 | graph=graphviz.Source(dot_data)
111 |
--------------------------------------------------------------------------------
/tinyml/tree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__init__.py
--------------------------------------------------------------------------------
/tinyml/tree/__pycache__/DecisionTreeClassifier.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/DecisionTreeClassifier.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/tree/__pycache__/DecisionTreeRegressor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/DecisionTreeRegressor.cpython-36.pyc
--------------------------------------------------------------------------------
/tinyml/tree/__pycache__/DecisionTreeRegressor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/DecisionTreeRegressor.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/tree/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/tinyml/tree/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/tree/__pycache__/treePlotter.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/treePlotter.cpython-36.pyc
--------------------------------------------------------------------------------
/tinyml/tree/__pycache__/treePlotter.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengyang95/tiny_ml/6a3ee55e4ee4ddc3dad1e53f511725679a99f083/tinyml/tree/__pycache__/treePlotter.cpython-37.pyc
--------------------------------------------------------------------------------
/tinyml/tree/treePlotter.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from pylab import mpl
3 |
4 | """
5 | 《Machine Learning in Action》一书中可视化决策树的代码
6 | """
7 |
8 | mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
9 | mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
10 | decisionNode = dict(boxstyle="sawtooth", fc="0.8")
11 | leafNode = dict(boxstyle="round4", fc="0.8")
12 | arrow_args = dict(arrowstyle="<-")
13 |
14 | def plotNode(nodeTxt, centerPt, parentPt, nodeType):
15 | createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', \
16 | xytext=centerPt, textcoords='axes fraction', \
17 | va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
18 |
19 | def getNumLeafs(myTree):
20 | numLeafs = 0
21 | firstStr = list(myTree.keys())[0]
22 | secondDict = myTree[firstStr]
23 | for key in secondDict.keys():
24 | if type(secondDict[key]).__name__ == 'dict':
25 | numLeafs += getNumLeafs(secondDict[key])
26 | else:
27 | numLeafs += 1
28 | return numLeafs
29 |
30 | def getTreeDepth(myTree):
31 | maxDepth = 0
32 | firstStr = list(myTree.keys())[0]
33 | secondDict = myTree[firstStr]
34 | for key in secondDict.keys():
35 | if type(secondDict[key]).__name__ == 'dict':
36 | thisDepth = getTreeDepth(secondDict[key]) + 1
37 | else:
38 | thisDepth = 1
39 | if thisDepth > maxDepth:
40 | maxDepth = thisDepth
41 | return maxDepth
42 |
43 | def plotMidText(cntrPt, parentPt, txtString):
44 | xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
45 | yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
46 | createPlot.ax1.text(xMid, yMid, txtString)
47 |
48 | def plotTree(myTree, parentPt, nodeTxt):
49 | numLeafs = getNumLeafs(myTree)
50 | depth = getTreeDepth(myTree)
51 | firstStr = list(myTree.keys())[0]
52 | cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalw, plotTree.yOff)
53 | plotMidText(cntrPt, parentPt, nodeTxt)
54 | plotNode(firstStr, cntrPt, parentPt, decisionNode)
55 | secondDict = myTree[firstStr]
56 | plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
57 | for key in secondDict.keys():
58 | if type(secondDict[key]).__name__ == 'dict':
59 | plotTree(secondDict[key], cntrPt, str(key))
60 | else:
61 | plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalw
62 | plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
63 | plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
64 | plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD
65 |
66 | def createPlot(inTree):
67 | fig = plt.figure(1, facecolor='white')
68 | fig.clf()
69 | axprops = dict(xticks=[], yticks=[])
70 | createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
71 | plotTree.totalw = float(getNumLeafs(inTree))
72 | plotTree.totalD = float(getTreeDepth(inTree))
73 | plotTree.xOff = -0.5 / plotTree.totalw
74 | plotTree.yOff = 1.0
75 | plotTree(inTree, (0.5, 1.0), '')
76 | plt.show()
77 |
--------------------------------------------------------------------------------