z_alpha_upper else zorder)
153 | plt.fill_between(x, 0, y, where=x>=z_upper, facecolor="red", alpha=0.5, zorder=zorder+1 if z_upper > z_alpha_upper else zorder)
154 |
155 | plt.fill_between(x, 0, y, where=x<=z_alpha_lower, facecolor="green", alpha=1, zorder=zorder if z_upper > z_alpha_upper else zorder+1)
156 | plt.fill_between(x, 0, y, where=x>=z_alpha_upper, facecolor="green", alpha=1, zorder=zorder if z_upper > z_alpha_upper else zorder+1)
157 |
158 | zorder += 2
159 | plt.axvline(z_alpha_lower, color="g", linestyle="--", zorder=zorder)
160 | plt.axvline(z_alpha_upper, color="g", linestyle="--", zorder=zorder)
161 | plt.axvline(z_lower, color="r", linestyle="--", zorder=zorder)
162 | plt.axvline(z_upper, color="r", linestyle="--", zorder=zorder)
163 |
164 | zorder += 1
165 | plt.annotate("$\\alpha$", fontsize=14, xy=(z_alpha_lower, 0.004), xycoords="data", xytext=(-0.1, 0.2),
166 | arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), zorder=zorder)
167 | plt.annotate("", fontsize=14, xy=(z_alpha_upper, 0.004), xycoords="data",
168 | xytext=(0.1, 0.19), arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), zorder=zorder)
169 |
170 | plt.annotate("p-value", fontsize=14, xy=(z_lower, 0.004), xycoords="data", xytext=(-0.7, 0.1),
171 | arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), zorder=zorder)
172 | plt.annotate("", fontsize=14, xy=(z_upper, 0.004), xycoords="data",
173 | xytext=(0.1, 0.09), arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), zorder=zorder)
174 |
175 | plt.annotate("$z$", fontsize=14, xy=(z, 0), xycoords="data",
176 | xytext=(z, -0.08), zorder=zorder)
177 |
178 | plt.annotate("$z_{\\alpha}$", fontsize=14, xy=(z_alpha_upper if z > 0 else z_alpha_lower, 0), xycoords="data",
179 | xytext=(z_alpha_upper if z > 0 else z_alpha_lower, -0.08), zorder=zorder)
180 |
181 | zorder += 1
182 | plt.annotate("$\\alpha={:0.2f}$".format(alpha), (0.67,0.9), fontsize=14, xycoords="axes fraction", zorder=zorder)
183 | plt.annotate("p-value=${:0.4f}$".format(pvalue), (0.67,0.8), fontsize=14, xycoords="axes fraction", zorder=zorder)
184 | plt.annotate("$z={:0.2f}$".format(z), (0.67,0.7), fontsize=14, xycoords="axes fraction", zorder=zorder)
185 | plt.annotate("$z_{\\alpha}=%0.2f$"%(z_alpha_upper if z > 0 else z_alpha_lower), (0.67,0.6), fontsize=14, xycoords="axes fraction", zorder=zorder)
186 |
187 | plt.grid(True)
188 |
189 |
190 | def plot_two_tailed_pvalue_for_norm(x_bar, mu=0, se=1, alpha=0.05, xlim=(-4,4)):
191 |
192 | x = np.linspace(xlim[0], xlim[1], 1000)
193 | y = stats.norm.pdf(x, loc=mu, scale=se)
194 |
195 | pvalue = get_pvalue_for_two_tails_norm(get_z(x_bar, mu, se))
196 | z_alpha = abs(get_z_by_alpha_for_two_tailed(alpha))
197 |
198 | upper_bound = x_bar if x_bar > mu else 2*mu - x_bar
199 | lower_bound = 2*mu - x_bar if x_bar > mu else x_bar
200 |
201 | x_alpha_upper = mu + z_alpha*se
202 | x_alpha_lower = mu - z_alpha*se
203 |
204 | plt.title("Normal Distribution with $\mu$ and $SE$")
205 |
206 | zorder = 1
207 | plt.plot(x, y, lw=2, color="green", zorder=zorder)
208 | plt.xlim(xlim)
209 |
210 | zorder += 1
211 | plt.fill_between(x, 0, y, where=x<=lower_bound, facecolor="red", alpha=0.5, zorder=zorder+1 if alpha > pvalue else zorder)
212 | plt.fill_between(x, 0, y, where=x>=upper_bound, facecolor="red", alpha=0.5, zorder=zorder+1 if alpha > pvalue else zorder)
213 |
214 | plt.fill_between(x, 0, y, where=x<=x_alpha_lower, facecolor="green", alpha=1, zorder=zorder if alpha > pvalue else zorder+1)
215 | plt.fill_between(x, 0, y, where=x>=x_alpha_upper, facecolor="green", alpha=1, zorder=zorder if alpha > pvalue else zorder+1)
216 |
217 | zorder += 2
218 | plt.axvline(x_alpha_upper, color="g", linestyle="--", zorder=zorder)
219 | plt.axvline(x_alpha_lower, color="g", linestyle="--", zorder=zorder)
220 |
221 | plt.axvline(lower_bound, color="r", linestyle="--", zorder=zorder)
222 | plt.axvline(upper_bound, color="r", linestyle="--", zorder=zorder)
223 |
224 | zorder += 1
225 |
226 | # TODO: Add p-value and alpha arrows
227 | # plt.annotate("$\\alpha$", fontsize=14, xy=(z_alpha_lower, 0.005), xycoords="data", xytext=(19, 0.06),
228 | # arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
229 | # plt.annotate("", fontsize=14, xy=(z_alpha_upper, 0.005), xycoords="data",
230 | # xytext=(21, 0.057), arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
231 |
232 | # plt.annotate("p-value", fontsize=14, xy=(lower_bound, 0.005), xycoords="data", xytext=(17, 0.03),
233 | # arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
234 | # plt.annotate("", fontsize=14, xy=(upper_bound, 0.005), xycoords="data",
235 | # xytext=(22, 0.027), arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
236 |
237 | plt.annotate("$\\mu$", fontsize=14, xy=(mu, 0), xycoords="data",
238 | xytext=(mu, 0), zorder=zorder)
239 |
240 | plt.annotate("$\\bar{x}$", fontsize=14, xy=(x_bar, 0), xycoords="data",
241 | xytext=(x_bar, 0), zorder=zorder)
242 |
243 | plt.annotate("$\\bar{x}_{\\alpha}$", fontsize=14, xy=(x_alpha_upper if x_bar > mu else x_alpha_lower, 0), xycoords="data",
244 | xytext=(x_alpha_upper if x_bar > mu else x_alpha_lower, 0), zorder=zorder)
245 |
246 | zorder += 1
247 | plt.annotate("$\\alpha={:0.2f}$".format(alpha), (0.67,0.9), fontsize=14, xycoords="axes fraction", zorder=zorder)
248 | plt.annotate("p-value=${:0.4f}$".format(pvalue), (0.67,0.8), fontsize=14, xycoords="axes fraction", zorder=zorder)
249 | plt.annotate("$\\bar{x}=%0.2f$" % (x_bar), (0.67,0.7), fontsize=14, xycoords="axes fraction", zorder=zorder)
250 | plt.annotate("$\\bar{x}_{\\alpha}=%0.2f$" % (x_alpha_upper if x_bar > mu else x_alpha_lower), (0.67,0.6), fontsize=14, xycoords="axes fraction", zorder=zorder)
251 |
252 | plt.grid(True)
253 |
254 |
255 | def plot_two_tailed_pvalue_for_tdistribution(t, df, alpha=0.05, xlim=(-4,4)):
256 |
257 | t_alpha_upper = get_t_by_alpha_for_two_tailed(alpha, df)
258 | t_alpha_lower = -t_alpha_upper
259 |
260 | t_upper = abs(t)
261 | t_lower = -t_upper
262 |
263 | x = np.linspace(xlim[0], xlim[1], 1000)
264 | y = stats.norm.pdf(x, loc=0, scale=1)
265 | pvalue = get_pvalue_for_two_tails_tdistribtion(t, df)
266 |
267 | plt.title("t-Distribution")
268 |
269 | zorder = 1
270 | plt.plot(x, y, lw=2, color="green", zorder=zorder)
271 | plt.xlim(xlim)
272 |
273 | zorder += 1
274 | plt.fill_between(x, 0, y, where=x<=t_lower, facecolor="red", alpha=0.5, zorder=zorder+1 if t_upper > t_alpha_upper else zorder)
275 | plt.fill_between(x, 0, y, where=x>=t_upper, facecolor="red", alpha=0.5, zorder=zorder+1 if t_upper > t_alpha_upper else zorder)
276 |
277 | plt.fill_between(x, 0, y, where=x<=t_alpha_lower, facecolor="green", alpha=1, zorder=zorder if t_upper > t_alpha_upper else zorder+1)
278 | plt.fill_between(x, 0, y, where=x>=t_alpha_upper, facecolor="green", alpha=1, zorder=zorder if t_upper > t_alpha_upper else zorder+1)
279 |
280 | zorder += 2
281 | plt.axvline(t_alpha_lower, color="g", linestyle="--", zorder=zorder)
282 | plt.axvline(t_alpha_upper, color="g", linestyle="--", zorder=zorder)
283 | plt.axvline(t_lower, color="r", linestyle="--", zorder=zorder)
284 | plt.axvline(t_upper, color="r", linestyle="--", zorder=zorder)
285 |
286 | zorder += 1
287 | plt.annotate("$t$", fontsize=14, xy=(t, 0), xycoords="data",
288 | xytext=(t, -0.08), zorder=zorder)
289 |
290 | plt.annotate("$t_{\\alpha}$", fontsize=14, xy=(t_alpha_upper if t > 0 else t_alpha_lower, 0), xycoords="data",
291 | xytext=(t_alpha_upper if t > 0 else t_alpha_lower, -0.08), zorder=zorder)
292 |
293 | zorder += 1
294 | plt.annotate("$\\alpha={:0.2f}$".format(alpha), (0.67,0.9), fontsize=14, xycoords="axes fraction", zorder=zorder)
295 | plt.annotate("p-value=${:0.4f}$".format(pvalue), (0.67,0.8), fontsize=14, xycoords="axes fraction", zorder=zorder)
296 | plt.annotate("$t={:0.2f}$".format(t), (0.67,0.7), fontsize=14, xycoords="axes fraction", zorder=zorder)
297 | plt.annotate("$t_{\\alpha}=%0.2f$"%(t_alpha_upper if t > 0 else t_alpha_lower), (0.67,0.6), fontsize=14, xycoords="axes fraction", zorder=zorder)
298 |
299 | plt.grid(True)
300 |
301 | if __name__ == "__main__":
302 | pass
303 |
--------------------------------------------------------------------------------
/lib/plot_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from matplotlib import cm
4 |
5 |
6 | class CPlot:
7 | """Classification plot class with static methods"""
8 |
9 | @staticmethod
10 | def show_init_data_plot(X, y, cmap="tab10"):
11 |
12 | plt.title("Initial Data")
13 | scatter = plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap)
14 | plt.grid(True)
15 | plt.xlabel("X1")
16 | plt.ylabel("X2")
17 | # FIXME: version 0.20
18 | # plt.legend(*scatter.legend_elements(), title="Class:")
19 | plt.show()
20 |
21 | @staticmethod
22 | def show_train_test_plots(model, X_train, y_train, X_test, y_test,
23 | title=None, cmap="tab10", proba=False,
24 | show_colorbar=True):
25 |
26 | step = 0.01
27 |
28 | x1_min = np.min([X_train[:,0].min(), X_test[:,0].min()])
29 | x1_max = np.max([X_train[:,0].max(), X_test[:,0].max()])
30 |
31 | x2_min = np.min([X_train[:,1].min(), X_test[:,1].min()])
32 | x2_max = np.max([X_train[:,1].max(), X_test[:,1].max()])
33 |
34 | x1_min = x1_min - (0.1*np.abs(x1_min))
35 | x1_max = x1_max + (0.1*np.abs(x1_max))
36 | x2_min = x2_min - (0.1*np.abs(x2_min))
37 | x2_max = x2_max + (0.1*np.abs(x2_max))
38 |
39 | xx, yy = np.meshgrid(np.arange(x1_min, x1_max, step),
40 | np.arange(x2_min, x2_max, step))
41 | points = np.c_[xx.ravel(), yy.ravel()]
42 |
43 | if proba is True and hasattr(model, "predict_proba") and len(model.classes_) == 2:
44 | cmap = cm.bwr
45 | Z = model.predict_proba(points)[:, 1]
46 | elif proba is True and hasattr(model, "decision_function") and len(model.classes_) == 2:
47 | cmap = cm.bwr
48 | Z = model.decision_function(points)
49 | else:
50 | Z = model.predict(points)
51 |
52 | Z = Z.reshape(xx.shape)
53 |
54 | plt.figure(1, figsize=[12, 4])
55 |
56 | if title:
57 | plt.suptitle(title, fontsize=16)
58 |
59 | plt.subplot(1,2,1)
60 | plt.title("Train data")
61 | plt.contourf(xx, yy, Z, cmap=cmap, alpha=.5)
62 | scatter = plt.scatter(X_train[:,0], X_train[:,1], c=y_train, s=80, cmap=cmap, alpha=0.5, label="True")
63 | plt.scatter(X_train[:,0], X_train[:,1], c=model.predict(X_train), s=20, cmap=cmap, label="Predicted")
64 | if show_colorbar:
65 | plt.colorbar()
66 | plt.xlabel("X1")
67 | plt.ylabel("X2")
68 | plt.xlim(x1_min, x1_max)
69 | plt.ylim(x2_min, x2_max)
70 | # FIXME: legend_elements is not supported in matplotlib 3.0.3
71 | # plt.legend(*scatter.legend_elements(), title="Class:")
72 | plt.legend()
73 | plt.grid(True)
74 |
75 | plt.subplot(1,2,2)
76 | plt.title("Test data")
77 | plt.contourf(xx, yy, Z, cmap=cmap, alpha=.5)
78 | scatter = plt.scatter(X_test[:,0], X_test[:,1], c=y_test, s=80, cmap=cmap, alpha=0.5, label="True")
79 | plt.scatter(X_test[:,0], X_test[:,1], c=model.predict(X_test), s=20, cmap=cmap, label="Predicted")
80 | if show_colorbar:
81 | plt.colorbar()
82 | plt.xlabel("X1")
83 | plt.ylabel("X2")
84 | plt.xlim(x1_min, x1_max)
85 | plt.ylim(x2_min, x2_max)
86 | # FIXME: legend_elements is not supported in matplotlib 3.0.3
87 | # plt.legend(*scatter.legend_elements(), title="Class:")
88 | plt.legend()
89 | plt.grid(True)
90 |
91 | plt.show()
92 |
93 | @staticmethod
94 | def show_prediction_plot(model, X, y, title=None, cmap="tab10", proba=False):
95 |
96 | step = 0.01
97 |
98 | x1_min = np.min([X[:,0].min(), X[:,0].min()])
99 | x1_max = np.max([X[:,0].max(), X[:,0].max()])
100 |
101 | x2_min = np.min([X[:,1].min(), X[:,1].min()])
102 | x2_max = np.max([X[:,1].max(), X[:,1].max()])
103 |
104 | x1_min = x1_min - (0.1*np.abs(x1_min))
105 | x1_max = x1_max + (0.1*np.abs(x1_max))
106 | x2_min = x2_min - (0.1*np.abs(x2_min))
107 | x2_max = x2_max + (0.1*np.abs(x2_max))
108 |
109 | xx, yy = np.meshgrid(np.arange(x1_min, x1_max, step),
110 | np.arange(x2_min, x2_max, step))
111 | points = np.c_[xx.ravel(), yy.ravel()]
112 |
113 | if proba is True and hasattr(model, "predict_proba") and len(model.classes_) == 2:
114 | cmap = cm.bwr
115 | Z = model.predict_proba(points)[:, 1]
116 | elif proba is True and hasattr(model, "decision_function") and len(model.classes_) == 2:
117 | cmap = cm.bwr
118 | Z = model.decision_function(points)
119 | else:
120 | Z = model.predict(points)
121 |
122 | Z = Z.reshape(xx.shape)
123 |
124 | plt.figure(1, figsize=[6, 4])
125 |
126 | if title:
127 | plt.suptitle(title, fontsize=16)
128 |
129 | plt.subplot(1,1,1)
130 | plt.title("Train data")
131 | plt.contourf(xx, yy, Z, cmap=cmap, alpha=.5)
132 | scatter = plt.scatter(X[:,0], X[:,1], c=y, s=80, cmap=cmap, alpha=0.5)
133 | plt.scatter(X[:,0], X[:,1], c=model.predict(X), s=20, cmap=cmap)
134 | plt.xlabel("X1")
135 | plt.ylabel("X2")
136 | plt.xlim(x1_min, x1_max)
137 | plt.ylim(x2_min, x2_max)
138 | # plt.legend(*scatter.legend_elements(), title="Class:")
139 | plt.grid(True)
140 |
141 | plt.show()
142 |
143 | class RPlot:
144 | """Regression plot class with static methods"""
145 |
146 | @staticmethod
147 | def show_init_data_plot(x, y):
148 | plt.title("Initial Data")
149 | plt.plot(x, y, "o", c="g")
150 | plt.xlabel("X")
151 | plt.ylabel("Y")
152 | plt.grid(True)
153 | plt.show()
154 |
155 | @staticmethod
156 | def show_train_test_plots(model, X_train, y_train, X_test, y_test, title=None):
157 |
158 | plt.figure(1, figsize=[12, 4])
159 |
160 | if title:
161 | plt.suptitle(title, fontsize=16)
162 |
163 | x_min = np.min([X_train.min(), X_test.min()])
164 | x_max = np.max([X_train.max(), X_test.max()])
165 |
166 | x_min = x_min + 0.1*x_min
167 | x_max = x_max + 0.1*x_max
168 |
169 | xx = np.arange(x_min, x_max, 0.01)[:, np.newaxis]
170 |
171 | plt.subplot(1,2,1)
172 | plt.title("Train data")
173 | plt.plot(X_train, y_train, "o", c="g")
174 | plt.plot(xx, model.predict(xx), c="g", linewidth=2)
175 | plt.plot(X_train, model.predict(X_train), "o", color="red", lw=2)
176 | plt.vlines(X_train, ymin=y_train, ymax=model.predict(X_train), colors="black", linestyles="dotted")
177 | plt.xlabel("X")
178 | plt.ylabel("Y")
179 | plt.grid(True)
180 |
181 | plt.subplot(1,2,2)
182 | plt.title("Test data")
183 | plt.plot(X_test, y_test, "o", c="g")
184 | plt.plot(xx, model.predict(xx), c="green", label="max_depth=5", linewidth=2)
185 | plt.plot(X_test, model.predict(X_test), "o", color="red", lw=2)
186 | plt.vlines(X_test, ymin=y_test, ymax=model.predict(X_test), colors="black", linestyles="dotted")
187 | plt.xlabel("X")
188 | plt.ylabel("Y")
189 | plt.grid(True)
190 |
191 | plt.show()
192 |
193 |
194 | show_cplots = CPlot.show_train_test_plots
195 | show_init_cplots = CPlot.show_init_data_plot
196 | show_prediction_cplots = CPlot.show_prediction_plot
197 |
198 | show_rplots = RPlot.show_train_test_plots
199 | show_init_rplots = RPlot.show_init_data_plot
200 |
--------------------------------------------------------------------------------
/notebooks/C3_GD.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "СЕМИНАР. Безусловная оптимизация. Градиентный спуск
\n",
8 | "Папулин С.Ю. (papulin.study@yandex.ru)
"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "\n",
16 | ""
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import numpy as np\n",
31 | "import pandas as pnd\n",
32 | "import matplotlib.pyplot as plt\n",
33 | "from mpl_toolkits.mplot3d import Axes3D\n",
34 | "from matplotlib import cm\n",
35 | "%matplotlib inline"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "\n",
43 | "\n",
44 | "
\n",
45 | "
1. Производная
\n",
46 | " \t
\n",
47 | "
\n",
48 | "
"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "from scipy.misc import derivative"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "Производная в точке"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "x0 = -4\n",
74 | "f = lambda x: x**2\n",
75 | "\n",
76 | "\n",
77 | "# Производная в точке x0\n",
78 | "df_x0 = derivative(f, x0, n=1)\n",
79 | "print(\"f'(x0) =\", df_x0)\n",
80 | "\n",
81 | "# Вторая производная в точке x0\n",
82 | "ddf_x0 = derivative(f, x0, n=2)\n",
83 | "print(\"f''(x0) =\", ddf_x0)"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "Производные на итервале значений"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "x_start = -4\n",
100 | "x_end = 5\n",
101 | "step = 1\n",
102 | "\n",
103 | "# Набор значений от x_start до x_end с шагом step\n",
104 | "x = np.arange(x_start, x_end, step)\n",
105 | "print(\"Значения:\", x)\n",
106 | "\n",
107 | "# Производные\n",
108 | "df = derivative(f, x, n=1)\n",
109 | "print(\"Производные:\", df)\n",
110 | "\n",
111 | "# Вторые производные\n",
112 | "ddf = derivative(f, x, n=2)\n",
113 | "print(\"Вторые производные:\", ddf)"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "Отрицательная функция"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "f_neg = lambda x: - x**2"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "# Производные\n",
139 | "df_neg = derivative(f_neg, x, n=1)\n",
140 | "print(\"Производные:\", df_neg)\n",
141 | "\n",
142 | "# Вторые производные\n",
143 | "ddf_neg = derivative(f_neg, x, n=2)\n",
144 | "print(\"Вторые производные:\", ddf_neg)"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "Графики"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "plt.figure(\"4\", figsize=[15,6])\n",
161 | "\n",
162 | "ax1 = plt.subplot(1,2,1)\n",
163 | "\n",
164 | "plt.plot(x, f(x), \"-o\", label=\"$f(x)=x^2$\")\n",
165 | "plt.plot(x, df, \"-o\", label=\"$f'(x)$\")\n",
166 | "plt.plot(x, ddf, \"-o\", label=\"$f''(x)$\")\n",
167 | "\n",
168 | "plt.title(\"$f(x)=x^2$\")\n",
169 | "\n",
170 | "plt.xlabel(\"x\")\n",
171 | "plt.ylabel(\"y\")\n",
172 | "\n",
173 | "plt.grid(True)\n",
174 | "\n",
175 | "plt.legend()\n",
176 | "\n",
177 | "ax2 = plt.subplot(1,2,2)\n",
178 | "\n",
179 | "plt.plot(x, f_neg(x), \"-o\", label=\"$f_{neg}(x)=-x^2$\")\n",
180 | "plt.plot(x, df_neg, \"-o\", label=\"$f'_{neg}(x)$\")\n",
181 | "plt.plot(x, ddf_neg, \"-o\", label=\"$f''_{neg}(x)$\")\n",
182 | "\n",
183 | "plt.title(\"$f(x)=-x^2$\")\n",
184 | "\n",
185 | "plt.xlabel(\"x\")\n",
186 | "plt.ylabel(\"y\")\n",
187 | "\n",
188 | "plt.grid(True)\n",
189 | "\n",
190 | "plt.legend()\n",
191 | "\n",
192 | "plt.show()"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "Функция:\n",
200 | "\n",
201 | "$$f(x) = x^2 + 10 \\cdot \\sin(x)$$"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "# Исходные данные\n",
211 | "x = np.arange(-10, 10, 0.1)\n",
212 | "f = lambda x: x**2 + 10 * np.sin(x)\n",
213 | "\n",
214 | "# Производные\n",
215 | "df = derivative(f, x, n=1)\n",
216 | "ddf = derivative(f, x, n=2)"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "# Поиск экстремумов (brute force)\n",
226 | "indx = np.where(np.logical_and(df >= -0.35, df <= 0.25))\n",
227 | "indx"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "Графики"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "plt.figure(\"4\", figsize=[8,6])\n",
244 | "\n",
245 | "ax1 = plt.subplot(1,1,1)\n",
246 | "\n",
247 | "plt.plot(x, f(x), \"-\", label=\"$f(x)$\")\n",
248 | "plt.plot(x, df, \"-\", label=\"$f'(x)$\")\n",
249 | "plt.plot(x, ddf, \"-\", label=\"$f''(x)$\")\n",
250 | "\n",
251 | "plt.plot(x[indx], f(x[indx]), \"o\", color=\"darkblue\")\n",
252 | "\n",
253 | "for xx in x[indx]:\n",
254 | " plt.axvline(x=xx, color=\"grey\", linestyle=\"dashed\", linewidth=1)\n",
255 | "\n",
256 | "plt.title(\"$f(x)=x^2 + 10 \\cdot \\sin(x)$\")\n",
257 | "\n",
258 | "plt.xlabel(\"x\")\n",
259 | "plt.ylabel(\"y\")\n",
260 | "\n",
261 | "plt.grid(True)\n",
262 | "\n",
263 | "plt.legend()\n",
264 | "\n",
265 | "plt.show()"
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "\n",
273 | "\n",
274 | "
\n",
275 | "
2. Градиентный спуск
\n",
276 | " \t
\n",
277 | "
\n",
278 | "
"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "$$ \\mathbf{x}^{(i+1)} = \\mathbf{x}^{(i)}-\\alpha \\cdot \\bigtriangledown f \\left( \\mathbf{x}\\right)$$"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "### Функция с одной переменной"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "$$f(x) = x^2 + 10 \\sin(x)$$\n",
300 | "$$f^{'}(x) = 2x + 10 \\cos(x)$$"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "# Функция\n",
310 | "def f(x):\n",
311 | " return x**2 + 10 * np.sin(x)\n",
312 | "\n",
313 | "# Производная\n",
314 | "def df(x):\n",
315 | " return 2*x + 10 * np.cos(x)\n",
316 | "\n",
317 | "# Значения аргумента\n",
318 | "x = np.arange(-10, 10, 0.1)"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "plt.figure(\"1\")\n",
328 | "\n",
329 | "plt.grid(True)\n",
330 | "plt.plot(x, f(x))\n",
331 | "plt.plot(x, df(x))\n",
332 | "plt.title(\"$f(x) = x^2+10\\sin(x)$\")\n",
333 | "plt.xlabel(\"x\")\n",
334 | "plt.ylabel(\"f(x)\")\n",
335 | "plt.legend((\"$f(x)$\", \"$f^{\\prime}(x)$\"), loc=\"lower right\")\n",
336 | "plt.grid(True)\n",
337 | "\n",
338 | "plt.show()"
339 | ]
340 | },
341 | {
342 | "cell_type": "markdown",
343 | "metadata": {},
344 | "source": [
345 | "Исследование влияния значения коэффициента альфа
"
346 | ]
347 | },
348 | {
349 | "cell_type": "markdown",
350 | "metadata": {},
351 | "source": [
352 | "Начальная точка 1
"
353 | ]
354 | },
355 | {
356 | "cell_type": "markdown",
357 | "metadata": {},
358 | "source": [
359 | "$$x_0 = -8$$\n",
360 | "$$\\alpha \\in \\{ 0.02, 0.05, 0.1, 0.2, 0.4, 0.6\\}$$\n",
361 | "$$err_{min} = 10^{-3}$$\n",
362 | "$$iteration_{max} = 20$$"
363 | ]
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "metadata": {},
368 | "source": [
369 | "
"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "Начальная точка 2
"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "$$x_0 = 8$$\n",
384 | "$$\\alpha \\in \\{ 0.02, 0.05, 0.1, 0.2, 0.4, 0.6\\}$$\n",
385 | "$$err_{min} = 10^{-3}$$\n",
386 | "$$iteration_{max} = 20$$"
387 | ]
388 | },
389 | {
390 | "cell_type": "markdown",
391 | "metadata": {},
392 | "source": [
393 | "
"
394 | ]
395 | },
396 | {
397 | "cell_type": "markdown",
398 | "metadata": {},
399 | "source": [
400 | "### Функция с двумя переменными"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {},
406 | "source": [
407 | "Исходная функция:\n",
408 | "\n",
409 | "$$f(x_1, x_2) = 2x_1^2 + x_2^2 + x_1x_2$$\n",
410 | "\n",
411 | "Частная производная по $x_1$:\n",
412 | "\n",
413 | "$$\\frac {\\partial f(x_1, x_2)}{\\partial x_1} = 4x_1 + x_2$$\n",
414 | "\n",
415 | "Частная производная по $x_2$:\n",
416 | "$$\\frac {\\partial f(x_1, x_2)}{\\partial x_2} = 2x_2 + x_1$$"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": null,
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "f = lambda x1, x2: 2*x1**2 + x2**2 +x1*x2 # функция\n",
426 | "dfx1 = lambda x1, x2: 4*x1 + x2 # частная производная по x1\n",
427 | "dfx2 = lambda x1, x2: 2*x2 + x1 # частная производная по x2\n",
428 | "\n",
429 | "coord_x1 = np.arange(-4, 5, 0.1) # значения x c шагом 1\n",
430 | "coord_x2 = np.arange(-4, 5, 0.1) # значения x c шагом 1\n",
431 | "\n",
432 | "x1, x2 = np.meshgrid(coord_x1, coord_x2)"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": null,
438 | "metadata": {},
439 | "outputs": [],
440 | "source": [
441 | "fig = plt.figure(1, figsize=(10, 10))\n",
442 | "\n",
443 | "ax0 = fig.add_subplot(2, 2, 1, projection=\"3d\")\n",
444 | "ax0.set_title(\"$f(x_1,x_2)=2x^2_{1}+x^2_{2}+x_{1}x_{2}$\")\n",
445 | "ax0.plot_surface(x1, x2, f(x1,x2), rstride=1, cstride=1, cmap=cm.coolwarm,\n",
446 | " linewidth=0, antialiased=True)\n",
447 | "ax0.set_xlabel(\"$x_1$\")\n",
448 | "ax0.set_ylabel(\"$x_2$\")\n",
449 | "ax0.set_zlabel(\"$f(x_1,x_2)$\")\n",
450 | "\n",
451 | "ax1 = plt.subplot(2,2,2)\n",
452 | "ax1.set_title(\"$f(x_1,x_2)=2x^2_{1}+x^2_{2}+x_{1}x_{2}$\")\n",
453 | "cf = ax1.contourf(x1, x2, f(x1,x2), 20, alpha=0.5, cmap=cm.coolwarm)\n",
454 | "plt.colorbar(cf)\n",
455 | "ax1.set_xlabel(\"$x_1$\")\n",
456 | "ax1.set_ylabel(\"$x_2$\")\n",
457 | "\n",
458 | "ax2 = plt.subplot(2,2,3)\n",
459 | "ax2.set_title(\"Gradient\")\n",
460 | "ax2.set_xlabel(\"$x_1$\")\n",
461 | "ax2.set_ylabel(\"$x_2$\")\n",
462 | "ax2.quiver(x1[0::5, 0::5], x2[0::5, 0::5], \n",
463 | " dfx1(x1[0::5, 0::5],x2[0::5, 0::5]), dfx2(x1[0::5, 0::5],x2[0::5, 0::5]), scale=100)\n",
464 | "\n",
465 | "ax3 = plt.subplot(2,2,4)\n",
466 | "ax3.set_title(\"Gradient\")\n",
467 | "ax3.set_xlabel(\"$x_1$\")\n",
468 | "ax3.set_ylabel(\"$x_2$\")\n",
469 | "cf = ax3.contourf(x1, x2, f(x1,x2), 20, cmap=cm.coolwarm)\n",
470 | "plt.colorbar(cf)\n",
471 | "ax3.quiver(x1[0::5, 0::5], x2[0::5, 0::5], \n",
472 | " dfx1(x1[0::5, 0::5],x2[0::5, 0::5]), dfx2(x1[0::5, 0::5],x2[0::5, 0::5]), scale=100)\n",
473 | "\n",
474 | "plt.tight_layout()\n",
475 | "\n",
476 | "plt.show()"
477 | ]
478 | },
479 | {
480 | "cell_type": "markdown",
481 | "metadata": {},
482 | "source": [
483 | "Исследование влияния значения коэффициента альфа
"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {},
489 | "source": [
490 | "Начальная точка 1
"
491 | ]
492 | },
493 | {
494 | "cell_type": "markdown",
495 | "metadata": {},
496 | "source": [
497 | "$$x_{1,0} = 3$$\n",
498 | "$$x_{2,0} = 0$$\n",
499 | "$$\\alpha \\in \\{ 0.02, 0.05, 0.1, 0.2, 0.3, 0.45\\}$$\n",
500 | "$$err_{min} = 10^{-3}$$\n",
501 | "$$iteration_{max} = 20$$"
502 | ]
503 | },
504 | {
505 | "cell_type": "markdown",
506 | "metadata": {},
507 | "source": [
508 | "
"
509 | ]
510 | },
511 | {
512 | "cell_type": "markdown",
513 | "metadata": {},
514 | "source": [
515 | "Начальная точка 2
"
516 | ]
517 | },
518 | {
519 | "cell_type": "markdown",
520 | "metadata": {},
521 | "source": [
522 | "$$x_{1,0} = -3$$\n",
523 | "$$x_{2,0} = -2$$\n",
524 | "$$\\alpha \\in \\{ 0.02, 0.05, 0.1, 0.2, 0.3, 0.45\\}$$\n",
525 | "$$err_{min} = 10^{-3}$$\n",
526 | "$$iteration_{max} = 20$$"
527 | ]
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "metadata": {},
532 | "source": [
533 | "
"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": null,
539 | "metadata": {},
540 | "outputs": [],
541 | "source": []
542 | }
543 | ],
544 | "metadata": {
545 | "kernelspec": {
546 | "display_name": "Python 3",
547 | "language": "python",
548 | "name": "python3"
549 | },
550 | "language_info": {
551 | "codemirror_mode": {
552 | "name": "ipython",
553 | "version": 3
554 | },
555 | "file_extension": ".py",
556 | "mimetype": "text/x-python",
557 | "name": "python",
558 | "nbconvert_exporter": "python",
559 | "pygments_lexer": "ipython3",
560 | "version": "3.7.3"
561 | }
562 | },
563 | "nbformat": 4,
564 | "nbformat_minor": 2
565 | }
566 |
--------------------------------------------------------------------------------
/notebooks/C3_GD_Appendix.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "СЕМИНАР. Оптимизация. Часть 1. Исследование влияния значения коэффициента альфа в градиентном спуске
\n",
8 | "Папулин С.Ю. (papulin.study@yandex.ru)
"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import numpy as np\n",
18 | "import pandas as pnd\n",
19 | "import matplotlib.pyplot as plt\n",
20 | "from mpl_toolkits.mplot3d import Axes3D\n",
21 | "from matplotlib import cm\n",
22 | "%matplotlib inline"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## Функция одной переменной"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "def gradient_descent(f, df, start_pos, alpha, max_iter=20, tol=0.0001, return_progress=False):\n",
39 | " \"\"\"Градиентный спуск.\"\"\"\n",
40 | " \n",
41 | " curr_pos = start_pos\n",
42 | " f_prev = f(*start_pos)\n",
43 | " \n",
44 | " if return_progress:\n",
45 | " progress = dict()\n",
46 | " progress[\"points\"] = [curr_pos]\n",
47 | " \n",
48 | " \n",
49 | " for i in range(max_iter):\n",
50 | " \n",
51 | " curr_pos = curr_pos - alpha * df(*curr_pos)\n",
52 | " \n",
53 | " if return_progress:\n",
54 | " progress[\"points\"].append(curr_pos)\n",
55 | " \n",
56 | " f_curr = f(*curr_pos)\n",
57 | "\n",
58 | " if abs(f_prev - f_curr) <= tol:\n",
59 | " break\n",
60 | "\n",
61 | " f_prev = f_curr\n",
62 | " \n",
63 | " if return_progress:\n",
64 | " progress[\"points\"] = np.array(progress[\"points\"])\n",
65 | " return (curr_pos, f_curr, i+1, progress)\n",
66 | " \n",
67 | " return (curr_pos, f_curr, i+1)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "def plot_progress(x, f, points):\n",
77 | " \n",
78 | " for i in range(1, len(points)):\n",
79 | " start_xy = (points[i-1], f(points[i-1]))\n",
80 | " end_xy = (points[i], f(points[i]))\n",
81 | " plt.annotate(\"\", \n",
82 | " xy=start_xy, xytext=end_xy, \n",
83 | " arrowprops=dict(\n",
84 | " arrowstyle=\"<-\", \n",
85 | " color=\"grey\", \n",
86 | " linestyle =\"dashed\"), \n",
87 | " zorder=3) \n",
88 | " plt.annotate(\"\", \n",
89 | " xy=(points[0], f(points[0])), xytext=(points[-1], f(points[-1])), \n",
90 | " arrowprops=dict(arrowstyle=\"<-\", color=\"red\"), \n",
91 | " zorder=4)\n",
92 | " plt.plot(points[0], f(points[0]), \"o\", color=\"green\", zorder=4)\n",
93 | " plt.plot(points[-1], f(points[-1]), \"o\", color=\"red\", zorder=4)\n",
94 | " \n",
95 | "\n",
96 | "def plot_function(x, f):\n",
97 | " plt.plot(x, f(x), '-', color = \"blue\", zorder=1)\n",
98 | "\n",
99 | "\n",
100 | "def plot_derivative(x, df):\n",
101 | " plt.plot(x, df(x), '-', color = \"orange\", zorder=2)"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "$$f(x) = x^2 + 10 \\cdot \\cos(x)$$"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "# Функция и производная\n",
118 | "\n",
119 | "f = lambda x: x**2 + 10*np.cos(x)\n",
120 | "df = lambda x: 2*x - 10*np.sin(x)\n",
121 | "\n",
122 | "\n",
123 | "x = np.arange(-10, 10, 0.5)\n",
124 | "\n",
125 | "\n",
126 | "# Параметры\n",
127 | "\n",
128 | "alpha = 0.05\n",
129 | "max_iter = 20\n",
130 | "alphas = [0.01, 0.05, 0.1, 0.15]\n",
131 | "\n",
132 | "\n",
133 | "# Начальное значение\n",
134 | "\n",
135 | "x_start = (8,)\n",
136 | "\n",
137 | "\n",
138 | "# Количеста строк при отображении графиков\n",
139 | "\n",
140 | "subplot_rows = np.ceil(len(alphas) / 2.0)\n",
141 | "\n",
142 | "\n",
143 | "# Создание области отображения графиков\n",
144 | "\n",
145 | "plt.figure(figsize=(10, 4*subplot_rows))\n",
146 | "plt.suptitle(\"$x^2 + 10 \\cos(x)$\", fontsize=16, y=1.05)\n",
147 | "\n",
148 | "# Поиск минимального значения функции при различных alpha\n",
149 | "\n",
150 | "for i in range(len(alphas)):\n",
151 | " \n",
152 | " # Градиентный спуск\n",
153 | " final_x, final_f, num_iter, progress = gradient_descent(f, df, x_start, alphas[i], return_progress=True)\n",
154 | " \n",
155 | " # Отображение результата на графике\n",
156 | " plt.subplot(subplot_rows, 2, i+1)\n",
157 | " plt.title(\"$x_0=%s, \\\\alpha=%s$\" % (x_start, alphas[i]))\n",
158 | " plot_function(x, f)\n",
159 | " plot_derivative(x, df)\n",
160 | " plot_progress(x, f, progress[\"points\"])\n",
161 | " plt.xlabel(\"$x$\")\n",
162 | " plt.ylabel(\"$f(x)$\")\n",
163 | " plt.grid(True)\n",
164 | "\n",
165 | " \n",
166 | "# Отбражения графиков\n",
167 | "\n",
168 | "plt.tight_layout()\n",
169 | "plt.show()"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "def show_plots(f, df, x_start, alphas, max_iter, tol, func):\n",
179 | "\n",
180 | " # Количеста строк при отображении графиков\n",
181 | "\n",
182 | " subplot_rows = np.ceil(len(alphas) / 2.0)\n",
183 | "\n",
184 | "\n",
185 | " # Количеста строк при отображении графиков\n",
186 | "\n",
187 | " subplot_rows = np.ceil(len(alphas) / 2.0)\n",
188 | "\n",
189 | "\n",
190 | " # Создание области отображения графиков\n",
191 | "\n",
192 | " plt.figure(figsize=(10, 4*subplot_rows))\n",
193 | " plt.suptitle(\"$x^2 + 10 \\sin(x)$\", fontsize=16, y=1.05)\n",
194 | "\n",
195 | " # Поиск минимального значения функции при различных alpha\n",
196 | "\n",
197 | " for i in range(len(alphas)):\n",
198 | "\n",
199 | " # Градиентный спуск\n",
200 | " final_x, final_f, num_iter, progress = func(f, df, x_start, alphas[i], max_iter=max_iter, tol=err, return_progress=True)\n",
201 | "\n",
202 | " # Отображение результата на графике\n",
203 | " plt.subplot(subplot_rows, 2, i+1)\n",
204 | " plt.title(\"$x_0=%s, \\\\alpha=%s$\" % (x_start, alphas[i]))\n",
205 | " plot_function(x, f)\n",
206 | " plot_derivative(x, df)\n",
207 | " plot_progress(x, f, progress[\"points\"])\n",
208 | " plt.xlabel(\"$x$\")\n",
209 | " plt.ylabel(\"$f(x)$\")\n",
210 | " plt.grid(True)\n",
211 | "\n",
212 | "\n",
213 | " # Отбражения графиков\n",
214 | "\n",
215 | " plt.tight_layout()\n",
216 | " plt.show()"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "# Функция и производная\n",
226 | "\n",
227 | "f = lambda x: x**2 + 10 * np.sin(x)\n",
228 | "df = lambda x: 2*x + 10 * np.cos(x)\n",
229 | "\n",
230 | "\n",
231 | "x = np.arange(-10, 10, 0.5)\n",
232 | "\n",
233 | "\n",
234 | "# Параметры\n",
235 | "\n",
236 | "alpha = 0.05\n",
237 | "max_iter = 20\n",
238 | "alphas = [0.02, 0.05, 0.1, 0.2, 0.4, 0.6]\n",
239 | "err = 1e-3 # минимальное изменение функции (ошибка)"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "# Начальное значение\n",
249 | "\n",
250 | "x_start = (8,)\n",
251 | "show_plots(f, df, x_start, alphas, max_iter, tol=err, func=gradient_descent)"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "# Начальное значение\n",
261 | "\n",
262 | "x_start = (-8,)\n",
263 | "show_plots(f, df, x_start, alphas, max_iter, tol=err, func=gradient_descent)"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {},
269 | "source": [
270 | "## Функция двух переменных"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "Функция с двумя переменными $f(x_1,x_2)$:\n",
278 | "\n",
279 | "$$f(x_1, x_2) = 2x_1^2 + x_2^2 + x_1x_2$$"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "# Функция и частные производные по x1 и x2\n",
289 | "\n",
290 | "f = lambda x1, x2: 2*x1**2 + x2**2 +x1*x2\n",
291 | "dfx1 = lambda x1, x2: 4*x1 + x2\n",
292 | "dfx2 = lambda x1, x2: 2*x2 + x1\n",
293 | "\n",
294 | "df = lambda x1, x2: np.array((dfx1(x1, x2), dfx2(x1,x2)))\n",
295 | "\n",
296 | "coord_x1 = np.arange(-4, 5, 0.1) # Значения x1 c шагом 1\n",
297 | "coord_x2 = np.arange(-4, 5, 0.1) # Значения x2 c шагом 1\n",
298 | "\n",
299 | "x1, x2 = np.meshgrid(coord_x1, coord_x2)\n",
300 | "\n",
301 | "\n",
302 | "# Отображение фукции и её градиент\n",
303 | "\n",
304 | "fig = plt.figure(1, figsize=(10, 10))\n",
305 | "\n",
306 | "ax0 = fig.add_subplot(2, 2, 1, projection=\"3d\")\n",
307 | "ax0.plot_surface(x1, x2, f(x1,x2), rstride=1, cstride=1, cmap=cm.coolwarm,\n",
308 | " linewidth=0, antialiased=True)\n",
309 | "ax0.set_title(\"$f(x_1,x_2)=2x^2_{1}+x^2_{2}+x_{1}x_{2}$\")\n",
310 | "ax0.set_xlabel(\"$x_1$\")\n",
311 | "ax0.set_ylabel(\"$x_2$\")\n",
312 | "ax0.set_zlabel(\"$f(x_1,x_2)$\")\n",
313 | "\n",
314 | "ax1 = plt.subplot(2,2,2)\n",
315 | "cf = ax1.contourf(x1, x2, f(x1,x2), 50, alpha=0.5, cmap=cm.coolwarm)\n",
316 | "plt.colorbar(cf)\n",
317 | "ax1.set_title(\"$f(x_1,x_2)=2x^2_{1}+x^2_{2}+x_{1}x_{2}$\")\n",
318 | "ax1.set_xlabel(\"$x_1$\")\n",
319 | "ax1.set_ylabel(\"$x_2$\")\n",
320 | "\n",
321 | "ax2 = plt.subplot(2,2,3)\n",
322 | "ax2.set_title(\"Gradient\")\n",
323 | "ax2.set_xlabel(\"$x_1$\")\n",
324 | "ax2.set_ylabel(\"$x_2$\")\n",
325 | "ax2.quiver(x1[0::5, 0::5], x2[0::5, 0::5], dfx1(x1[0::5, 0::5],x2[0::5, 0::5]), dfx2(x1[0::5, 0::5],x2[0::5, 0::5]), scale=100)\n",
326 | "\n",
327 | "ax3 = plt.subplot(2,2,4)\n",
328 | "ax3.set_xlabel(\"$x_1$\")\n",
329 | "ax3.set_ylabel(\"$x_2$\")\n",
330 | "ax3.set_title(\"Gradient\")\n",
331 | "ax3.contourf(x1, x2, f(x1,x2), 50, cmap=cm.coolwarm)\n",
332 | "ax3.quiver(x1[0::5, 0::5], x2[0::5, 0::5], dfx1(x1[0::5, 0::5],x2[0::5, 0::5]), dfx2(x1[0::5, 0::5],x2[0::5, 0::5]), scale=100)\n",
333 | "\n",
334 | "plt.tight_layout()\n",
335 | "\n",
336 | "plt.show()"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "def plot_progress_two(points):\n",
346 | " \n",
347 | " for i in range(1, len(points)):\n",
348 | " start_xy = points[i-1]\n",
349 | " end_xy = points[i]\n",
350 | " plt.plot(start_xy[0], start_xy[1], \"o\", color = \"blue\")\n",
351 | " plt.annotate(\"\", xy=start_xy, xytext=end_xy, arrowprops=dict(arrowstyle=\"<-\", color=\"grey\", \n",
352 | " linestyle =\"dashed\"), zorder=3)\n",
353 | " \n",
354 | " plt.annotate(\"\", xy=points[0], xytext=points[-1], \n",
355 | " arrowprops=dict(arrowstyle=\"<-\", color=\"red\", linestyle =\"dashed\"), \n",
356 | " zorder=3)\n",
357 | " plt.plot(*points[0], \"o\", color=\"green\", zorder=4)\n",
358 | " plt.plot(*points[-1], \"o\", color=\"red\", zorder=4)\n",
359 | "\n",
360 | " \n",
361 | "def plot_info_two(final_x, final_f, num_iter):\n",
362 | " text = \"Number of iterations: \"+str(num_iter) + \"\\n $x_{1,min} = \" + \\\n",
363 | " str(np.around(final_x[0], decimals = 2)) +\"$, $x_{2,min} = \" + str(np.around(final_x[1], decimals = 2)) +\"$ \\n\" + \\\n",
364 | " \"$f(x_{1,min}, x_{2,min}) = \" + str(np.around(final_f, decimals = 4)) + \"$\"\n",
365 | " plt.annotate(text, (0.10, 0.80), xytext=(0.10, 0.75), textcoords=\"axes fraction\", size=14)\n",
366 | " \n",
367 | "\n",
368 | "def plot_function_two(x1, x2, f):\n",
369 | " plt.contourf(x1, x2, f(x1, x2), 10, alpha=0.5, cmap=cm.coolwarm)\n",
370 | " \n",
371 | "\n",
372 | "def show_plots_two(f, df, start_pos, alphas, max_iter, tol, func):\n",
373 | "\n",
374 | " # Количеста строк при отображении графиков\n",
375 | "\n",
376 | " subplot_rows = np.ceil(len(alphas) / 2.0)\n",
377 | "\n",
378 | "\n",
379 | " # Количеста строк при отображении графиков\n",
380 | "\n",
381 | " subplot_rows = np.ceil(len(alphas) / 2.0)\n",
382 | "\n",
383 | "\n",
384 | " # Создание области отображения графиков\n",
385 | "\n",
386 | " plt.figure(figsize=(10, 4*subplot_rows))\n",
387 | " plt.suptitle(\"$f(x_1, x_2) = 2x_1^2 + x_2^2 + x_1x_2$\", fontsize=16, y=1.05)\n",
388 | "\n",
389 | " # Поиск минимального значения функции при различных alpha\n",
390 | "\n",
391 | " for i in range(len(alphas)):\n",
392 | "\n",
393 | " # Градиентный спуск\n",
394 | " final_x, final_f, num_iter, progress = func(f, df, start_pos, alphas[i], max_iter=max_iter, tol=err, return_progress=True)\n",
395 | "\n",
396 | " # Отображение результата на графике\n",
397 | " plt.subplot(subplot_rows, 2, i+1)\n",
398 | " plt.title(\"$x_0=%s, \\\\alpha=%s$\" % (start_pos, alpha))\n",
399 | " plot_function_two(x1, x2, f)\n",
400 | " plot_info_two(final_x, final_f, num_iter)\n",
401 | " plot_progress_two(progress[\"points\"])\n",
402 | " plt.xlabel(\"$x1$\")\n",
403 | " plt.ylabel(\"$x2$\")\n",
404 | " plt.grid(True)\n",
405 | "\n",
406 | " # Отбражения графиков\n",
407 | "\n",
408 | " plt.tight_layout()\n",
409 | " plt.show()"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "# Параметры\n",
419 | "\n",
420 | "max_iter = 20\n",
421 | "err = 0.0001\n",
422 | "alphas = [0.01, 0.05, 0.1, 0.2, 0.3, 0.45]"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "metadata": {},
429 | "outputs": [],
430 | "source": [
431 | "# Начальное значение\n",
432 | "\n",
433 | "start_pos = (3, 0)\n",
434 | "show_plots_two(f, df, start_pos, alphas, max_iter, tol=err, func=gradient_descent)"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": null,
440 | "metadata": {},
441 | "outputs": [],
442 | "source": [
443 | "# Начальное значение\n",
444 | "\n",
445 | "start_pos = (-3, -2)\n",
446 | "show_plots_two(f, df, start_pos, alphas, max_iter, tol=err, func=gradient_descent)"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": null,
452 | "metadata": {},
453 | "outputs": [],
454 | "source": []
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": null,
459 | "metadata": {},
460 | "outputs": [],
461 | "source": []
462 | }
463 | ],
464 | "metadata": {
465 | "kernelspec": {
466 | "display_name": "Python 3",
467 | "language": "python",
468 | "name": "python3"
469 | },
470 | "language_info": {
471 | "codemirror_mode": {
472 | "name": "ipython",
473 | "version": 3
474 | },
475 | "file_extension": ".py",
476 | "mimetype": "text/x-python",
477 | "name": "python",
478 | "nbconvert_exporter": "python",
479 | "pygments_lexer": "ipython3",
480 | "version": "3.7.4"
481 | }
482 | },
483 | "nbformat": 4,
484 | "nbformat_minor": 2
485 | }
486 |
--------------------------------------------------------------------------------
/notebooks/C3_Sklearn_Basics.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Основы разработки под sklearn\n",
8 | "---\n",
9 | "С.Ю. Папулин (papulin.study@yandex.ru)"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "### Содержание\n",
17 | "\n",
18 | "- [Общие сведения](#Общие-сведения)\n",
19 | "- [Реализация модели предсказания](#Реализация-модели-предсказания)\n",
20 | "- [Реализация транформации](#Реализация-транформации)\n",
21 | "- [Применение `Pipeline`](#Применение-Pipeline)\n",
22 | "- [Источники](#Источники)"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "import numpy as np\n",
32 | "import matplotlib.pyplot as plt\n",
33 | "%matplotlib inline"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## Общие сведения"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "Объекты `sklearn` и их методы:\n",
48 | "- **Estimator**: `fit` и `partial_fit` (дообучение)\n",
49 | "- **Predictor**: `predict` + для классификации дополнительно `decision_function` и/или `predict_proba`\n",
50 | "- **Transformer**: `transform` и `fit_transform`\n",
51 | "- **Model**: `score`"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "Аргументы методов:\n",
59 | "- `fit(X, y, **kwargs) -> self`\n",
60 | "- `partial_fit(X, y, **kwargs) -> self`\n",
61 | "- `set_params(*args, **kwargs)` и `get_params(deep=True) -> dict`\n",
62 | "- `score(X, y, **kwargs) -> float`\n",
63 | "- `transform(X, **kwargs) -> X_t`\n",
64 | "- `fit_transform(X, y, **kwargs) -> X_t`\n",
65 | "\n",
66 | "`X` - массив размера (n_samples, n_features), `y` - массив размера (n_samples,)\n",
67 | "\n",
68 | "Результат оценки (обучения):\n",
69 | "- `coef_`, `idf_` и пр.\n",
70 | "\n",
71 | "Перезаписываются каждый раз после вызова `fit`"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "## Реализация модели предсказания"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin\n",
88 | "from sklearn.utils.validation import check_X_y, check_array, check_is_fitted"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "help(BaseEstimator)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "help(RegressorMixin)"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "class CustomLinearRegression(BaseEstimator, RegressorMixin):\n",
116 | " \n",
117 | " def __init__(self, method='ols'):\n",
118 | " self.method = method\n",
119 | " \n",
120 | " def fit(self, X, y):\n",
121 | " X, y = check_X_y(X, y)\n",
122 | " self.n_features_in_ = X.shape[1]\n",
123 | " X_ = np.c_[np.ones(X.shape[0]), X]\n",
124 | " # Вариант 1. не пройдет тесты check_estimator\n",
125 | " self.coef_ = np.linalg.inv(X_.T @ X_) @ X_.T @ y\n",
126 | " # Вариант 2. пройдет тесты check_estimator\n",
127 | " # self.coef_ = np.linalg.pinv(X_) @ y\n",
128 | " return self\n",
129 | " \n",
130 | " def predict(self, X):\n",
131 | " check_is_fitted(self, 'coef_')\n",
132 | " X = check_array(X)\n",
133 | " X_ = np.c_[np.ones(X.shape[0]), X]\n",
134 | " return X_ @ self.coef_"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "Доступ к параметрам"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "model = CustomLinearRegression()\n",
151 | "# model.get_params(deep=True)"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "# model.set_params(method='gd')"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "Проверка на совместимость с `sklearn`"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "from sklearn.utils.estimator_checks import check_estimator\n",
177 | "from sklearn.base import is_regressor "
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "# Если есть проблемы с нижележащей командой, обновите threadpoolctl\n",
187 | "# %pip install threadpoolctl==3.1.0"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "try:\n",
197 | " check_estimator(estimator=CustomLinearRegression())\n",
198 | "except Exception as e:\n",
199 | " print(e)"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "# Note: Based on _estimator_type\n",
209 | "is_regressor(CustomLinearRegression())"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "Совместимые с `sklearn` объекты можно использовать в `GridSearchCV` для выбора моделей и в `Pipeline` для организации последовательности обработки данных."
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "### Пример"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "def generate_data(n=100, start_x=4, length_x=8, mu=0, sigma=0.5):\n",
233 | " \"\"\"Генерация данных.\"\"\"\n",
234 | " from scipy import stats\n",
235 | " f = lambda x: 2 + 0.3*x\n",
236 | " x = stats.uniform.rvs(size=n, loc=start_x, scale=length_x, random_state=1)\n",
237 | " e = stats.norm.rvs(size=n, loc=mu, scale=sigma, random_state=1)\n",
238 | " return x.reshape(-1,1), f(x) + e"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "X, y = generate_data()"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "# Отображение наблюдений\n",
257 | "plt.figure(1, figsize=[4, 4])\n",
258 | "\n",
259 | "plt.subplot(1,1,1)\n",
260 | "plt.scatter(X[:,0], y, color=\"green\", label=\"Sample\", zorder=2)\n",
261 | "plt.legend()\n",
262 | "plt.xlabel(\"$x$\")\n",
263 | "plt.ylabel(\"$f(x)$\")\n",
264 | "plt.grid(True)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "from sklearn.model_selection import train_test_split"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "# Разбиение данных на обучающие и тестовые\n",
283 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)\n",
284 | "X_train[:5], y_train[:5]"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "model = CustomLinearRegression()"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "# Обучение\n",
303 | "model.fit(X_train, y_train)\n",
304 | "model.coef_"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "# Отображение наблюдений и линии регрессии\n",
314 | "plt.figure(2, figsize=[4, 4])\n",
315 | "\n",
316 | "xx = np.linspace(X[:,0].min(),X[:,0].max(), 2).reshape(-1,1)\n",
317 | "\n",
318 | "plt.subplot(1,1,1)\n",
319 | "plt.scatter(X[:,0], y, color=\"green\", label=\"Sample\", zorder=2)\n",
320 | "plt.plot(xx, model.predict(xx), \"-\", color=\"grey\", label=\"Regression\")\n",
321 | "plt.xlabel(\"$x$\")\n",
322 | "plt.ylabel(\"$f(x)$\")\n",
323 | "plt.legend()\n",
324 | "plt.grid(True)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "model.score(X_test, y_test)"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "Сравнение с реализацией в `sklearn`"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "from sklearn.linear_model import LinearRegression"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "buildin_model = LinearRegression().fit(X_train, y_train)\n",
359 | "buildin_model.intercept_, buildin_model.coef_"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "buildin_model.score(X_test, y_test)"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "metadata": {},
374 | "source": [
375 | "## Реализация транформации"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "metadata": {},
382 | "outputs": [],
383 | "source": [
384 | "help(TransformerMixin)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": null,
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "class CustomStandardTransformer(BaseEstimator, TransformerMixin):\n",
394 | " \n",
395 | " def __init__(self):\n",
396 | " pass\n",
397 | " \n",
398 | " def fit(self, X, y=None):\n",
399 | " # TODO(X, y)\n",
400 | " self.params_ = ...\n",
401 | " return self\n",
402 | " \n",
403 | " def transform(self, X):\n",
404 | " # TODO(X, params_)\n",
405 | " X_ = ...\n",
406 | " return X_"
407 | ]
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "metadata": {},
412 | "source": [
413 | "## Применение Pipeline"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": null,
419 | "metadata": {},
420 | "outputs": [],
421 | "source": [
422 | "from sklearn.pipeline import Pipeline"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "metadata": {},
429 | "outputs": [],
430 | "source": [
431 | "# TODO: AddOneTransformer\n",
432 | "# TODO: CustomLinearRegression"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": null,
438 | "metadata": {},
439 | "outputs": [],
440 | "source": [
441 | "pipeline = Pipeline([\n",
442 | " (\"addone\", AddOneTransformer()),\n",
443 | " (\"regressor\", CustomLinearRegression())\n",
444 | "])"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": null,
450 | "metadata": {},
451 | "outputs": [],
452 | "source": [
453 | "# Обучение\n",
454 | "pipeline.fit(X_train, y_train)\n",
455 | "\n",
456 | "# Параметры модели\n",
457 | "print(f'w = {pipeline.named_steps[\"regressor\"].coef_}')\n",
458 | "\n",
459 | "# Качество модели\n",
460 | "print(f'R^2 = {pipeline.score(X_test, y_test)}')\n",
461 | "\n",
462 | "# Предсказание\n",
463 | "y_test__pred = pipeline.predict(X_test)\n",
464 | "y_test__pred[:5]"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "metadata": {},
471 | "outputs": [],
472 | "source": [
473 | "# Отображение наблюдений и линии регрессии\n",
474 | "plt.figure(2, figsize=[4, 4])\n",
475 | "\n",
476 | "xx = np.linspace(X[:,0].min(),X[:,0].max(), 2).reshape(-1,1)\n",
477 | "\n",
478 | "plt.subplot(1,1,1)\n",
479 | "plt.scatter(X[:,0], y, color=\"green\", label=\"Sample\", zorder=2)\n",
480 | "plt.plot(xx, pipeline.predict(xx), \"-\", color=\"grey\", label=\"Regression\")\n",
481 | "plt.xlabel(\"$x$\")\n",
482 | "plt.ylabel(\"$f(x)$\")\n",
483 | "plt.legend()\n",
484 | "plt.grid(True)"
485 | ]
486 | },
487 | {
488 | "cell_type": "markdown",
489 | "metadata": {},
490 | "source": [
491 | "## Сериализация модели"
492 | ]
493 | },
494 | {
495 | "cell_type": "markdown",
496 | "metadata": {},
497 | "source": [
498 | "`joblib`"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": null,
504 | "metadata": {},
505 | "outputs": [],
506 | "source": [
507 | "from joblib import dump, load"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": null,
513 | "metadata": {},
514 | "outputs": [],
515 | "source": [
516 | "# Обученная модель\n",
517 | "linear_model = LinearRegression().fit(X_train, y_train)\n",
518 | "linear_model.intercept_, buildin_model.coef_"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": null,
524 | "metadata": {},
525 | "outputs": [],
526 | "source": [
527 | "FILE_NAME = 'linear_model.joblib'"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": null,
533 | "metadata": {},
534 | "outputs": [],
535 | "source": [
536 | "# Сохранение модели (сериализация модели)\n",
537 | "dump(linear_model, FILE_NAME) "
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": null,
543 | "metadata": {},
544 | "outputs": [],
545 | "source": [
546 | "# Там где загружается модель, должен быть\n",
547 | "# доступен класс модели\n",
548 | "from sklearn.linear_model import LinearRegression\n",
549 | "\n",
550 | "# Загрузка модели (десериализация модели)\n",
551 | "linear_model = load(FILE_NAME)\n",
552 | "\n",
553 | "# Проверка\n",
554 | "linear_model.intercept_, buildin_model.coef_"
555 | ]
556 | },
557 | {
558 | "cell_type": "markdown",
559 | "metadata": {},
560 | "source": [
561 | "## Источники"
562 | ]
563 | },
564 | {
565 | "cell_type": "markdown",
566 | "metadata": {},
567 | "source": [
568 | "- [Developing scikit-learn estimators](https://scikit-learn.org/stable/developers/develop.html)\n",
569 | "- [Utilities for Developers](https://scikit-learn.org/stable/developers/utilities.html#developers-utils)\n",
570 | "- [Glossary of Common Terms and API Elements](https://scikit-learn.org/stable/glossary.html#glossary)\n",
571 | "- [A template for scikit-learn contributions](https://github.com/scikit-learn-contrib/project-template)"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": null,
577 | "metadata": {},
578 | "outputs": [],
579 | "source": []
580 | }
581 | ],
582 | "metadata": {
583 | "kernelspec": {
584 | "display_name": "Python 3 (ipykernel)",
585 | "language": "python",
586 | "name": "python3"
587 | },
588 | "language_info": {
589 | "codemirror_mode": {
590 | "name": "ipython",
591 | "version": 3
592 | },
593 | "file_extension": ".py",
594 | "mimetype": "text/x-python",
595 | "name": "python",
596 | "nbconvert_exporter": "python",
597 | "pygments_lexer": "ipython3",
598 | "version": "3.11.7"
599 | }
600 | },
601 | "nbformat": 4,
602 | "nbformat_minor": 4
603 | }
604 |
--------------------------------------------------------------------------------
/notebooks/C5_HAR.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Распознавание активности (Human Activity Recognition - HAR)\n",
8 | "\n",
9 | "С.Ю. Папулин (papulin.study@yandex.ru)"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "### Содержание\n",
17 | "\n",
18 | "- [Анализ исходных данных]()\n",
19 | "- [Построение модели распознавания активности]()\n",
20 | "- [Выбор модели]()"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "Подключение модулей"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import pandas as pd\n",
37 | "import numpy as np\n",
38 | "\n",
39 | "import matplotlib.pyplot as plt\n",
40 | "%matplotlib inline"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "RANDOM_STATE = 1234"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Анализ исходных данных"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "Описание: [Human Activity Recognition Using Smartphones Data Set](https://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones)\n",
64 | "\n",
65 | "Ссылка: [UCI HAR Dataset.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip)"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "Загрузка исходных данных"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "YOUR_PATH = \"/YOUR_PATH/UCI HAR Dataset\""
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "# Наименования активностей\n",
91 | "LABEL_NAMES_FILE = f\"{YOUR_PATH}/activity_labels.txt\"\n",
92 | "\n",
93 | "# Наименование столбцов признаков\n",
94 | "FEATURE_NAMES_FILE = f\"{YOUR_PATH}/features.txt\"\n",
95 | "\n",
96 | "# Идентификаторы испытуемых\n",
97 | "X_TRAIN_SUBJECT_FILE = f\"{YOUR_PATH}/train/subject_train.txt\"\n",
98 | "\n",
99 | "\n",
100 | "# Признаки (временные, частотные) и целевых значения (активности)\n",
101 | "\n",
102 | "# Обучающая часть\n",
103 | "X_TRAIN_FILE = f\"{YOUR_PATH}/train/X_train.txt\"\n",
104 | "Y_TRAIN_FILE = f\"{YOUR_PATH}/train/y_train.txt\"\n",
105 | "\n",
106 | "# Тестовая часть\n",
107 | "X_TEST_FILE = f\"{YOUR_PATH}/test/X_test.txt\"\n",
108 | "Y_TEST_FILE = f\"{YOUR_PATH}/test/y_test.txt\""
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "# Загрузка наименований активностей\n",
118 | "LABEL_NAMES = list()\n",
119 | "with open(LABEL_NAMES_FILE, \"r\") as fin:\n",
120 | " for line in fin:\n",
121 | " LABEL_NAMES.append(line.split()[1])\n",
122 | "LABEL_NAMES"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "# Загрузка наименований столбцов\n",
132 | "CLMS = list()\n",
133 | "with open(FEATURE_NAMES_FILE, \"r\") as fin:\n",
134 | " for line in fin:\n",
135 | " CLMS.append(line.split()[1])\n",
136 | "len(CLMS)"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "CLMS = [str(indx+1) +\".\" + el for indx, el in enumerate(CLMS)]"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "# Вывод нескольких наименований\n",
155 | "CLMS[:5]"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# Загрузка идентификаторов испытуемых\n",
165 | "df_subjects = pd.read_csv(X_TRAIN_SUBJECT_FILE, header=None, sep=\"\\s+\", names=[\"subject\"])\n",
166 | "df_subjects.head()"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "# Загрузка признаков\n",
176 | "df_features = pd.read_csv(X_TRAIN_FILE, header=None, sep=\"\\s+\", names=CLMS)\n",
177 | "df_features.head()"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "# Загрузка целевых значений\n",
187 | "df_labels = pd.read_csv(Y_TRAIN_FILE, header=None, names=[\"activity\"])\n",
188 | "df_labels.head(5)"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "# Формирование одного датафрейма\n",
198 | "df = pd.concat([df_subjects, df_features, df_labels], axis=1)\n",
199 | "df.head()"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "Отображение количества различных активностей"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "fig, ax = plt.subplots(1, 1)\n",
216 | "\n",
217 | "fig.set_figheight(4)\n",
218 | "fig.set_figwidth(6)\n",
219 | "\n",
220 | "\n",
221 | "ax = df.groupby(\"activity\").size().plot.bar(ax=ax)\n",
222 | "ax.set_xticklabels(LABEL_NAMES, rotation=60)\n",
223 | "ax.set_ylabel(\"window count\")\n",
224 | "ax.grid(True)"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "Распредление активностей 5го испытуемого"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "SUBJECT_ID = 5"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "fig, ax = plt.subplots(1, 1)\n",
250 | "\n",
251 | "fig.set_figheight(4)\n",
252 | "fig.set_figwidth(6)\n",
253 | "\n",
254 | "ax = df[df[\"subject\"]==SUBJECT_ID]\\\n",
255 | " .groupby(\"activity\")\\\n",
256 | " .size()\\\n",
257 | " .plot.bar(ax=ax)\n",
258 | "ax.set_xticklabels(LABEL_NAMES, rotation=60)\n",
259 | "ax.set_ylabel(\"window count\")\n",
260 | "ax.grid(True)"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "Отображение данных от акселерометра по координатам"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "ACC_CLMS = [\"1.tBodyAcc-mean()-X\", \"2.tBodyAcc-mean()-Y\", \"3.tBodyAcc-mean()-Z\", \"activity\"]\n",
277 | "\n",
278 | "df_acc = df.loc[\n",
279 | " df[\"subject\"]==SUBJECT_ID,\n",
280 | " ACC_CLMS\n",
281 | "]\n",
282 | "\n",
283 | "df_acc.index = range(len(df_acc))\n",
284 | "df_acc.head()"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "fig, axes = plt.subplots(3, 1)\n",
294 | "\n",
295 | "fig.set_figheight(12)\n",
296 | "fig.set_figwidth(12)\n",
297 | "\n",
298 | "for indx, ax in enumerate(axes):\n",
299 | "\n",
300 | " df_acc[ACC_CLMS[indx]].plot(ax=ax, color=\"grey\")\n",
301 | "\n",
302 | " ax.set_title(\"Subject {}: {}\".format(SUBJECT_ID, ACC_CLMS[indx]))\n",
303 | " ax.set_xlabel(\"window index\")\n",
304 | " ax.set_ylabel(\"acc\")\n",
305 | "\n",
306 | " for i in range(1, len(LABEL_NAMES)+1):\n",
307 | " df_acc[df_acc[\"activity\"]==i][ACC_CLMS[indx]].plot(\n",
308 | " marker=\"o\", linestyle=\"\", ax=ax, \n",
309 | " label=LABEL_NAMES[i-1]\n",
310 | " )\n",
311 | "\n",
312 | " ax.grid(True)\n",
313 | " ax.legend()\n",
314 | "\n",
315 | "plt.tight_layout()\n",
316 | "plt.show()"
317 | ]
318 | },
319 | {
320 | "cell_type": "markdown",
321 | "metadata": {},
322 | "source": [
323 | "## Построение модели распознавания активности"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": null,
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "from sklearn.linear_model import LogisticRegression\n",
333 | "from sklearn.metrics import classification_report"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "import sys\n",
343 | "sys.path.insert(0, \"../lib/\")\n",
344 | "from plot_confusion_matrix import plot_confusion_matrix"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "from sklearn.utils import shuffle"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "df_ = shuffle(df, random_state=RANDOM_STATE)\n",
363 | "df_.head()"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {},
370 | "outputs": [],
371 | "source": [
372 | "# Построение модели\n",
373 | "model = LogisticRegression(penalty=\"l2\", \n",
374 | " max_iter=100, \n",
375 | " solver=\"newton-cg\", \n",
376 | " multi_class=\"multinomial\",\n",
377 | " random_state=RANDOM_STATE)\n",
378 | "\n",
379 | "# Обучение\n",
380 | "model.fit(df_[CLMS], df_[\"activity\"])"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {},
386 | "source": [
387 | "Базовая отметка"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "# TODO"
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "metadata": {},
402 | "source": [
403 | "Проверка на тестовом множестве"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": null,
409 | "metadata": {},
410 | "outputs": [],
411 | "source": [
412 | "# Загрузка тестового множества\n",
413 | "df_test_features = pd.read_csv(X_TEST_FILE, header=None, sep=\"\\s+\", names=CLMS)\n",
414 | "df_test_labels = pd.read_csv(Y_TEST_FILE, header=None, names=[\"activity\"])\n",
415 | "\n",
416 | "df_test = pd.concat([df_test_features, df_test_labels], axis=1)\n",
417 | "df_test.head()"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": [
426 | "# Педсказанные значения\n",
427 | "df_test[\"pred\"] = model.predict(df_test[CLMS])\n",
428 | "df_test[[\"pred\", \"activity\"]].head()"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": null,
434 | "metadata": {},
435 | "outputs": [],
436 | "source": [
437 | "# Педсказанные значения\n",
438 | "df_test[\"pred\"] = model.predict(df_test[CLMS])\n",
439 | "\n",
440 | "# Расчет доли правильных классификаций\n",
441 | "accuracy = model.score(df_test[CLMS], df_test[\"activity\"])\n",
442 | "print(\"Accuracy = {}\\n\".format(accuracy))\n",
443 | "\n",
444 | "# Вывод других метрик\n",
445 | "print(classification_report(df_test[\"activity\"], \n",
446 | " df_test[\"pred\"], \n",
447 | " target_names=LABEL_NAMES))"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "# Вывод матрицы ошибок\n",
457 | "ax = plot_confusion_matrix(df_test[\"activity\"]-1,\n",
458 | " df_test[\"pred\"]-1,\n",
459 | " classes=np.array(LABEL_NAMES),\n",
460 | " figsize=[8,8])"
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "metadata": {},
466 | "source": [
467 | "## Выбор модели"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": null,
473 | "metadata": {},
474 | "outputs": [],
475 | "source": [
476 | "from sklearn.model_selection import StratifiedKFold\n",
477 | "from sklearn.model_selection import GridSearchCV\n",
478 | "\n",
479 | "from sklearn.ensemble import RandomForestClassifier\n",
480 | "from sklearn.neighbors import KNeighborsClassifier"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": null,
486 | "metadata": {},
487 | "outputs": [],
488 | "source": [
489 | "kf = StratifiedKFold(n_splits=3)"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": null,
495 | "metadata": {},
496 | "outputs": [],
497 | "source": [
498 | "models = dict()\n",
499 | "\n",
500 | "# k-ближайших соседей\n",
501 | "models[\"knn\"] = (\n",
502 | " KNeighborsClassifier(), {\n",
503 | " \"n_neighbors\": [5, 11]\n",
504 | " })\n",
505 | "\n",
506 | "# Логистическая регрессия\n",
507 | "models[\"logreg\"] = (\n",
508 | " LogisticRegression(\n",
509 | " penalty=\"l2\", \n",
510 | " solver=\"newton-cg\", \n",
511 | " multi_class=\"multinomial\",\n",
512 | " random_state=1234), {\n",
513 | " \"C\": [0.1, 1] \n",
514 | " })\n",
515 | "\n",
516 | "# Случайный лес\n",
517 | "models[\"rforest\"] = (\n",
518 | " RandomForestClassifier(\n",
519 | " criterion=\"gini\",\n",
520 | " random_state=RANDOM_STATE), {\n",
521 | " \"n_estimators\": [50, 100] \n",
522 | " })"
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": null,
528 | "metadata": {},
529 | "outputs": [],
530 | "source": [
531 | "df_result = pd.DataFrame(columns=[\"params\", \"accuracy\"])\n",
532 | "\n",
533 | "for name, (model, params) in models.items():\n",
534 | " grid = GridSearchCV(estimator=model, \n",
535 | " param_grid=params, \n",
536 | " cv=kf,\n",
537 | " verbose=2)\n",
538 | " grid.fit(df_[CLMS], df_[\"activity\"])\n",
539 | " df_result.loc[model.__class__.__name__] = (\n",
540 | " grid.best_params_,\n",
541 | " grid.score(df_test[CLMS], df_test[\"activity\"]))"
542 | ]
543 | },
544 | {
545 | "cell_type": "code",
546 | "execution_count": null,
547 | "metadata": {},
548 | "outputs": [],
549 | "source": [
550 | "# Вывод лучших моделей, их параметров и доли правильных классификаций\n",
551 | "df_result.head()"
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": null,
557 | "metadata": {},
558 | "outputs": [],
559 | "source": []
560 | }
561 | ],
562 | "metadata": {
563 | "kernelspec": {
564 | "display_name": "Python 3 (ipykernel)",
565 | "language": "python",
566 | "name": "python3"
567 | },
568 | "language_info": {
569 | "codemirror_mode": {
570 | "name": "ipython",
571 | "version": 3
572 | },
573 | "file_extension": ".py",
574 | "mimetype": "text/x-python",
575 | "name": "python",
576 | "nbconvert_exporter": "python",
577 | "pygments_lexer": "ipython3",
578 | "version": "3.11.7"
579 | }
580 | },
581 | "nbformat": 4,
582 | "nbformat_minor": 4
583 | }
584 |
--------------------------------------------------------------------------------
/notebooks/C5_Language_Detector.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Распознавание языка текста\n",
8 | "\n",
9 | "
\n",
10 | "\n",
11 | "С.Ю. Папулин (papulin.study@yandex.ru)"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "### Содержание\n",
19 | "\n",
20 | "- [Статический текст](#Статический-текст)\n",
21 | "- [Динамический текст](#Динамический-текст)\n",
22 | " - [Построение модели](#Построение-модели)\n",
23 | " - [Проверка динамического распознавания](#Проверка-динамического-распознавания)"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "Подключение библиотек:"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import numpy as np\n",
40 | "import pandas as pd\n",
41 | "import matplotlib.pyplot as plt\n",
42 | "%matplotlib inline"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "from sklearn.model_selection import train_test_split\n",
52 | "from sklearn.pipeline import Pipeline\n",
53 | "from sklearn.preprocessing import PolynomialFeatures\n",
54 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
55 | "from sklearn.naive_bayes import MultinomialNB"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "import sys\n",
65 | "sys.path.insert(0, \"../lib/\")\n",
66 | "from datasets import fetch_20languages"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## Статический текст"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "[Набор данных](https://huggingface.co/datasets/papluca/language-identification)"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "# Загрузка данных\n",
90 | "dataset = fetch_20languages(return_X_y=True)\n",
91 | "\n",
92 | "# Вывод описания\n",
93 | "print(dataset.DESCR)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "df_train = dataset.data['train']\n",
103 | "df_train"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "# from sklearn.datasets import get_data_home\n",
113 | "\n",
114 | "# # Директория по умолчанию, где хранятся данных\n",
115 | "# get_data_home()"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "df_train.describe()"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "# Количество текстов по каждому классу\n",
134 | "df_train['labels'].value_counts()"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "# Среднее количество символов в текстах по каждому классу\n",
144 | "df_train.groupby('labels')['text'].agg(\n",
145 | " lambda text: text.str.len().mean()\n",
146 | ")"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "pipeline = Pipeline([\n",
156 | " ('vectorizer', TfidfVectorizer()),\n",
157 | " ('classifier', MultinomialNB())\n",
158 | "])"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "pipeline.fit(df_train['text'], df_train['labels'])"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "df_test = dataset.data['test']"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "pipeline.score(df_test['text'], df_test['labels'])"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "## Динамический текст"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "### Построение модели"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "import re"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": null,
214 | "metadata": {
215 | "scrolled": true
216 | },
217 | "outputs": [],
218 | "source": [
219 | "# Исходные данные\n",
220 | "df_train.head()"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "# Шаблон для делителя строки на слова\n",
230 | "COMPILER = re.compile(\"\\W+\", re.UNICODE)"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "def split_sentence(lang, text):\n",
240 | " for word in set(COMPILER.split(text)):\n",
241 | " if word:\n",
242 | " yield (lang, word)\n",
243 | "\n",
244 | "\n",
245 | "def sentence_flat_map(df):\n",
246 | " def fetch_word_lang_pair():\n",
247 | " for i, row in df.iterrows():\n",
248 | " for item in split_sentence(row['labels'], row['text']):\n",
249 | " yield item\n",
250 | " return pd.DataFrame(\n",
251 | " data=fetch_word_lang_pair(), \n",
252 | " columns=['labels', 'word']\n",
253 | " ).drop_duplicates()\n",
254 | "\n",
255 | "\n",
256 | "# Формивание датафрейма язык-слово и удаление повторений\n",
257 | "df_train_new = sentence_flat_map(df_train)\n",
258 | "df_test_new = sentence_flat_map(df_test)\n",
259 | "\n",
260 | "df_train_new"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "df_train_new.shape"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "INPUT = 'обуч'\n",
279 | "\n",
280 | "print(\n",
281 | " df_train_new[df_train_new['word'].str.contains(INPUT)]\\\n",
282 | " .groupby('labels')\\\n",
283 | " .count().T\n",
284 | ")"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "# Априорные вероятности классов\n",
294 | "# class_prior=[\n",
295 | "# 0.04, 0.04, 0.05, 0.05, 0.1, 0.05, 0.05, 0.04, 0.05, 0.05,\n",
296 | "# 0.05, 0.05, 0.05, 0.05, 0.05, 0.04, 0.04, 0.05, 0.05, 0.05\n",
297 | "# ]\n",
298 | "class_prior=[0.05]*20\n",
299 | "\n",
300 | "# Пострение модели классификации\n",
301 | "pipeline = Pipeline([\n",
302 | " ('vectorizer', TfidfVectorizer(analyzer='char', ngram_range=(2,4))),\n",
303 | " ('classifier', MultinomialNB(class_prior=class_prior))\n",
304 | "])\n",
305 | "\n",
306 | "# Обучение модели\n",
307 | "pipeline.fit(df_train_new['word'], df_train_new['labels'])\n",
308 | "\n",
309 | "test_accuracy_on_words = pipeline.score(df_test_new['word'], df_test_new['labels'])\n",
310 | "test_accuracy_on_texts = pipeline.score(df_test['text'], df_test['labels'])\n",
311 | "\n",
312 | "# Оценка качества на тестовом множестве\n",
313 | "print(f\"Accuracy on Test (word-lang) = {test_accuracy_on_words}\")"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "# Оценка качества на тестовом множестве (из первой задачи)\n",
323 | "print(f\"Accuracy on Test (text-lang) = {test_accuracy_on_texts}\")"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": null,
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "# Словарь\n",
333 | "pipeline.named_steps['vectorizer'].vocabulary_"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "# Классы\n",
343 | "langs = pipeline.named_steps['classifier'].classes_\n",
344 | "langs"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "INPUT = 'tra'\n",
354 | "\n",
355 | "# Вероятности принадлежности классам для некоторого слова\n",
356 | "probs = pipeline.predict_proba([INPUT,])[0]\n",
357 | "probs"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": null,
363 | "metadata": {},
364 | "outputs": [],
365 | "source": [
366 | "print(\n",
367 | " pd.DataFrame(\n",
368 | " data={'prob': probs}, \n",
369 | " index=langs)\\\n",
370 | " .sort_values(by='prob', ascending=False)\n",
371 | ")"
372 | ]
373 | },
374 | {
375 | "cell_type": "markdown",
376 | "metadata": {},
377 | "source": [
378 | "### Проверка динамического распознавания "
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "import ipywidgets as widgets\n",
388 | "from IPython.display import display, clear_output"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "def display_prediction(langs, probs):\n",
398 | " \"\"\"\n",
399 | " Отображение вероятностей по языкам \n",
400 | " в виде датафрейма.\n",
401 | " \"\"\"\n",
402 | " print(\n",
403 | " pd.DataFrame(\n",
404 | " data={'prob': probs},\n",
405 | " index=langs\n",
406 | " )\\\n",
407 | " .sort_values('prob', ascending=False)\\\n",
408 | " .head(10)\n",
409 | " )"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "# Ввод текста\n",
419 | "text_input = widgets.Text()\n",
420 | "display(text_input)\n",
421 | "\n",
422 | "# Вывод результата предсказания\n",
423 | "output = widgets.Output()\n",
424 | "display(output)\n",
425 | "\n",
426 | "\n",
427 | "def handle_process_text(sender):\n",
428 | " with output:\n",
429 | " clear_output()\n",
430 | " probs = pipeline.predict_proba([sender.new,])[0]\n",
431 | " langs = pipeline.named_steps['classifier'].classes_\n",
432 | " display_prediction(langs, probs)\n",
433 | "\n",
434 | "\n",
435 | "# Отслеживание ввода\n",
436 | "text_input.observe(handle_process_text, names='value')"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": null,
442 | "metadata": {},
443 | "outputs": [],
444 | "source": []
445 | }
446 | ],
447 | "metadata": {
448 | "kernelspec": {
449 | "display_name": "Python 3 (ipykernel)",
450 | "language": "python",
451 | "name": "python3"
452 | },
453 | "language_info": {
454 | "codemirror_mode": {
455 | "name": "ipython",
456 | "version": 3
457 | },
458 | "file_extension": ".py",
459 | "mimetype": "text/x-python",
460 | "name": "python",
461 | "nbconvert_exporter": "python",
462 | "pygments_lexer": "ipython3",
463 | "version": "3.11.7"
464 | }
465 | },
466 | "nbformat": 4,
467 | "nbformat_minor": 4
468 | }
469 |
--------------------------------------------------------------------------------
/notebooks/C5_Linear_Regression_Fuel_Consumption.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Линейная полиномиальная регрессия\n",
8 | "\n",
9 | "Предсказание расхода топлива автомобилем\n",
10 | "\n",
11 | "
\n",
12 | "\n",
13 | "С.Ю. Папулин (papulin.study@yandex.ru)"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "### Содержание\n",
21 | "\n",
22 | "- [Загрузка данных](#Загрузка-данных)\n",
23 | "- [Предсказание расхода топлива](#Предсказание-расхода-топлива)\n",
24 | "- [Источники](#Источники)"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "Увеличение области вывода:"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "# FIXME: \n",
41 | "# %%javascript\n",
42 | "# IPython.OutputArea.auto_scroll_threshold = 9999;"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "Подключение библиотек:"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "import numpy as np\n",
59 | "import pandas as pd\n",
60 | "import matplotlib.pyplot as plt\n",
61 | "%matplotlib inline"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "from pandas.plotting import scatter_matrix"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "from sklearn.model_selection import train_test_split\n",
80 | "from sklearn.linear_model import LinearRegression\n",
81 | "from sklearn.metrics import mean_squared_error"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "from sklearn.preprocessing import PolynomialFeatures"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "## Загрузка данных"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "FILE_PATH = \"../data/auto-mpg.data\""
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "Признаки:\n",
114 | "1. `mpg`: миль на галлон, действительное значение\n",
115 | "2. `cylinders`: количество цилиндров, дискретное значение\n",
116 | "3. `displacement`: объем двигателя, куб. дюймы, действительное значение\n",
117 | "4. `horsepower`: horsepower: действительное значение\n",
118 | "5. `weight`: вес автомобиля: lbs., действительное значение\n",
119 | "6. `acceleration`: время разгона до 60 mph, сек., действительное значение\n",
120 | "7. `model_year`: год выпуска модели, (по модулю 100), дискретное значение\n",
121 | "8. `origin`: регион (1. American, 2. European, 3. Japanese), дискретное значение\n",
122 | "9. `name`: наименование модели, строка (уникально для каждого экземпляра)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "CLMNS = [\n",
132 | " \"mpg\", \"cylinders\", \"displacement\", \"horsepower\", \n",
133 | " \"weight\", \"acceleration\", \"model_year\", \"origin\", \"name\"\n",
134 | "]\n",
135 | "\n",
136 | "# Загрузка датасета\n",
137 | "# Замечание: \n",
138 | "# 1) Разбиваем на столбцы по пробелам (один и более)\n",
139 | "# 2) Там, где ?, заменяем на NaN\n",
140 | "# 3) Удаляем строки с NaN\n",
141 | "\n",
142 | "df = pd.read_csv(FILE_PATH, \n",
143 | " sep=\"\\s+\", \n",
144 | " names=CLMNS, \n",
145 | " na_values=[\"?\",]).dropna()\n",
146 | "\n",
147 | "df.head()"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "scatter_matrix(df, figsize=[12,12])\n",
157 | "plt.show()"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "# График\n",
167 | "plt.figure(1, figsize=[12, 4])\n",
168 | "plt.subplot(1,2,1)\n",
169 | "plt.title(\"horsepower\")\n",
170 | "plt.scatter(df[\"horsepower\"], df[\"mpg\"], color=\"green\")\n",
171 | "plt.xlabel(\"$horsepower$\")\n",
172 | "plt.ylabel(\"$mpg$\")\n",
173 | "plt.grid(True)\n",
174 | "\n",
175 | "plt.subplot(1,2,2)\n",
176 | "plt.title(\"weight\")\n",
177 | "plt.scatter(df[\"weight\"], df[\"mpg\"], color=\"green\")\n",
178 | "plt.xlabel(\"$weight$\")\n",
179 | "plt.ylabel(\"$mpg$\")\n",
180 | "plt.grid(True)\n",
181 | "plt.show()"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "## Предсказание расхода топлива"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "Модель 1:\n",
196 | "\n",
197 | "$$h_1(x) = \\theta_0 + \\theta_1\\cdot\\text{horsepower}$$\n",
198 | "\n",
199 | "Модель 2:\n",
200 | "\n",
201 | "$$h_2(x) = \\theta_0 + \\theta_1\\cdot\\text{horsepower} + \\theta_2\\cdot\\text{horsepower}^2$$\n",
202 | "\n",
203 | "Модель 3:\n",
204 | "\n",
205 | "$$h_3(x) = \\theta_0 + \\theta_1\\cdot\\text{horsepower} + \\theta_2\\cdot\\text{weight} $$\n",
206 | "\n",
207 | "Модель 4:\n",
208 | "\n",
209 | "$$h_4(x) = \\theta_0 + \\theta_1\\cdot\\text{horsepower} + \\theta_2\\cdot\\text{horsepower}^2 + \\theta_3\\cdot\\text{weight} $$"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "def create_poly_as_dataframe(df_train, df_test, degree):\n",
219 | " \"\"\"\n",
220 | " Создает датафреймы с полиномами для обучающей и тестовой частей.\n",
221 | " \n",
222 | " Замечание: В данном случае нет необходимости создавать полиномы отдельно\n",
223 | " для обучающего и тестового датафреймов. Можно было бы это сделать для всего\n",
224 | " исходного датафрейма. Однако интерфейс PolynomialFeatures подразумевает\n",
225 | " использование методов fit и transform. Поэтому разделение применяется\n",
226 | " для соблюдения общего подхода.\n",
227 | " \"\"\"\n",
228 | " pf = PolynomialFeatures(degree=degree)\n",
229 | " train_poly = pf.fit_transform(df_train)\n",
230 | " test_poly = pf.transform(df_test)\n",
231 | " return pd.DataFrame(train_poly, index=df_train.index), pd.DataFrame(test_poly, index=df_test.index)\n",
232 | "\n",
233 | "\n",
234 | "def create_poly_as_matrix(df_train, df_test, degree):\n",
235 | " \"\"\"\n",
236 | " Создает матрицы с полиномами для обучающей и тестовой частей.\n",
237 | " \"\"\"\n",
238 | " pf = PolynomialFeatures(degree=degree)\n",
239 | " train_poly = pf.fit_transform(df_train)\n",
240 | " test_poly = pf.transform(df_test)\n",
241 | " return np.asmatrix(train_poly), np.asmatrix(test_poly)"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "# # Формирование датафрейма признаков: \n",
251 | "# # исходный датафрейм признаков + датафрейм полиномов (кроме самих признаков и 1)\n",
252 | "# poly_degree = 3\n",
253 | "# num_poly_features = df_train_X[feature_clmns_1].columns.size\n",
254 | "\n",
255 | "# df_train_poly, df_test_poly = create_poly_as_dataframe(df_train_X[feature_clmns_1], \n",
256 | "# df_test_X[feature_clmns_1], \n",
257 | "# degree=poly_degree)\n",
258 | "\n",
259 | "# # Замечание: Используется цикл вместо concat, чтобы избежать\n",
260 | "# # повторного включения столбцов\n",
261 | "# for i in range(num_poly_features+1, df_train_poly.columns.size):\n",
262 | "# df_train_X[i] = df_train_poly[i]\n",
263 | " \n",
264 | "# df_train_X.head()"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "def plot_true_predicted(df_X, df_y, label_clmn=\"label\", prediction_clmn=\"prediction\", title=None):\n",
274 | " \"\"\"\n",
275 | " Построение графиков действительных значений и предсказанных \n",
276 | " по каждому признаку.\n",
277 | " \"\"\"\n",
278 | " \n",
279 | " feature_names = df_X.columns\n",
280 | " num_features = feature_names.size\n",
281 | " num_plot_rows = int(np.ceil((num_features+1)/2.0))\n",
282 | " columns = df_X.columns\n",
283 | " \n",
284 | " fig = plt.figure(figsize=[12, 4*num_plot_rows])\n",
285 | " for i in range(num_features):\n",
286 | " plt.subplot(num_plot_rows, 2, i+1)\n",
287 | " plt.vlines(df_X[feature_names[i]], ymin=df_y[label_clmn], ymax=df_y[prediction_clmn], \n",
288 | " colors=\"black\", linestyles=\"dotted\", lw=1, zorder=1)\n",
289 | " plt.scatter(df_X[feature_names[i]], df_y[label_clmn], \n",
290 | " color=\"green\", label=\"true\", zorder=2)\n",
291 | " plt.scatter(df_X[feature_names[i]], df_y[prediction_clmn], \n",
292 | " color=\"red\", label=\"predicted\", zorder=3)\n",
293 | " plt.xlabel(\"$%s$\" % feature_names[i])\n",
294 | " plt.ylabel(\"$%s$\" % label_clmn)\n",
295 | " plt.legend()\n",
296 | " plt.grid(True) \n",
297 | " \n",
298 | " plt.subplot(num_plot_rows, 2, num_features+1)\n",
299 | " plt.scatter(df_y[prediction_clmn], df_y[label_clmn], color=\"slategrey\")\n",
300 | " xlim = plt.gca().get_xlim() \n",
301 | " plt.plot(xlim, xlim, '--', color=\"grey\")\n",
302 | " plt.xlim(xlim) \n",
303 | " plt.xlabel(\"$\\\\bar{y}$\")\n",
304 | " plt.ylabel(\"$y$\")\n",
305 | " plt.grid(True) \n",
306 | " \n",
307 | " plt.tight_layout()\n",
308 | " \n",
309 | "# if title:\n",
310 | "# plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)\n",
311 | "# plt.suptitle(title, y=.98, fontsize=16)\n",
312 | " plt.show()"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {
319 | "scrolled": true
320 | },
321 | "outputs": [],
322 | "source": [
323 | "# Столбец целевого значения (действительного значения)\n",
324 | "target_clmn = [\"mpg\"]\n",
325 | "\n",
326 | "# Столбцы признаков (все кроме целевого значения)\n",
327 | "all_feature_clmns = df.columns.delete(df.columns.get_loc(target_clmn[0]))\n",
328 | "\n",
329 | "# Столбцы признаков для моделей\n",
330 | "feature_clmns_1 = [\"horsepower\"]\n",
331 | "feature_clmns_2 = [\"horsepower\", \"horsepower^2\"]\n",
332 | "feature_clmns_3 = [\"horsepower\", \"weight\"]\n",
333 | "feature_clmns_4 = [\"horsepower\", \"horsepower^2\", \"weight\"]\n",
334 | "\n",
335 | "# Разбиение исходных данных на обучающее и тестовое множества\n",
336 | "df_train_X, df_test_X, df_train_y, df_test_y = train_test_split(\n",
337 | " df[all_feature_clmns], df[target_clmn], \n",
338 | " test_size=0.3, random_state=1234)\n",
339 | "\n",
340 | "# Добавление полинома в датафрейм признаков (в данном случае \n",
341 | "# нужна только степень 2 для horsepower)\n",
342 | "\n",
343 | "# Вариант 1\n",
344 | "\n",
345 | "# Добавление столбца (можно сделать и для всего датафрейма)\n",
346 | "df_train_X[\"horsepower^2\"] = df_train_X[\"horsepower\"]**2\n",
347 | "df_test_X[\"horsepower^2\"] = df_test_X[\"horsepower\"]**2\n",
348 | "\n",
349 | "\n",
350 | "# Вариант 2 (с использованием PolynomialFeatures)\n",
351 | "\n",
352 | "# poly_degree = 2\n",
353 | "\n",
354 | "# train_poly_matrix, test_poly_matrix = create_poly_as_matrix(df_train_X[feature_clmns_1], \n",
355 | "# df_test_X[feature_clmns_1], \n",
356 | "# degree=poly_degree)\n",
357 | "# df_train_X[\"horsepower^2\"] = train_poly_matrix[:,2]\n",
358 | "# df_test_X[\"horsepower^2\"] = test_poly_matrix[:,2]\n",
359 | "\n",
360 | "\n",
361 | "# Список столбцов признаков для всех моделей\n",
362 | "features_set = [feature_clmns_1, feature_clmns_2, feature_clmns_3, feature_clmns_4]\n",
363 | "\n",
364 | "# Обучение и оценка качества моделей\n",
365 | "for indx, features in enumerate(features_set):\n",
366 | " \n",
367 | " # Обучение\n",
368 | " model = LinearRegression()\n",
369 | " model.fit(df_train_X[features], df_train_y[target_clmn[0]])\n",
370 | " \n",
371 | " # Параметры обученных моделей\n",
372 | " print(\"Model\", indx + 1)\n",
373 | " print(\"\\tw0 =\", model.intercept_)\n",
374 | " for i, coef in enumerate(model.coef_):\n",
375 | " print(\"\\tw{} = {}\".format(i+1, coef))\n",
376 | " \n",
377 | " # Предсказания\n",
378 | " model_name = \"model_{}_pred\".format(indx+1)\n",
379 | " df_train_y[model_name] = model.predict(df_train_X[features])\n",
380 | " df_test_y[model_name] = model.predict(df_test_X[features])\n",
381 | " \n",
382 | " # Среднеквадратические ошибки на тестовом подмножестве для всех моделей\n",
383 | " mse = mean_squared_error(df_test_y[target_clmn], model.predict(df_test_X[features]))\n",
384 | " print(\"\\tMSE = {}\".format(mse))\n",
385 | " \n",
386 | " # Графики\n",
387 | " plot_true_predicted(df_test_X[features], \n",
388 | " df_test_y, \n",
389 | " label_clmn=target_clmn[0], \n",
390 | " prediction_clmn=model_name)\n",
391 | "\n",
392 | "# Действительные и предсказанные значения для тестовых данных (первые пять)\n",
393 | "df_test_y.head(5)"
394 | ]
395 | },
396 | {
397 | "cell_type": "markdown",
398 | "metadata": {},
399 | "source": [
400 | "## Альтернативная реализация"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "metadata": {},
407 | "outputs": [],
408 | "source": [
409 | "def transform_to_poly(df, clmn, inplace=True):\n",
410 | " \"\"\"Трансформация\"\"\"\n",
411 | " clmn_new = \"{}^2\".format(clmn)\n",
412 | " if inplace:\n",
413 | " df[clmn_new] = df[clmn]**2\n",
414 | " return df\n",
415 | " return df.assign(**{clmn_new: df[clmn]**2})\n",
416 | "\n",
417 | "\n",
418 | "def train_and_predit(df, model, feature_clmns, label_clmn, predicted_clmn=\"predicted\", inplace=True):\n",
419 | " \"\"\"Обучение и предсказание\"\"\"\n",
420 | " model.fit(df[feature_clmns], df[label_clmn])\n",
421 | " if inplace:\n",
422 | " df[predicted_clmn] = model.predict(df[feature_clmns])\n",
423 | " return df\n",
424 | " return df.assign(**{predicted_clmn: model.predict(df[feature_clmns])})\n",
425 | "\n",
426 | "\n",
427 | "def predict(df, model, feature_clmns, predicted_clmn=\"predicted\", inplace=True):\n",
428 | " \"\"\"Предсказание\"\"\"\n",
429 | " if inplace:\n",
430 | " df[predicted_clmn] = model.predict(df[feature_clmns])\n",
431 | " return df\n",
432 | " return df.assign(**{predicted_clmn: model.predict(df[feature_clmns])})\n",
433 | "\n",
434 | "\n",
435 | "def describe(df, name, model, feature_clmns, label_clmn, predicted_clmn=\"predicted\"):\n",
436 | " \"\"\"Вывод информации о модели\"\"\"\n",
437 | " \n",
438 | " # Наименование\n",
439 | " print(name)\n",
440 | " \n",
441 | " # Параметры обученных моделей\n",
442 | " print(\"\\tw0 =\", model.intercept_)\n",
443 | " for i, coef in enumerate(model.coef_):\n",
444 | " print(\"\\tw{} = {}\".format(i+1, coef))\n",
445 | " \n",
446 | " # Среднеквадратические ошибки на тестовом подмножестве для всех моделей\n",
447 | " mse = mean_squared_error(df[label_clmn], df[predicted_clmn])\n",
448 | " print(\"\\tMSE = {}\".format(mse))\n",
449 | " \n",
450 | " # Графики\n",
451 | " plot_true_predicted(df[feature_clmns], df, \n",
452 | " label_clmn=label_clmn, \n",
453 | " prediction_clmn=predicted_clmn)\n",
454 | " \n",
455 | " return df\n",
456 | " \n",
457 | " \n",
458 | "# Разбиение исходных данных на обучающее и тестовое множества\n",
459 | "df_train, df_test = train_test_split(df, test_size=0.3, random_state=1234)\n",
460 | "\n",
461 | "# Столбцы признаков и целевого значения\n",
462 | "target_clmn = \"mpg\"\n",
463 | "feature_clmns = [\"horsepower\", \"horsepower^2\"]\n",
464 | "\n",
465 | "# Инициализация модели\n",
466 | "model = LinearRegression() \n",
467 | "\n",
468 | "# Обучение и тестирование\n",
469 | "df_train__predicted = df_train\\\n",
470 | " .pipe(transform_to_poly, \"horsepower\", False)\\\n",
471 | " .pipe(train_and_predit, model, feature_clmns, target_clmn)\\\n",
472 | " .pipe(describe, \"Train\", model, feature_clmns, target_clmn)\n",
473 | "\n",
474 | "df_test__predicted = df_test\\\n",
475 | " .pipe(transform_to_poly, \"horsepower\", False)\\\n",
476 | " .pipe(predict, model, feature_clmns)\\\n",
477 | " .pipe(describe, \"Test\", model, feature_clmns, target_clmn)\n",
478 | "\n",
479 | "df_test__predicted[feature_clmns + [target_clmn] + [\"predicted\"]].head(5)"
480 | ]
481 | },
482 | {
483 | "cell_type": "markdown",
484 | "metadata": {},
485 | "source": [
486 | "## Источники"
487 | ]
488 | },
489 | {
490 | "cell_type": "markdown",
491 | "metadata": {},
492 | "source": [
493 | "[Auto MPG Data Set](https://archive.ics.uci.edu/ml/datasets/auto+mpg)"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": []
502 | }
503 | ],
504 | "metadata": {
505 | "kernelspec": {
506 | "display_name": "Python 3 (ipykernel)",
507 | "language": "python",
508 | "name": "python3"
509 | },
510 | "language_info": {
511 | "codemirror_mode": {
512 | "name": "ipython",
513 | "version": 3
514 | },
515 | "file_extension": ".py",
516 | "mimetype": "text/x-python",
517 | "name": "python",
518 | "nbconvert_exporter": "python",
519 | "pygments_lexer": "ipython3",
520 | "version": "3.11.7"
521 | }
522 | },
523 | "nbformat": 4,
524 | "nbformat_minor": 4
525 | }
526 |
--------------------------------------------------------------------------------
/notebooks/C8_Preprocessing_And_SemiSupervised.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Кластеризация. Предобработка и обучение с частичным привлечением учителя\n",
8 | "\n",
9 | "\n",
10 | "
\n",
11 | "\n",
12 | "С.Ю. Папулин (papulin.study@yandex.ru)"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "### Содержание\n",
20 | "\n",
21 | "- [Предобработка данных для задачи классификации](#Предобработка-данных-для-задачи-классификации)\n",
22 | "- [Обучение с частичным привлечением учителя](#Обучение-с-частичным-привлечением-учителя)\n",
23 | "- [Источники](#Источники)"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "Подключение библиотек:"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import warnings\n",
40 | "warnings.filterwarnings('ignore') "
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "import time\n",
50 | "import numpy as np\n",
51 | "import matplotlib.pyplot as plt\n",
52 | "%matplotlib inline"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "from sklearn.svm import SVC\n",
62 | "from sklearn.linear_model import LogisticRegression\n",
63 | "from sklearn.cluster import KMeans\n",
64 | "from sklearn.pipeline import Pipeline"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "import sys\n",
74 | "sys.path.insert(0, \"../lib/\")\n",
75 | "from datasets import fetch_fashion_mnist"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "## Предобработка данных для задачи классификации"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "Загрузка данных"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "fashion_dataset = fetch_fashion_mnist(return_X_y=True)\n",
99 | "\n",
100 | "print('Overview\\n', fashion_dataset.DESCR)\n",
101 | "print('Feature names\\n', fashion_dataset.feature_names)"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "IMAGE_INDX = 20\n",
111 | "\n",
112 | "print('Image:')\n",
113 | "plt.figure(figsize=[4, 4])\n",
114 | "plt.imshow(fashion_dataset.data['train'][IMAGE_INDX].reshape(-1, 28))\n",
115 | "plt.show()\n",
116 | "\n",
117 | "print('Target:', fashion_dataset.target['train'][IMAGE_INDX])\n",
118 | "print('Name:', fashion_dataset.feature_names[fashion_dataset.target['train'][IMAGE_INDX]])"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "Размерность данных"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "fashion_dataset.data['train'].shape, fashion_dataset.data['test'].shape"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "fashion_dataset.target['train'].shape, fashion_dataset.target['test'].shape"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "Обучающее и тестовое подмножества"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "X_train_, X_test, y_train_, y_test = *fashion_dataset.data.values(), *fashion_dataset.target.values()\n",
160 | "\n",
161 | "# Уменьшение количества элементов обучающего множества\n",
162 | "X_train = X_train_[:10000]\n",
163 | "y_train = y_train_[:10000]\n",
164 | "\n",
165 | "# Уменьшение размера изображений\n",
166 | "# X_train = X_train.reshape(-1, 28, 28)[:, ::2, ::2].reshape(-1, 14*14)"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "np.unique(y_train, return_counts=True)"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "Обучения классификатора"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "tick = time.time()\n",
192 | "model = SVC(**{'C': 10, 'kernel': 'poly', 'gamma': 'scale', 'degree': 2})\n",
193 | "model.fit(X_train, y_train)\n",
194 | "print(\"Time =\", time.time() - tick)\n",
195 | "model.score(X_test, y_test)"
196 | ]
197 | },
198 | {
199 | "cell_type": "markdown",
200 | "metadata": {},
201 | "source": [
202 | "Формирование признаков посредством кластеризации"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": null,
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "tick = time.time()\n",
212 | "cluster_model = KMeans(n_clusters=50, random_state=12345)\n",
213 | "cluster_model.fit(X_train)\n",
214 | "print(\"Time =\", time.time() - tick)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "cluster_model.transform(X_train).shape"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "# Расстояние до центров кластеров\n",
233 | "cluster_model.transform(X_train)[:1]"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "# Предсказание кластеров\n",
243 | "# print(np.argmin(cluster_model.transform(X_train), axis=1)[:5])\n",
244 | "# print(cluster_model.predict(X_train[:5]))"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {},
250 | "source": [
251 | "Классификатор с новым наборам признаков"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "tick = time.time()\n",
261 | "model = SVC(**{'C':10, 'kernel':'poly', 'gamma': 'scale', 'degree': 2})\n",
262 | "model.fit(cluster_model.transform(X_train), y_train)\n",
263 | "print('Time =', time.time() - tick)\n",
264 | "model.score(cluster_model.transform(X_test), y_test)"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "Реализация посредством `Pipeline`"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": [
280 | "tick = time.time()\n",
281 | "pipeline = Pipeline([\n",
282 | " ('cluster_model', KMeans(n_clusters=50, random_state=12345)),\n",
283 | " ('classifier', SVC(**{'C':10, 'kernel': 'poly', 'gamma': 'scale', 'degree': 2})),\n",
284 | "])\n",
285 | "pipeline.fit(X_train, y_train)\n",
286 | "print('Time =', time.time() - tick)\n",
287 | "pipeline.score(X_test, y_test)"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "metadata": {},
293 | "source": [
294 | "## Обучение с частичным привлечением учителя"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": null,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "# Везьмем первые n наблюдений\n",
304 | "N = 50\n",
305 | "X_train_n = X_train[:N]\n",
306 | "y_train_n = y_train[:N]"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "np.unique(y_train_n, return_counts=True)"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "# Обучаем модель классификации\n",
325 | "tick = time.time()\n",
326 | "model = SVC(**{'C': 10, 'kernel': 'poly', 'gamma': 'scale', 'degree': 2})\n",
327 | "# model = LogisticRegression()\n",
328 | "model.fit(X_train_n, y_train_n)\n",
329 | "print('Accuracy =', model.score(X_test, y_test))\n",
330 | "print('Time =', time.time() - tick)"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "# Обучаем модель кластеризации\n",
340 | "tick = time.time()\n",
341 | "cluster_model = KMeans(n_clusters=N, random_state=12345)\n",
342 | "cluster_model.fit(X_train)\n",
343 | "print('Time =', time.time() - tick)"
344 | ]
345 | },
346 | {
347 | "cell_type": "markdown",
348 | "metadata": {},
349 | "source": [
350 | "Разметка данных ближайших к кластерам"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "# Индексы наблюдений с минимальным расстоянием до ближайщего кластера\n",
360 | "indices = np.argmin(cluster_model.transform(X_train), axis=0)\n",
361 | "indices"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": null,
367 | "metadata": {},
368 | "outputs": [],
369 | "source": [
370 | "cols = 10\n",
371 | "row_num = -(-len(indices) // cols)\n",
372 | "\n",
373 | "fig, axs = plt.subplots(row_num, cols, figsize=(14, 2*row_num), squeeze=False)\n",
374 | "for i in range(row_num):\n",
375 | " for j in range(cols):\n",
376 | " indx = i * cols + j\n",
377 | " if indx >= len(indices):\n",
378 | " fig.delaxes(axs[i, j])\n",
379 | " else:\n",
380 | " image = X_train[indices[indx]].reshape(-1, 28)\n",
381 | " axs[i, j].imshow(image)\n",
382 | " axs[i, j].set_title(\n",
383 | " \"cluster={}\".format(indx))\n",
384 | " axs[i, j].axis(\"off\")\n",
385 | "# plt.tight_layout()\n",
386 | "plt.show()"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "# Массив наблюдений, соответствующих ранее полученным индексам\n",
396 | "X_train_n_labeled = X_train[indices]\n",
397 | "\n",
398 | "# Замечания: Эти значения должны быть внесены вручную на основе\n",
399 | "# изображений выше. Однако здесь мы используем уже размеченный \n",
400 | "# набор с целевыми значениями\n",
401 | "y_train_n_labeled = y_train[indices]"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "np.unique(y_train_n_labeled, return_counts=True)"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {},
417 | "outputs": [],
418 | "source": [
419 | "# Обучение на новом наборе из N размеченных данных\n",
420 | "model = SVC(**{'C': 10, 'kernel': 'poly', 'gamma': 'scale', 'degree': 2})\n",
421 | "model.fit(X_train_n_labeled, y_train_n_labeled)\n",
422 | "print('Accuracy =', model.score(X_test, y_test))\n",
423 | "print('Time =', time.time() - tick)"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "Разметка всего набора данных"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": null,
436 | "metadata": {},
437 | "outputs": [],
438 | "source": [
439 | "# Предсказание кластеров\n",
440 | "с__pred = cluster_model.predict(X_train)"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": null,
446 | "metadata": {},
447 | "outputs": [],
448 | "source": [
449 | "# Сопоставим индексы кластеров и индексы классов (для размеченных вручную изображений)\n",
450 | "y_train_labeled = y_train[indices[с__pred]]\n",
451 | "y_train_labeled"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": null,
457 | "metadata": {},
458 | "outputs": [],
459 | "source": [
460 | "# Обучение на новых размеченных данных\n",
461 | "tick = time.time()\n",
462 | "model = SVC(**{'C': 10, 'kernel': 'poly', 'gamma': 'scale', 'degree': 2})\n",
463 | "model.fit(X_train, y_train_labeled)\n",
464 | "print('Accuracy =', model.score(X_test, y_test))\n",
465 | "print('Time =', time.time() - tick)"
466 | ]
467 | },
468 | {
469 | "cell_type": "markdown",
470 | "metadata": {},
471 | "source": [
472 | "Приведенные выше способы подходят для повышения качества предсказания моделей при небольшом количестве размеченных данных. Если у нас достаточно большой набор размеченных данных, то не стоит ожидать значительного увеличения качества предсказания"
473 | ]
474 | },
475 | {
476 | "cell_type": "markdown",
477 | "metadata": {},
478 | "source": [
479 | "## Источники"
480 | ]
481 | },
482 | {
483 | "cell_type": "markdown",
484 | "metadata": {},
485 | "source": [
486 | "Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow by Aurélien Géron"
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": null,
492 | "metadata": {},
493 | "outputs": [],
494 | "source": []
495 | }
496 | ],
497 | "metadata": {
498 | "kernelspec": {
499 | "display_name": "Python 3 (ipykernel)",
500 | "language": "python",
501 | "name": "python3"
502 | },
503 | "language_info": {
504 | "codemirror_mode": {
505 | "name": "ipython",
506 | "version": 3
507 | },
508 | "file_extension": ".py",
509 | "mimetype": "text/x-python",
510 | "name": "python",
511 | "nbconvert_exporter": "python",
512 | "pygments_lexer": "ipython3",
513 | "version": "3.11.7"
514 | }
515 | },
516 | "nbformat": 4,
517 | "nbformat_minor": 4
518 | }
519 |
--------------------------------------------------------------------------------
/notebooks/css/style.css:
--------------------------------------------------------------------------------
1 | .msg-block {
2 | padding: 10px;
3 | }
4 | .msg-warning {
5 | background-color: lightyellow;
6 | border: 1px solid orange;
7 | border-left: 10px solid orange;
8 | }
9 | .msg-info {
10 | background-color: #e6f2ff;
11 | border: 1px solid #0099ff;
12 | border-left: 10px solid #0099ff;
13 | }
14 | .msg-text-warn {
15 |
16 | }
17 | .msg-text-warn:before {
18 | content: "предупреждение!\A";
19 | text-transform: uppercase;
20 | font-weight: bold;
21 | color: darkorange;
22 | white-space: pre;
23 | }
24 | .msg-text-info {
25 |
26 | }
27 | .msg-text-info:before {
28 | content: "замечание!\A";
29 | text-transform: uppercase;
30 | font-weight: bold;
31 | color: #007acc;
32 | white-space: pre;
33 | }
34 | .msg-imp {
35 | background-color: #ffe6e6;
36 | border: 1px solid red;
37 | border-left: 10px solid red;
38 | }
39 | .msg-text-imp {
40 |
41 | }
42 | .msg-text-imp:before {
43 | content: "внимание!\A";
44 | text-transform: uppercase;
45 | font-weight: bold;
46 | color: darkred;
47 | margin-bottom: 15px;
48 | white-space: pre;
49 |
50 | }
51 | .code-font, .bold, .code-key {
52 | font-family: monospace;
53 | }
54 | .code-key {
55 | color: green;
56 | font-weight: bold;
57 | }
58 | .code-text-key {
59 | color: #007a99;
60 | font-weight: bold;
61 | }
62 |
63 | .code-block {
64 | margin-left: 20px;
65 | padding: 10px;
66 | border-left: 2px solid lightgrey;
67 | }
68 | .bold {
69 | font-weight: bold;
70 | }
71 |
--------------------------------------------------------------------------------
/notebooks/img/gd-one-var-alpha-left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMethods/Practice/ed03a18b83e0973a8fd540fcc6c1f766db857c64/notebooks/img/gd-one-var-alpha-left.png
--------------------------------------------------------------------------------
/notebooks/img/gd-one-var-alpha-right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMethods/Practice/ed03a18b83e0973a8fd540fcc6c1f766db857c64/notebooks/img/gd-one-var-alpha-right.png
--------------------------------------------------------------------------------
/notebooks/img/gd-two-var-alpha-left-sum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMethods/Practice/ed03a18b83e0973a8fd540fcc6c1f766db857c64/notebooks/img/gd-two-var-alpha-left-sum.png
--------------------------------------------------------------------------------
/notebooks/img/gd-two-var-alpha-left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMethods/Practice/ed03a18b83e0973a8fd540fcc6c1f766db857c64/notebooks/img/gd-two-var-alpha-left.png
--------------------------------------------------------------------------------
/notebooks/img/gd-two-var-alpha-right-sum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMethods/Practice/ed03a18b83e0973a8fd540fcc6c1f766db857c64/notebooks/img/gd-two-var-alpha-right-sum.png
--------------------------------------------------------------------------------
/notebooks/img/gd-two-var-alpha-right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMethods/Practice/ed03a18b83e0973a8fd540fcc6c1f766db857c64/notebooks/img/gd-two-var-alpha-right.png
--------------------------------------------------------------------------------
/notebooks/img/sgd-two-var-alpha-left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMethods/Practice/ed03a18b83e0973a8fd540fcc6c1f766db857c64/notebooks/img/sgd-two-var-alpha-left.png
--------------------------------------------------------------------------------
/notebooks/img/sgd-two-var-alpha-right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMethods/Practice/ed03a18b83e0973a8fd540fcc6c1f766db857c64/notebooks/img/sgd-two-var-alpha-right.png
--------------------------------------------------------------------------------
/notebooks/img/vectors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMethods/Practice/ed03a18b83e0973a8fd540fcc6c1f766db857c64/notebooks/img/vectors.png
--------------------------------------------------------------------------------