├── Python
    ├── 03
    │   ├── lm_boston.ipynb
    │   ├── lm_ridge_lasso_boston.ipynb
    │   ├── lm_ridge_lasso_tokyo.ipynb
    │   └── lm_tokyo.ipynb
    ├── 04
    │   ├── decisionTree_iris.ipynb
    │   ├── decisionTree_tweets.ipynb
    │   ├── get_tweets.ipynb
    │   ├── logit_iris.ipynb
    │   ├── logit_tweets.ipynb
    │   ├── randomForest_iris.ipynb
    │   ├── randomForest_tweets.ipynb
    │   └── tweets.tsv
    ├── 05
    │   ├── Kmeans_iris.ipynb
    │   ├── Kmeans_prefecture.ipynb
    │   ├── data_prefecture_category.csv
    │   ├── pca_iris.ipynb
    │   └── pca_prefecture.ipynb
    ├── 06
    │   ├── classification.ipynb
    │   └── regression.ipynb
    ├── 07
    │   ├── cnn_mnist.ipynb
    │   ├── cnn_temple_shrine.ipynb
    │   ├── get_imaeg.py
    │   ├── nn_mnist.ipynb
    │   └── nn_temple_shrine.ipynb
    └── 08
    │   ├── collaborative_filtering.ipynb
    │   └── word2vec_tweets.ipynb
├── R
    ├── 03
    │   ├── lm_boston.R
    │   ├── lm_ridge_lasso_boston.R
    │   ├── lm_ridge_lasso_tokyo.R
    │   └── lm_tokyo.R
    ├── 04
    │   ├── decisionTree_iris.R
    │   ├── decisionTree_tweets.R
    │   ├── logit_iris.R
    │   ├── logit_tweets.R
    │   ├── randomForest_iris.R
    │   ├── randomForest_tweets.R
    │   └── tweets.tsv
    ├── 05
    │   ├── Kmeans_iris.R
    │   ├── Kmeans_prefecture.R
    │   ├── data_prefecture_category.csv
    │   ├── pca_iris.R
    │   └── pca_prefecture.R
    ├── 06
    │   ├── classification.R
    │   └── regression.R
    └── 07
    │   ├── cnn_mnist.R
    │   └── nn_mnist.R
└── README.md


/Python/03/lm_boston.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch03-boston-lm.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "GPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "o5dgWD9rz4LG",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import numpy as np\n",
 27 |         "import pandas as pd\n",
 28 |         "import matplotlib.pyplot as plt\n",
 29 |         "%matplotlib inline\n",
 30 |         "import seaborn as sns\n",
 31 |         "from sklearn.linear_model import LinearRegression\n",
 32 |         "from sklearn.datasets import load_boston"
 33 |       ],
 34 |       "execution_count": 0,
 35 |       "outputs": []
 36 |     },
 37 |     {
 38 |       "cell_type": "markdown",
 39 |       "metadata": {
 40 |         "id": "GAqEPf-uYZZ5",
 41 |         "colab_type": "text"
 42 |       },
 43 |       "source": [
 44 |         "## データ読み込み"
 45 |       ]
 46 |     },
 47 |     {
 48 |       "cell_type": "code",
 49 |       "metadata": {
 50 |         "id": "Z38CWl_Fz61A",
 51 |         "colab_type": "code",
 52 |         "colab": {}
 53 |       },
 54 |       "source": [
 55 |         "boston = load_boston()\n",
 56 |         "print(boston.DESCR)"
 57 |       ],
 58 |       "execution_count": 0,
 59 |       "outputs": []
 60 |     },
 61 |     {
 62 |       "cell_type": "code",
 63 |       "metadata": {
 64 |         "id": "2HC7DcOv9Mib",
 65 |         "colab_type": "code",
 66 |         "colab": {}
 67 |       },
 68 |       "source": [
 69 |         "data_boston = pd.DataFrame(boston.data, columns=boston.feature_names)\n",
 70 |         "data_boston['PRICE'] = boston.target"
 71 |       ],
 72 |       "execution_count": 0,
 73 |       "outputs": []
 74 |     },
 75 |     {
 76 |       "cell_type": "code",
 77 |       "metadata": {
 78 |         "id": "x10DS7QWZRiY",
 79 |         "colab_type": "code",
 80 |         "colab": {}
 81 |       },
 82 |       "source": [
 83 |         "print(data_boston.head())"
 84 |       ],
 85 |       "execution_count": 0,
 86 |       "outputs": []
 87 |     },
 88 |     {
 89 |       "cell_type": "code",
 90 |       "metadata": {
 91 |         "id": "bDWlr_nRq8v1",
 92 |         "colab_type": "code",
 93 |         "colab": {}
 94 |       },
 95 |       "source": [
 96 |         "print(data_boston.tail())"
 97 |       ],
 98 |       "execution_count": 0,
 99 |       "outputs": []
100 |     },
101 |     {
102 |       "cell_type": "markdown",
103 |       "metadata": {
104 |         "id": "Sg70gJxoYgt6",
105 |         "colab_type": "text"
106 |       },
107 |       "source": [
108 |         "## 可視化"
109 |       ]
110 |     },
111 |     {
112 |       "cell_type": "code",
113 |       "metadata": {
114 |         "id": "OyirnZ2a9U8X",
115 |         "colab_type": "code",
116 |         "colab": {}
117 |       },
118 |       "source": [
119 |         "sns.jointplot('RM', 'PRICE', data=data_boston)"
120 |       ],
121 |       "execution_count": 0,
122 |       "outputs": []
123 |     },
124 |     {
125 |       "cell_type": "code",
126 |       "metadata": {
127 |         "id": "TuMKNWCHBS1S",
128 |         "colab_type": "code",
129 |         "colab": {}
130 |       },
131 |       "source": [
132 |         "sns.pairplot(data_boston)"
133 |       ],
134 |       "execution_count": 0,
135 |       "outputs": []
136 |     },
137 |     {
138 |       "cell_type": "code",
139 |       "metadata": {
140 |         "id": "xIdkp9Q_B4-Y",
141 |         "colab_type": "code",
142 |         "colab": {}
143 |       },
144 |       "source": [
145 |         "sns.pairplot(data_boston, vars=[\"PRICE\", \"RM\", \"DIS\"])"
146 |       ],
147 |       "execution_count": 0,
148 |       "outputs": []
149 |     },
150 |     {
151 |       "cell_type": "markdown",
152 |       "metadata": {
153 |         "id": "akWwJ5J8yx3f",
154 |         "colab_type": "text"
155 |       },
156 |       "source": [
157 |         "## 線形回帰を実践"
158 |       ]
159 |     },
160 |     {
161 |       "cell_type": "code",
162 |       "metadata": {
163 |         "id": "37T2q4X11NIm",
164 |         "colab_type": "code",
165 |         "colab": {}
166 |       },
167 |       "source": [
168 |         "lr = LinearRegression()"
169 |       ],
170 |       "execution_count": 0,
171 |       "outputs": []
172 |     },
173 |     {
174 |       "cell_type": "code",
175 |       "metadata": {
176 |         "id": "UaCgv7VMCQaQ",
177 |         "colab_type": "code",
178 |         "colab": {}
179 |       },
180 |       "source": [
181 |         "x_column_list = ['RM']\n",
182 |         "y_column_list = ['PRICE']\n",
183 |         "\n",
184 |         "data_boston_x = data_boston[x_column_list]\n",
185 |         "data_boston_y = data_boston[y_column_list]\n",
186 |         "\n",
187 |         "lr.fit(data_boston_x, data_boston_y)"
188 |       ],
189 |       "execution_count": 0,
190 |       "outputs": []
191 |     },
192 |     {
193 |       "cell_type": "code",
194 |       "metadata": {
195 |         "id": "CmF-CkZkzBRi",
196 |         "colab_type": "code",
197 |         "colab": {}
198 |       },
199 |       "source": [
200 |         "print(lr.coef_)\n",
201 |         "print(lr.intercept_)"
202 |       ],
203 |       "execution_count": 0,
204 |       "outputs": []
205 |     },
206 |     {
207 |       "cell_type": "markdown",
208 |       "metadata": {
209 |         "id": "vb42hci58i5V",
210 |         "colab_type": "text"
211 |       },
212 |       "source": [
213 |         "### 重回帰分析"
214 |       ]
215 |     },
216 |     {
217 |       "cell_type": "code",
218 |       "metadata": {
219 |         "id": "d3wJ2V2CAFEK",
220 |         "colab_type": "code",
221 |         "colab": {}
222 |       },
223 |       "source": [
224 |         "lr_multi = LinearRegression()\n",
225 |         "\n",
226 |         "x_column_list_for_multi = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']\n",
227 |         "y_column_list_for_multi = ['PRICE']\n",
228 |         "\n",
229 |         "data_boston_x = data_boston[x_column_list_for_multi]\n",
230 |         "data_boston_y = data_boston[y_column_list_for_multi]\n",
231 |         "\n",
232 |         "lr_multi.fit(data_boston_x,  data_boston_y)"
233 |       ],
234 |       "execution_count": 0,
235 |       "outputs": []
236 |     },
237 |     {
238 |       "cell_type": "code",
239 |       "metadata": {
240 |         "id": "QDwGOyzVAc_f",
241 |         "colab_type": "code",
242 |         "colab": {}
243 |       },
244 |       "source": [
245 |         "print(lr_multi.coef_)\n",
246 |         "print(lr_multi.intercept_)"
247 |       ],
248 |       "execution_count": 0,
249 |       "outputs": []
250 |     },
251 |     {
252 |       "cell_type": "markdown",
253 |       "metadata": {
254 |         "id": "alpJSV_OY_UP",
255 |         "colab_type": "text"
256 |       },
257 |       "source": [
258 |         "## 予測"
259 |       ]
260 |     },
261 |     {
262 |       "cell_type": "code",
263 |       "metadata": {
264 |         "id": "IYDRbFKCZA2n",
265 |         "colab_type": "code",
266 |         "colab": {}
267 |       },
268 |       "source": [
269 |         "from sklearn.model_selection import train_test_split\n",
270 |         "\n",
271 |         "X_train, X_test, y_train, y_test = train_test_split(data_boston_x, data_boston_y, test_size=0.3)"
272 |       ],
273 |       "execution_count": 0,
274 |       "outputs": []
275 |     },
276 |     {
277 |       "cell_type": "code",
278 |       "metadata": {
279 |         "id": "3tlg7O37brti",
280 |         "colab_type": "code",
281 |         "colab": {}
282 |       },
283 |       "source": [
284 |         "print(X_train.shape)\n",
285 |         "print(X_test.shape)\n",
286 |         "print(y_train.shape)\n",
287 |         "print(y_test.shape)"
288 |       ],
289 |       "execution_count": 0,
290 |       "outputs": []
291 |     },
292 |     {
293 |       "cell_type": "code",
294 |       "metadata": {
295 |         "id": "yDbbRGARejnC",
296 |         "colab_type": "code",
297 |         "colab": {}
298 |       },
299 |       "source": [
300 |         "lr_multi2 = LinearRegression()\n",
301 |         "\n",
302 |         "lr_multi2.fit(X_train, y_train) \n",
303 |         "print(lr_multi2.coef_)\n",
304 |         "print(lr_multi2.intercept_)"
305 |       ],
306 |       "execution_count": 0,
307 |       "outputs": []
308 |     },
309 |     {
310 |       "cell_type": "code",
311 |       "metadata": {
312 |         "id": "8dqcvzMMfrmL",
313 |         "colab_type": "code",
314 |         "colab": {}
315 |       },
316 |       "source": [
317 |         "y_pred = lr_multi2.predict(X_test)"
318 |       ],
319 |       "execution_count": 0,
320 |       "outputs": []
321 |     },
322 |     {
323 |       "cell_type": "code",
324 |       "metadata": {
325 |         "id": "m_K6lCv2g0f9",
326 |         "colab_type": "code",
327 |         "colab": {}
328 |       },
329 |       "source": [
330 |         "print(y_pred - y_test)"
331 |       ],
332 |       "execution_count": 0,
333 |       "outputs": []
334 |     },
335 |     {
336 |       "cell_type": "markdown",
337 |       "metadata": {
338 |         "id": "ZPwNArsCk9JX",
339 |         "colab_type": "text"
340 |       },
341 |       "source": [
342 |         "## MAE"
343 |       ]
344 |     },
345 |     {
346 |       "cell_type": "code",
347 |       "metadata": {
348 |         "id": "-TGrKPMGg10f",
349 |         "colab_type": "code",
350 |         "colab": {}
351 |       },
352 |       "source": [
353 |         "from sklearn.metrics import mean_absolute_error"
354 |       ],
355 |       "execution_count": 0,
356 |       "outputs": []
357 |     },
358 |     {
359 |       "cell_type": "code",
360 |       "metadata": {
361 |         "id": "8YeXhiqWhPtI",
362 |         "colab_type": "code",
363 |         "colab": {}
364 |       },
365 |       "source": [
366 |         "x_column_list = ['RM']\n",
367 |         "y_column_list = ['PRICE']\n",
368 |         "\n",
369 |         "X_train, X_test, y_train, y_test = train_test_split(data_boston[x_column_list], data_boston[y_column_list], test_size=0.3)\n",
370 |         "\n",
371 |         "lr_single = LinearRegression()\n",
372 |         "\n",
373 |         "lr_single.fit(X_train, y_train) \n",
374 |         "y_pred = lr_single.predict(X_test)\n",
375 |         "\n",
376 |         "print(mean_absolute_error(y_pred, y_test))"
377 |       ],
378 |       "execution_count": 0,
379 |       "outputs": []
380 |     },
381 |     {
382 |       "cell_type": "code",
383 |       "metadata": {
384 |         "id": "vXCexSrUlZ6K",
385 |         "colab_type": "code",
386 |         "colab": {}
387 |       },
388 |       "source": [
389 |         "x_column_list_for_multi = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']\n",
390 |         "y_column_list_for_multi = ['PRICE']\n",
391 |         "\n",
392 |         "X_train, X_test, y_train, y_test = train_test_split(data_boston[x_column_list_for_multi], data_boston[y_column_list_for_multi], test_size=0.3)\n",
393 |         "\n",
394 |         "lr_multi2 = LinearRegression()\n",
395 |         "\n",
396 |         "lr_multi2.fit(X_train, y_train) \n",
397 |         "y_pred = lr_multi2.predict(X_test)\n",
398 |         "\n",
399 |         "print(mean_absolute_error(y_pred, y_test))"
400 |       ],
401 |       "execution_count": 0,
402 |       "outputs": []
403 |     },
404 |     {
405 |       "cell_type": "code",
406 |       "metadata": {
407 |         "id": "pbCs-LY9lhs0",
408 |         "colab_type": "code",
409 |         "colab": {}
410 |       },
411 |       "source": [
412 |         ""
413 |       ],
414 |       "execution_count": 0,
415 |       "outputs": []
416 |     },
417 |     {
418 |       "cell_type": "code",
419 |       "metadata": {
420 |         "id": "ZuiyKVZfnetv",
421 |         "colab_type": "code",
422 |         "colab": {}
423 |       },
424 |       "source": [
425 |         ""
426 |       ],
427 |       "execution_count": 0,
428 |       "outputs": []
429 |     }
430 |   ]
431 | }


--------------------------------------------------------------------------------
/Python/03/lm_ridge_lasso_boston.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch03-boston-ridge-lasso-lm.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "GPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "o5dgWD9rz4LG",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import numpy as np\n",
 27 |         "import pandas as pd\n",
 28 |         "import matplotlib.pyplot as plt\n",
 29 |         "%matplotlib inline\n",
 30 |         "import seaborn as sns\n",
 31 |         "from sklearn.linear_model import LinearRegression, Ridge, Lasso\n",
 32 |         "from sklearn.datasets import load_boston\n",
 33 |         "from sklearn.metrics import mean_absolute_error\n",
 34 |         "from sklearn.model_selection import train_test_split"
 35 |       ],
 36 |       "execution_count": 0,
 37 |       "outputs": []
 38 |     },
 39 |     {
 40 |       "cell_type": "markdown",
 41 |       "metadata": {
 42 |         "id": "GAqEPf-uYZZ5",
 43 |         "colab_type": "text"
 44 |       },
 45 |       "source": [
 46 |         "## データ読み込み"
 47 |       ]
 48 |     },
 49 |     {
 50 |       "cell_type": "code",
 51 |       "metadata": {
 52 |         "id": "Z38CWl_Fz61A",
 53 |         "colab_type": "code",
 54 |         "colab": {}
 55 |       },
 56 |       "source": [
 57 |         "boston = load_boston()\n",
 58 |         "data_boston = pd.DataFrame(boston.data, columns=boston.feature_names)\n",
 59 |         "data_boston['PRICE'] = boston.target\n",
 60 |         "\n",
 61 |         "print(data_boston.head())\n",
 62 |         "print(data_boston.tail())"
 63 |       ],
 64 |       "execution_count": 0,
 65 |       "outputs": []
 66 |     },
 67 |     {
 68 |       "cell_type": "markdown",
 69 |       "metadata": {
 70 |         "id": "Obrkk6djkrQV",
 71 |         "colab_type": "text"
 72 |       },
 73 |       "source": [
 74 |         "## L1正則化なし"
 75 |       ]
 76 |     },
 77 |     {
 78 |       "cell_type": "code",
 79 |       "metadata": {
 80 |         "id": "Dxo6oyArkuZX",
 81 |         "colab_type": "code",
 82 |         "colab": {}
 83 |       },
 84 |       "source": [
 85 |         "lr_multi = LinearRegression()\n",
 86 |         "\n",
 87 |         "x_column_list_for_multi = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']\n",
 88 |         "y_column_list_for_multi = ['PRICE']\n",
 89 |         "\n",
 90 |         "lr_multi.fit(data_boston[x_column_list_for_multi],  data_boston[y_column_list_for_multi])\n",
 91 |         "\n",
 92 |         "print(lr_multi.coef_)\n",
 93 |         "print(lr_multi.intercept_)"
 94 |       ],
 95 |       "execution_count": 0,
 96 |       "outputs": []
 97 |     },
 98 |     {
 99 |       "cell_type": "code",
100 |       "metadata": {
101 |         "id": "720BakmC4Khb",
102 |         "colab_type": "code",
103 |         "colab": {}
104 |       },
105 |       "source": [
106 |         "X_train, X_test, y_train, y_test = train_test_split(data_boston[x_column_list_for_multi], data_boston[y_column_list_for_multi], test_size=0.3)"
107 |       ],
108 |       "execution_count": 0,
109 |       "outputs": []
110 |     },
111 |     {
112 |       "cell_type": "markdown",
113 |       "metadata": {
114 |         "id": "W0BRywgqkLKv",
115 |         "colab_type": "text"
116 |       },
117 |       "source": [
118 |         "### 予測と**MAE**"
119 |       ]
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "metadata": {
124 |         "id": "q-pBlusmkMt6",
125 |         "colab_type": "code",
126 |         "colab": {}
127 |       },
128 |       "source": [
129 |         "lr_multi2 = LinearRegression()\n",
130 |         "\n",
131 |         "lr_multi2.fit(X_train, y_train) \n",
132 |         "print(lr_multi2.coef_)\n",
133 |         "print(lr_multi2.intercept_)\n",
134 |         "\n",
135 |         "y_pred_lr = lr_multi2.predict(X_test)\n",
136 |         "\n",
137 |         "# 残差\n",
138 |         "# print(y_pred_lr-y_test)\n",
139 |         "\n",
140 |         "# MAE\n",
141 |         "print(mean_absolute_error(y_pred_lr, y_test))"
142 |       ],
143 |       "execution_count": 0,
144 |       "outputs": []
145 |     },
146 |     {
147 |       "cell_type": "markdown",
148 |       "metadata": {
149 |         "id": "aA5RrYWvk-Pj",
150 |         "colab_type": "text"
151 |       },
152 |       "source": [
153 |         "## Lasso回帰"
154 |       ]
155 |     },
156 |     {
157 |       "cell_type": "code",
158 |       "metadata": {
159 |         "id": "zu_8ABezlTgd",
160 |         "colab_type": "code",
161 |         "colab": {}
162 |       },
163 |       "source": [
164 |         "lasso = Lasso(alpha=0.001, normalize=True)\n",
165 |         "lasso.fit(X_train, y_train) \n",
166 |         "print(lasso.coef_)\n",
167 |         "print(lasso.intercept_)"
168 |       ],
169 |       "execution_count": 0,
170 |       "outputs": []
171 |     },
172 |     {
173 |       "cell_type": "markdown",
174 |       "metadata": {
175 |         "id": "T1FWbFm8lPDq",
176 |         "colab_type": "text"
177 |       },
178 |       "source": [
179 |         "### MAE"
180 |       ]
181 |     },
182 |     {
183 |       "cell_type": "code",
184 |       "metadata": {
185 |         "id": "NMzuTOhXlBtQ",
186 |         "colab_type": "code",
187 |         "colab": {}
188 |       },
189 |       "source": [
190 |         "y_pred_lasso = lasso.predict(X_test)\n",
191 |         "\n",
192 |         "# 残差\n",
193 |         "# print(y_pred_lasso.reshape(-1,1) - y_test)\n",
194 |         "\n",
195 |         "# MAE\n",
196 |         "print(mean_absolute_error(y_pred_lasso, y_test))"
197 |       ],
198 |       "execution_count": 0,
199 |       "outputs": []
200 |     },
201 |     {
202 |       "cell_type": "markdown",
203 |       "metadata": {
204 |         "id": "0zLEcnZFjkrk",
205 |         "colab_type": "text"
206 |       },
207 |       "source": [
208 |         "## Ridge回帰"
209 |       ]
210 |     },
211 |     {
212 |       "cell_type": "code",
213 |       "metadata": {
214 |         "id": "A1XZ4pV3lVxd",
215 |         "colab_type": "code",
216 |         "colab": {}
217 |       },
218 |       "source": [
219 |         "ridge = Ridge(alpha=0.01, normalize=True)\n",
220 |         "ridge.fit(X_train, y_train) \n",
221 |         "print(ridge.coef_)\n",
222 |         "print(ridge.intercept_)"
223 |       ],
224 |       "execution_count": 0,
225 |       "outputs": []
226 |     },
227 |     {
228 |       "cell_type": "markdown",
229 |       "metadata": {
230 |         "id": "Pm6sVXnXlUb-",
231 |         "colab_type": "text"
232 |       },
233 |       "source": [
234 |         "### MAE"
235 |       ]
236 |     },
237 |     {
238 |       "cell_type": "code",
239 |       "metadata": {
240 |         "id": "ZuiyKVZfnetv",
241 |         "colab_type": "code",
242 |         "colab": {}
243 |       },
244 |       "source": [
245 |         "y_pred_ridge = ridge.predict(X_test)\n",
246 |         "\n",
247 |         "# 残差\n",
248 |         "# print(y_pred_ridge.reshape(-1,1) - y_test)\n",
249 |         "\n",
250 |         "# MAE\n",
251 |         "print(mean_absolute_error(y_pred_ridge, y_test))"
252 |       ],
253 |       "execution_count": 0,
254 |       "outputs": []
255 |     },
256 |     {
257 |       "cell_type": "code",
258 |       "metadata": {
259 |         "id": "CXzbRDoXx24o",
260 |         "colab_type": "code",
261 |         "colab": {}
262 |       },
263 |       "source": [
264 |         ""
265 |       ],
266 |       "execution_count": 0,
267 |       "outputs": []
268 |     }
269 |   ]
270 | }


--------------------------------------------------------------------------------
/Python/03/lm_ridge_lasso_tokyo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch03-tokyo-ridge-lasso-lm.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "metadata": {
 20 |         "id": "hbPe1ouXtDkg",
 21 |         "colab_type": "code",
 22 |         "colab": {}
 23 |       },
 24 |       "source": [
 25 |         "import numpy as np\n",
 26 |         "import matplotlib.pyplot as plt\n",
 27 |         "import pandas as pd\n",
 28 |         "import random\n",
 29 |         "%matplotlib inline\n",
 30 |         "import seaborn as sns\n",
 31 |         "from sklearn.linear_model import LinearRegression, Ridge, Lasso\n",
 32 |         "\n",
 33 |         "from sklearn.model_selection import train_test_split\n",
 34 |         "from sklearn.metrics import mean_absolute_error\n",
 35 |         "\n",
 36 |         "import requests\n",
 37 |         "import json\n",
 38 |         "import re"
 39 |       ],
 40 |       "execution_count": 0,
 41 |       "outputs": []
 42 |     },
 43 |     {
 44 |       "cell_type": "markdown",
 45 |       "metadata": {
 46 |         "id": "jB1EII0lhGAW",
 47 |         "colab_type": "text"
 48 |       },
 49 |       "source": [
 50 |         "### CSVファイルからデータ読み込みとデータ整形\n"
 51 |       ]
 52 |     },
 53 |     {
 54 |       "cell_type": "code",
 55 |       "metadata": {
 56 |         "id": "bk-9aVfz7CJW",
 57 |         "colab_type": "code",
 58 |         "colab": {}
 59 |       },
 60 |       "source": [
 61 |         "data_from_csv = pd.read_csv(\"13_Tokyo_20171_20184.csv\", encoding='cp932')\n",
 62 |         "data_used_apartment = data_from_csv.query('種類 == \"中古マンション等\"')\n",
 63 |         "\n",
 64 |         "columns_name_list = [\"最寄駅：距離（分）\", \"間取り\", \"面積（㎡）\",\"建築年\", \"建物の構造\", \"建ぺい率（％）\", \"容積率（％）\", \"市区町村名\", \"取引価格（総額）\"]\n",
 65 |         "\n",
 66 |         "data_selected = data_used_apartment[columns_name_list]\n",
 67 |         "data_selected_dropna = data_selected.dropna(how='any') # 一つでもNANデータを含む行を削除"
 68 |       ],
 69 |       "execution_count": 0,
 70 |       "outputs": []
 71 |     },
 72 |     {
 73 |       "cell_type": "code",
 74 |       "metadata": {
 75 |         "id": "oibI6rxwQTaX",
 76 |         "colab_type": "code",
 77 |         "colab": {}
 78 |       },
 79 |       "source": [
 80 |         "# 建築年を築年数に変更\n",
 81 |         "data_selected_dropna = data_selected_dropna[data_selected_dropna[\"建築年\"].str.match('^平成|昭和')]\n",
 82 |         "\n",
 83 |         "wareki_to_seireki = {'昭和': 1926-1, '平成': 1989-1}\n",
 84 |         "\n",
 85 |         "building_year_list = data_selected_dropna[\"建築年\"]\n",
 86 |         "\n",
 87 |         "building_age_list = []\n",
 88 |         "for building_year in building_year_list:\n",
 89 |         "    # 昭和○年 → 昭和, ○ に変換、平成○年 → 平成, ○ に変換\n",
 90 |         "    building_year_split = re.search(r'(.+?)([0-9]+|元)年', building_year)\n",
 91 |         "    # 西暦に変換\n",
 92 |         "    seireki = wareki_to_seireki[building_year_split.groups()[0]] + int(building_year_split.groups()[1])\n",
 93 |         "   \n",
 94 |         "    building_age = 2018 - seireki # 築年数に変換\n",
 95 |         "    building_age_list.append(building_age)\n",
 96 |         "\n",
 97 |         "    \n",
 98 |         "data_selected_dropna[\"築年数\"] = building_age_list # 新しく、築年数列を追加\n",
 99 |         "# もう使わないので、建築年列は削除\n",
100 |         "data_added_building_age = data_selected_dropna.drop(\"建築年\", axis=1)"
101 |       ],
102 |       "execution_count": 0,
103 |       "outputs": []
104 |     },
105 |     {
106 |       "cell_type": "code",
107 |       "metadata": {
108 |         "id": "BoyjtlvLhZG4",
109 |         "colab_type": "code",
110 |         "colab": {}
111 |       },
112 |       "source": [
113 |         "# ダミー変数化しないもの\n",
114 |         "columns_name_list = [\"最寄駅：距離（分）\", \"面積（㎡）\",\"築年数\", \"建ぺい率（％）\", \"容積率（％）\", \"取引価格（総額）\"]\n",
115 |         "# ダミー変数リスト\n",
116 |         "dummy_list = [\"間取り\", \"建物の構造\", \"市区町村名\"]\n",
117 |         "\n",
118 |         "# ダミー変数を追加\n",
119 |         "data_added_dummies = pd.concat([data_added_building_age[columns_name_list],\n",
120 |         "                   pd.get_dummies(data_added_building_age[dummy_list], drop_first=True)], axis=1)\n",
121 |         "\n",
122 |         "\n",
123 |         "# 文字列を数値化\n",
124 |         "data_added_dummies[\"面積（㎡）\"] = data_added_dummies[\"面積（㎡）\"].astype(float)\n",
125 |         "data_added_dummies = data_added_dummies[~data_added_dummies['最寄駅：距離（分）'].str.contains('\\?')]\n",
126 |         "data_added_dummies[\"最寄駅：距離（分）\"] = data_added_dummies[\"最寄駅：距離（分）\"].astype(float)\n",
127 |         "\n",
128 |         "# 6000万円以下のデータのみ抽出\n",
129 |         "data_added_dummies = data_added_dummies[data_added_dummies[\"取引価格（総額）\"] < 60000000]"
130 |       ],
131 |       "execution_count": 0,
132 |       "outputs": []
133 |     },
134 |     {
135 |       "cell_type": "code",
136 |       "metadata": {
137 |         "id": "zq7u9UhLib5q",
138 |         "colab_type": "code",
139 |         "colab": {}
140 |       },
141 |       "source": [
142 |         "print(data_added_dummies.shape)"
143 |       ],
144 |       "execution_count": 0,
145 |       "outputs": []
146 |     },
147 |     {
148 |       "cell_type": "markdown",
149 |       "metadata": {
150 |         "id": "-TkxAlYlt4Zg",
151 |         "colab_type": "text"
152 |       },
153 |       "source": [
154 |         "## L1正則化なし"
155 |       ]
156 |     },
157 |     {
158 |       "cell_type": "code",
159 |       "metadata": {
160 |         "id": "s-AZ2qRlvCeU",
161 |         "colab_type": "code",
162 |         "colab": {}
163 |       },
164 |       "source": [
165 |         "x = data_added_dummies.drop(\"取引価格（総額）\", axis=1)\n",
166 |         "y = data_added_dummies[\"取引価格（総額）\"]\n",
167 |         "\n",
168 |         "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)"
169 |       ],
170 |       "execution_count": 0,
171 |       "outputs": []
172 |     },
173 |     {
174 |       "cell_type": "markdown",
175 |       "metadata": {
176 |         "id": "BLquTeiTAHlI",
177 |         "colab_type": "text"
178 |       },
179 |       "source": [
180 |         "### 予測とMAE"
181 |       ]
182 |     },
183 |     {
184 |       "cell_type": "code",
185 |       "metadata": {
186 |         "id": "f8VEXWtRwqUO",
187 |         "colab_type": "code",
188 |         "colab": {}
189 |       },
190 |       "source": [
191 |         "lr_multi = LinearRegression()\n",
192 |         "\n",
193 |         "lr_multi.fit(X_train, y_train) \n",
194 |         "print(lr_multi.coef_)\n",
195 |         "print(lr_multi.intercept_)\n",
196 |         "\n",
197 |         "y_pred_lr = lr_multi.predict(X_test)\n",
198 |         "\n",
199 |         "# 残差\n",
200 |         "# print(y_pred_lr - y_test)\n",
201 |         "\n",
202 |         "# MAE\n",
203 |         "print(mean_absolute_error(y_pred_lr, y_test))"
204 |       ],
205 |       "execution_count": 0,
206 |       "outputs": []
207 |     },
208 |     {
209 |       "cell_type": "markdown",
210 |       "metadata": {
211 |         "id": "b6MSFJDWCWiw",
212 |         "colab_type": "text"
213 |       },
214 |       "source": [
215 |         "## Lasso回帰"
216 |       ]
217 |     },
218 |     {
219 |       "cell_type": "code",
220 |       "metadata": {
221 |         "id": "_jOfClMfA7wG",
222 |         "colab_type": "code",
223 |         "colab": {}
224 |       },
225 |       "source": [
226 |         "lasso = Lasso(alpha=1, normalize=True)\n",
227 |         "lasso.fit(X_train, y_train) \n",
228 |         "print(lasso.coef_)\n",
229 |         "print(lasso.intercept_)"
230 |       ],
231 |       "execution_count": 0,
232 |       "outputs": []
233 |     },
234 |     {
235 |       "cell_type": "markdown",
236 |       "metadata": {
237 |         "id": "Z7IX8p9KCbr3",
238 |         "colab_type": "text"
239 |       },
240 |       "source": [
241 |         "### MAE"
242 |       ]
243 |     },
244 |     {
245 |       "cell_type": "code",
246 |       "metadata": {
247 |         "id": "nSB4v9_s5P04",
248 |         "colab_type": "code",
249 |         "colab": {}
250 |       },
251 |       "source": [
252 |         "y_pred_lasso = lasso.predict(X_test)\n",
253 |         "\n",
254 |         "# 残差\n",
255 |         "# print(y_pred_lasso.reshape(-1,1) - y_test)\n",
256 |         "\n",
257 |         "# MAE\n",
258 |         "print(mean_absolute_error(y_pred_lasso, y_test))"
259 |       ],
260 |       "execution_count": 0,
261 |       "outputs": []
262 |     },
263 |     {
264 |       "cell_type": "markdown",
265 |       "metadata": {
266 |         "id": "yZMnQ46MCX38",
267 |         "colab_type": "text"
268 |       },
269 |       "source": [
270 |         "## Ridge回帰"
271 |       ]
272 |     },
273 |     {
274 |       "cell_type": "code",
275 |       "metadata": {
276 |         "id": "siOmINIpCZcP",
277 |         "colab_type": "code",
278 |         "colab": {}
279 |       },
280 |       "source": [
281 |         "ridge = Ridge(alpha=0.1, normalize=True)\n",
282 |         "ridge.fit(X_train, y_train) \n",
283 |         "print(ridge.coef_)\n",
284 |         "print(ridge.intercept_)"
285 |       ],
286 |       "execution_count": 0,
287 |       "outputs": []
288 |     },
289 |     {
290 |       "cell_type": "markdown",
291 |       "metadata": {
292 |         "id": "fdy0CHSSCdMu",
293 |         "colab_type": "text"
294 |       },
295 |       "source": [
296 |         "### MAE"
297 |       ]
298 |     },
299 |     {
300 |       "cell_type": "code",
301 |       "metadata": {
302 |         "id": "gaH1r7WqCdwZ",
303 |         "colab_type": "code",
304 |         "colab": {}
305 |       },
306 |       "source": [
307 |         "y_pred_ridge = ridge.predict(X_test)\n",
308 |         "\n",
309 |         "# 残差\n",
310 |         "# print(y_pred_ridge.reshape(-1,1) - y_test)\n",
311 |         "\n",
312 |         "# MAE\n",
313 |         "print(mean_absolute_error(y_pred_ridge, y_test))"
314 |       ],
315 |       "execution_count": 0,
316 |       "outputs": []
317 |     },
318 |     {
319 |       "cell_type": "code",
320 |       "metadata": {
321 |         "id": "-eW2Q8MCTyTj",
322 |         "colab_type": "code",
323 |         "colab": {}
324 |       },
325 |       "source": [
326 |         ""
327 |       ],
328 |       "execution_count": 0,
329 |       "outputs": []
330 |     }
331 |   ]
332 | }


--------------------------------------------------------------------------------
/Python/03/lm_tokyo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch03-tokyo-lm.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "toc_visible": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "hbPe1ouXtDkg",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import numpy as np\n",
 27 |         "import matplotlib.pyplot as plt\n",
 28 |         "import pandas as pd\n",
 29 |         "import random\n",
 30 |         "%matplotlib inline\n",
 31 |         "import seaborn as sns\n",
 32 |         "from sklearn.linear_model import LinearRegression\n",
 33 |         "from sklearn.model_selection import train_test_split\n",
 34 |         "\n",
 35 |         "import requests\n",
 36 |         "import json\n",
 37 |         "import re"
 38 |       ],
 39 |       "execution_count": 0,
 40 |       "outputs": []
 41 |     },
 42 |     {
 43 |       "cell_type": "markdown",
 44 |       "metadata": {
 45 |         "id": "RLSAgxJdJ0rr",
 46 |         "colab_type": "text"
 47 |       },
 48 |       "source": [
 49 |         "### APIでデータ読み込み"
 50 |       ]
 51 |     },
 52 |     {
 53 |       "cell_type": "code",
 54 |       "metadata": {
 55 |         "id": "kh4ZZr5Itq_u",
 56 |         "colab_type": "code",
 57 |         "colab": {}
 58 |       },
 59 |       "source": [
 60 |         "url_path = \"https://www.land.mlit.go.jp/webland/api/TradeListSearch?from=20171&to=20185&area=13\"\n",
 61 |         "request_result = requests.get(url_path)\n",
 62 |         "data_json = request_result.json()[\"data\"]"
 63 |       ],
 64 |       "execution_count": 0,
 65 |       "outputs": []
 66 |     },
 67 |     {
 68 |       "cell_type": "code",
 69 |       "metadata": {
 70 |         "id": "haC9urQWKrC0",
 71 |         "colab_type": "code",
 72 |         "colab": {}
 73 |       },
 74 |       "source": [
 75 |         "print(len(data_json))"
 76 |       ],
 77 |       "execution_count": 0,
 78 |       "outputs": []
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "metadata": {
 83 |         "id": "sj82Ui5MKu19",
 84 |         "colab_type": "code",
 85 |         "colab": {}
 86 |       },
 87 |       "source": [
 88 |         "print(data_json[0])"
 89 |       ],
 90 |       "execution_count": 0,
 91 |       "outputs": []
 92 |     },
 93 |     {
 94 |       "cell_type": "code",
 95 |       "metadata": {
 96 |         "id": "7iJVhsm0LS8F",
 97 |         "colab_type": "code",
 98 |         "colab": {}
 99 |       },
100 |       "source": [
101 |         "print(data_json[1000])"
102 |       ],
103 |       "execution_count": 0,
104 |       "outputs": []
105 |     },
106 |     {
107 |       "cell_type": "code",
108 |       "metadata": {
109 |         "id": "BuOMyy7NNdsH",
110 |         "colab_type": "code",
111 |         "colab": {}
112 |       },
113 |       "source": [
114 |         "data_pd = pd.io.json.json_normalize(data_json)\n",
115 |         "print(data_pd.shape)"
116 |       ],
117 |       "execution_count": 0,
118 |       "outputs": []
119 |     },
120 |     {
121 |       "cell_type": "code",
122 |       "metadata": {
123 |         "id": "VHVU53mCr-47",
124 |         "colab_type": "code",
125 |         "colab": {}
126 |       },
127 |       "source": [
128 |         "print(data_pd.head(10))"
129 |       ],
130 |       "execution_count": 0,
131 |       "outputs": []
132 |     },
133 |     {
134 |       "cell_type": "code",
135 |       "metadata": {
136 |         "id": "56KmFBgHssAI",
137 |         "colab_type": "code",
138 |         "colab": {}
139 |       },
140 |       "source": [
141 |         "print(data_pd.isnull().sum())"
142 |       ],
143 |       "execution_count": 0,
144 |       "outputs": []
145 |     },
146 |     {
147 |       "cell_type": "markdown",
148 |       "metadata": {
149 |         "id": "jB1EII0lhGAW",
150 |         "colab_type": "text"
151 |       },
152 |       "source": [
153 |         "### CSVファイルからデータ読み込み"
154 |       ]
155 |     },
156 |     {
157 |       "cell_type": "code",
158 |       "metadata": {
159 |         "id": "bk-9aVfz7CJW",
160 |         "colab_type": "code",
161 |         "colab": {}
162 |       },
163 |       "source": [
164 |         "data_from_csv = pd.read_csv(\"13_Tokyo_20171_20184.csv\", encoding='cp932')"
165 |       ],
166 |       "execution_count": 0,
167 |       "outputs": []
168 |     },
169 |     {
170 |       "cell_type": "code",
171 |       "metadata": {
172 |         "id": "oibI6rxwQTaX",
173 |         "colab_type": "code",
174 |         "colab": {}
175 |       },
176 |       "source": [
177 |         "print(data_from_csv.shape)"
178 |       ],
179 |       "execution_count": 0,
180 |       "outputs": []
181 |     },
182 |     {
183 |       "cell_type": "code",
184 |       "metadata": {
185 |         "id": "BoyjtlvLhZG4",
186 |         "colab_type": "code",
187 |         "colab": {}
188 |       },
189 |       "source": [
190 |         "print(data_from_csv.iloc[0])"
191 |       ],
192 |       "execution_count": 0,
193 |       "outputs": []
194 |     },
195 |     {
196 |       "cell_type": "code",
197 |       "metadata": {
198 |         "id": "zq7u9UhLib5q",
199 |         "colab_type": "code",
200 |         "colab": {}
201 |       },
202 |       "source": [
203 |         "print(data_from_csv.head(10))"
204 |       ],
205 |       "execution_count": 0,
206 |       "outputs": []
207 |     },
208 |     {
209 |       "cell_type": "markdown",
210 |       "metadata": {
211 |         "id": "4ejeo1d0ZWng",
212 |         "colab_type": "text"
213 |       },
214 |       "source": [
215 |         "### データ整形"
216 |       ]
217 |     },
218 |     {
219 |       "cell_type": "code",
220 |       "metadata": {
221 |         "id": "-QNqhoSiivt7",
222 |         "colab_type": "code",
223 |         "colab": {}
224 |       },
225 |       "source": [
226 |         "print(data_from_csv[\"種類\"].unique())"
227 |       ],
228 |       "execution_count": 0,
229 |       "outputs": []
230 |     },
231 |     {
232 |       "cell_type": "code",
233 |       "metadata": {
234 |         "id": "PLG4jg1RCnwA",
235 |         "colab_type": "code",
236 |         "colab": {}
237 |       },
238 |       "source": [
239 |         "data_used_apartment = data_from_csv.query('種類 == \"中古マンション等\"')\n",
240 |         "print(data_used_apartment.shape)\n",
241 |         "print(data_used_apartment.head())\n",
242 |         "print(data_used_apartment.iloc[0])"
243 |       ],
244 |       "execution_count": 0,
245 |       "outputs": []
246 |     },
247 |     {
248 |       "cell_type": "code",
249 |       "metadata": {
250 |         "id": "2tDqZZiEEH6U",
251 |         "colab_type": "code",
252 |         "colab": {}
253 |       },
254 |       "source": [
255 |         "print(data_used_apartment.isnull().sum())"
256 |       ],
257 |       "execution_count": 0,
258 |       "outputs": []
259 |     },
260 |     {
261 |       "cell_type": "code",
262 |       "metadata": {
263 |         "id": "7vWpDjsISdNh",
264 |         "colab_type": "code",
265 |         "colab": {}
266 |       },
267 |       "source": [
268 |         "columns_name_list = [\"最寄駅：距離（分）\", \"間取り\", \"面積（㎡）\",\"建築年\", \"建物の構造\", \"建ぺい率（％）\", \"容積率（％）\", \"市区町村名\", \"取引価格（総額）\"]\n",
269 |         "\n",
270 |         "data_selected = data_used_apartment[columns_name_list]\n",
271 |         "print(data_selected.shape)\n",
272 |         "\n",
273 |         "data_selected_dropna = data_selected.dropna(how='any') # 一つでもNANデータを含む行を削除\n",
274 |         "print(data_selected_dropna.shape)\n",
275 |         "print(data_selected_dropna.iloc[0])"
276 |       ],
277 |       "execution_count": 0,
278 |       "outputs": []
279 |     },
280 |     {
281 |       "cell_type": "code",
282 |       "metadata": {
283 |         "id": "SsRWqNUsSdQx",
284 |         "colab_type": "code",
285 |         "colab": {}
286 |       },
287 |       "source": [
288 |         "data_selected_dropna[\"建築年\"].unique()"
289 |       ],
290 |       "execution_count": 0,
291 |       "outputs": []
292 |     },
293 |     {
294 |       "cell_type": "code",
295 |       "metadata": {
296 |         "id": "8gkx6DdCSdTM",
297 |         "colab_type": "code",
298 |         "colab": {}
299 |       },
300 |       "source": [
301 |         "data_selected_dropna = data_selected_dropna[data_selected_dropna[\"建築年\"].str.match('^平成|昭和')]\n",
302 |         "\n",
303 |         "\n",
304 |         "wareki_to_seireki = {'昭和': 1926-1, '平成': 1989-1}\n",
305 |         "\n",
306 |         "building_year_list = data_selected_dropna[\"建築年\"]\n",
307 |         "\n",
308 |         "building_age_list = []\n",
309 |         "for building_year in building_year_list:\n",
310 |         "    # 昭和○年 → 昭和, ○ に変換、平成○年 → 平成, ○ に変換\n",
311 |         "    building_year_split = re.search(r'(.+?)([0-9]+|元)年', building_year)\n",
312 |         "    # 西暦に変換\n",
313 |         "    seireki = wareki_to_seireki[building_year_split.groups()[0]] + int(building_year_split.groups()[1])\n",
314 |         "   \n",
315 |         "    building_age = 2018 - seireki # 築年数に変換\n",
316 |         "    building_age_list.append(building_age)\n",
317 |         "\n",
318 |         "    \n",
319 |         "data_selected_dropna[\"築年数\"] = building_age_list # 新しく、築年数列を追加\n",
320 |         "\n",
321 |         "# もう使わないので、建築年列は削除\n",
322 |         "data_added_building_age = data_selected_dropna.drop(\"建築年\", axis=1)\n",
323 |         "print(data_added_building_age.head())"
324 |       ],
325 |       "execution_count": 0,
326 |       "outputs": []
327 |     },
328 |     {
329 |       "cell_type": "code",
330 |       "metadata": {
331 |         "id": "0twcuPJlfYm2",
332 |         "colab_type": "code",
333 |         "colab": {}
334 |       },
335 |       "source": [
336 |         "# ダミー変数化しないもののリスト\n",
337 |         "columns_name_list = [\"最寄駅：距離（分）\", \"面積（㎡）\",\"築年数\", \"建ぺい率（％）\", \"容積率（％）\", \"取引価格（総額）\"]\n",
338 |         "\n",
339 |         "# ダミー変数化するリスト\n",
340 |         "dummy_list = [\"間取り\", \"建物の構造\", \"市区町村名\"]\n",
341 |         "\n",
342 |         "# ダミー変数を追加\n",
343 |         "data_added_dummies = pd.concat([data_added_building_age[columns_name_list],\n",
344 |         "                   pd.get_dummies(data_added_building_age[dummy_list], drop_first=True)], axis=1)\n",
345 |         "\n",
346 |         "print(data_added_dummies.shape)\n",
347 |         "print(data_added_dummies.iloc[0])"
348 |       ],
349 |       "execution_count": 0,
350 |       "outputs": []
351 |     },
352 |     {
353 |       "cell_type": "code",
354 |       "metadata": {
355 |         "id": "SHRUOuaxSdV-",
356 |         "colab_type": "code",
357 |         "colab": {}
358 |       },
359 |       "source": [
360 |         "print(data_added_dummies.dtypes)"
361 |       ],
362 |       "execution_count": 0,
363 |       "outputs": []
364 |     },
365 |     {
366 |       "cell_type": "code",
367 |       "metadata": {
368 |         "id": "Fn6U6LvDlDAW",
369 |         "colab_type": "code",
370 |         "colab": {}
371 |       },
372 |       "source": [
373 |         "data_added_dummies[\"面積（㎡）\"] = data_added_dummies[\"面積（㎡）\"].astype(float)\n",
374 |         "data_added_dummies = data_added_dummies[~data_added_dummies['最寄駅：距離（分）'].str.contains('\\?')]\n",
375 |         "data_added_dummies[\"最寄駅：距離（分）\"] = data_added_dummies[\"最寄駅：距離（分）\"].astype(float)"
376 |       ],
377 |       "execution_count": 0,
378 |       "outputs": []
379 |     },
380 |     {
381 |       "cell_type": "code",
382 |       "metadata": {
383 |         "id": "QyJzbmqgSdY3",
384 |         "colab_type": "code",
385 |         "colab": {}
386 |       },
387 |       "source": [
388 |         "print(data_added_dummies.dtypes)"
389 |       ],
390 |       "execution_count": 0,
391 |       "outputs": []
392 |     },
393 |     {
394 |       "cell_type": "markdown",
395 |       "metadata": {
396 |         "id": "w7p-xIA1Ap8I",
397 |         "colab_type": "text"
398 |       },
399 |       "source": [
400 |         "## 可視化"
401 |       ]
402 |     },
403 |     {
404 |       "cell_type": "code",
405 |       "metadata": {
406 |         "id": "VQWx7PTAApQX",
407 |         "colab_type": "code",
408 |         "colab": {}
409 |       },
410 |       "source": [
411 |         "plt.hist(data_added_dummies[\"取引価格（総額）\"])\n",
412 |         "plt.show()\n",
413 |         "\n",
414 |         "tmp_data = data_added_dummies[data_added_dummies[\"取引価格（総額）\"] < 60000000]\n",
415 |         "print(tmp_data.shape)\n",
416 |         "plt.hist(tmp_data[\"取引価格（総額）\"])\n",
417 |         "plt.show()"
418 |       ],
419 |       "execution_count": 0,
420 |       "outputs": []
421 |     },
422 |     {
423 |       "cell_type": "code",
424 |       "metadata": {
425 |         "id": "wgFEq8xwDOFw",
426 |         "colab_type": "code",
427 |         "colab": {}
428 |       },
429 |       "source": [
430 |         "data_added_dummies = data_added_dummies[data_added_dummies[\"取引価格（総額）\"] < 60000000]"
431 |       ],
432 |       "execution_count": 0,
433 |       "outputs": []
434 |     },
435 |     {
436 |       "cell_type": "markdown",
437 |       "metadata": {
438 |         "id": "-TkxAlYlt4Zg",
439 |         "colab_type": "text"
440 |       },
441 |       "source": [
442 |         "## 線形回帰を実践"
443 |       ]
444 |     },
445 |     {
446 |       "cell_type": "code",
447 |       "metadata": {
448 |         "id": "s-AZ2qRlvCeU",
449 |         "colab_type": "code",
450 |         "colab": {}
451 |       },
452 |       "source": [
453 |         "lr = LinearRegression()\n",
454 |         "\n",
455 |         "x_column_list = ['面積（㎡）']\n",
456 |         "y_column_list = ['取引価格（総額）']\n",
457 |         "\n",
458 |         "x = data_added_dummies[x_column_list]\n",
459 |         "y = data_added_dummies[y_column_list]\n",
460 |         "\n",
461 |         "lr.fit(x, y)"
462 |       ],
463 |       "execution_count": 0,
464 |       "outputs": []
465 |     },
466 |     {
467 |       "cell_type": "code",
468 |       "metadata": {
469 |         "id": "7rYy2IflwCOF",
470 |         "colab_type": "code",
471 |         "colab": {}
472 |       },
473 |       "source": [
474 |         "print(lr.coef_)\n",
475 |         "print(lr.intercept_)"
476 |       ],
477 |       "execution_count": 0,
478 |       "outputs": []
479 |     },
480 |     {
481 |       "cell_type": "markdown",
482 |       "metadata": {
483 |         "id": "hLDyZjIEt_lm",
484 |         "colab_type": "text"
485 |       },
486 |       "source": [
487 |         "### 重回帰分析"
488 |       ]
489 |     },
490 |     {
491 |       "cell_type": "code",
492 |       "metadata": {
493 |         "id": "TeyqwMDWwn7P",
494 |         "colab_type": "code",
495 |         "colab": {}
496 |       },
497 |       "source": [
498 |         "x = data_added_dummies.drop(\"取引価格（総額）\", axis=1)\n",
499 |         "y = data_added_dummies[\"取引価格（総額）\"]\n",
500 |         "\n",
501 |         "print(x.head())\n",
502 |         "print(y.head())"
503 |       ],
504 |       "execution_count": 0,
505 |       "outputs": []
506 |     },
507 |     {
508 |       "cell_type": "code",
509 |       "metadata": {
510 |         "id": "f8VEXWtRwqUO",
511 |         "colab_type": "code",
512 |         "colab": {}
513 |       },
514 |       "source": [
515 |         "lr_multi = LinearRegression()\n",
516 |         "lr_multi.fit(x, y)\n",
517 |         "\n",
518 |         "print(lr_multi.coef_)\n",
519 |         "print(lr_multi.intercept_)"
520 |       ],
521 |       "execution_count": 0,
522 |       "outputs": []
523 |     },
524 |     {
525 |       "cell_type": "code",
526 |       "metadata": {
527 |         "id": "nSB4v9_s5P04",
528 |         "colab_type": "code",
529 |         "colab": {}
530 |       },
531 |       "source": [
532 |         "for i in range(len(lr_multi.coef_)):\n",
533 |         "  print(x.columns[i], lr_multi.coef_[i])"
534 |       ],
535 |       "execution_count": 0,
536 |       "outputs": []
537 |     },
538 |     {
539 |       "cell_type": "markdown",
540 |       "metadata": {
541 |         "id": "lVmbQrgXuET8",
542 |         "colab_type": "text"
543 |       },
544 |       "source": [
545 |         "## 予測"
546 |       ]
547 |     },
548 |     {
549 |       "cell_type": "code",
550 |       "metadata": {
551 |         "id": "8Be2-0Cs8fpl",
552 |         "colab_type": "code",
553 |         "colab": {}
554 |       },
555 |       "source": [
556 |         "x = data_added_dummies.drop(\"取引価格（総額）\", axis=1)\n",
557 |         "y = data_added_dummies[\"取引価格（総額）\"]\n",
558 |         "\n",
559 |         "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)\n",
560 |         "print(X_train.shape)\n",
561 |         "print(X_test.shape)\n",
562 |         "print(y_train.shape)\n",
563 |         "print(y_test.shape)"
564 |       ],
565 |       "execution_count": 0,
566 |       "outputs": []
567 |     },
568 |     {
569 |       "cell_type": "code",
570 |       "metadata": {
571 |         "id": "4NOWZ3gF8iO6",
572 |         "colab_type": "code",
573 |         "colab": {}
574 |       },
575 |       "source": [
576 |         "lr_multi2 = LinearRegression()\n",
577 |         "\n",
578 |         "lr_multi2.fit(X_train, y_train) \n",
579 |         "print(lr_multi2.coef_)\n",
580 |         "print(lr_multi2.intercept_)"
581 |       ],
582 |       "execution_count": 0,
583 |       "outputs": []
584 |     },
585 |     {
586 |       "cell_type": "code",
587 |       "metadata": {
588 |         "id": "AiJXsD0O8lce",
589 |         "colab_type": "code",
590 |         "colab": {}
591 |       },
592 |       "source": [
593 |         "y_pred = lr_multi2.predict(X_test)\n",
594 |         "print(y_pred)"
595 |       ],
596 |       "execution_count": 0,
597 |       "outputs": []
598 |     },
599 |     {
600 |       "cell_type": "code",
601 |       "metadata": {
602 |         "id": "1TXiAr4w9AJ0",
603 |         "colab_type": "code",
604 |         "colab": {}
605 |       },
606 |       "source": [
607 |         "print(y_pred - y_test)"
608 |       ],
609 |       "execution_count": 0,
610 |       "outputs": []
611 |     },
612 |     {
613 |       "cell_type": "code",
614 |       "metadata": {
615 |         "id": "GMF1DnRuJ_pP",
616 |         "colab_type": "code",
617 |         "colab": {}
618 |       },
619 |       "source": [
620 |         "from sklearn.metrics import r2_score\n",
621 |         "r2_score(y_test, y_pred)"
622 |       ],
623 |       "execution_count": 0,
624 |       "outputs": []
625 |     },
626 |     {
627 |       "cell_type": "markdown",
628 |       "metadata": {
629 |         "id": "s8Q2afPluIJH",
630 |         "colab_type": "text"
631 |       },
632 |       "source": [
633 |         "## MAE"
634 |       ]
635 |     },
636 |     {
637 |       "cell_type": "code",
638 |       "metadata": {
639 |         "id": "Ao8cg_wZO-XX",
640 |         "colab_type": "code",
641 |         "colab": {}
642 |       },
643 |       "source": [
644 |         "from sklearn.metrics import mean_absolute_error\n",
645 |         "\n",
646 |         "x_column_list = ['面積（㎡）']\n",
647 |         "y_column_list = ['取引価格（総額）']\n",
648 |         "\n",
649 |         "\n",
650 |         "X_train, X_test, y_train, y_test = train_test_split(data_added_dummies[x_column_list], data_added_dummies[y_column_list], test_size=0.3)\n",
651 |         "\n",
652 |         "lr_single = LinearRegression()\n",
653 |         "\n",
654 |         "lr_single.fit(X_train, y_train) \n",
655 |         "y_pred = lr_single.predict(X_test)\n",
656 |         "\n",
657 |         "print(mean_absolute_error(y_pred, y_test))"
658 |       ],
659 |       "execution_count": 0,
660 |       "outputs": []
661 |     },
662 |     {
663 |       "cell_type": "code",
664 |       "metadata": {
665 |         "id": "WfV7c4oQObxF",
666 |         "colab_type": "code",
667 |         "colab": {}
668 |       },
669 |       "source": [
670 |         "x = data_added_dummies.drop(\"取引価格（総額）\", axis=1)\n",
671 |         "y = data_added_dummies[\"取引価格（総額）\"]\n",
672 |         "\n",
673 |         "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.7)\n",
674 |         "\n",
675 |         "lr_multi2 = LinearRegression()\n",
676 |         "\n",
677 |         "lr_multi2.fit(X_train, y_train) \n",
678 |         "y_pred = lr_multi2.predict(X_test)\n",
679 |         "\n",
680 |         "print(mean_absolute_error(y_pred, y_test))"
681 |       ],
682 |       "execution_count": 0,
683 |       "outputs": []
684 |     },
685 |     {
686 |       "cell_type": "markdown",
687 |       "metadata": {
688 |         "id": "vR70qGkg-LiC",
689 |         "colab_type": "text"
690 |       },
691 |       "source": [
692 |         ""
693 |       ]
694 |     }
695 |   ]
696 | }
697 | 


--------------------------------------------------------------------------------
/Python/04/decisionTree_iris.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch04-iris-DecisionTree.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "toc_visible": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "kSb7IeHmFcWW",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import numpy as np\n",
 27 |         "import pandas as pd\n",
 28 |         "from sklearn.tree import DecisionTreeClassifier\n",
 29 |         "from sklearn.model_selection import train_test_split\n",
 30 |         "from sklearn.metrics import accuracy_score\n",
 31 |         "\n",
 32 |         "from sklearn.datasets import load_iris"
 33 |       ],
 34 |       "execution_count": 0,
 35 |       "outputs": []
 36 |     },
 37 |     {
 38 |       "cell_type": "markdown",
 39 |       "metadata": {
 40 |         "id": "lZqQjbcaG-4w",
 41 |         "colab_type": "text"
 42 |       },
 43 |       "source": [
 44 |         "## データ読み込み"
 45 |       ]
 46 |     },
 47 |     {
 48 |       "cell_type": "code",
 49 |       "metadata": {
 50 |         "id": "hHujZz1fFwJK",
 51 |         "colab_type": "code",
 52 |         "colab": {}
 53 |       },
 54 |       "source": [
 55 |         "iris = load_iris()\n",
 56 |         "X, Y = iris.data, iris.target\n",
 57 |         "\n",
 58 |         "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)"
 59 |       ],
 60 |       "execution_count": 0,
 61 |       "outputs": []
 62 |     },
 63 |     {
 64 |       "cell_type": "markdown",
 65 |       "metadata": {
 66 |         "id": "CpGVPa-AHbLE",
 67 |         "colab_type": "text"
 68 |       },
 69 |       "source": [
 70 |         "## 決定木を実践"
 71 |       ]
 72 |     },
 73 |     {
 74 |       "cell_type": "code",
 75 |       "metadata": {
 76 |         "id": "pnWZvrnNHAOB",
 77 |         "colab_type": "code",
 78 |         "colab": {}
 79 |       },
 80 |       "source": [
 81 |         "clf = DecisionTreeClassifier(max_depth=5)"
 82 |       ],
 83 |       "execution_count": 0,
 84 |       "outputs": []
 85 |     },
 86 |     {
 87 |       "cell_type": "code",
 88 |       "metadata": {
 89 |         "id": "TRPrDkjSbD9g",
 90 |         "colab_type": "code",
 91 |         "colab": {}
 92 |       },
 93 |       "source": [
 94 |         "# 学習\n",
 95 |         "clf.fit(X_train, y_train)\n",
 96 |         "\n",
 97 |         "# 評価\n",
 98 |         "y_pred = clf.predict(X_test)\n",
 99 |         "print(accuracy_score(y_test, y_pred))"
100 |       ],
101 |       "execution_count": 0,
102 |       "outputs": []
103 |     },
104 |     {
105 |       "cell_type": "markdown",
106 |       "metadata": {
107 |         "id": "3LoiTrkzRn9u",
108 |         "colab_type": "text"
109 |       },
110 |       "source": [
111 |         "### 可視化"
112 |       ]
113 |     },
114 |     {
115 |       "cell_type": "code",
116 |       "metadata": {
117 |         "id": "0ou82IgHRF9v",
118 |         "colab_type": "code",
119 |         "colab": {}
120 |       },
121 |       "source": [
122 |         "!sudo apt install graphviz\n",
123 |         "!pip install dtreeviz"
124 |       ],
125 |       "execution_count": 0,
126 |       "outputs": []
127 |     },
128 |     {
129 |       "cell_type": "code",
130 |       "metadata": {
131 |         "id": "M57MOkkcRo8H",
132 |         "colab_type": "code",
133 |         "colab": {}
134 |       },
135 |       "source": [
136 |         "from dtreeviz.trees import dtreeviz\n",
137 |         "\n",
138 |         "viz = dtreeviz(clf, X,  Y,\n",
139 |         "  feature_names = iris.feature_names,\n",
140 |         "  target_name = 'breed',\n",
141 |         "  class_names=[str(i) for i in iris.target_names],\n",
142 |         "  )\n",
143 |         "\n",
144 |         "display(viz)\n",
145 |         "# 保存する場合\n",
146 |         "# viz.save(\"tree.svg\")"
147 |       ],
148 |       "execution_count": 0,
149 |       "outputs": []
150 |     },
151 |     {
152 |       "cell_type": "code",
153 |       "metadata": {
154 |         "id": "8Ge1rhSVRuMJ",
155 |         "colab_type": "code",
156 |         "colab": {}
157 |       },
158 |       "source": [
159 |         ""
160 |       ],
161 |       "execution_count": 0,
162 |       "outputs": []
163 |     },
164 |     {
165 |       "cell_type": "code",
166 |       "metadata": {
167 |         "id": "qum_tMJkhuwF",
168 |         "colab_type": "code",
169 |         "colab": {}
170 |       },
171 |       "source": [
172 |         ""
173 |       ],
174 |       "execution_count": 0,
175 |       "outputs": []
176 |     }
177 |   ]
178 | }


--------------------------------------------------------------------------------
/Python/04/decisionTree_tweets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch04-Tweet-DecisionTree.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "toc_visible": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "kSb7IeHmFcWW",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import numpy as np\n",
 27 |         "import pandas as pd\n",
 28 |         "from sklearn.tree import DecisionTreeClassifier\n",
 29 |         "from sklearn.model_selection import train_test_split\n",
 30 |         "from sklearn.metrics import accuracy_score\n",
 31 |         "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 32 |         "\n",
 33 |         "from sklearn.datasets import load_iris"
 34 |       ],
 35 |       "execution_count": 0,
 36 |       "outputs": []
 37 |     },
 38 |     {
 39 |       "cell_type": "code",
 40 |       "metadata": {
 41 |         "id": "hHujZz1fFwJK",
 42 |         "colab_type": "code",
 43 |         "colab": {}
 44 |       },
 45 |       "source": [
 46 |         "# mecabインストール\n",
 47 |         "!apt install aptitude\n",
 48 |         "!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y\n",
 49 |         "\n",
 50 |         "# mecab pythonインストール（pythonでmecabを動かすために必要)\n",
 51 |         "!pip install mecab-python3==0.7\n",
 52 |         "\n",
 53 |         "# neologd辞書インストール\n",
 54 |         "!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git\n",
 55 |         "!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n\n",
 56 |         "\n",
 57 |         "# 辞書変更\n",
 58 |         "!sed -e \"s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g\" /etc/mecabrc > /etc/mecabrc.new\n",
 59 |         "!cp /etc/mecabrc /etc/mecabrc.org\n",
 60 |         "!cp /etc/mecabrc.new /etc/mecabrc"
 61 |       ],
 62 |       "execution_count": 0,
 63 |       "outputs": []
 64 |     },
 65 |     {
 66 |       "cell_type": "markdown",
 67 |       "metadata": {
 68 |         "id": "hqNNWh9YTaAC",
 69 |         "colab_type": "text"
 70 |       },
 71 |       "source": [
 72 |         "## データ読み込み"
 73 |       ]
 74 |     },
 75 |     {
 76 |       "cell_type": "code",
 77 |       "metadata": {
 78 |         "id": "ATIL0zuqTbt5",
 79 |         "colab_type": "code",
 80 |         "colab": {}
 81 |       },
 82 |       "source": [
 83 |         "import MeCab\n",
 84 |         "\n",
 85 |         "data_tweet = pd.read_csv('tweets.tsv',  sep=\"\\t\")\n",
 86 |         "data_tweet = data_tweet.dropna()\n",
 87 |         "Y = data_tweet.iloc[:,1].values\n",
 88 |         "\n",
 89 |         "print(data_tweet.head())"
 90 |       ],
 91 |       "execution_count": 0,
 92 |       "outputs": []
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "metadata": {
 97 |         "id": "ThOUKPvPTb2W",
 98 |         "colab_type": "code",
 99 |         "colab": {}
100 |       },
101 |       "source": [
102 |         "tagger = MeCab.Tagger()\n",
103 |         "tagger.parse('')\n",
104 |         "\n",
105 |         "# 文字列を単語で分割しリストに格納する\n",
106 |         "def word_tokenaize(texts):\n",
107 |         "    node = tagger.parseToNode(texts)\n",
108 |         "    word_list = []\n",
109 |         "    while node:\n",
110 |         "        word_type = node.feature.split(\",\")[0]\n",
111 |         "        if (word_type == '名詞'):#|(word_type == '形容詞'):\n",
112 |         "            word = node.feature.split(\",\")[6]\n",
113 |         "            if word != '*':\n",
114 |         "              word_list.append(word)\n",
115 |         "        node = node.next\n",
116 |         "\n",
117 |         "    return word_list"
118 |       ],
119 |       "execution_count": 0,
120 |       "outputs": []
121 |     },
122 |     {
123 |       "cell_type": "code",
124 |       "metadata": {
125 |         "id": "dKukpU80UAFx",
126 |         "colab_type": "code",
127 |         "colab": {}
128 |       },
129 |       "source": [
130 |         "vectorizer = TfidfVectorizer(tokenizer=word_tokenaize)\n",
131 |         "\n",
132 |         "tweet_matrix = vectorizer.fit_transform(data_tweet.iloc[:,0])\n",
133 |         "X = tweet_matrix.toarray()\n",
134 |         "\n",
135 |         "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)"
136 |       ],
137 |       "execution_count": 0,
138 |       "outputs": []
139 |     },
140 |     {
141 |       "cell_type": "markdown",
142 |       "metadata": {
143 |         "id": "CpGVPa-AHbLE",
144 |         "colab_type": "text"
145 |       },
146 |       "source": [
147 |         "## 決定木を実践"
148 |       ]
149 |     },
150 |     {
151 |       "cell_type": "code",
152 |       "metadata": {
153 |         "id": "pnWZvrnNHAOB",
154 |         "colab_type": "code",
155 |         "colab": {}
156 |       },
157 |       "source": [
158 |         "clf = DecisionTreeClassifier(max_depth = 30)"
159 |       ],
160 |       "execution_count": 0,
161 |       "outputs": []
162 |     },
163 |     {
164 |       "cell_type": "code",
165 |       "metadata": {
166 |         "id": "rZ3TQB9uHdQV",
167 |         "colab_type": "code",
168 |         "colab": {}
169 |       },
170 |       "source": [
171 |         "# 学習\n",
172 |         "clf.fit(X_train, y_train)\n",
173 |         "\n",
174 |         "# 評価\n",
175 |         "y_pred = clf.predict(X_test)\n",
176 |         "print(accuracy_score(y_test, y_pred))"
177 |       ],
178 |       "execution_count": 0,
179 |       "outputs": []
180 |     },
181 |     {
182 |       "cell_type": "markdown",
183 |       "metadata": {
184 |         "id": "3LoiTrkzRn9u",
185 |         "colab_type": "text"
186 |       },
187 |       "source": [
188 |         "### 可視化"
189 |       ]
190 |     },
191 |     {
192 |       "cell_type": "code",
193 |       "metadata": {
194 |         "id": "0ou82IgHRF9v",
195 |         "colab_type": "code",
196 |         "colab": {}
197 |       },
198 |       "source": [
199 |         "!sudo apt install graphviz\n",
200 |         "!pip install dtreeviz"
201 |       ],
202 |       "execution_count": 0,
203 |       "outputs": []
204 |     },
205 |     {
206 |       "cell_type": "code",
207 |       "metadata": {
208 |         "id": "M57MOkkcRo8H",
209 |         "colab_type": "code",
210 |         "colab": {}
211 |       },
212 |       "source": [
213 |         "from dtreeviz.trees import dtreeviz\n",
214 |         "\n",
215 |         "viz = dtreeviz(clf, X,  Y,\n",
216 |         "  feature_names=[i for i in range(X.shape[1])],\n",
217 |         "  target_name = 'tweet',\n",
218 |         "  class_names=['NP-UR', 'C&R'],\n",
219 |         "  )\n",
220 |         "\n",
221 |         "display(viz)\n",
222 |         "# 保存する場合\n",
223 |         "viz.save(\"tree.svg\")"
224 |       ],
225 |       "execution_count": 0,
226 |       "outputs": []
227 |     },
228 |     {
229 |       "cell_type": "code",
230 |       "metadata": {
231 |         "id": "8Ge1rhSVRuMJ",
232 |         "colab_type": "code",
233 |         "colab": {}
234 |       },
235 |       "source": [
236 |         "words_list = vectorizer.get_feature_names()"
237 |       ],
238 |       "execution_count": 0,
239 |       "outputs": []
240 |     },
241 |     {
242 |       "cell_type": "code",
243 |       "metadata": {
244 |         "id": "IGZ0BlFxdrSQ",
245 |         "colab_type": "code",
246 |         "colab": {}
247 |       },
248 |       "source": [
249 |         "print(words_list[1606])\n",
250 |         "print(words_list[1524])"
251 |       ],
252 |       "execution_count": 0,
253 |       "outputs": []
254 |     },
255 |     {
256 |       "cell_type": "code",
257 |       "metadata": {
258 |         "id": "FYf6-GFRhgV8",
259 |         "colab_type": "code",
260 |         "colab": {}
261 |       },
262 |       "source": [
263 |         ""
264 |       ],
265 |       "execution_count": 0,
266 |       "outputs": []
267 |     }
268 |   ]
269 | }


--------------------------------------------------------------------------------
/Python/04/get_tweets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch04-Get-Tweet.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "metadata": {
 20 |         "id": "QZ4D9fwktm5J",
 21 |         "colab_type": "code",
 22 |         "colab": {}
 23 |       },
 24 |       "source": [
 25 |         "#coding:utf-8\n",
 26 |         "import numpy as np\n",
 27 |         "import json\n",
 28 |         "import requests\n",
 29 |         "from requests_oauthlib import OAuth1Session, OAuth1\n",
 30 |         "import datetime\n",
 31 |         "import re\n",
 32 |         "import time\n",
 33 |         "\n",
 34 |         "from google.colab import files"
 35 |       ],
 36 |       "execution_count": 0,
 37 |       "outputs": []
 38 |     },
 39 |     {
 40 |       "cell_type": "markdown",
 41 |       "metadata": {
 42 |         "id": "inx3ZQ1ytwgU",
 43 |         "colab_type": "text"
 44 |       },
 45 |       "source": [
 46 |         "## 認証"
 47 |       ]
 48 |     },
 49 |     {
 50 |       "cell_type": "code",
 51 |       "metadata": {
 52 |         "id": "Oz54bCWxtv1u",
 53 |         "colab_type": "code",
 54 |         "colab": {}
 55 |       },
 56 |       "source": [
 57 |         "access_token = 'XXXXXXXXXX\n",
 58 |         "access_token_secret = 'XXXXXXXXXX'\n",
 59 |         "consumer_key = 'XXXXXXXXXX'\n",
 60 |         "consumer_key_secret = 'XXXXXXXXXX'\n",
 61 |         "\n",
 62 |         "# タイムライン取得用のURL\n",
 63 |         "url = \"https://api.twitter.com/1.1/statuses/user_timeline.json\"\n",
 64 |         "\n",
 65 |         "#APIの認証\n",
 66 |         "twitter = OAuth1Session(consumer_key, consumer_key_secret, access_token, access_token_secret)"
 67 |       ],
 68 |       "execution_count": 0,
 69 |       "outputs": []
 70 |     },
 71 |     {
 72 |       "cell_type": "markdown",
 73 |       "metadata": {
 74 |         "id": "3U8l4Mg-tyZu",
 75 |         "colab_type": "text"
 76 |       },
 77 |       "source": [
 78 |         "## Np_Ur_ のツイート取得"
 79 |       ]
 80 |     },
 81 |     {
 82 |       "cell_type": "code",
 83 |       "metadata": {
 84 |         "id": "XF279Mzw02Yh",
 85 |         "colab_type": "code",
 86 |         "colab": {}
 87 |       },
 88 |       "source": [
 89 |         "def normalize_text(text):\n",
 90 |         "    text = re.sub(r'https?://[\\w/:%#\\$&\\?\\(\\)~\\.=\\+\\-…]+', \"\", text)\n",
 91 |         "    text = re.sub('RT', \"\", text)\n",
 92 |         "    text = re.sub('お気に入り', \"\", text)\n",
 93 |         "    text = re.sub('まとめ', \"\", text)\n",
 94 |         "    text = re.sub(r'[!-~]', \"\", text)\n",
 95 |         "    text = re.sub(r'[︰-＠]', \"\", text)\n",
 96 |         "    text = re.sub('\\u3000',\"\", text)\n",
 97 |         "    text = re.sub('\\t', \"\", text)\n",
 98 |         "    text = re.sub('\\n', \"\", text)\n",
 99 |         "    text = text.strip()\n",
100 |         "    return text"
101 |       ],
102 |       "execution_count": 0,
103 |       "outputs": []
104 |     },
105 |     {
106 |       "cell_type": "code",
107 |       "metadata": {
108 |         "id": "v4TAWF4qttB5",
109 |         "colab_type": "code",
110 |         "colab": {}
111 |       },
112 |       "source": [
113 |         "# パラメータの定義\n",
114 |         "params = {'screen_name': 'Np_Ur_',\n",
115 |         "          'exclude_replies':True,\n",
116 |         "          'include_rts':False,\n",
117 |         "          'count':200\n",
118 |         "         }\n",
119 |         "\n",
120 |         "f_out = open('np_ur_.tsv','w')\n",
121 |         "\n",
122 |         "for _ in range(20):\n",
123 |         "    res = twitter.get(url, params = params)\n",
124 |         "\n",
125 |         "    if res.status_code == 200:\n",
126 |         "\n",
127 |         "        timeline = json.loads(res.text)\n",
128 |         "        if len(timeline) == 0:\n",
129 |         "            break\n",
130 |         " \n",
131 |         "        # 各ツイートの本文を表示\n",
132 |         "        for i in range(len(timeline)):\n",
133 |         "            #print(len(timeline[i]['text']))\n",
134 |         "            f_out.write(normalize_text(timeline[i]['text']) +  '\\t' + \"0\" + '\\n')\n",
135 |         "        \n",
136 |         "        #  一番最後のツイートIDをパラメータmax_idに追加 \n",
137 |         "        params['max_id'] = timeline[len(timeline) - 1]['id'] - 1\n",
138 |         "\n",
139 |         "f_out.close()"
140 |       ],
141 |       "execution_count": 0,
142 |       "outputs": []
143 |     },
144 |     {
145 |       "cell_type": "code",
146 |       "metadata": {
147 |         "id": "NIUtJQ2Bt1Wq",
148 |         "colab_type": "code",
149 |         "colab": {}
150 |       },
151 |       "source": [
152 |         "files.download('np_ur_.tsv')"
153 |       ],
154 |       "execution_count": 0,
155 |       "outputs": []
156 |     },
157 |     {
158 |       "cell_type": "markdown",
159 |       "metadata": {
160 |         "id": "AcEcCSiKt-1_",
161 |         "colab_type": "text"
162 |       },
163 |       "source": [
164 |         "## lucky_CandR のツイート取得"
165 |       ]
166 |     },
167 |     {
168 |       "cell_type": "code",
169 |       "metadata": {
170 |         "id": "hCI91j-it6IM",
171 |         "colab_type": "code",
172 |         "colab": {}
173 |       },
174 |       "source": [
175 |         "# パラメータの定義\n",
176 |         "params = {'screen_name':'lucky_CandR',\n",
177 |         "          'exclude_replies':True,\n",
178 |         "          'include_rts':False,\n",
179 |         "          'count':200\n",
180 |         "         }\n",
181 |         "\n",
182 |         "f_out = open('lucky_CandR.tsv','w')\n",
183 |         "\n",
184 |         "for _ in range(20):\n",
185 |         "    res = twitter.get(url, params = params)\n",
186 |         "\n",
187 |         "    if res.status_code == 200:\n",
188 |         "\n",
189 |         "        timeline = json.loads(res.text)\n",
190 |         "        if len(timeline) == 0:\n",
191 |         "            break\n",
192 |         " \n",
193 |         "        # 各ツイートの本文を表示\n",
194 |         "        for i in range(len(timeline)):\n",
195 |         "            f_out.write(normalize_text(timeline[i]['text']) +  '\\t' + \"1\" + '\\n')\n",
196 |         "        \n",
197 |         "        #  一番最後のツイートIDをパラメータmax_idに追加 \n",
198 |         "        params['max_id'] = timeline[len(timeline) - 1]['id'] - 1\n",
199 |         "\n",
200 |         "f_out.close()"
201 |       ],
202 |       "execution_count": 0,
203 |       "outputs": []
204 |     },
205 |     {
206 |       "cell_type": "code",
207 |       "metadata": {
208 |         "id": "xO1jONm_uLAm",
209 |         "colab_type": "code",
210 |         "colab": {}
211 |       },
212 |       "source": [
213 |         "files.download('lucky_CandR.tsv')"
214 |       ],
215 |       "execution_count": 0,
216 |       "outputs": []
217 |     },
218 |     {
219 |       "cell_type": "code",
220 |       "metadata": {
221 |         "id": "632Ox_gGuNti",
222 |         "colab_type": "code",
223 |         "colab": {}
224 |       },
225 |       "source": [
226 |         "# データ結合\n",
227 |         "import pandas as pd\n",
228 |         "\n",
229 |         "tsv_files = ['np_ur_.tsv', 'lucky_CandR.tsv']\n",
230 |         "list = []\n",
231 |         "\n",
232 |         "for file in tsv_files:\n",
233 |         "    list.append(pd.read_csv(file, delimiter='\\t', header=None))\n",
234 |         "df = pd.concat(list, sort=False)\n",
235 |         "\n",
236 |         "df.to_csv( 'tweets.tsv', sep='\\t',index=False)"
237 |       ],
238 |       "execution_count": 0,
239 |       "outputs": []
240 |     },
241 |     {
242 |       "cell_type": "code",
243 |       "metadata": {
244 |         "id": "iUo5GZOG3JuI",
245 |         "colab_type": "code",
246 |         "colab": {}
247 |       },
248 |       "source": [
249 |         "files.download('tweets.tsv')"
250 |       ],
251 |       "execution_count": 0,
252 |       "outputs": []
253 |     },
254 |     {
255 |       "cell_type": "code",
256 |       "metadata": {
257 |         "id": "ZkWD27Pm4D_j",
258 |         "colab_type": "code",
259 |         "colab": {}
260 |       },
261 |       "source": [
262 |         ""
263 |       ],
264 |       "execution_count": 0,
265 |       "outputs": []
266 |     }
267 |   ]
268 | }


--------------------------------------------------------------------------------
/Python/04/logit_tweets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch04-Tweet-logit_2.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "toc_visible": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "L_FdzJMz8TR7",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import numpy as np\n",
 27 |         "import pandas as pd\n",
 28 |         "import matplotlib.pyplot as plt\n",
 29 |         "%matplotlib inline\n",
 30 |         "\n",
 31 |         "from sklearn.datasets import load_iris\n",
 32 |         "from sklearn.linear_model import LogisticRegression\n",
 33 |         "from sklearn.metrics import accuracy_score\n",
 34 |         "from sklearn.model_selection import train_test_split\n",
 35 |         "\n",
 36 |         "from sklearn.feature_extraction.text import TfidfVectorizer"
 37 |       ],
 38 |       "execution_count": 0,
 39 |       "outputs": []
 40 |     },
 41 |     {
 42 |       "cell_type": "code",
 43 |       "metadata": {
 44 |         "id": "vlvlKilesNjB",
 45 |         "colab_type": "code",
 46 |         "colab": {}
 47 |       },
 48 |       "source": [
 49 |         "# mecabインストール\n",
 50 |         "!apt install aptitude\n",
 51 |         "!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y\n",
 52 |         "\n",
 53 |         "# mecab pythonインストール（pythonでmecabを動かすために必要)\n",
 54 |         "!pip install mecab-python3==0.7\n",
 55 |         "\n",
 56 |         "# neologd辞書インストール\n",
 57 |         "!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git\n",
 58 |         "!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n\n",
 59 |         "\n",
 60 |         "# 辞書変更\n",
 61 |         "!sed -e \"s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g\" /etc/mecabrc > /etc/mecabrc.new\n",
 62 |         "!cp /etc/mecabrc /etc/mecabrc.org\n",
 63 |         "!cp /etc/mecabrc.new /etc/mecabrc\n",
 64 |         "\n",
 65 |         "import MeCab"
 66 |       ],
 67 |       "execution_count": 0,
 68 |       "outputs": []
 69 |     },
 70 |     {
 71 |       "cell_type": "markdown",
 72 |       "metadata": {
 73 |         "id": "G3NXk634sC9Q",
 74 |         "colab_type": "text"
 75 |       },
 76 |       "source": [
 77 |         "## データ読み込み"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "metadata": {
 83 |         "id": "OHO3YkrxiwTe",
 84 |         "colab_type": "code",
 85 |         "colab": {}
 86 |       },
 87 |       "source": [
 88 |         "data_tweet = pd.read_csv('tweets.tsv',  sep=\"\\t\")\n",
 89 |         "data_tweet = data_tweet.dropna()\n",
 90 |         "Y = data_tweet.iloc[:,1].values\n",
 91 |         "\n",
 92 |         "print(data_tweet.head())"
 93 |       ],
 94 |       "execution_count": 0,
 95 |       "outputs": []
 96 |     },
 97 |     {
 98 |       "cell_type": "code",
 99 |       "metadata": {
100 |         "id": "n1oDBaWYsExp",
101 |         "colab_type": "code",
102 |         "colab": {}
103 |       },
104 |       "source": [
105 |         "tagger = MeCab.Tagger()\n",
106 |         "tagger.parse('')\n",
107 |         "\n",
108 |         "# 文字列を単語で分割しリストに格納する\n",
109 |         "def word_tokenaize(texts):\n",
110 |         "    node = tagger.parseToNode(texts)\n",
111 |         "    word_list = []\n",
112 |         "    while node:\n",
113 |         "        word_type = node.feature.split(\",\")[0]\n",
114 |         "        if (word_type == '名詞')|(word_type == '形容詞'):\n",
115 |         "            word = node.feature.split(\",\")[6]\n",
116 |         "            if word != '*':\n",
117 |         "              word_list.append(word)\n",
118 |         "        node = node.next\n",
119 |         "\n",
120 |         "    return word_list"
121 |       ],
122 |       "execution_count": 0,
123 |       "outputs": []
124 |     },
125 |     {
126 |       "cell_type": "code",
127 |       "metadata": {
128 |         "id": "yJ-jlDAquhmp",
129 |         "colab_type": "code",
130 |         "colab": {}
131 |       },
132 |       "source": [
133 |         "vectorizer = TfidfVectorizer(tokenizer=word_tokenaize)\n",
134 |         "\n",
135 |         "tweet_matrix = vectorizer.fit_transform(data_tweet.iloc[:,0])\n",
136 |         "X = tweet_matrix.toarray()\n",
137 |         "print(X.shape)"
138 |       ],
139 |       "execution_count": 0,
140 |       "outputs": []
141 |     },
142 |     {
143 |       "cell_type": "markdown",
144 |       "metadata": {
145 |         "id": "QIJD4zzzcJ6r",
146 |         "colab_type": "text"
147 |       },
148 |       "source": [
149 |         "## ロジスティック回帰を実践"
150 |       ]
151 |     },
152 |     {
153 |       "cell_type": "code",
154 |       "metadata": {
155 |         "id": "Zj9MXim3vasH",
156 |         "colab_type": "code",
157 |         "colab": {}
158 |       },
159 |       "source": [
160 |         "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)\n",
161 |         "\n",
162 |         "logit_multi2 = LogisticRegression()\n",
163 |         "logit_multi2.fit(X_train, y_train)\n",
164 |         "\n",
165 |         "print(logit_multi2.coef_)\n",
166 |         "print(logit_multi2.intercept_)"
167 |       ],
168 |       "execution_count": 0,
169 |       "outputs": []
170 |     },
171 |     {
172 |       "cell_type": "code",
173 |       "metadata": {
174 |         "id": "kCbO0ucmvzD_",
175 |         "colab_type": "code",
176 |         "colab": {}
177 |       },
178 |       "source": [
179 |         "y_pred = logit_multi2.predict(X_test)\n",
180 |         "print(accuracy_score(y_test, y_pred))"
181 |       ],
182 |       "execution_count": 0,
183 |       "outputs": []
184 |     },
185 |     {
186 |       "cell_type": "code",
187 |       "metadata": {
188 |         "id": "VGGJpwP7wS99",
189 |         "colab_type": "code",
190 |         "colab": {}
191 |       },
192 |       "source": [
193 |         ""
194 |       ],
195 |       "execution_count": 0,
196 |       "outputs": []
197 |     }
198 |   ]
199 | }


--------------------------------------------------------------------------------
/Python/04/randomForest_iris.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch04-iris-RandomForest.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "toc_visible": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "kSb7IeHmFcWW",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import numpy as np\n",
 27 |         "import pandas as pd\n",
 28 |         "import matplotlib.pyplot as plt\n",
 29 |         "%matplotlib inline\n",
 30 |         "\n",
 31 |         "from sklearn.ensemble import RandomForestClassifier\n",
 32 |         "from sklearn.model_selection import train_test_split\n",
 33 |         "from sklearn.metrics import accuracy_score\n",
 34 |         "\n",
 35 |         "from sklearn.datasets import load_iris"
 36 |       ],
 37 |       "execution_count": 0,
 38 |       "outputs": []
 39 |     },
 40 |     {
 41 |       "cell_type": "markdown",
 42 |       "metadata": {
 43 |         "id": "lZqQjbcaG-4w",
 44 |         "colab_type": "text"
 45 |       },
 46 |       "source": [
 47 |         "## データ読み込み"
 48 |       ]
 49 |     },
 50 |     {
 51 |       "cell_type": "code",
 52 |       "metadata": {
 53 |         "id": "hHujZz1fFwJK",
 54 |         "colab_type": "code",
 55 |         "colab": {}
 56 |       },
 57 |       "source": [
 58 |         "iris = load_iris()\n",
 59 |         "X, Y = iris.data, iris.target\n",
 60 |         "\n",
 61 |         "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)"
 62 |       ],
 63 |       "execution_count": 0,
 64 |       "outputs": []
 65 |     },
 66 |     {
 67 |       "cell_type": "markdown",
 68 |       "metadata": {
 69 |         "id": "CpGVPa-AHbLE",
 70 |         "colab_type": "text"
 71 |       },
 72 |       "source": [
 73 |         "## ランダムフォレストを実践"
 74 |       ]
 75 |     },
 76 |     {
 77 |       "cell_type": "code",
 78 |       "metadata": {
 79 |         "id": "pnWZvrnNHAOB",
 80 |         "colab_type": "code",
 81 |         "colab": {}
 82 |       },
 83 |       "source": [
 84 |         "clf = RandomForestClassifier(n_estimators=10, max_depth=3)"
 85 |       ],
 86 |       "execution_count": 0,
 87 |       "outputs": []
 88 |     },
 89 |     {
 90 |       "cell_type": "code",
 91 |       "metadata": {
 92 |         "id": "rZ3TQB9uHdQV",
 93 |         "colab_type": "code",
 94 |         "colab": {}
 95 |       },
 96 |       "source": [
 97 |         "# 学習\n",
 98 |         "clf.fit(X_train, y_train)\n",
 99 |         "\n",
100 |         "# 評価\n",
101 |         "y_pred = clf.predict(X_test)\n",
102 |         "print(accuracy_score(y_test, y_pred))"
103 |       ],
104 |       "execution_count": 0,
105 |       "outputs": []
106 |     },
107 |     {
108 |       "cell_type": "markdown",
109 |       "metadata": {
110 |         "id": "3LoiTrkzRn9u",
111 |         "colab_type": "text"
112 |       },
113 |       "source": [
114 |         "### 可視化"
115 |       ]
116 |     },
117 |     {
118 |       "cell_type": "code",
119 |       "metadata": {
120 |         "id": "0ou82IgHRF9v",
121 |         "colab_type": "code",
122 |         "colab": {}
123 |       },
124 |       "source": [
125 |         "# 特徴量の重要度\n",
126 |         "importances = clf.feature_importances_"
127 |       ],
128 |       "execution_count": 0,
129 |       "outputs": []
130 |     },
131 |     {
132 |       "cell_type": "code",
133 |       "metadata": {
134 |         "id": "UZuZlDh-nX2t",
135 |         "colab_type": "code",
136 |         "colab": {}
137 |       },
138 |       "source": [
139 |         "print(importances)"
140 |       ],
141 |       "execution_count": 0,
142 |       "outputs": []
143 |     },
144 |     {
145 |       "cell_type": "code",
146 |       "metadata": {
147 |         "id": "8Ge1rhSVRuMJ",
148 |         "colab_type": "code",
149 |         "colab": {}
150 |       },
151 |       "source": [
152 |         "features = np.array(iris.feature_names)\n",
153 |         "\n",
154 |         "# プロット\n",
155 |         "indices = np.argsort(importances)\n",
156 |         "plt.figure(figsize=(6,6))\n",
157 |         "plt.barh(range(len(indices)), importances[indices], color='b', align='center')\n",
158 |         "plt.yticks(range(len(indices)), features[indices])\n",
159 |         "plt.savefig('rf_importance_iris.png')"
160 |       ],
161 |       "execution_count": 0,
162 |       "outputs": []
163 |     },
164 |     {
165 |       "cell_type": "code",
166 |       "metadata": {
167 |         "id": "qum_tMJkhuwF",
168 |         "colab_type": "code",
169 |         "colab": {}
170 |       },
171 |       "source": [
172 |         "features"
173 |       ],
174 |       "execution_count": 0,
175 |       "outputs": []
176 |     },
177 |     {
178 |       "cell_type": "code",
179 |       "metadata": {
180 |         "id": "6XM7QTN0blCc",
181 |         "colab_type": "code",
182 |         "colab": {}
183 |       },
184 |       "source": [
185 |         ""
186 |       ],
187 |       "execution_count": 0,
188 |       "outputs": []
189 |     }
190 |   ]
191 | }


--------------------------------------------------------------------------------
/Python/04/randomForest_tweets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch04-Tweet-RandomForest.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "metadata": {
 20 |         "id": "kSb7IeHmFcWW",
 21 |         "colab_type": "code",
 22 |         "colab": {}
 23 |       },
 24 |       "source": [
 25 |         "import numpy as np\n",
 26 |         "import pandas as pd\n",
 27 |         "import matplotlib.pyplot as plt\n",
 28 |         "%matplotlib inline\n",
 29 |         "\n",
 30 |         "from sklearn.ensemble import RandomForestClassifier\n",
 31 |         "from sklearn.model_selection import train_test_split\n",
 32 |         "from sklearn.metrics import accuracy_score\n",
 33 |         "from sklearn.feature_extraction.text import TfidfVectorizer"
 34 |       ],
 35 |       "execution_count": 0,
 36 |       "outputs": []
 37 |     },
 38 |     {
 39 |       "cell_type": "code",
 40 |       "metadata": {
 41 |         "id": "hHujZz1fFwJK",
 42 |         "colab_type": "code",
 43 |         "colab": {}
 44 |       },
 45 |       "source": [
 46 |         "# mecabインストール\n",
 47 |         "!apt install aptitude\n",
 48 |         "!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y\n",
 49 |         "\n",
 50 |         "# mecab pythonインストール（pythonでmecabを動かすために必要)\n",
 51 |         "!pip install mecab-python3==0.7\n",
 52 |         "\n",
 53 |         "# neologd辞書インストール\n",
 54 |         "!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git\n",
 55 |         "!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n\n",
 56 |         "\n",
 57 |         "# 辞書変更\n",
 58 |         "!sed -e \"s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g\" /etc/mecabrc > /etc/mecabrc.new\n",
 59 |         "!cp /etc/mecabrc /etc/mecabrc.org\n",
 60 |         "!cp /etc/mecabrc.new /etc/mecabrc\n",
 61 |         "\n",
 62 |         "import MeCab"
 63 |       ],
 64 |       "execution_count": 0,
 65 |       "outputs": []
 66 |     },
 67 |     {
 68 |       "cell_type": "markdown",
 69 |       "metadata": {
 70 |         "id": "hqNNWh9YTaAC",
 71 |         "colab_type": "text"
 72 |       },
 73 |       "source": [
 74 |         "## データ読み込み"
 75 |       ]
 76 |     },
 77 |     {
 78 |       "cell_type": "code",
 79 |       "metadata": {
 80 |         "id": "ATIL0zuqTbt5",
 81 |         "colab_type": "code",
 82 |         "colab": {}
 83 |       },
 84 |       "source": [
 85 |         "data_tweet = pd.read_csv('tweets.tsv',  sep=\"\\t\")\n",
 86 |         "data_tweet = data_tweet.dropna()\n",
 87 |         "Y = data_tweet.iloc[:,1].values\n",
 88 |         "\n",
 89 |         "print(data_tweet.head())"
 90 |       ],
 91 |       "execution_count": 0,
 92 |       "outputs": []
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "metadata": {
 97 |         "id": "ThOUKPvPTb2W",
 98 |         "colab_type": "code",
 99 |         "colab": {}
100 |       },
101 |       "source": [
102 |         "tagger = MeCab.Tagger()\n",
103 |         "tagger.parse('')\n",
104 |         "\n",
105 |         "# 文字列を単語で分割しリストに格納する\n",
106 |         "def word_tokenaize(texts):\n",
107 |         "    node = tagger.parseToNode(texts)\n",
108 |         "    word_list = []\n",
109 |         "    while node:\n",
110 |         "        word_type = node.feature.split(\",\")[0]\n",
111 |         "        if (word_type == '名詞'):#|(word_type == '形容詞'):\n",
112 |         "            word = node.feature.split(\",\")[6]\n",
113 |         "            if word != '*':\n",
114 |         "              word_list.append(word)\n",
115 |         "        node = node.next\n",
116 |         "\n",
117 |         "    return word_list"
118 |       ],
119 |       "execution_count": 0,
120 |       "outputs": []
121 |     },
122 |     {
123 |       "cell_type": "code",
124 |       "metadata": {
125 |         "id": "dKukpU80UAFx",
126 |         "colab_type": "code",
127 |         "colab": {}
128 |       },
129 |       "source": [
130 |         "vectorizer = TfidfVectorizer(tokenizer=word_tokenaize)\n",
131 |         "\n",
132 |         "tweet_matrix = vectorizer.fit_transform(data_tweet.iloc[:,0])\n",
133 |         "X = tweet_matrix.toarray()"
134 |       ],
135 |       "execution_count": 0,
136 |       "outputs": []
137 |     },
138 |     {
139 |       "cell_type": "code",
140 |       "metadata": {
141 |         "id": "3aJTcqZYoXsb",
142 |         "colab_type": "code",
143 |         "colab": {}
144 |       },
145 |       "source": [
146 |         "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)"
147 |       ],
148 |       "execution_count": 0,
149 |       "outputs": []
150 |     },
151 |     {
152 |       "cell_type": "markdown",
153 |       "metadata": {
154 |         "id": "CpGVPa-AHbLE",
155 |         "colab_type": "text"
156 |       },
157 |       "source": [
158 |         "## ランダムフォレストを実践"
159 |       ]
160 |     },
161 |     {
162 |       "cell_type": "code",
163 |       "metadata": {
164 |         "id": "pnWZvrnNHAOB",
165 |         "colab_type": "code",
166 |         "colab": {}
167 |       },
168 |       "source": [
169 |         "clf = RandomForestClassifier(n_estimators= 50, max_depth=20)"
170 |       ],
171 |       "execution_count": 0,
172 |       "outputs": []
173 |     },
174 |     {
175 |       "cell_type": "code",
176 |       "metadata": {
177 |         "id": "rZ3TQB9uHdQV",
178 |         "colab_type": "code",
179 |         "colab": {}
180 |       },
181 |       "source": [
182 |         "# 学習\n",
183 |         "clf.fit(X_train, y_train)\n",
184 |         "\n",
185 |         "# 評価\n",
186 |         "y_pred = clf.predict(X_test)\n",
187 |         "print(accuracy_score(y_test, y_pred))"
188 |       ],
189 |       "execution_count": 0,
190 |       "outputs": []
191 |     },
192 |     {
193 |       "cell_type": "markdown",
194 |       "metadata": {
195 |         "id": "3LoiTrkzRn9u",
196 |         "colab_type": "text"
197 |       },
198 |       "source": [
199 |         "### 可視化"
200 |       ]
201 |     },
202 |     {
203 |       "cell_type": "code",
204 |       "metadata": {
205 |         "id": "0ou82IgHRF9v",
206 |         "colab_type": "code",
207 |         "colab": {}
208 |       },
209 |       "source": [
210 |         "words_list = vectorizer.get_feature_names()\n",
211 |         "\n",
212 |         "features = np.array(np.arange(0,len(words_list)))\n",
213 |         "# 特徴量の重要度\n",
214 |         "importances = clf.feature_importances_"
215 |       ],
216 |       "execution_count": 0,
217 |       "outputs": []
218 |     },
219 |     {
220 |       "cell_type": "code",
221 |       "metadata": {
222 |         "id": "M57MOkkcRo8H",
223 |         "colab_type": "code",
224 |         "colab": {}
225 |       },
226 |       "source": [
227 |         "indices = np.argsort(importances)[-11:]\n",
228 |         "plt.figure(figsize=(6,6))\n",
229 |         "plt.barh(range(len(indices)), importances[indices], color='b', align='center')\n",
230 |         "plt.yticks(range(len(indices)), features[indices])\n",
231 |         "#plt.show()\n",
232 |         "plt.savefig('rf_importance_tweet.png')"
233 |       ],
234 |       "execution_count": 0,
235 |       "outputs": []
236 |     },
237 |     {
238 |       "cell_type": "code",
239 |       "metadata": {
240 |         "id": "8Ge1rhSVRuMJ",
241 |         "colab_type": "code",
242 |         "colab": {}
243 |       },
244 |       "source": [
245 |         "for i in indices:\n",
246 |         "  print(i, words_list[i])"
247 |       ],
248 |       "execution_count": 0,
249 |       "outputs": []
250 |     },
251 |     {
252 |       "cell_type": "code",
253 |       "metadata": {
254 |         "id": "IGZ0BlFxdrSQ",
255 |         "colab_type": "code",
256 |         "colab": {}
257 |       },
258 |       "source": [
259 |         ""
260 |       ],
261 |       "execution_count": 0,
262 |       "outputs": []
263 |     },
264 |     {
265 |       "cell_type": "code",
266 |       "metadata": {
267 |         "id": "FYf6-GFRhgV8",
268 |         "colab_type": "code",
269 |         "colab": {}
270 |       },
271 |       "source": [
272 |         ""
273 |       ],
274 |       "execution_count": 0,
275 |       "outputs": []
276 |     }
277 |   ]
278 | }


--------------------------------------------------------------------------------
/Python/05/Kmeans_prefecture.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch05-prefecture-Kmeans.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "metadata": {
 20 |         "id": "fjn-xOmMcOZd",
 21 |         "colab_type": "code",
 22 |         "colab": {}
 23 |       },
 24 |       "source": [
 25 |         "import numpy as np\n",
 26 |         "import pandas as pd\n",
 27 |         "import matplotlib.pyplot as plt\n",
 28 |         "%matplotlib inline\n",
 29 |         "\n",
 30 |         "from sklearn.cluster import KMeans\n",
 31 |         "from sklearn.preprocessing import StandardScaler"
 32 |       ],
 33 |       "execution_count": 0,
 34 |       "outputs": []
 35 |     },
 36 |     {
 37 |       "cell_type": "markdown",
 38 |       "metadata": {
 39 |         "id": "6KjYW74wx4yT",
 40 |         "colab_type": "text"
 41 |       },
 42 |       "source": [
 43 |         "## データ読み込み"
 44 |       ]
 45 |     },
 46 |     {
 47 |       "cell_type": "code",
 48 |       "metadata": {
 49 |         "id": "ALMzMhn9hn9Y",
 50 |         "colab_type": "code",
 51 |         "colab": {}
 52 |       },
 53 |       "source": [
 54 |         "data_prefecture = pd.read_csv(\"data_prefecture_category.csv\", encoding='utf-8', index_col=0)"
 55 |       ],
 56 |       "execution_count": 0,
 57 |       "outputs": []
 58 |     },
 59 |     {
 60 |       "cell_type": "code",
 61 |       "metadata": {
 62 |         "id": "PLw8rXe-huZ2",
 63 |         "colab_type": "code",
 64 |         "colab": {}
 65 |       },
 66 |       "source": [
 67 |         "print(data_prefecture.head())"
 68 |       ],
 69 |       "execution_count": 0,
 70 |       "outputs": []
 71 |     },
 72 |     {
 73 |       "cell_type": "code",
 74 |       "metadata": {
 75 |         "id": "lEJwvHBWkqkT",
 76 |         "colab_type": "code",
 77 |         "colab": {}
 78 |       },
 79 |       "source": [
 80 |         "# カンマ区切りの文字列を数値に変換\n",
 81 |         "data_prefecture_float = data_prefecture.apply(lambda x: x.str.replace(',','')).astype(np.float)"
 82 |       ],
 83 |       "execution_count": 0,
 84 |       "outputs": []
 85 |     },
 86 |     {
 87 |       "cell_type": "code",
 88 |       "metadata": {
 89 |         "id": "kx_SRyvpkAkX",
 90 |         "colab_type": "code",
 91 |         "colab": {}
 92 |       },
 93 |       "source": [
 94 |         "print(data_prefecture_float.head())"
 95 |       ],
 96 |       "execution_count": 0,
 97 |       "outputs": []
 98 |     },
 99 |     {
100 |       "cell_type": "markdown",
101 |       "metadata": {
102 |         "id": "oT_9YoHPyXaN",
103 |         "colab_type": "text"
104 |       },
105 |       "source": [
106 |         "## 可視化"
107 |       ]
108 |     },
109 |     {
110 |       "cell_type": "code",
111 |       "metadata": {
112 |         "id": "7DKDD7tnyDWD",
113 |         "colab_type": "code",
114 |         "colab": {}
115 |       },
116 |       "source": [
117 |         "plt.hist(data_prefecture_float[\"食料\"])"
118 |       ],
119 |       "execution_count": 0,
120 |       "outputs": []
121 |     },
122 |     {
123 |       "cell_type": "code",
124 |       "metadata": {
125 |         "id": "BLvnLvHHzZM0",
126 |         "colab_type": "code",
127 |         "colab": {}
128 |       },
129 |       "source": [
130 |         "plt.hist(data_prefecture_float[\"住居\"])"
131 |       ],
132 |       "execution_count": 0,
133 |       "outputs": []
134 |     },
135 |     {
136 |       "cell_type": "code",
137 |       "metadata": {
138 |         "id": "ZntDAgVqzb1T",
139 |         "colab_type": "code",
140 |         "colab": {}
141 |       },
142 |       "source": [
143 |         "plt.hist(data_prefecture_float[\"教育\"])"
144 |       ],
145 |       "execution_count": 0,
146 |       "outputs": []
147 |     },
148 |     {
149 |       "cell_type": "markdown",
150 |       "metadata": {
151 |         "id": "K0v4nSjbzkiv",
152 |         "colab_type": "text"
153 |       },
154 |       "source": [
155 |         "## K平均法"
156 |       ]
157 |     },
158 |     {
159 |       "cell_type": "code",
160 |       "metadata": {
161 |         "id": "IRDfZM0UyhNq",
162 |         "colab_type": "code",
163 |         "colab": {}
164 |       },
165 |       "source": [
166 |         "# 標準化\n",
167 |         "scaler = StandardScaler()\n",
168 |         "data_std = scaler.fit_transform(data_prefecture_float)"
169 |       ],
170 |       "execution_count": 0,
171 |       "outputs": []
172 |     },
173 |     {
174 |       "cell_type": "code",
175 |       "metadata": {
176 |         "id": "B5b6usAMi4_n",
177 |         "colab_type": "code",
178 |         "colab": {}
179 |       },
180 |       "source": [
181 |         "k_means = KMeans(n_clusters=4)\n",
182 |         "k_means.fit(data_std)"
183 |       ],
184 |       "execution_count": 0,
185 |       "outputs": []
186 |     },
187 |     {
188 |       "cell_type": "code",
189 |       "metadata": {
190 |         "id": "Wn-OuwSblxXP",
191 |         "colab_type": "code",
192 |         "colab": {}
193 |       },
194 |       "source": [
195 |         "print(k_means.labels_)"
196 |       ],
197 |       "execution_count": 0,
198 |       "outputs": []
199 |     },
200 |     {
201 |       "cell_type": "code",
202 |       "metadata": {
203 |         "id": "EX6_rR_BlzbB",
204 |         "colab_type": "code",
205 |         "colab": {}
206 |       },
207 |       "source": [
208 |         "data_prefecture_float[\"label\"] = k_means.labels_"
209 |       ],
210 |       "execution_count": 0,
211 |       "outputs": []
212 |     },
213 |     {
214 |       "cell_type": "code",
215 |       "metadata": {
216 |         "id": "CF-wfz-Jl9xp",
217 |         "colab_type": "code",
218 |         "colab": {}
219 |       },
220 |       "source": [
221 |         "print(data_prefecture_float[data_prefecture_float[\"label\"] == 0][\"label\"])\n"
222 |       ],
223 |       "execution_count": 0,
224 |       "outputs": []
225 |     },
226 |     {
227 |       "cell_type": "code",
228 |       "metadata": {
229 |         "id": "AQS_ADfDngAX",
230 |         "colab_type": "code",
231 |         "colab": {}
232 |       },
233 |       "source": [
234 |         "print(data_prefecture_float[data_prefecture_float[\"label\"] == 1][\"label\"])\n",
235 |         "print(data_prefecture_float[data_prefecture_float[\"label\"] == 2][\"label\"])\n",
236 |         "print(data_prefecture_float[data_prefecture_float[\"label\"] == 3][\"label\"])"
237 |       ],
238 |       "execution_count": 0,
239 |       "outputs": []
240 |     },
241 |     {
242 |       "cell_type": "code",
243 |       "metadata": {
244 |         "id": "OagBxk02mVXC",
245 |         "colab_type": "code",
246 |         "colab": {}
247 |       },
248 |       "source": [
249 |         "# クラスターごとの平均値を計算して、一つのDataFrameに格納する\n",
250 |         "k_means_feature = pd.concat([data_prefecture_float[data_prefecture_float[\"label\"] == 0].mean(), \n",
251 |         "                             data_prefecture_float[data_prefecture_float[\"label\"] == 1].mean(),\n",
252 |         "                             data_prefecture_float[data_prefecture_float[\"label\"] == 2].mean(), \n",
253 |         "                             data_prefecture_float[data_prefecture_float[\"label\"] == 3].mean()], axis = 1)"
254 |       ],
255 |       "execution_count": 0,
256 |       "outputs": []
257 |     },
258 |     {
259 |       "cell_type": "code",
260 |       "metadata": {
261 |         "id": "0DRA0HkEm2pX",
262 |         "colab_type": "code",
263 |         "colab": {}
264 |       },
265 |       "source": [
266 |         "k_means_feature"
267 |       ],
268 |       "execution_count": 0,
269 |       "outputs": []
270 |     },
271 |     {
272 |       "cell_type": "code",
273 |       "metadata": {
274 |         "id": "PaewoBOhqv9U",
275 |         "colab_type": "code",
276 |         "colab": {}
277 |       },
278 |       "source": [
279 |         ""
280 |       ],
281 |       "execution_count": 0,
282 |       "outputs": []
283 |     },
284 |     {
285 |       "cell_type": "code",
286 |       "metadata": {
287 |         "id": "ih00cgwxqx9f",
288 |         "colab_type": "code",
289 |         "colab": {}
290 |       },
291 |       "source": [
292 |         ""
293 |       ],
294 |       "execution_count": 0,
295 |       "outputs": []
296 |     }
297 |   ]
298 | }


--------------------------------------------------------------------------------
/Python/05/data_prefecture_category.csv:
--------------------------------------------------------------------------------
 1 | 都道府県,食料,住居,光熱・水道,家具・家事,被服及び,保健医療,交通・通信,教育,教養娯楽,諸雑費
 2 | 札幌市,"819,536","279,764","228,330","103,893","129,292","99,902","442,564","124,799","276,976","218,769"
 3 | 青森市,"790,368","259,971","295,102","96,173","98,267","115,529","427,590","96,241","245,912","232,403"
 4 | 盛岡市,"771,420","246,223","250,260","102,652","142,183","123,152","438,431","144,845","276,140","286,892"
 5 | 仙台市,"862,052","240,690","197,006","117,818","116,682","109,467","379,888","150,622","317,874","280,381"
 6 | 秋田市,"835,325","226,152","296,036","111,587","127,798","133,474","496,526","111,430","280,440","238,857"
 7 | 山形市,"841,537","315,770","285,590","99,357","125,567","104,612","770,941","107,336","302,035","289,043"
 8 | 福島市,"950,582","285,711","257,681","126,588","169,182","94,275","665,083","141,012","392,401","276,986"
 9 | 水戸市,"877,968","235,274","231,740","127,631","174,481","119,688","695,369","200,251","390,123","322,231"
10 | 宇都宮市,"970,391","294,398","243,081","104,325","171,918","125,397","622,628","175,432","375,213","292,779"
11 | 前橋市,"876,472","149,049","202,882","150,428","166,129","142,103","549,336","113,726","397,195","313,629"
12 | さいたま市,"1,042,267","350,989","216,828","110,043","173,828","174,833","501,966","275,513","330,177","276,978"
13 | 千葉市,"867,636","162,260","153,227","81,768","142,156","87,722","421,253","155,287","329,146","320,532"
14 | 東京都区部,"943,279","404,843","175,822","112,716","208,975","156,721","417,168","272,696","423,476","254,768"
15 | 横浜市,"926,253","215,616","184,484","124,547","172,798","136,661","517,576","251,826","420,737","275,789"
16 | 新潟市,"842,736","178,061","254,426","116,049","128,177","114,074","606,168","199,170","265,664","316,409"
17 | 富山市,"896,917","307,401","263,618","127,392","122,275","114,880","579,845","91,179","336,369","263,650"
18 | 金沢市,"971,470","220,831","246,180","125,704","167,773","101,640","680,653","245,222","405,272","355,490"
19 | 福井市,"925,413","151,093","249,017","94,646","114,519","99,707","462,830","122,414","328,129","277,653"
20 | 甲府市,"747,397","300,816","214,981","90,925","101,371","104,563","420,691","116,368","323,950","234,201"
21 | 長野市,"786,130","344,086","239,435","109,564","116,436","108,134","519,702","92,604","266,054","289,707"
22 | 岐阜市,"865,541","201,315","239,365","130,079","173,834","135,925","699,940","243,758","414,244","305,166"
23 | 静岡市,"807,241","358,014","204,189","106,298","139,274","109,700","432,415","119,306","316,773","227,907"
24 | 名古屋市,"821,916","249,793","156,478","82,537","139,540","104,044","480,970","107,105","394,293","224,362"
25 | 津市,"863,096","195,647","203,113","125,860","164,073","117,537","517,539","251,968","386,805","251,410"
26 | 大津市,"915,677","108,352","236,832","158,680","141,251","108,875","521,557","180,740","325,487","245,402"
27 | 京都市,"845,226","210,964","232,337","88,931","129,277","92,014","390,179","212,035","358,755","246,851"
28 | 大阪市,"840,018","269,369","177,417","95,044","114,748","138,580","369,889","140,737","317,359","202,192"
29 | 神戸市,"656,924","136,381","103,216","67,591","110,686","54,228","319,734","31,347","208,916","184,317"
30 | 奈良市,"898,884","157,240","272,448","114,845","165,037","144,301","496,535","388,515","399,766","282,591"
31 | 和歌山市,"887,859","244,498","246,528","130,329","152,058","92,863","510,125","144,763","343,537","241,471"
32 | 鳥取市,"706,962","204,600","194,986","108,323","103,304","86,720","513,462","77,770","230,101","281,468"
33 | 松江市,"727,565","328,050","221,065","93,567","103,611","105,134","545,464","85,915","292,628","281,605"
34 | 岡山市,"765,652","289,496","202,733","96,181","161,001","136,606","502,230","151,293","302,995","233,083"
35 | 広島市,"810,255","219,623","182,511","105,210","127,351","104,142","605,174","181,977","284,268","220,201"
36 | 山口市,"607,019","363,261","177,832","86,593","100,132","108,410","586,591","59,450","298,965","223,511"
37 | 徳島市,"817,065","183,086","211,546","119,732","153,757","113,235","443,341","239,275","362,019","277,219"
38 | 高松市,"809,931","323,569","227,821","119,424","129,374","131,729","615,294","103,593","279,503","243,670"
39 | 松山市,"828,274","197,045","241,818","125,931","159,782","102,157","491,929","208,938","305,368","253,428"
40 | 高知市,"803,052","310,383","225,292","198,099","119,242","102,917","533,892","157,375","309,526","280,036"
41 | 福岡市,"760,638","188,295","156,097","116,400","152,971","96,334","471,238","120,417","355,085","273,449"
42 | 佐賀市,"814,400","262,685","224,972","98,570","140,041","144,157","515,064","144,634","359,726","284,169"
43 | 長崎市,"658,520","308,171","210,173","84,279","115,569","83,159","390,576","88,847","187,986","182,308"
44 | 熊本市,"870,311","311,909","243,256","143,752","152,455","133,442","509,583","223,684","345,740","338,671"
45 | 大分市,"789,001","355,356","207,281","135,103","162,991","100,884","561,382","97,157","451,635","313,381"
46 | 宮崎市,"778,907","222,861","185,008","96,874","122,197","113,690","559,338","131,236","279,217","278,642"
47 | 鹿児島市,"787,120","345,632","198,035","116,358","164,759","121,532","552,727","108,190","298,067","254,743"
48 | 那覇市,"726,160","337,851","211,156","111,406","100,591","102,076","448,672","131,853","237,977","175,467"
49 | 川崎市,"872,136","427,698","158,914","87,568","112,286","94,975","324,322","124,695","292,594","215,952"
50 | 相模原市,"756,340","290,616","166,843","95,209","112,149","122,466","302,611","112,232","363,026","210,583"
51 | 浜松市,"803,305","198,314","187,801","105,721","127,230","114,431","680,801","128,846","281,603","236,023"
52 | 堺市,"927,069","236,032","257,915","123,388","155,071","129,308","640,550","247,428","419,882","251,675"
53 | 北九州市,"862,432","149,685","206,224","109,226","167,286","146,687","700,366","164,434","270,083","256,207"


--------------------------------------------------------------------------------
/Python/05/pca_iris.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch05-iris-pca.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "metadata": {
 20 |         "id": "1fXQD0FV11Kd",
 21 |         "colab_type": "code",
 22 |         "colab": {}
 23 |       },
 24 |       "source": [
 25 |         "import numpy as np\n",
 26 |         "import pandas as pd\n",
 27 |         "import matplotlib.pyplot as plt\n",
 28 |         "%matplotlib inline\n",
 29 |         "\n",
 30 |         "from sklearn.decomposition import PCA\n",
 31 |         "from sklearn.preprocessing import StandardScaler\n",
 32 |         "\n",
 33 |         "from sklearn.datasets import load_iris"
 34 |       ],
 35 |       "execution_count": 0,
 36 |       "outputs": []
 37 |     },
 38 |     {
 39 |       "cell_type": "code",
 40 |       "metadata": {
 41 |         "id": "0O3Q1upd2VXx",
 42 |         "colab_type": "code",
 43 |         "colab": {}
 44 |       },
 45 |       "source": [
 46 |         "iris = load_iris()\n",
 47 |         "\n",
 48 |         "data_iris = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
 49 |         "data_iris[\"target\"] = iris.target\n",
 50 |         "\n",
 51 |         "print(data_iris.head())\n",
 52 |         "print(data_iris.shape)"
 53 |       ],
 54 |       "execution_count": 0,
 55 |       "outputs": []
 56 |     },
 57 |     {
 58 |       "cell_type": "code",
 59 |       "metadata": {
 60 |         "id": "aJ86062Q2XZd",
 61 |         "colab_type": "code",
 62 |         "colab": {}
 63 |       },
 64 |       "source": [
 65 |         "data_iris.describe()"
 66 |       ],
 67 |       "execution_count": 0,
 68 |       "outputs": []
 69 |     },
 70 |     {
 71 |       "cell_type": "markdown",
 72 |       "metadata": {
 73 |         "id": "bWeaduLQ2sP5",
 74 |         "colab_type": "text"
 75 |       },
 76 |       "source": [
 77 |         "## 主成分分析"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "metadata": {
 83 |         "id": "7jmVukQ-HEzL",
 84 |         "colab_type": "code",
 85 |         "colab": {}
 86 |       },
 87 |       "source": [
 88 |         "## 標準化\n",
 89 |         "scaler = StandardScaler()\n",
 90 |         "data_std = scaler.fit_transform(data_iris[iris.feature_names])"
 91 |       ],
 92 |       "execution_count": 0,
 93 |       "outputs": []
 94 |     },
 95 |     {
 96 |       "cell_type": "code",
 97 |       "metadata": {
 98 |         "id": "hKq2PZecE7Qv",
 99 |         "colab_type": "code",
100 |         "colab": {}
101 |       },
102 |       "source": [
103 |         "data_std_df = pd.DataFrame(data_std, columns=data_iris.columns[0:4])\n",
104 |         "\n",
105 |         "# もとのデータ\n",
106 |         "print(data_iris.describe())\n",
107 |         "\n",
108 |         "# 標準化後のデータ\n",
109 |         "print(data_std_df.describe())"
110 |       ],
111 |       "execution_count": 0,
112 |       "outputs": []
113 |     },
114 |     {
115 |       "cell_type": "code",
116 |       "metadata": {
117 |         "id": "oKknBSSI2cbv",
118 |         "colab_type": "code",
119 |         "colab": {}
120 |       },
121 |       "source": [
122 |         "pca = PCA(n_components=2)\n",
123 |         "pca_transformed = pca.fit_transform(data_std)"
124 |       ],
125 |       "execution_count": 0,
126 |       "outputs": []
127 |     },
128 |     {
129 |       "cell_type": "code",
130 |       "metadata": {
131 |         "id": "zr4jna7_HF38",
132 |         "colab_type": "code",
133 |         "colab": {}
134 |       },
135 |       "source": [
136 |         "print(pca_transformed.shape)"
137 |       ],
138 |       "execution_count": 0,
139 |       "outputs": []
140 |     },
141 |     {
142 |       "cell_type": "code",
143 |       "metadata": {
144 |         "id": "EHAuY_xE4fsv",
145 |         "colab_type": "code",
146 |         "colab": {}
147 |       },
148 |       "source": [
149 |         "plt.scatter(pca_transformed[:, 0], pca_transformed[:, 1])"
150 |       ],
151 |       "execution_count": 0,
152 |       "outputs": []
153 |     },
154 |     {
155 |       "cell_type": "code",
156 |       "metadata": {
157 |         "id": "oKP0ctmc4yXv",
158 |         "colab_type": "code",
159 |         "colab": {}
160 |       },
161 |       "source": [
162 |         "plt.scatter(pca_transformed[:, 0], pca_transformed[:, 1], c=data_iris[\"target\"])"
163 |       ],
164 |       "execution_count": 0,
165 |       "outputs": []
166 |     },
167 |     {
168 |       "cell_type": "markdown",
169 |       "metadata": {
170 |         "id": "2D5Tdi0259sT",
171 |         "colab_type": "text"
172 |       },
173 |       "source": [
174 |         "### 寄与度"
175 |       ]
176 |     },
177 |     {
178 |       "cell_type": "code",
179 |       "metadata": {
180 |         "id": "N2TPBL0n5Clm",
181 |         "colab_type": "code",
182 |         "colab": {}
183 |       },
184 |       "source": [
185 |         "print(pca.explained_variance_ratio_)"
186 |       ],
187 |       "execution_count": 0,
188 |       "outputs": []
189 |     },
190 |     {
191 |       "cell_type": "code",
192 |       "metadata": {
193 |         "id": "O1ezKZcq6Goo",
194 |         "colab_type": "code",
195 |         "colab": {}
196 |       },
197 |       "source": [
198 |         "print(sum(pca.explained_variance_ratio_))"
199 |       ],
200 |       "execution_count": 0,
201 |       "outputs": []
202 |     },
203 |     {
204 |       "cell_type": "code",
205 |       "metadata": {
206 |         "id": "EE1frz2M6IcN",
207 |         "colab_type": "code",
208 |         "colab": {}
209 |       },
210 |       "source": [
211 |         ""
212 |       ],
213 |       "execution_count": 0,
214 |       "outputs": []
215 |     }
216 |   ]
217 | }


--------------------------------------------------------------------------------
/Python/05/pca_prefecture.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch05-prefecture-pca.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "toc_visible": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "3yZ2JWsv13Yj",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import numpy as np\n",
 27 |         "import pandas as pd\n",
 28 |         "import matplotlib.pyplot as plt\n",
 29 |         "%matplotlib inline\n",
 30 |         "from mpl_toolkits.mplot3d import Axes3D\n",
 31 |         "\n",
 32 |         "from sklearn.decomposition import PCA\n",
 33 |         "from sklearn.preprocessing import StandardScaler"
 34 |       ],
 35 |       "execution_count": 0,
 36 |       "outputs": []
 37 |     },
 38 |     {
 39 |       "cell_type": "code",
 40 |       "metadata": {
 41 |         "id": "P8nKuedD6WiW",
 42 |         "colab_type": "code",
 43 |         "colab": {}
 44 |       },
 45 |       "source": [
 46 |         "data_prefecture = pd.read_csv(\"data_prefecture_category.csv\", encoding='utf-8', index_col=0)"
 47 |       ],
 48 |       "execution_count": 0,
 49 |       "outputs": []
 50 |     },
 51 |     {
 52 |       "cell_type": "code",
 53 |       "metadata": {
 54 |         "id": "ysoFHIiR6Ydc",
 55 |         "colab_type": "code",
 56 |         "colab": {}
 57 |       },
 58 |       "source": [
 59 |         "print(data_prefecture.head())"
 60 |       ],
 61 |       "execution_count": 0,
 62 |       "outputs": []
 63 |     },
 64 |     {
 65 |       "cell_type": "code",
 66 |       "metadata": {
 67 |         "id": "nfoP6S_L6a1E",
 68 |         "colab_type": "code",
 69 |         "colab": {}
 70 |       },
 71 |       "source": [
 72 |         "# カンマ区切りの文字列を数値に変換\n",
 73 |         "data_prefecture_float = data_prefecture.apply(lambda x: x.str.replace(',','')).astype(np.float)"
 74 |       ],
 75 |       "execution_count": 0,
 76 |       "outputs": []
 77 |     },
 78 |     {
 79 |       "cell_type": "code",
 80 |       "metadata": {
 81 |         "id": "cLrqPSPR6cfO",
 82 |         "colab_type": "code",
 83 |         "colab": {}
 84 |       },
 85 |       "source": [
 86 |         "print(data_prefecture_float.head())"
 87 |       ],
 88 |       "execution_count": 0,
 89 |       "outputs": []
 90 |     },
 91 |     {
 92 |       "cell_type": "code",
 93 |       "metadata": {
 94 |         "id": "vHLhXKXl6eBq",
 95 |         "colab_type": "code",
 96 |         "colab": {}
 97 |       },
 98 |       "source": [
 99 |         "plt.hist(data_prefecture_float[\"食料\"])"
100 |       ],
101 |       "execution_count": 0,
102 |       "outputs": []
103 |     },
104 |     {
105 |       "cell_type": "code",
106 |       "metadata": {
107 |         "id": "k_HpeCwo6fi_",
108 |         "colab_type": "code",
109 |         "colab": {}
110 |       },
111 |       "source": [
112 |         "plt.hist(data_prefecture_float[\"住居\"])"
113 |       ],
114 |       "execution_count": 0,
115 |       "outputs": []
116 |     },
117 |     {
118 |       "cell_type": "code",
119 |       "metadata": {
120 |         "id": "R-znSRyI6hGz",
121 |         "colab_type": "code",
122 |         "colab": {}
123 |       },
124 |       "source": [
125 |         "plt.hist(data_prefecture_float[\"教育\"])"
126 |       ],
127 |       "execution_count": 0,
128 |       "outputs": []
129 |     },
130 |     {
131 |       "cell_type": "markdown",
132 |       "metadata": {
133 |         "id": "cPdr4CGA6lHX",
134 |         "colab_type": "text"
135 |       },
136 |       "source": [
137 |         "## 主成分分析"
138 |       ]
139 |     },
140 |     {
141 |       "cell_type": "code",
142 |       "metadata": {
143 |         "id": "OfIoWjir6xRr",
144 |         "colab_type": "code",
145 |         "colab": {}
146 |       },
147 |       "source": [
148 |         "# 標準化\n",
149 |         "\n",
150 |         "scaler = StandardScaler()\n",
151 |         "data_std = scaler.fit_transform(data_prefecture_float)"
152 |       ],
153 |       "execution_count": 0,
154 |       "outputs": []
155 |     },
156 |     {
157 |       "cell_type": "code",
158 |       "metadata": {
159 |         "id": "L7oIaSiW6im3",
160 |         "colab_type": "code",
161 |         "colab": {}
162 |       },
163 |       "source": [
164 |         "pca = PCA(n_components=2)\n",
165 |         "pca_transformed = pca.fit_transform(data_std)"
166 |       ],
167 |       "execution_count": 0,
168 |       "outputs": []
169 |     },
170 |     {
171 |       "cell_type": "code",
172 |       "metadata": {
173 |         "id": "wOIjtLG86tt8",
174 |         "colab_type": "code",
175 |         "colab": {}
176 |       },
177 |       "source": [
178 |         "plt.scatter(pca_transformed[:, 0], pca_transformed[:, 1])"
179 |       ],
180 |       "execution_count": 0,
181 |       "outputs": []
182 |     },
183 |     {
184 |       "cell_type": "code",
185 |       "metadata": {
186 |         "id": "SCB9i_hMBqs6",
187 |         "colab_type": "code",
188 |         "colab": {}
189 |       },
190 |       "source": [
191 |         "fig, ax = plt.subplots(figsize=(14, 8))\n",
192 |         "\n",
193 |         "plt.scatter(pca_transformed[:, 0], pca_transformed[:, 1])\n",
194 |         "for k, v in enumerate(pca_transformed):\n",
195 |         "    ax.annotate(k, xy=(v[0],v[1]),size=10)"
196 |       ],
197 |       "execution_count": 0,
198 |       "outputs": []
199 |     },
200 |     {
201 |       "cell_type": "code",
202 |       "metadata": {
203 |         "id": "YiLk33xLDVFr",
204 |         "colab_type": "code",
205 |         "colab": {}
206 |       },
207 |       "source": [
208 |         "for i in range(data_prefecture_float.shape[0]):\n",
209 |         "    print(i, data_prefecture_float.index[i])"
210 |       ],
211 |       "execution_count": 0,
212 |       "outputs": []
213 |     },
214 |     {
215 |       "cell_type": "markdown",
216 |       "metadata": {
217 |         "id": "BkgQjkFB64sH",
218 |         "colab_type": "text"
219 |       },
220 |       "source": [
221 |         "### 寄与度"
222 |       ]
223 |     },
224 |     {
225 |       "cell_type": "code",
226 |       "metadata": {
227 |         "id": "lk5wlzgG61GO",
228 |         "colab_type": "code",
229 |         "colab": {}
230 |       },
231 |       "source": [
232 |         "print(pca.explained_variance_ratio_)"
233 |       ],
234 |       "execution_count": 0,
235 |       "outputs": []
236 |     },
237 |     {
238 |       "cell_type": "code",
239 |       "metadata": {
240 |         "id": "TEDtVFMk65tW",
241 |         "colab_type": "code",
242 |         "colab": {}
243 |       },
244 |       "source": [
245 |         "print(sum(pca.explained_variance_ratio_))"
246 |       ],
247 |       "execution_count": 0,
248 |       "outputs": []
249 |     },
250 |     {
251 |       "cell_type": "markdown",
252 |       "metadata": {
253 |         "id": "lLDsFaTT7I6W",
254 |         "colab_type": "text"
255 |       },
256 |       "source": [
257 |         "### 次元を増やす"
258 |       ]
259 |     },
260 |     {
261 |       "cell_type": "code",
262 |       "metadata": {
263 |         "id": "A7SvxLRz67UA",
264 |         "colab_type": "code",
265 |         "colab": {}
266 |       },
267 |       "source": [
268 |         "pca2 = PCA(n_components=3)\n",
269 |         "pca2_transformed = pca2.fit_transform(data_std)"
270 |       ],
271 |       "execution_count": 0,
272 |       "outputs": []
273 |     },
274 |     {
275 |       "cell_type": "code",
276 |       "metadata": {
277 |         "id": "ECpeAHo16_pw",
278 |         "colab_type": "code",
279 |         "colab": {}
280 |       },
281 |       "source": [
282 |         "print(sum(pca2.explained_variance_ratio_))"
283 |       ],
284 |       "execution_count": 0,
285 |       "outputs": []
286 |     },
287 |     {
288 |       "cell_type": "code",
289 |       "metadata": {
290 |         "id": "z9t9BTuz7UG5",
291 |         "colab_type": "code",
292 |         "colab": {}
293 |       },
294 |       "source": [
295 |         "print(pca2.explained_variance_ratio_)"
296 |       ],
297 |       "execution_count": 0,
298 |       "outputs": []
299 |     },
300 |     {
301 |       "cell_type": "code",
302 |       "metadata": {
303 |         "id": "ExQFfXh-8ISK",
304 |         "colab_type": "code",
305 |         "colab": {}
306 |       },
307 |       "source": [
308 |         "fig = plt.figure()\n",
309 |         "ax = fig.add_subplot(111, projection='3d')\n",
310 |         "ax.scatter3D(pca2_transformed[:, 0], pca2_transformed[:, 1], pca2_transformed[:, 2])\n",
311 |         "ax.set_title(\"Scatter Plot\")\n",
312 |         "ax.view_init(40, 100)\n",
313 |         "\n",
314 |         "\n",
315 |         "plt.show()"
316 |       ],
317 |       "execution_count": 0,
318 |       "outputs": []
319 |     },
320 |     {
321 |       "cell_type": "code",
322 |       "metadata": {
323 |         "id": "N8JmfSnXzSww",
324 |         "colab_type": "code",
325 |         "colab": {}
326 |       },
327 |       "source": [
328 |         "pca2_transformed"
329 |       ],
330 |       "execution_count": 0,
331 |       "outputs": []
332 |     },
333 |     {
334 |       "cell_type": "code",
335 |       "metadata": {
336 |         "id": "ihqW-C3e00c-",
337 |         "colab_type": "code",
338 |         "colab": {}
339 |       },
340 |       "source": [
341 |         ""
342 |       ],
343 |       "execution_count": 0,
344 |       "outputs": []
345 |     }
346 |   ]
347 | }


--------------------------------------------------------------------------------
/Python/06/classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "classification.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "metadata": {
 20 |         "id": "dj5RSy08UIcH",
 21 |         "colab_type": "code",
 22 |         "colab": {}
 23 |       },
 24 |       "source": [
 25 |         "import numpy as np\n",
 26 |         "import pandas as pd\n",
 27 |         "\n",
 28 |         "from sklearn.datasets import load_iris\n",
 29 |         "from sklearn.linear_model import LogisticRegression\n",
 30 |         "\n",
 31 |         "from sklearn.model_selection import train_test_split\n",
 32 |         "from sklearn.metrics import accuracy_score\n",
 33 |         "from sklearn.metrics import roc_auc_score"
 34 |       ],
 35 |       "execution_count": 0,
 36 |       "outputs": []
 37 |     },
 38 |     {
 39 |       "cell_type": "markdown",
 40 |       "metadata": {
 41 |         "id": "6i4REqNKVDTv",
 42 |         "colab_type": "text"
 43 |       },
 44 |       "source": [
 45 |         "## データ読み込み\n"
 46 |       ]
 47 |     },
 48 |     {
 49 |       "cell_type": "code",
 50 |       "metadata": {
 51 |         "id": "7gQ4SukTUBM7",
 52 |         "colab_type": "code",
 53 |         "colab": {}
 54 |       },
 55 |       "source": [
 56 |         "iris = load_iris()\n",
 57 |         "\n",
 58 |         "tmp_data = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
 59 |         "tmp_data[\"target\"] = iris.target\n",
 60 |         "\n",
 61 |         "data_iris = tmp_data[tmp_data['target'] <= 1]\n",
 62 |         "\n",
 63 |         "x_column_list = ['sepal length (cm)']\n",
 64 |         "y_column_list = ['target']\n",
 65 |         "\n",
 66 |         "X_train, X_test, y_train, y_test = train_test_split(data_iris[x_column_list], \n",
 67 |         "                                                    data_iris[y_column_list], test_size=0.3)"
 68 |       ],
 69 |       "execution_count": 0,
 70 |       "outputs": []
 71 |     },
 72 |     {
 73 |       "cell_type": "markdown",
 74 |       "metadata": {
 75 |         "id": "oxavKwzVVPTp",
 76 |         "colab_type": "text"
 77 |       },
 78 |       "source": [
 79 |         "## 学習と予測"
 80 |       ]
 81 |     },
 82 |     {
 83 |       "cell_type": "code",
 84 |       "metadata": {
 85 |         "id": "0Ogf7jEFVK7K",
 86 |         "colab_type": "code",
 87 |         "colab": {}
 88 |       },
 89 |       "source": [
 90 |         "logit = LogisticRegression()\n",
 91 |         "\n",
 92 |         "logit = LogisticRegression()\n",
 93 |         "logit.fit(X_train, y_train)\n",
 94 |         "\n",
 95 |         "y_pred = logit.predict(X_test)"
 96 |       ],
 97 |       "execution_count": 0,
 98 |       "outputs": []
 99 |     },
100 |     {
101 |       "cell_type": "markdown",
102 |       "metadata": {
103 |         "id": "Gc1F9yRXVz5G",
104 |         "colab_type": "text"
105 |       },
106 |       "source": [
107 |         "## 正解率"
108 |       ]
109 |     },
110 |     {
111 |       "cell_type": "code",
112 |       "metadata": {
113 |         "id": "LDFpNB5DVunc",
114 |         "colab_type": "code",
115 |         "colab": {}
116 |       },
117 |       "source": [
118 |         "accuracy_score(y_test, y_pred)"
119 |       ],
120 |       "execution_count": 0,
121 |       "outputs": []
122 |     },
123 |     {
124 |       "cell_type": "markdown",
125 |       "metadata": {
126 |         "id": "a0xBXjdDWAIK",
127 |         "colab_type": "text"
128 |       },
129 |       "source": [
130 |         "## AUC"
131 |       ]
132 |     },
133 |     {
134 |       "cell_type": "code",
135 |       "metadata": {
136 |         "id": "jKJMH5oMV99s",
137 |         "colab_type": "code",
138 |         "colab": {}
139 |       },
140 |       "source": [
141 |         "roc_auc_score(y_test, y_pred)"
142 |       ],
143 |       "execution_count": 0,
144 |       "outputs": []
145 |     },
146 |     {
147 |       "cell_type": "code",
148 |       "metadata": {
149 |         "id": "cL_YCijTUhAr",
150 |         "colab_type": "code",
151 |         "colab": {}
152 |       },
153 |       "source": [
154 |         ""
155 |       ],
156 |       "execution_count": 0,
157 |       "outputs": []
158 |     }
159 |   ]
160 | }


--------------------------------------------------------------------------------
/Python/06/regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "regression.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "metadata": {
 20 |         "id": "Tplx-pF0QMTW",
 21 |         "colab_type": "code",
 22 |         "colab": {}
 23 |       },
 24 |       "source": [
 25 |         "import numpy as np\n",
 26 |         "import pandas as pd\n",
 27 |         "\n",
 28 |         "from sklearn.linear_model import LinearRegression\n",
 29 |         "from sklearn.datasets import load_boston\n",
 30 |         "\n",
 31 |         "from sklearn.model_selection import train_test_split\n",
 32 |         "from sklearn.metrics import mean_absolute_error\n",
 33 |         "from sklearn.metrics import mean_squared_error\n",
 34 |         "from sklearn.metrics import mean_squared_log_error"
 35 |       ],
 36 |       "execution_count": 0,
 37 |       "outputs": []
 38 |     },
 39 |     {
 40 |       "cell_type": "markdown",
 41 |       "metadata": {
 42 |         "id": "5RuL6EUjSP6L",
 43 |         "colab_type": "text"
 44 |       },
 45 |       "source": [
 46 |         "## データ読み込み"
 47 |       ]
 48 |     },
 49 |     {
 50 |       "cell_type": "code",
 51 |       "metadata": {
 52 |         "id": "Ed4iMNSlSMJE",
 53 |         "colab_type": "code",
 54 |         "colab": {}
 55 |       },
 56 |       "source": [
 57 |         "boston = load_boston()\n",
 58 |         "data_boston = pd.DataFrame(boston.data, columns=boston.feature_names)\n",
 59 |         "data_boston['PRICE'] = boston.target\n",
 60 |         "\n",
 61 |         "lr_multi = LinearRegression()\n",
 62 |         "\n",
 63 |         "x_column_list_for_multi = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', \n",
 64 |         "                           'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']\n",
 65 |         "y_column_list_for_multi = ['PRICE']\n",
 66 |         "\n",
 67 |         "X_train, X_test, y_train, y_test = train_test_split(data_boston[x_column_list_for_multi], \n",
 68 |         "                                                    data_boston[y_column_list_for_multi], test_size=0.3)"
 69 |       ],
 70 |       "execution_count": 0,
 71 |       "outputs": []
 72 |     },
 73 |     {
 74 |       "cell_type": "markdown",
 75 |       "metadata": {
 76 |         "id": "6lAp0JMISvtD",
 77 |         "colab_type": "text"
 78 |       },
 79 |       "source": [
 80 |         "## 学習と予測"
 81 |       ]
 82 |     },
 83 |     {
 84 |       "cell_type": "code",
 85 |       "metadata": {
 86 |         "id": "Weqsnyi0SvSZ",
 87 |         "colab_type": "code",
 88 |         "colab": {}
 89 |       },
 90 |       "source": [
 91 |         "lr_multi.fit(X_train, y_train) \n",
 92 |         "y_pred = lr_multi.predict(X_test)"
 93 |       ],
 94 |       "execution_count": 0,
 95 |       "outputs": []
 96 |     },
 97 |     {
 98 |       "cell_type": "markdown",
 99 |       "metadata": {
100 |         "id": "Ld1yFxecTJdF",
101 |         "colab_type": "text"
102 |       },
103 |       "source": [
104 |         "## RMSE"
105 |       ]
106 |     },
107 |     {
108 |       "cell_type": "code",
109 |       "metadata": {
110 |         "id": "NA0uL9a5ZAql",
111 |         "colab_type": "code",
112 |         "colab": {}
113 |       },
114 |       "source": [
115 |         "mean_squared_error(y_test, y_pred)"
116 |       ],
117 |       "execution_count": 0,
118 |       "outputs": []
119 |     },
120 |     {
121 |       "cell_type": "code",
122 |       "metadata": {
123 |         "id": "Zg-a3SD_TLPM",
124 |         "colab_type": "code",
125 |         "colab": {}
126 |       },
127 |       "source": [
128 |         "np.sqrt(mean_squared_error(y_test, y_pred))"
129 |       ],
130 |       "execution_count": 0,
131 |       "outputs": []
132 |     },
133 |     {
134 |       "cell_type": "markdown",
135 |       "metadata": {
136 |         "id": "fljMKBwdTBxJ",
137 |         "colab_type": "text"
138 |       },
139 |       "source": [
140 |         "## MAE"
141 |       ]
142 |     },
143 |     {
144 |       "cell_type": "code",
145 |       "metadata": {
146 |         "id": "qJqX5ZeGSoQ7",
147 |         "colab_type": "code",
148 |         "colab": {}
149 |       },
150 |       "source": [
151 |         "mean_absolute_error(y_test, y_pred)"
152 |       ],
153 |       "execution_count": 0,
154 |       "outputs": []
155 |     },
156 |     {
157 |       "cell_type": "markdown",
158 |       "metadata": {
159 |         "id": "BgJdnzIoTIV5",
160 |         "colab_type": "text"
161 |       },
162 |       "source": [
163 |         "## RMSLE"
164 |       ]
165 |     },
166 |     {
167 |       "cell_type": "code",
168 |       "metadata": {
169 |         "id": "RHvxOfkKZruN",
170 |         "colab_type": "code",
171 |         "colab": {}
172 |       },
173 |       "source": [
174 |         "mean_squared_log_error (y_test, y_pred)"
175 |       ],
176 |       "execution_count": 0,
177 |       "outputs": []
178 |     },
179 |     {
180 |       "cell_type": "code",
181 |       "metadata": {
182 |         "id": "LoKr-oCWTHcp",
183 |         "colab_type": "code",
184 |         "colab": {}
185 |       },
186 |       "source": [
187 |         "np.sqrt(mean_squared_log_error (y_test, y_pred))"
188 |       ],
189 |       "execution_count": 0,
190 |       "outputs": []
191 |     },
192 |     {
193 |       "cell_type": "code",
194 |       "metadata": {
195 |         "id": "m0Lqa0CGTxUF",
196 |         "colab_type": "code",
197 |         "colab": {}
198 |       },
199 |       "source": [
200 |         ""
201 |       ],
202 |       "execution_count": 0,
203 |       "outputs": []
204 |     }
205 |   ]
206 | }


--------------------------------------------------------------------------------
/Python/07/cnn_mnist.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch07-mnist-cnn.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "GPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "FSJx3WFbK_mI",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "from keras.layers import Conv2D, MaxPool2D, Flatten, Dense\n",
 27 |         "from keras.models import Sequential\n",
 28 |         "\n",
 29 |         "from keras.utils import to_categorical\n",
 30 |         "import matplotlib.pyplot as plt\n",
 31 |         "\n",
 32 |         "from keras.datasets import mnist"
 33 |       ],
 34 |       "execution_count": 0,
 35 |       "outputs": []
 36 |     },
 37 |     {
 38 |       "cell_type": "markdown",
 39 |       "metadata": {
 40 |         "id": "wVmdIQwbLTlW",
 41 |         "colab_type": "text"
 42 |       },
 43 |       "source": [
 44 |         "## データ読み込み"
 45 |       ]
 46 |     },
 47 |     {
 48 |       "cell_type": "code",
 49 |       "metadata": {
 50 |         "id": "wBIqFYBaLKpi",
 51 |         "colab_type": "code",
 52 |         "colab": {}
 53 |       },
 54 |       "source": [
 55 |         "(X_train, y_train), (X_test, y_test) = mnist.load_data()\n",
 56 |         "\n",
 57 |         "print(X_train.shape)"
 58 |       ],
 59 |       "execution_count": 0,
 60 |       "outputs": []
 61 |     },
 62 |     {
 63 |       "cell_type": "code",
 64 |       "metadata": {
 65 |         "id": "3IQ6YmIALOAS",
 66 |         "colab_type": "code",
 67 |         "colab": {}
 68 |       },
 69 |       "source": [
 70 |         "plt.imshow(X_train[0], cmap='gray')"
 71 |       ],
 72 |       "execution_count": 0,
 73 |       "outputs": []
 74 |     },
 75 |     {
 76 |       "cell_type": "code",
 77 |       "metadata": {
 78 |         "id": "8jnlir0kLP41",
 79 |         "colab_type": "code",
 80 |         "colab": {}
 81 |       },
 82 |       "source": [
 83 |         "print(y_train.shape)\n",
 84 |         "print(y_train[0])"
 85 |       ],
 86 |       "execution_count": 0,
 87 |       "outputs": []
 88 |     },
 89 |     {
 90 |       "cell_type": "markdown",
 91 |       "metadata": {
 92 |         "id": "niA-WDCeLVHh",
 93 |         "colab_type": "text"
 94 |       },
 95 |       "source": [
 96 |         "### データ整形"
 97 |       ]
 98 |     },
 99 |     {
100 |       "cell_type": "code",
101 |       "metadata": {
102 |         "id": "agl1i6TjLR05",
103 |         "colab_type": "code",
104 |         "colab": {}
105 |       },
106 |       "source": [
107 |         "# 画像をreshape\n",
108 |         "X_train = X_train.reshape((60000, 28, 28, 1))\n",
109 |         "X_test = X_test.reshape((10000, 28, 28, 1))\n",
110 |         "\n",
111 |         "# 輝度値を0 ~ 1に入るように正規化\n",
112 |         "X_train = X_train.astype('float32')/255\n",
113 |         "X_test = X_test.astype('float32')/255\n",
114 |         "\n",
115 |         "# one hot encoding\n",
116 |         "y_train = to_categorical(y_train)\n",
117 |         "y_test = to_categorical(y_test)"
118 |       ],
119 |       "execution_count": 0,
120 |       "outputs": []
121 |     },
122 |     {
123 |       "cell_type": "markdown",
124 |       "metadata": {
125 |         "id": "vxpRwBYLLcZ3",
126 |         "colab_type": "text"
127 |       },
128 |       "source": [
129 |         "## モデル作成"
130 |       ]
131 |     },
132 |     {
133 |       "cell_type": "code",
134 |       "metadata": {
135 |         "id": "a-pmohDGLaT9",
136 |         "colab_type": "code",
137 |         "colab": {}
138 |       },
139 |       "source": [
140 |         "model = Sequential()\n",
141 |         "\n",
142 |         "# 畳み込み層\n",
143 |         "model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))\n",
144 |         "  \n",
145 |         "# プーリング層\n",
146 |         "model.add(MaxPool2D(2, 2))\n",
147 |         "  \n",
148 |         "model.add(Flatten())\n",
149 |         "model.add(Dense(32, activation='relu'))\n",
150 |         "model.add(Dense(10, activation='softmax'))\n",
151 |         "\n",
152 |         "model.summary()"
153 |       ],
154 |       "execution_count": 0,
155 |       "outputs": []
156 |     },
157 |     {
158 |       "cell_type": "code",
159 |       "metadata": {
160 |         "id": "BNfazDhRLnt1",
161 |         "colab_type": "code",
162 |         "colab": {}
163 |       },
164 |       "source": [
165 |         "model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])\n",
166 |         "model.fit(X_train, y_train, epochs=5, batch_size=64)"
167 |       ],
168 |       "execution_count": 0,
169 |       "outputs": []
170 |     },
171 |     {
172 |       "cell_type": "code",
173 |       "metadata": {
174 |         "id": "T9Ucs5GVLud1",
175 |         "colab_type": "code",
176 |         "colab": {}
177 |       },
178 |       "source": [
179 |         "model.evaluate(X_test, y_test)"
180 |       ],
181 |       "execution_count": 0,
182 |       "outputs": []
183 |     },
184 |     {
185 |       "cell_type": "code",
186 |       "metadata": {
187 |         "id": "2Z7oEM5mM3-b",
188 |         "colab_type": "code",
189 |         "colab": {}
190 |       },
191 |       "source": [
192 |         "from keras.models import load_model\n",
193 |         "\n",
194 |         "# modelの保存\n",
195 |         "model.save('model.h5')  \n",
196 |         "\n",
197 |         "# modelの読み込み\n",
198 |         "model = load_model('model.h5')"
199 |       ],
200 |       "execution_count": 0,
201 |       "outputs": []
202 |     },
203 |     {
204 |       "cell_type": "code",
205 |       "metadata": {
206 |         "id": "w5auAp3RolLM",
207 |         "colab_type": "code",
208 |         "colab": {}
209 |       },
210 |       "source": [
211 |         ""
212 |       ],
213 |       "execution_count": 0,
214 |       "outputs": []
215 |     }
216 |   ]
217 | }


--------------------------------------------------------------------------------
/Python/07/cnn_temple_shrine.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch07-temple-shrine-cnn.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "GPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "HRoy_vzj83WJ",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "from keras.datasets import mnist\n",
 27 |         "from keras.utils import to_categorical\n",
 28 |         "\n",
 29 |         "from keras.layers import Conv2D, MaxPool2D, Flatten, Dense\n",
 30 |         "from keras.models import Sequential\n",
 31 |         "from keras.models import load_model\n",
 32 |         "\n",
 33 |         "import matplotlib.pyplot as plt\n",
 34 |         "from PIL import Image\n",
 35 |         "import os\n",
 36 |         "import numpy as np\n",
 37 |         "from sklearn.model_selection import train_test_split"
 38 |       ],
 39 |       "execution_count": 0,
 40 |       "outputs": []
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "metadata": {
 45 |         "id": "lkcJwKDJ4vuV",
 46 |         "colab_type": "code",
 47 |         "colab": {}
 48 |       },
 49 |       "source": [
 50 |         "# Google ドライブをマウントするには、このセルを実行してください。\n",
 51 |         "from google.colab import drive\n",
 52 |         "drive.mount('/content/drive/')"
 53 |       ],
 54 |       "execution_count": 0,
 55 |       "outputs": []
 56 |     },
 57 |     {
 58 |       "cell_type": "markdown",
 59 |       "metadata": {
 60 |         "id": "gbGo22SlK0E0",
 61 |         "colab_type": "text"
 62 |       },
 63 |       "source": [
 64 |         "## データ読み込み"
 65 |       ]
 66 |     },
 67 |     {
 68 |       "cell_type": "code",
 69 |       "metadata": {
 70 |         "id": "rzob7zR3uw9D",
 71 |         "colab_type": "code",
 72 |         "colab": {}
 73 |       },
 74 |       "source": [
 75 |         "X = []\n",
 76 |         "Y = []\n",
 77 |         "image_size = 30\n",
 78 |         "\n",
 79 |         "folder_path = \"/content/drive/My Drive/PythonBooks/src/Ch07/images/\"\n",
 80 |         "file_list = os.listdir(folder_path)"
 81 |       ],
 82 |       "execution_count": 0,
 83 |       "outputs": []
 84 |     },
 85 |     {
 86 |       "cell_type": "code",
 87 |       "metadata": {
 88 |         "id": "aG-mpkDNKzoJ",
 89 |         "colab_type": "code",
 90 |         "colab": {}
 91 |       },
 92 |       "source": [
 93 |         "for file in file_list:\n",
 94 |         "    try:\n",
 95 |         "        image = Image.open(folder_path + file)\n",
 96 |         "    except:\n",
 97 |         "        print('error', file)\n",
 98 |         "        continue\n",
 99 |         "    \n",
100 |         "    image = image.convert(\"RGB\")\n",
101 |         "    image = image.resize((image_size, image_size))\n",
102 |         "    data = np.asarray(image)\n",
103 |         "    X.append(data)\n",
104 |         "    if 'temple' in file:\n",
105 |         "        Y.append(0)\n",
106 |         "    else:\n",
107 |         "         Y.append(1)\n",
108 |         "\n",
109 |         "X = np.array(X)\n",
110 |         "Y = np.array(Y)"
111 |       ],
112 |       "execution_count": 0,
113 |       "outputs": []
114 |     },
115 |     {
116 |       "cell_type": "markdown",
117 |       "metadata": {
118 |         "id": "hoPt9AcDP5KW",
119 |         "colab_type": "text"
120 |       },
121 |       "source": [
122 |         "### データ整形"
123 |       ]
124 |     },
125 |     {
126 |       "cell_type": "code",
127 |       "metadata": {
128 |         "id": "uogTTc6oBsd0",
129 |         "colab_type": "code",
130 |         "colab": {}
131 |       },
132 |       "source": [
133 |         "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)\n",
134 |         "# 画像をreshape\n",
135 |         "X_train = X_train.reshape(-1, image_size, image_size, 3)\n",
136 |         "X_test = X_test.reshape(-1, image_size, image_size, 3)\n",
137 |         "\n",
138 |         "# 輝度値を0 ~ 1に入るように正規化\n",
139 |         "X_train = X_train.astype('float32')/255\n",
140 |         "X_test = X_test.astype('float32')/255\n",
141 |         "\n",
142 |         "# one hot encoding\n",
143 |         "y_train = to_categorical(y_train)\n",
144 |         "y_test = to_categorical(y_test)"
145 |       ],
146 |       "execution_count": 0,
147 |       "outputs": []
148 |     },
149 |     {
150 |       "cell_type": "markdown",
151 |       "metadata": {
152 |         "id": "e4VxBZMxQPl6",
153 |         "colab_type": "text"
154 |       },
155 |       "source": [
156 |         "## モデル作成"
157 |       ]
158 |     },
159 |     {
160 |       "cell_type": "code",
161 |       "metadata": {
162 |         "id": "rQ3yfGfH_at7",
163 |         "colab_type": "code",
164 |         "colab": {}
165 |       },
166 |       "source": [
167 |         "model_cnn = Sequential()\n",
168 |         "model_cnn.add(Conv2D(32, (3,3), activation='relu', input_shape=(image_size, image_size, 3)))\n",
169 |         "model_cnn.add(MaxPool2D(2,2))\n",
170 |         "model_cnn.add(Conv2D(64, (3,3), activation='relu'))\n",
171 |         "model_cnn.add(MaxPool2D(2,2))\n",
172 |         "model_cnn.add(Conv2D(128, (3,3), activation='relu'))\n",
173 |         "model_cnn.add(MaxPool2D(2,2))\n",
174 |         "model_cnn.add(Flatten())\n",
175 |         "model_cnn.add(Dense(512, activation='relu'))       \n",
176 |         "model_cnn.add(Dense(2, activation='softmax'))\n",
177 |         "\n",
178 |         "model_cnn.summary()"
179 |       ],
180 |       "execution_count": 0,
181 |       "outputs": []
182 |     },
183 |     {
184 |       "cell_type": "code",
185 |       "metadata": {
186 |         "id": "izLnujUXKjXy",
187 |         "colab_type": "code",
188 |         "colab": {}
189 |       },
190 |       "source": [
191 |         "model_cnn.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])\n",
192 |         "model_cnn.fit(X_train, y_train, epochs=10, batch_size=20)\n",
193 |         "\n",
194 |         "model_cnn.evaluate(X_test, y_test)"
195 |       ],
196 |       "execution_count": 0,
197 |       "outputs": []
198 |     },
199 |     {
200 |       "cell_type": "code",
201 |       "metadata": {
202 |         "id": "Fi-ZodmOJ0I1",
203 |         "colab_type": "code",
204 |         "colab": {}
205 |       },
206 |       "source": [
207 |         ""
208 |       ],
209 |       "execution_count": 0,
210 |       "outputs": []
211 |     }
212 |   ]
213 | }


--------------------------------------------------------------------------------
/Python/07/get_imaeg.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | 
 3 | import urllib.request
 4 | from urllib.parse import quote
 5 | import httplib2
 6 | import json
 7 | import requests
 8 | 
 9 | KEY = "" # 取得したAPI Key
10 | ENGINE_ID = "" # 取得した検索エンジンID
11 | 
12 | keywords = ["寺", "神社"]
13 | start_num = 0
14 | 
15 | def get_urls(keyword, number):
16 |     urls = []
17 |     count = 0
18 | 
19 |     while count < number:
20 |         if number - count <= 10:
21 |             num_param = str(number - count)
22 |         else:
23 |             num_param = "10"
24 | 
25 |         query = "https://www.googleapis.com/customsearch/v1?key=" + KEY + \
26 |             "&cx=" + ENGINE_ID + \
27 |             "&num=" + num_param + \
28 |             "&start=" + str(count + 1) + \
29 |             "&q=" + quote(keyword) + \
30 |             "&searchType=image"  # &dateRestrict=y1"
31 | 
32 |         res = urllib.request.urlopen(query)
33 |         data = json.loads(res.read().decode('utf-8'))
34 | 
35 |         for i in range(len(data["items"])):
36 |             urls.append(data["items"][i]["link"])
37 |         
38 |         count += 10
39 | 
40 |     return urls
41 | 
42 | def get_images(keyword, number):
43 |     urls = get_urls(keyword, number)
44 | 
45 |     for i in range(len(urls)):
46 |         res = requests.get(urls[i], verify=False)
47 |         image = res.content
48 | 
49 |         if keyword == keywords[0]:
50 |             filename = "temple" + str(i + start_num) + ".jpg"
51 |         else:
52 |             filename = "shrine" + str(i + start_num) + ".jpg"
53 | 
54 |         with open(filename, 'wb') as f:
55 |             f.write(image)
56 | 
57 | # メイン
58 | for keyword in keywords:
59 |     # キーワードごとに取得したい枚数を指定(今回は100)
60 |     get_images(keyword, 100)


--------------------------------------------------------------------------------
/Python/07/nn_mnist.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch07-mnist-nn.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "toc_visible": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     },
 16 |     "accelerator": "GPU"
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "code",
 21 |       "metadata": {
 22 |         "id": "HRoy_vzj83WJ",
 23 |         "colab_type": "code",
 24 |         "colab": {}
 25 |       },
 26 |       "source": [
 27 |         "from keras.layers import Dense\n",
 28 |         "from keras.models import Sequential\n",
 29 |         "from keras.utils import to_categorical\n",
 30 |         "import matplotlib.pyplot as plt\n",
 31 |         "\n",
 32 |         "from keras.datasets import mnist"
 33 |       ],
 34 |       "execution_count": 0,
 35 |       "outputs": []
 36 |     },
 37 |     {
 38 |       "cell_type": "markdown",
 39 |       "metadata": {
 40 |         "id": "gbGo22SlK0E0",
 41 |         "colab_type": "text"
 42 |       },
 43 |       "source": [
 44 |         "## データ読み込み"
 45 |       ]
 46 |     },
 47 |     {
 48 |       "cell_type": "code",
 49 |       "metadata": {
 50 |         "id": "aG-mpkDNKzoJ",
 51 |         "colab_type": "code",
 52 |         "colab": {}
 53 |       },
 54 |       "source": [
 55 |         "(X_train, y_train), (X_test, y_test) = mnist.load_data()"
 56 |       ],
 57 |       "execution_count": 0,
 58 |       "outputs": []
 59 |     },
 60 |     {
 61 |       "cell_type": "code",
 62 |       "metadata": {
 63 |         "id": "XVmcIrW8MSg6",
 64 |         "colab_type": "code",
 65 |         "colab": {}
 66 |       },
 67 |       "source": [
 68 |         "print(X_train.shape)"
 69 |       ],
 70 |       "execution_count": 0,
 71 |       "outputs": []
 72 |     },
 73 |     {
 74 |       "cell_type": "code",
 75 |       "metadata": {
 76 |         "id": "4R59WO9Uy4wD",
 77 |         "colab_type": "code",
 78 |         "colab": {}
 79 |       },
 80 |       "source": [
 81 |         "print(X_train[0])"
 82 |       ],
 83 |       "execution_count": 0,
 84 |       "outputs": []
 85 |     },
 86 |     {
 87 |       "cell_type": "code",
 88 |       "metadata": {
 89 |         "id": "FND90BkzPWd6",
 90 |         "colab_type": "code",
 91 |         "colab": {}
 92 |       },
 93 |       "source": [
 94 |         "plt.imshow(X_train[0], cmap='gray')"
 95 |       ],
 96 |       "execution_count": 0,
 97 |       "outputs": []
 98 |     },
 99 |     {
100 |       "cell_type": "code",
101 |       "metadata": {
102 |         "id": "YpOfARYEP0Uv",
103 |         "colab_type": "code",
104 |         "colab": {}
105 |       },
106 |       "source": [
107 |         "plt.imshow(X_train[1], cmap='gray')"
108 |       ],
109 |       "execution_count": 0,
110 |       "outputs": []
111 |     },
112 |     {
113 |       "cell_type": "code",
114 |       "metadata": {
115 |         "id": "z9RVpEK_QG5H",
116 |         "colab_type": "code",
117 |         "colab": {}
118 |       },
119 |       "source": [
120 |         "plt.imshow(X_train[2], cmap='gray')"
121 |       ],
122 |       "execution_count": 0,
123 |       "outputs": []
124 |     },
125 |     {
126 |       "cell_type": "code",
127 |       "metadata": {
128 |         "id": "Ao2B3O_VMT5t",
129 |         "colab_type": "code",
130 |         "colab": {}
131 |       },
132 |       "source": [
133 |         "print(y_train.shape)\n",
134 |         "print(y_train[0])"
135 |       ],
136 |       "execution_count": 0,
137 |       "outputs": []
138 |     },
139 |     {
140 |       "cell_type": "markdown",
141 |       "metadata": {
142 |         "id": "hoPt9AcDP5KW",
143 |         "colab_type": "text"
144 |       },
145 |       "source": [
146 |         "### データ整形"
147 |       ]
148 |     },
149 |     {
150 |       "cell_type": "code",
151 |       "metadata": {
152 |         "id": "uWs4Ko57J75i",
153 |         "colab_type": "code",
154 |         "colab": {}
155 |       },
156 |       "source": [
157 |         "# 画像を1次元配列にreshape\n",
158 |         "X_train = X_train.reshape(60000, 28*28)\n",
159 |         "X_test = X_test.reshape(10000, 28*28)"
160 |       ],
161 |       "execution_count": 0,
162 |       "outputs": []
163 |     },
164 |     {
165 |       "cell_type": "code",
166 |       "metadata": {
167 |         "id": "Zw1T3qlYP8LV",
168 |         "colab_type": "code",
169 |         "colab": {}
170 |       },
171 |       "source": [
172 |         "# 輝度値を0 ~ 1に入るように正規化\n",
173 |         "X_train = X_train.astype('float32')/255\n",
174 |         "X_test = X_test.astype('float32')/255"
175 |       ],
176 |       "execution_count": 0,
177 |       "outputs": []
178 |     },
179 |     {
180 |       "cell_type": "code",
181 |       "metadata": {
182 |         "id": "7VAsaSwvP9sG",
183 |         "colab_type": "code",
184 |         "colab": {}
185 |       },
186 |       "source": [
187 |         "# one hot encoding\n",
188 |         "y_train = to_categorical(y_train)\n",
189 |         "y_test = to_categorical(y_test)"
190 |       ],
191 |       "execution_count": 0,
192 |       "outputs": []
193 |     },
194 |     {
195 |       "cell_type": "markdown",
196 |       "metadata": {
197 |         "id": "e4VxBZMxQPl6",
198 |         "colab_type": "text"
199 |       },
200 |       "source": [
201 |         "## モデル作成"
202 |       ]
203 |     },
204 |     {
205 |       "cell_type": "code",
206 |       "metadata": {
207 |         "id": "bW5-du5wQO-B",
208 |         "colab_type": "code",
209 |         "colab": {}
210 |       },
211 |       "source": [
212 |         "model = Sequential()\n",
213 |         "model.add(Dense(64, activation='relu', input_dim=28*28))\n",
214 |         "model.add(Dense(10, activation='softmax'))\n",
215 |         "\n",
216 |         "model.summary()"
217 |       ],
218 |       "execution_count": 0,
219 |       "outputs": []
220 |     },
221 |     {
222 |       "cell_type": "code",
223 |       "metadata": {
224 |         "id": "iRszP63XQgrl",
225 |         "colab_type": "code",
226 |         "colab": {}
227 |       },
228 |       "source": [
229 |         "model.compile(optimizer='Adam',\n",
230 |         "              loss='categorical_crossentropy',\n",
231 |         "              metrics=['accuracy'])\n",
232 |         "\n",
233 |         "model.fit(X_train, y_train, epochs=5, batch_size=64)"
234 |       ],
235 |       "execution_count": 0,
236 |       "outputs": []
237 |     },
238 |     {
239 |       "cell_type": "code",
240 |       "metadata": {
241 |         "id": "Hx73rHnyRCMJ",
242 |         "colab_type": "code",
243 |         "colab": {}
244 |       },
245 |       "source": [
246 |         "model.evaluate(X_test, y_test)"
247 |       ],
248 |       "execution_count": 0,
249 |       "outputs": []
250 |     },
251 |     {
252 |       "cell_type": "code",
253 |       "metadata": {
254 |         "id": "sJo1GL3hRR5l",
255 |         "colab_type": "code",
256 |         "colab": {}
257 |       },
258 |       "source": [
259 |         "model.save('model.h5')"
260 |       ],
261 |       "execution_count": 0,
262 |       "outputs": []
263 |     },
264 |     {
265 |       "cell_type": "markdown",
266 |       "metadata": {
267 |         "id": "CggIki-tRgNb",
268 |         "colab_type": "text"
269 |       },
270 |       "source": [
271 |         "## モデルを複雑に"
272 |       ]
273 |     },
274 |     {
275 |       "cell_type": "code",
276 |       "metadata": {
277 |         "id": "O5wFwHl_RcAa",
278 |         "colab_type": "code",
279 |         "colab": {}
280 |       },
281 |       "source": [
282 |         "model2 = Sequential()\n",
283 |         "model2.add(Dense(512, activation='relu', input_dim=28*28))\n",
284 |         "model2.add(Dense(512, activation='relu'))\n",
285 |         "model2.add(Dense(10, activation='softmax'))\n",
286 |         "\n",
287 |         "model2.summary()"
288 |       ],
289 |       "execution_count": 0,
290 |       "outputs": []
291 |     },
292 |     {
293 |       "cell_type": "code",
294 |       "metadata": {
295 |         "id": "3ZdPIsRqRvbo",
296 |         "colab_type": "code",
297 |         "colab": {}
298 |       },
299 |       "source": [
300 |         "model2.compile(optimizer='Adam',\n",
301 |         "              loss='categorical_crossentropy',\n",
302 |         "              metrics=['accuracy'])\n",
303 |         "\n",
304 |         "model2.fit(X_train, y_train, epochs=5, batch_size=64)"
305 |       ],
306 |       "execution_count": 0,
307 |       "outputs": []
308 |     },
309 |     {
310 |       "cell_type": "code",
311 |       "metadata": {
312 |         "id": "BhJtZcaiR0zl",
313 |         "colab_type": "code",
314 |         "colab": {}
315 |       },
316 |       "source": [
317 |         "model2.evaluate(X_test, y_test)"
318 |       ],
319 |       "execution_count": 0,
320 |       "outputs": []
321 |     },
322 |     {
323 |       "cell_type": "code",
324 |       "metadata": {
325 |         "id": "ZBgJLIPgR3-T",
326 |         "colab_type": "code",
327 |         "colab": {}
328 |       },
329 |       "source": [
330 |         ""
331 |       ],
332 |       "execution_count": 0,
333 |       "outputs": []
334 |     },
335 |     {
336 |       "cell_type": "code",
337 |       "metadata": {
338 |         "id": "iSOfIy82SfLH",
339 |         "colab_type": "code",
340 |         "colab": {}
341 |       },
342 |       "source": [
343 |         ""
344 |       ],
345 |       "execution_count": 0,
346 |       "outputs": []
347 |     }
348 |   ]
349 | }


--------------------------------------------------------------------------------
/Python/07/nn_temple_shrine.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch07-temple-shrine-nn.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "GPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "HRoy_vzj83WJ",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "from keras.datasets import mnist\n",
 27 |         "from keras.utils import to_categorical\n",
 28 |         "\n",
 29 |         "from keras.layers import Dense\n",
 30 |         "from keras.models import Sequential\n",
 31 |         "from keras.models import load_model\n",
 32 |         "\n",
 33 |         "import matplotlib.pyplot as plt\n",
 34 |         "from PIL import Image\n",
 35 |         "import os\n",
 36 |         "import numpy as np\n",
 37 |         "from sklearn.model_selection import train_test_split"
 38 |       ],
 39 |       "execution_count": 0,
 40 |       "outputs": []
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "metadata": {
 45 |         "id": "lkcJwKDJ4vuV",
 46 |         "colab_type": "code",
 47 |         "colab": {}
 48 |       },
 49 |       "source": [
 50 |         "# Google ドライブをマウントするには、このセルを実行してください。\n",
 51 |         "from google.colab import drive\n",
 52 |         "drive.mount('/content/drive/')"
 53 |       ],
 54 |       "execution_count": 0,
 55 |       "outputs": []
 56 |     },
 57 |     {
 58 |       "cell_type": "markdown",
 59 |       "metadata": {
 60 |         "id": "gbGo22SlK0E0",
 61 |         "colab_type": "text"
 62 |       },
 63 |       "source": [
 64 |         "## データ読み込み"
 65 |       ]
 66 |     },
 67 |     {
 68 |       "cell_type": "code",
 69 |       "metadata": {
 70 |         "id": "rzob7zR3uw9D",
 71 |         "colab_type": "code",
 72 |         "colab": {}
 73 |       },
 74 |       "source": [
 75 |         "X = []\n",
 76 |         "Y = []\n",
 77 |         "\n",
 78 |         "folder_path = \"/content/drive/My Drive/PythonBooks/src/Ch07/images/\"\n",
 79 |         "file_list = os.listdir(folder_path)"
 80 |       ],
 81 |       "execution_count": 0,
 82 |       "outputs": []
 83 |     },
 84 |     {
 85 |       "cell_type": "code",
 86 |       "metadata": {
 87 |         "id": "aG-mpkDNKzoJ",
 88 |         "colab_type": "code",
 89 |         "colab": {}
 90 |       },
 91 |       "source": [
 92 |         "image_size = 30\n",
 93 |         "\n",
 94 |         "for file in file_list:\n",
 95 |         "    try:\n",
 96 |         "        image = Image.open(folder_path + file)\n",
 97 |         "    except:\n",
 98 |         "        print('error', file)\n",
 99 |         "        continue\n",
100 |         "    \n",
101 |         "    image = image.convert(\"RGB\")\n",
102 |         "    image = image.resize((image_size, image_size))\n",
103 |         "    data = np.asarray(image)\n",
104 |         "    X.append(data)\n",
105 |         "    if 'temple' in file:\n",
106 |         "        Y.append(0)\n",
107 |         "    else:\n",
108 |         "         Y.append(1)\n",
109 |         "\n",
110 |         "X = np.array(X)\n",
111 |         "Y = np.array(Y)"
112 |       ],
113 |       "execution_count": 0,
114 |       "outputs": []
115 |     },
116 |     {
117 |       "cell_type": "code",
118 |       "metadata": {
119 |         "id": "Y_wXrXCzBQi-",
120 |         "colab_type": "code",
121 |         "colab": {}
122 |       },
123 |       "source": [
124 |         "print(X.shape)"
125 |       ],
126 |       "execution_count": 0,
127 |       "outputs": []
128 |     },
129 |     {
130 |       "cell_type": "code",
131 |       "metadata": {
132 |         "id": "gRWme6TSBSzL",
133 |         "colab_type": "code",
134 |         "colab": {}
135 |       },
136 |       "source": [
137 |         "print(Y.shape)"
138 |       ],
139 |       "execution_count": 0,
140 |       "outputs": []
141 |     },
142 |     {
143 |       "cell_type": "markdown",
144 |       "metadata": {
145 |         "id": "hoPt9AcDP5KW",
146 |         "colab_type": "text"
147 |       },
148 |       "source": [
149 |         "### データ整形"
150 |       ]
151 |     },
152 |     {
153 |       "cell_type": "code",
154 |       "metadata": {
155 |         "id": "uogTTc6oBsd0",
156 |         "colab_type": "code",
157 |         "colab": {}
158 |       },
159 |       "source": [
160 |         "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)"
161 |       ],
162 |       "execution_count": 0,
163 |       "outputs": []
164 |     },
165 |     {
166 |       "cell_type": "code",
167 |       "metadata": {
168 |         "id": "uWs4Ko57J75i",
169 |         "colab_type": "code",
170 |         "colab": {}
171 |       },
172 |       "source": [
173 |         "# 画像を1次元配列にreshape\n",
174 |         "X_train = X_train.reshape(-1, image_size * image_size *3)\n",
175 |         "X_test = X_test.reshape(-1, image_size * image_size *3)"
176 |       ],
177 |       "execution_count": 0,
178 |       "outputs": []
179 |     },
180 |     {
181 |       "cell_type": "code",
182 |       "metadata": {
183 |         "id": "Zw1T3qlYP8LV",
184 |         "colab_type": "code",
185 |         "colab": {}
186 |       },
187 |       "source": [
188 |         "# 輝度値を0 ~ 1に入るように正規化\n",
189 |         "X_train = X_train.astype('float32')/255\n",
190 |         "X_test = X_test.astype('float32')/255"
191 |       ],
192 |       "execution_count": 0,
193 |       "outputs": []
194 |     },
195 |     {
196 |       "cell_type": "code",
197 |       "metadata": {
198 |         "id": "7VAsaSwvP9sG",
199 |         "colab_type": "code",
200 |         "colab": {}
201 |       },
202 |       "source": [
203 |         "# one hot encoding\n",
204 |         "y_train = to_categorical(y_train)\n",
205 |         "y_test = to_categorical(y_test)"
206 |       ],
207 |       "execution_count": 0,
208 |       "outputs": []
209 |     },
210 |     {
211 |       "cell_type": "markdown",
212 |       "metadata": {
213 |         "id": "e4VxBZMxQPl6",
214 |         "colab_type": "text"
215 |       },
216 |       "source": [
217 |         "## モデル作成"
218 |       ]
219 |     },
220 |     {
221 |       "cell_type": "code",
222 |       "metadata": {
223 |         "id": "bW5-du5wQO-B",
224 |         "colab_type": "code",
225 |         "colab": {}
226 |       },
227 |       "source": [
228 |         "model = Sequential()\n",
229 |         "model.add(Dense(64, activation='relu', input_dim=image_size * image_size *3))\n",
230 |         "model.add(Dense(2, activation='softmax'))\n",
231 |         "\n",
232 |         "model.summary()"
233 |       ],
234 |       "execution_count": 0,
235 |       "outputs": []
236 |     },
237 |     {
238 |       "cell_type": "code",
239 |       "metadata": {
240 |         "id": "iRszP63XQgrl",
241 |         "colab_type": "code",
242 |         "colab": {}
243 |       },
244 |       "source": [
245 |         "model.compile(optimizer='Adam',\n",
246 |         "              loss='categorical_crossentropy',\n",
247 |         "              metrics=['accuracy'])\n",
248 |         "\n",
249 |         "model.fit(X_train, y_train, epochs=20, batch_size=20)"
250 |       ],
251 |       "execution_count": 0,
252 |       "outputs": []
253 |     },
254 |     {
255 |       "cell_type": "code",
256 |       "metadata": {
257 |         "id": "Hx73rHnyRCMJ",
258 |         "colab_type": "code",
259 |         "colab": {}
260 |       },
261 |       "source": [
262 |         "model.evaluate(X_test, y_test)"
263 |       ],
264 |       "execution_count": 0,
265 |       "outputs": []
266 |     },
267 |     {
268 |       "cell_type": "code",
269 |       "metadata": {
270 |         "id": "sJo1GL3hRR5l",
271 |         "colab_type": "code",
272 |         "colab": {}
273 |       },
274 |       "source": [
275 |         "model.save('model.h5')"
276 |       ],
277 |       "execution_count": 0,
278 |       "outputs": []
279 |     }
280 |   ]
281 | }


--------------------------------------------------------------------------------
/Python/08/collaborative_filtering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch08-Collaborative-filtering.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "toc_visible": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "GJcUJGK-7KIQ",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import pandas as pd\n",
 27 |         "import numpy as np\n",
 28 |         "\n",
 29 |         "from sklearn.metrics.pairwise import cosine_similarity"
 30 |       ],
 31 |       "execution_count": 0,
 32 |       "outputs": []
 33 |     },
 34 |     {
 35 |       "cell_type": "markdown",
 36 |       "metadata": {
 37 |         "id": "ZMFk_sBO9SPj",
 38 |         "colab_type": "text"
 39 |       },
 40 |       "source": [
 41 |         "## データ読み込み"
 42 |       ]
 43 |     },
 44 |     {
 45 |       "cell_type": "code",
 46 |       "metadata": {
 47 |         "id": "36vbv-1y7b2B",
 48 |         "colab_type": "code",
 49 |         "colab": {}
 50 |       },
 51 |       "source": [
 52 |         "cols_name = ['user_id','item_id','rating','timestamp']\n",
 53 |         "data_movie = pd.read_csv('u.data', names=cols_name, sep=\"\\t\")\n",
 54 |         "print(data_movie.head())"
 55 |       ],
 56 |       "execution_count": 0,
 57 |       "outputs": []
 58 |     },
 59 |     {
 60 |       "cell_type": "code",
 61 |       "metadata": {
 62 |         "id": "7U5GlXH79PML",
 63 |         "colab_type": "code",
 64 |         "colab": {}
 65 |       },
 66 |       "source": [
 67 |         "movie_rating = data_movie.pivot(index='user_id', columns='item_id', values='rating').fillna(0).as_matrix()\n",
 68 |         "print(movie_rating[0:5])\n",
 69 |         "print(movie_rating.shape)"
 70 |       ],
 71 |       "execution_count": 0,
 72 |       "outputs": []
 73 |     },
 74 |     {
 75 |       "cell_type": "markdown",
 76 |       "metadata": {
 77 |         "id": "Z5TLFkyWptmk",
 78 |         "colab_type": "text"
 79 |       },
 80 |       "source": [
 81 |         "## コサイン類似度計算"
 82 |       ]
 83 |     },
 84 |     {
 85 |       "cell_type": "code",
 86 |       "metadata": {
 87 |         "id": "gCm59y0L9g_-",
 88 |         "colab_type": "code",
 89 |         "colab": {}
 90 |       },
 91 |       "source": [
 92 |         "cos_sim = cosine_similarity(movie_rating, movie_rating)\n",
 93 |         "print(cos_sim[:5])\n",
 94 |         "print(cos_sim.shape)"
 95 |       ],
 96 |       "execution_count": 0,
 97 |       "outputs": []
 98 |     },
 99 |     {
100 |       "cell_type": "markdown",
101 |       "metadata": {
102 |         "id": "nCT49DY4vBNp",
103 |         "colab_type": "text"
104 |       },
105 |       "source": [
106 |         "## レコメンド"
107 |       ]
108 |     },
109 |     {
110 |       "cell_type": "code",
111 |       "metadata": {
112 |         "id": "4mW_F4RE9cDB",
113 |         "colab_type": "code",
114 |         "colab": {}
115 |       },
116 |       "source": [
117 |         "# ユーザー1との類似度\n",
118 |         "cos_sim_for_user_1 = cos_sim[0]\n",
119 |         "# ユーザー1と類似度の高いユーザー10人のインデックスを抽出\n",
120 |         "similar_user = np.argsort(cos_sim_for_user_1)[-11:-1]\n",
121 |         "print(similar_user)"
122 |       ],
123 |       "execution_count": 0,
124 |       "outputs": []
125 |     },
126 |     {
127 |       "cell_type": "code",
128 |       "metadata": {
129 |         "id": "BQowdy38RBqn",
130 |         "colab_type": "code",
131 |         "colab": {}
132 |       },
133 |       "source": [
134 |         "print(cos_sim_for_user_1[similar_user])"
135 |       ],
136 |       "execution_count": 0,
137 |       "outputs": []
138 |     },
139 |     {
140 |       "cell_type": "code",
141 |       "metadata": {
142 |         "id": "ABhcpLBt9mbv",
143 |         "colab_type": "code",
144 |         "colab": {}
145 |       },
146 |       "source": [
147 |         "# 類似度の高いユーザーの映画評価値\n",
148 |         "movie_rating_of_similar_user = movie_rating[similar_user]\n",
149 |         "print(movie_rating_of_similar_user)"
150 |       ],
151 |       "execution_count": 0,
152 |       "outputs": []
153 |     },
154 |     {
155 |       "cell_type": "code",
156 |       "metadata": {
157 |         "id": "jfcEIQFvze1o",
158 |         "colab_type": "code",
159 |         "colab": {}
160 |       },
161 |       "source": [
162 |         "# 重みづけされた評価値を計算\n",
163 |         "weighted_movie_rating = movie_rating_of_similar_user * cos_sim_for_user_1[similar_user].reshape(-1, 1)\n",
164 |         "print(weighted_movie_rating)"
165 |       ],
166 |       "execution_count": 0,
167 |       "outputs": []
168 |     },
169 |     {
170 |       "cell_type": "code",
171 |       "metadata": {
172 |         "id": "Li21_3bb9oTf",
173 |         "colab_type": "code",
174 |         "colab": {}
175 |       },
176 |       "source": [
177 |         "# 各映画のレコメンド値を計算\n",
178 |         "mean_weighted_movie_rating = weighted_movie_rating.mean(axis=0)\n",
179 |         "print(mean_weighted_movie_rating)"
180 |       ],
181 |       "execution_count": 0,
182 |       "outputs": []
183 |     },
184 |     {
185 |       "cell_type": "code",
186 |       "metadata": {
187 |         "id": "d8MqIyjv9_1l",
188 |         "colab_type": "code",
189 |         "colab": {}
190 |       },
191 |       "source": [
192 |         "#ユーザー１の評価と加重平均スコアを列とするデータフレーム作成\n",
193 |         "recommend_values = pd.DataFrame({'user_1_score':movie_rating[0], 'recommend_value':mean_weighted_movie_rating})\n",
194 |         "print(recommend_values.head())"
195 |       ],
196 |       "execution_count": 0,
197 |       "outputs": []
198 |     },
199 |     {
200 |       "cell_type": "code",
201 |       "metadata": {
202 |         "id": "1bqd0849-CWB",
203 |         "colab_type": "code",
204 |         "colab": {}
205 |       },
206 |       "source": [
207 |         "#未評価のうちスコアの高い上位10件を抽出\n",
208 |         "recommend_values[recommend_values['user_1_score'] == 0].sort_values('recommend_value', ascending=False).head(10)"
209 |       ],
210 |       "execution_count": 0,
211 |       "outputs": []
212 |     },
213 |     {
214 |       "cell_type": "code",
215 |       "metadata": {
216 |         "id": "h5C7u5Y9wy8_",
217 |         "colab_type": "code",
218 |         "colab": {}
219 |       },
220 |       "source": [
221 |         ""
222 |       ],
223 |       "execution_count": 0,
224 |       "outputs": []
225 |     }
226 |   ]
227 | }


--------------------------------------------------------------------------------
/Python/08/word2vec_tweets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Ch08_word2vec.ipynb のコピー",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": [],
 10 |       "toc_visible": true
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "code",
 20 |       "metadata": {
 21 |         "id": "4o7bonIQ-7eN",
 22 |         "colab_type": "code",
 23 |         "colab": {}
 24 |       },
 25 |       "source": [
 26 |         "import json\n",
 27 |         "import requests\n",
 28 |         "from requests_oauthlib import OAuth1\n",
 29 |         "import re\n",
 30 |         "from google.colab import files\n",
 31 |         "\n",
 32 |         "\n",
 33 |         "# 取得したkeyを定義\n",
 34 |         "access_token = 'xxxxxxxx'\n",
 35 |         "access_token_secret = 'xxxxxxxx'\n",
 36 |         "consumer_key = 'xxxxxxxx'\n",
 37 |         "consumer_key_secret = 'xxxxxxxx'\n",
 38 |         "\n",
 39 |         "url = \"https://stream.twitter.com/1.1/statuses/sample.json?language=ja\"\n",
 40 |         "\n",
 41 |         "# OAuth で GET\n",
 42 |         "twitter = OAuth1(consumer_key, consumer_key_secret, access_token, access_token_secret)"
 43 |       ],
 44 |       "execution_count": 0,
 45 |       "outputs": []
 46 |     },
 47 |     {
 48 |       "cell_type": "code",
 49 |       "metadata": {
 50 |         "id": "nILqdz0jeTaF",
 51 |         "colab_type": "code",
 52 |         "colab": {}
 53 |       },
 54 |       "source": [
 55 |         "def normalize_text(text):\n",
 56 |         "    text = re.sub(r'https?://[\\w/:%#\\$&\\?\\(\\)~\\.=\\+\\-…]+', \"\", text)\n",
 57 |         "    text = re.sub('RT', \"\", text)\n",
 58 |         "    text = re.sub('お気に入り', \"\", text)\n",
 59 |         "    text = re.sub('まとめ', \"\", text)\n",
 60 |         "    text = re.sub(r'[!-~]', \"\", text)\n",
 61 |         "    text = re.sub(r'[︰-＠]', \"\", text)\n",
 62 |         "    text = re.sub('\\u3000',\"\", text)\n",
 63 |         "    text = re.sub('\\t', \"\", text)\n",
 64 |         "    text = re.sub('\\n', \"\", text)\n",
 65 |         "\n",
 66 |         "    text = text.strip()\n",
 67 |         "    return text"
 68 |       ],
 69 |       "execution_count": 0,
 70 |       "outputs": []
 71 |     },
 72 |     {
 73 |       "cell_type": "code",
 74 |       "metadata": {
 75 |         "id": "kVb27XwPeVoY",
 76 |         "colab_type": "code",
 77 |         "colab": {}
 78 |       },
 79 |       "source": [
 80 |         "url = \"https://stream.twitter.com/1.1/statuses/sample.json?language=ja\"\n",
 81 |         "\n",
 82 |         "with open('public_text_twitter.tsv','a', encoding='utf-8') as f:\n",
 83 |         "    res = requests.get(url, auth=twitter, stream=True)\n",
 84 |         "    for r in res.iter_lines():\n",
 85 |         "        try:\n",
 86 |         "            r_json = json.loads(r)\n",
 87 |         "            text = r_json['text']\n",
 88 |         "            f.write(normalize_text(text) + '\\n')\n",
 89 |         "        except:\n",
 90 |         "            continue"
 91 |       ],
 92 |       "execution_count": 0,
 93 |       "outputs": []
 94 |     },
 95 |     {
 96 |       "cell_type": "code",
 97 |       "metadata": {
 98 |         "id": "UYWv6fpGIzj6",
 99 |         "colab_type": "code",
100 |         "colab": {}
101 |       },
102 |       "source": [
103 |         "files.download('public_text_twitter.tsv')"
104 |       ],
105 |       "execution_count": 0,
106 |       "outputs": []
107 |     },
108 |     {
109 |       "cell_type": "markdown",
110 |       "metadata": {
111 |         "id": "A3yyvf-tmW-q",
112 |         "colab_type": "text"
113 |       },
114 |       "source": [
115 |         "## word2vec 実践"
116 |       ]
117 |     },
118 |     {
119 |       "cell_type": "code",
120 |       "metadata": {
121 |         "id": "3mpQrwtNeGOu",
122 |         "colab_type": "code",
123 |         "colab": {}
124 |       },
125 |       "source": [
126 |         "# mecabインストール\n",
127 |         "!apt install aptitude\n",
128 |         "!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y\n",
129 |         "\n",
130 |         "# mecab pythonインストール（pythonでmecabを動かすために必要)\n",
131 |         "!pip install mecab-python3==0.7\n",
132 |         "\n",
133 |         "# neologd辞書インストール\n",
134 |         "!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git\n",
135 |         "!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n"
136 |       ],
137 |       "execution_count": 0,
138 |       "outputs": []
139 |     },
140 |     {
141 |       "cell_type": "markdown",
142 |       "metadata": {
143 |         "id": "gcy-DRiosnJz",
144 |         "colab_type": "text"
145 |       },
146 |       "source": [
147 |         ""
148 |       ]
149 |     },
150 |     {
151 |       "cell_type": "code",
152 |       "metadata": {
153 |         "id": "2NJPcNzrYTKv",
154 |         "colab_type": "code",
155 |         "colab": {}
156 |       },
157 |       "source": [
158 |         "# 辞書変更\n",
159 |         "!sed -e \"s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g\" /etc/mecabrc  > /etc/mecabrc.new\n",
160 |         "!cp /etc/mecabrc /etc/mecabrc.org\n",
161 |         "!cp /etc/mecabrc.new /etc/mecabrc"
162 |       ],
163 |       "execution_count": 0,
164 |       "outputs": []
165 |     },
166 |     {
167 |       "cell_type": "code",
168 |       "metadata": {
169 |         "id": "y-QwAOeovcLI",
170 |         "colab_type": "code",
171 |         "colab": {}
172 |       },
173 |       "source": [
174 |         "import MeCab\n",
175 |         "import pandas as pd\n",
176 |         "import unicodedata\n",
177 |         "from gensim.models import word2vec"
178 |       ],
179 |       "execution_count": 0,
180 |       "outputs": []
181 |     },
182 |     {
183 |       "cell_type": "code",
184 |       "metadata": {
185 |         "id": "PWzcuYrDwtwu",
186 |         "colab_type": "code",
187 |         "colab": {}
188 |       },
189 |       "source": [
190 |         "# データ　インポート\n",
191 |         "df = pd.read_csv('public_text_twitter.tsv', sep='\\t', names=['text'])\n",
192 |         "text_lists = df['text'].unique().tolist()\n",
193 |         "\n",
194 |         "mt = MeCab.Tagger(\"-Ochasen\") "
195 |       ],
196 |       "execution_count": 0,
197 |       "outputs": []
198 |     },
199 |     {
200 |       "cell_type": "code",
201 |       "metadata": {
202 |         "id": "ge1qpQLyJdZJ",
203 |         "colab_type": "code",
204 |         "colab": {}
205 |       },
206 |       "source": [
207 |         "word_pos = ('名詞', '形容詞')\n",
208 |         "\n",
209 |         "with open('public_text_splited.txt', 'w', encoding='utf-8') as f:\n",
210 |         "    for text in text_lists:\n",
211 |         "        tmp_lists = []\n",
212 |         "        text = unicodedata.normalize('NFKC', str(text))\n",
213 |         "        \n",
214 |         "        node = mt.parseToNode(text)\n",
215 |         "        while node:\n",
216 |         "            if node.feature.startswith(word_pos) and ',非自立,' not in node.feature:\n",
217 |         "                tmp_lists.append(node.surface)\n",
218 |         "            \n",
219 |         "            node = node.next\n",
220 |         "                \n",
221 |         "        f.write(' '.join(tmp_lists) + '\\n')"
222 |       ],
223 |       "execution_count": 0,
224 |       "outputs": []
225 |     },
226 |     {
227 |       "cell_type": "code",
228 |       "metadata": {
229 |         "id": "_sfShPFtw4GV",
230 |         "colab_type": "code",
231 |         "colab": {}
232 |       },
233 |       "source": [
234 |         "sentences = word2vec.LineSentence('public_text_splited.txt')\n",
235 |         "model = word2vec.Word2Vec(sentences,\n",
236 |         "                          sg=1,         #0: CBOW, 1: skip-gram\n",
237 |         "                          size=200,     # ベクトルの次元数\n",
238 |         "                          window=3,    # 入力単語からの最大距離\n",
239 |         "                          min_count=5,  # 単語の出現回数でフィルタリング\n",
240 |         "                          )"
241 |       ],
242 |       "execution_count": 0,
243 |       "outputs": []
244 |     },
245 |     {
246 |       "cell_type": "code",
247 |       "metadata": {
248 |         "id": "A71Y2Jk5YnMJ",
249 |         "colab_type": "code",
250 |         "colab": {}
251 |       },
252 |       "source": [
253 |         "model.most_similar(positive='人生', topn=20)"
254 |       ],
255 |       "execution_count": 0,
256 |       "outputs": []
257 |     },
258 |     {
259 |       "cell_type": "code",
260 |       "metadata": {
261 |         "id": "rb45klcguFS1",
262 |         "colab_type": "code",
263 |         "colab": {}
264 |       },
265 |       "source": [
266 |         ""
267 |       ],
268 |       "execution_count": 0,
269 |       "outputs": []
270 |     }
271 |   ]
272 | }


--------------------------------------------------------------------------------
/R/03/lm_boston.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | 
 5 | library(MASS)
 6 | library(tidyverse)
 7 | library(GGally)
 8 | library(caret)
 9 | 
10 | # データ読み込み
11 | data(Boston)
12 | Boston %>% summary()
13 | Boston %>% head()
14 | 
15 | # 可視化
16 | Boston %>% ggpairs()
17 | Boston %>% select(1, 2) %>% ggpairs()
18 | 
19 | # 線形回帰
20 | # 単回帰
21 | lm_model <- train(data=Boston, medv ~ rm, method="lm")
22 | lm_model %>% summary()
23 | 
24 | # 重回帰
25 | lm_multi_model <- train(data=Boston, medv ~ ., method="lm")
26 |   
27 | lm_multi_model %>% summary()
28 | 
29 | # 予測
30 | train_size = 0.7
31 | train_index <- sample(Boston %>% nrow(), Boston %>% nrow() * train_size)
32 | train_data <- Boston[train_index,] # 訓練データ
33 | test_data <- Boston[-train_index,] # テストデータ
34 | 
35 | lm_multi_model2 <- train(data=train_data, medv ~ ., method="lm")
36 | y_pred <- predict(lm_multi_model2, test_data)
37 | y_pred - test_data$medv
38 | 
39 | # MAE
40 | ## 単回帰
41 | lm_single_model <- train(data=train_data, medv ~ rm, method="lm")
42 | y_pred <- predict(lm_single_model, test_data)
43 | MAE(y_pred, test_data$medv)
44 | 
45 | ## 重回帰
46 | lm_multi_model <- train(data=train_data, medv ~ ., method="lm")
47 | y_pred <- predict(lm_multi_model, test_data)
48 | MAE(y_pred, test_data$medv)
49 | 
50 | 


--------------------------------------------------------------------------------
/R/03/lm_ridge_lasso_boston.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | 
 5 | library(MASS)
 6 | library(tidyverse)
 7 | library(GGally)
 8 | library(caret)
 9 | 
10 | # データ読み込み
11 | data(Boston)
12 | Boston %>% summary()
13 | Boston %>% head()
14 | 
15 | # 可視化
16 | Boston %>% ggpairs()
17 | Boston %>% select(1, 2) %>% ggpairs()
18 | 
19 | # L1正則化なし
20 | train_size = 0.7
21 | train_index <- sample(Boston %>% nrow(), Boston %>% nrow() * train_size)
22 | train_data <- Boston[train_index,] # 訓練データ
23 | test_data <- Boston[-train_index,] # テストデータ
24 | 
25 | lm_multi_model <- train(data=train_data, medv ~ ., method="lm")
26 | y_pred <- predict(lm_multi_model, test_data)
27 | MAE(y_pred, test_data$medv)
28 | 
29 | # Lasso回帰
30 | lasso <- train(data=train_data, medv ~ ., method="glmnet", tuneGrid = expand.grid(alpha = 1, lambda = 1))
31 | y_pred <- predict(lasso, test_data)
32 | MAE(y_pred, test_data$medv)
33 | 
34 | 
35 | # Ridge回帰
36 | ridge <- train(data=train_data, medv ~ ., method="glmnet", tuneGrid = expand.grid(alpha = 0, lambda = 1))
37 | y_pred <- predict(ridge, test_data)
38 | MAE(y_pred, test_data$medv)
39 | 
40 | 


--------------------------------------------------------------------------------
/R/03/lm_ridge_lasso_tokyo.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages("glmnet")
 5 | 
 6 | library(MASS)
 7 | library(tidyverse)
 8 | library(GGally)
 9 | library(caret)
10 | 
11 | # データ読み込み
12 | data_tokyo <- read.csv("src/03/13_Tokyo_20171_20184.csv", header = TRUE, encoding = "cp932")
13 | 
14 | data_tokyo %>% summary()
15 | data_tokyo %>% head()
16 | 
17 | # 整形
18 | data_used_apartment <- data_tokyo %>% filter(種類 == "中古マンション等")
19 | columns_name_list <- c("最寄駅.距離.分.", "間取り", "面積...","建築年", "建物の構造", "建ぺい率...", "容積率...", "市区町村名", "取引価格.総額.")
20 | data_selected_dropna <- data_used_apartment %>% select(columns_name_list) %>% 
21 |   na.omit() %>%  filter(str_detect(建築年, "^平成|昭和")) %>% 
22 |   filter(str_detect(最寄駅.距離.分., "\\?", negate = TRUE))
23 | 
24 | wareki_to_seireki = c(1926-1, 1989-1)
25 | building_year_list <- data_selected_dropna$建築年
26 | 
27 | building_age_list <- c()
28 | for (i in 1:(building_year_list %>% length())){
29 |   # 西暦に変換
30 |   tmp <- unlist(strsplit(as.character(building_year_list[i]), "成|和|年"))
31 |   if (tmp[1] == "平"){
32 |     seireki = wareki_to_seireki[2] + as.integer(tmp[2])
33 |   }
34 |   else {
35 |     seireki = wareki_to_seireki[1] + as.integer(tmp[2])
36 |   }
37 |   # 築年数に変換
38 |   building_age = 2018 - seireki
39 |   
40 |   building_age_list = c(building_age_list, building_age)
41 | }
42 | 
43 | data_selected_dropna$築年数 <- building_age_list
44 | data_selected_dropna <- data_selected_dropna[, colnames(data_selected_dropna) != "建築年"]
45 | data_selected_dropna$最寄駅.距離.分. <- as.numeric(data_selected_dropna$最寄駅.距離.分)
46 | data_selected_dropna$面積... <- as.numeric(data_selected_dropna$面積...)
47 | 
48 | data_added_dummies <- data_selected_dropna %>% filter(取引価格.総額. < 60000000)
49 | 
50 | # L1正則化なし
51 | train_size = 0.7
52 | train_index <- sample(data_added_dummies %>% nrow(), data_added_dummies %>% nrow() * train_size)
53 | train_data <- data_added_dummies[train_index,] # 訓練データ
54 | test_data <- data_added_dummies[-train_index,] # テストデータ
55 | 
56 | lm_multi_model <- train(data=train_data, 取引価格.総額. ~ ., method="lm")
57 | y_pred <- predict(lm_multi_model, test_data)
58 | MAE(y_pred, test_data$取引価格.総額.)
59 | 
60 | # Lasso回帰
61 | lasso <- train(data=train_data, 取引価格.総額. ~ ., method="glmnet", tuneGrid = expand.grid(alpha = 1, lambda = 1))
62 | y_pred <- predict(lasso, test_data)
63 | MAE(y_pred, test_data$取引価格.総額.)
64 | 
65 | 
66 | # Ridge回帰
67 | ridge <- train(data=train_data, 取引価格.総額. ~ ., method="glmnet", tuneGrid = expand.grid(alpha = 0, lambda = 1))
68 | y_pred <- predict(ridge, test_data)
69 | MAE(y_pred, test_data$取引価格.総額.)


--------------------------------------------------------------------------------
/R/03/lm_tokyo.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | 
 5 | library(MASS)
 6 | library(tidyverse)
 7 | library(GGally)
 8 | library(caret)
 9 | 
10 | # データ読み込み
11 | data_tokyo <- read.csv("src/03/13_Tokyo_20171_20184.csv", header = TRUE, encoding = "cp932")
12 | 
13 | data_tokyo %>% summary()
14 | data_tokyo %>% head()
15 | 
16 | # 整形
17 | data_used_apartment <- data_tokyo %>% filter(種類 == "中古マンション等")
18 | columns_name_list <- c("最寄駅.距離.分.", "間取り", "面積...","建築年", "建物の構造", "建ぺい率...", "容積率...", "市区町村名", "取引価格.総額.")
19 | data_selected_dropna <- data_used_apartment %>% select(columns_name_list) %>% 
20 |   na.omit() %>%  filter(str_detect(建築年, "^平成|昭和")) %>% 
21 |   filter(str_detect(最寄駅.距離.分., "\\?", negate = TRUE))
22 | 
23 | wareki_to_seireki = c(1926-1, 1989-1)
24 | building_year_list <- data_selected_dropna$建築年
25 | 
26 | building_age_list <- c()
27 | for (i in 1:(building_year_list %>% length())){
28 |   # 西暦に変換
29 |   tmp <- unlist(strsplit(as.character(building_year_list[i]), "成|和|年"))
30 |   if (tmp[1] == "平"){
31 |     seireki = wareki_to_seireki[2] + as.integer(tmp[2])
32 |   }
33 |   else {
34 |     seireki = wareki_to_seireki[1] + as.integer(tmp[2])
35 |   }
36 |   # 築年数に変換
37 |   building_age = 2018 - seireki
38 |   
39 |   building_age_list = c(building_age_list, building_age)
40 | }
41 | 
42 | data_selected_dropna$築年数 <- building_age_list
43 | data_selected_dropna <- data_selected_dropna[, colnames(data_selected_dropna) != "建築年"]
44 | data_selected_dropna$最寄駅.距離.分. <- as.numeric(data_selected_dropna$最寄駅.距離.分)
45 | data_selected_dropna$面積... <- as.numeric(data_selected_dropna$面積...)
46 | 
47 | data_added_dummies <- data_selected_dropna %>% filter(取引価格.総額. < 60000000)
48 | 
49 | # 線形回帰
50 | ## 単回帰
51 | lm_model <- train(data=data_added_dummies, 取引価格.総額. ~ 面積..., method="lm")
52 | lm_model %>% summary()
53 | 
54 | ## 重回帰
55 | lm_multi_model <- train(data=data_added_dummies, 取引価格.総額. ~ ., method="lm")
56 | lm_multi_model %>% summary()
57 | 
58 | # 予測
59 | train_size = 0.7
60 | train_index <- sample(data_added_dummies %>% nrow(), data_added_dummies %>% nrow() * train_size)
61 | train_data <- data_added_dummies[train_index,] # 訓練データ
62 | test_data <- data_added_dummies[-train_index,] # テストデータ
63 | 
64 | lm_multi_model2 <- train(data=train_data, 取引価格.総額. ~ ., method="lm")
65 | y_pred <- predict(lm_multi_model2, test_data)
66 | y_pred - test_data$取引価格.総額.
67 | 
68 | # MAE
69 | ## 単回帰
70 | lm_single_model <- train(data=train_data, 取引価格.総額. ~ 面積..., method="lm")
71 | y_pred <- predict(lm_single_model, test_data)
72 | MAE(y_pred, test_data$取引価格.総額.)
73 | 
74 | ## 重回帰
75 | lm_multi_model <- train(data=train_data, 取引価格.総額. ~ ., method="lm")
76 | y_pred <- predict(lm_multi_model, test_data)
77 | MAE(y_pred, test_data$取引価格.総額.)
78 | 
79 | 


--------------------------------------------------------------------------------
/R/04/decisionTree_iris.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | # install.packages("doParallel")
 6 | 
 7 | library(MASS)
 8 | library(tidyverse)
 9 | library(caret)
10 | library(doParallel)
11 | 
12 | detectCores()
13 | cl <- makePSOCKcluster(4)
14 | registerDoParallel(cl)
15 | 
16 | # データ整形
17 | data_iris <- iris %>% 
18 |   filter(Species != "virginica") %>% select(-Species) %>% 
19 |   mutate(Species = as.matrix(iris$Species[1:100]))
20 | 
21 | # 決定木
22 | # 予測
23 | train_size = 0.7
24 | train_index <- sample(data_iris %>% nrow(), data_iris %>% nrow() * train_size)
25 | train_data <- data_iris[train_index,] # 訓練データ
26 | test_data <- data_iris[-train_index,] # テストデータ
27 | 
28 | 
29 | decisionTree_model <- train(Species ~ ., data=train_data, method="rpart")
30 | y_pred <- predict(decisionTree_model, test_data)
31 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor())
32 | 
33 | 


--------------------------------------------------------------------------------
/R/04/decisionTree_tweets.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | # install.packages("doParallel")
 6 | 
 7 | library(MASS)
 8 | library(tidyverse)
 9 | library(caret)
10 | library(doParallel)
11 | library(RMeCab)
12 | 
13 | detectCores()
14 | cl <- makePSOCKcluster(4)
15 | registerDoParallel(cl)
16 | 
17 | # データ読み込み
18 | tweets <- read.csv("src/04/tweets.tsv",sep = "\t") %>% na.omit()
19 | tweets %>% dim()
20 | 
21 | y <- tweets$X1
22 | text_all <- as.data.frame(tweets$X0)
23 | 
24 | # データ整形（tf-idf）
25 | doc_matrix <- docDF(text_all, col = 1, type = 1, pos = c("名詞", "形容詞"), minFreq = 1, weight = "tf*idf*norm") %>% 
26 |   filter(POS2 %in% c("一般", "固有名詞","自立"))
27 | doc_matrix_t <- doc_matrix[, 4:ncol(doc_matrix)] %>% t()
28 | 
29 | rownames(doc_matrix_t) <- c(1:nrow(doc_matrix_t))
30 | # colnames(doc_matrix_t) <- doc_matrix[, 1]
31 | 
32 | doc_matrix_t_1 <- cbind(doc_matrix_t, y) %>% na.omit()
33 | 
34 | doc_matrix_t_1[is.nan(doc_matrix_t_1)] <- NA
35 | doc_matrix_df <- doc_matrix_t_1 %>% na.omit() %>% as.data.frame()
36 | 
37 | # 決定木
38 | # 予測
39 | train_size = 0.7
40 | train_index <- sample(doc_matrix_df %>% nrow(), doc_matrix_df %>% nrow() * train_size)
41 | train_data <- doc_matrix_df[train_index,] # 訓練データ
42 | test_data <- doc_matrix_df[-train_index,] # テストデータ
43 | 
44 | 
45 | decisionTree_model <- train(y ~ ., data=train_data, method="rpart")
46 | y_pred <- predict(decisionTree_model, test_data)
47 | confusionMatrix(data = y_pred %>% round() %>% as.factor(), test_data$y %>% as.factor())
48 | 
49 | 


--------------------------------------------------------------------------------
/R/04/logit_iris.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | 
 6 | library(MASS)
 7 | library(tidyverse)
 8 | library(caret)
 9 | 
10 | data_iris <- iris %>% 
11 |   filter(Species != "virginica") %>% select(-Species) %>% 
12 |   mutate(Species = as.matrix(iris$Species[1:100]))
13 | # ロジスティック回帰
14 | ## 単回帰
15 | logit_model <- train(Species ~ Sepal.Length, data=data_iris, method="glm", family=binomial())
16 | logit_model %>% summary()
17 | 
18 | ## 重回帰
19 | logit_multi_model <- train(Species ~ ., data=data_iris, method="glm", family=binomial())
20 | logit_multi_model %>% summary()
21 | 
22 | # 予測
23 | train_size = 0.7
24 | train_index <- sample(data_iris %>% nrow(), data_iris %>% nrow() * train_size)
25 | train_data <- data_iris[train_index,] # 訓練データ
26 | test_data <- data_iris[-train_index,] # テストデータ
27 | 
28 | 
29 | ## 単回帰
30 | logit_single_model <- train(Species ~ Sepal.Length, data=train_data, method="glm", family=binomial())
31 | y_pred <- predict(logit_single_model, test_data)
32 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor())
33 | 
34 | 
35 | ## 重回帰
36 | logit_multi_model2 <- train(Species ~ ., data=train_data, method="glm", family=binomial())
37 | y_pred <- predict(logit_multi_model2, test_data)
38 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor())
39 | 
40 | 


--------------------------------------------------------------------------------
/R/04/logit_tweets.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | # install.packages("RMeCab", repos = "http://rmecab.jp/R", type = "source") 
 6 | 
 7 | library(MASS)
 8 | library(tidyverse)
 9 | library(caret)
10 | library(RMeCab)
11 | 
12 | # データ読み込み
13 | tweets <- read.csv("src/04/tweets.tsv",sep = "\t") %>% na.omit()
14 | tweets %>% dim()
15 | 
16 | y <- tweets$X1
17 | text_all <- as.data.frame(tweets$X0)
18 | 
19 | # データ整形（tf-idf）
20 | doc_matrix <- docDF(text_all, col = 1, type = 1, pos = c("名詞", "形容詞"), minFreq = 1, weight = "tf*idf*norm") %>% 
21 |   filter(POS2 %in% c("一般", "固有名詞","自立"))
22 | doc_matrix_t <- doc_matrix[, 4:ncol(doc_matrix)] %>% t()
23 | 
24 | rownames(doc_matrix_t) <- c(1:nrow(doc_matrix_t))
25 | # colnames(doc_matrix_t) <- doc_matrix[, 1]
26 | 
27 | doc_matrix_t_1 <- cbind(doc_matrix_t, y) %>% na.omit()
28 | 
29 | doc_matrix_t_1[is.nan(doc_matrix_t_1)] <- NA
30 | doc_matrix_df <- doc_matrix_t_1 %>% na.omit() %>% as.data.frame()
31 | 
32 | 
33 | # ロジスティック回帰
34 | # logit_multi_model <- train(y ~ ., data=doc_matrix_df, method="glm", family=binomial())
35 | # logit_multi_model %>% summary()
36 | 
37 | # 予測
38 | train_size = 0.7
39 | train_index <- sample(doc_matrix_df %>% nrow(), doc_matrix_df %>% nrow() * train_size)
40 | train_data <- doc_matrix_df[train_index,] # 訓練データ
41 | test_data <- doc_matrix_df[-train_index,] # テストデータ
42 | 
43 | 
44 | logit_multi_model2 <- train(y ~ ., data=train_data, method="glm", family=binomial())
45 | y_pred <- predict(logit_multi_model2, test_data)
46 | confusionMatrix(data = y_pred %>% round() %>% as.factor(), test_data$y %>% as.factor())
47 | 
48 | 


--------------------------------------------------------------------------------
/R/04/randomForest_iris.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | # install.packages("doParallel")
 6 | 
 7 | library(MASS)
 8 | library(tidyverse)
 9 | library(caret)
10 | library(doParallel)
11 | 
12 | detectCores()
13 | cl <- makePSOCKcluster(4)
14 | registerDoParallel(cl)
15 | 
16 | # データ整形
17 | data_iris <- iris %>% 
18 |   filter(Species != "virginica") %>% select(-Species) %>% 
19 |   mutate(Species = as.matrix(iris$Species[1:100]))
20 | 
21 | # ランダムフォレスト
22 | # 予測
23 | train_size = 0.7
24 | train_index <- sample(data_iris %>% nrow(), data_iris %>% nrow() * train_size)
25 | train_data <- data_iris[train_index,] # 訓練データ
26 | test_data <- data_iris[-train_index,] # テストデータ
27 | 
28 | 
29 | rf_model <- train(Species ~ ., data=train_data, method="rf")
30 | y_pred <- predict(rf_model, test_data)
31 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor())
32 | 
33 | 


--------------------------------------------------------------------------------
/R/04/randomForest_tweets.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | # install.packages("doParallel")
 6 | 
 7 | library(MASS)
 8 | library(tidyverse)
 9 | library(caret)
10 | library(doParallel)
11 | library(RMeCab)
12 | 
13 | detectCores()
14 | cl <- makePSOCKcluster(4)
15 | registerDoParallel(cl)
16 | 
17 | # データ読み込み
18 | tweets <- read.csv("src/04/tweets.tsv",sep = "\t") %>% na.omit()
19 | tweets %>% dim()
20 | 
21 | y <- tweets$X1
22 | text_all <- as.data.frame(tweets$X0)
23 | 
24 | # データ整形（tf-idf）
25 | doc_matrix <- docDF(text_all, col = 1, type = 1, pos = c("名詞", "形容詞"), minFreq = 1, weight = "tf*idf*norm") %>% 
26 |   filter(POS2 %in% c("一般", "固有名詞","自立"))
27 | doc_matrix_t <- doc_matrix[, 4:ncol(doc_matrix)] %>% t()
28 | 
29 | rownames(doc_matrix_t) <- c(1:nrow(doc_matrix_t))
30 | # colnames(doc_matrix_t) <- doc_matrix[, 1]
31 | 
32 | doc_matrix_t_1 <- cbind(doc_matrix_t, y %>% as.factor()) %>% na.omit()
33 | 
34 | doc_matrix_t_1[is.nan(doc_matrix_t_1)] <- NA
35 | doc_matrix_df <- doc_matrix_t_1 %>% na.omit() %>% as.data.frame()
36 | 
37 | colnames(doc_matrix_df)[ncol(doc_matrix_df)] = "y"
38 | 
39 | # 決定木
40 | # 予測
41 | train_size = 0.7
42 | train_index <- sample(doc_matrix_df %>% nrow(), doc_matrix_df %>% nrow() * train_size)
43 | train_data <- doc_matrix_df[train_index,] # 訓練データ
44 | test_data <- doc_matrix_df[-train_index,] # テストデータ
45 | 
46 | 
47 | rf_model <- train(y ~ ., data=train_data, method="rf", tuneLength=4)
48 | y_pred <- predict(rf_model, test_data)
49 | confusionMatrix(data = y_pred %>% round() %>% as.factor(), test_data$y %>% as.factor())
50 | 
51 | 


--------------------------------------------------------------------------------
/R/05/Kmeans_iris.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | 
 6 | library(MASS)
 7 | library(tidyverse)
 8 | # library(caret)
 9 | 
10 | data_iris <- iris %>% select(Sepal.Width, Petal.Width) %>% scale()
11 | k_means <- kmeans(data_iris, 2)
12 | k_means %>% summary()
13 | k_means$cluster
14 | 
15 | data_kmeans <- cbind(data_iris, k_means$cluster) %>% as.data.frame()
16 | 
17 | g <- ggplot(data_kmeans, aes(x=Sepal.Width, y=Petal.Width))
18 | g <- g + geom_point(aes(colour=V3), size=1, alpha=0.5)
19 | g


--------------------------------------------------------------------------------
/R/05/Kmeans_prefecture.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | 
 6 | library(MASS)
 7 | library(tidyverse)
 8 | # library(caret)
 9 | 
10 | data_prefecture <- read_csv("src/05/data_prefecture_category.csv")
11 | data_prefecture %>% summary()
12 | 
13 | data_prefecture_scaled <- data_prefecture %>% select(-都道府県) %>% scale()
14 | k_means <- kmeans(data_prefecture_scaled, 4)
15 | k_means %>% summary()
16 | k_means$cluster
17 | 
18 | data_prefecture_kmeans <- data_prefecture %>% mutate("label" = k_means$cluster)
19 | 
20 | data_prefecture_kmeans %>% filter(label==1)
21 | data_prefecture_kmeans %>% filter(label==2)
22 | data_prefecture_kmeans %>% filter(label==3)
23 | data_prefecture_kmeans %>% filter(label==4)
24 | 
25 | data_prefecture_kmeans %>% filter(label==1) %>% summary()
26 | data_prefecture_kmeans %>% filter(label==2) %>% summary()
27 | data_prefecture_kmeans %>% filter(label==3) %>% summary()
28 | data_prefecture_kmeans %>% filter(label==4) %>% summary()
29 | 


--------------------------------------------------------------------------------
/R/05/data_prefecture_category.csv:
--------------------------------------------------------------------------------
 1 | 都道府県,食料,住居,光熱・水道,家具・家事,被服及び,保健医療,交通・通信,教育,教養娯楽,諸雑費
 2 | 札幌市,"819,536","279,764","228,330","103,893","129,292","99,902","442,564","124,799","276,976","218,769"
 3 | 青森市,"790,368","259,971","295,102","96,173","98,267","115,529","427,590","96,241","245,912","232,403"
 4 | 盛岡市,"771,420","246,223","250,260","102,652","142,183","123,152","438,431","144,845","276,140","286,892"
 5 | 仙台市,"862,052","240,690","197,006","117,818","116,682","109,467","379,888","150,622","317,874","280,381"
 6 | 秋田市,"835,325","226,152","296,036","111,587","127,798","133,474","496,526","111,430","280,440","238,857"
 7 | 山形市,"841,537","315,770","285,590","99,357","125,567","104,612","770,941","107,336","302,035","289,043"
 8 | 福島市,"950,582","285,711","257,681","126,588","169,182","94,275","665,083","141,012","392,401","276,986"
 9 | 水戸市,"877,968","235,274","231,740","127,631","174,481","119,688","695,369","200,251","390,123","322,231"
10 | 宇都宮市,"970,391","294,398","243,081","104,325","171,918","125,397","622,628","175,432","375,213","292,779"
11 | 前橋市,"876,472","149,049","202,882","150,428","166,129","142,103","549,336","113,726","397,195","313,629"
12 | さいたま市,"1,042,267","350,989","216,828","110,043","173,828","174,833","501,966","275,513","330,177","276,978"
13 | 千葉市,"867,636","162,260","153,227","81,768","142,156","87,722","421,253","155,287","329,146","320,532"
14 | 東京都区部,"943,279","404,843","175,822","112,716","208,975","156,721","417,168","272,696","423,476","254,768"
15 | 横浜市,"926,253","215,616","184,484","124,547","172,798","136,661","517,576","251,826","420,737","275,789"
16 | 新潟市,"842,736","178,061","254,426","116,049","128,177","114,074","606,168","199,170","265,664","316,409"
17 | 富山市,"896,917","307,401","263,618","127,392","122,275","114,880","579,845","91,179","336,369","263,650"
18 | 金沢市,"971,470","220,831","246,180","125,704","167,773","101,640","680,653","245,222","405,272","355,490"
19 | 福井市,"925,413","151,093","249,017","94,646","114,519","99,707","462,830","122,414","328,129","277,653"
20 | 甲府市,"747,397","300,816","214,981","90,925","101,371","104,563","420,691","116,368","323,950","234,201"
21 | 長野市,"786,130","344,086","239,435","109,564","116,436","108,134","519,702","92,604","266,054","289,707"
22 | 岐阜市,"865,541","201,315","239,365","130,079","173,834","135,925","699,940","243,758","414,244","305,166"
23 | 静岡市,"807,241","358,014","204,189","106,298","139,274","109,700","432,415","119,306","316,773","227,907"
24 | 名古屋市,"821,916","249,793","156,478","82,537","139,540","104,044","480,970","107,105","394,293","224,362"
25 | 津市,"863,096","195,647","203,113","125,860","164,073","117,537","517,539","251,968","386,805","251,410"
26 | 大津市,"915,677","108,352","236,832","158,680","141,251","108,875","521,557","180,740","325,487","245,402"
27 | 京都市,"845,226","210,964","232,337","88,931","129,277","92,014","390,179","212,035","358,755","246,851"
28 | 大阪市,"840,018","269,369","177,417","95,044","114,748","138,580","369,889","140,737","317,359","202,192"
29 | 神戸市,"656,924","136,381","103,216","67,591","110,686","54,228","319,734","31,347","208,916","184,317"
30 | 奈良市,"898,884","157,240","272,448","114,845","165,037","144,301","496,535","388,515","399,766","282,591"
31 | 和歌山市,"887,859","244,498","246,528","130,329","152,058","92,863","510,125","144,763","343,537","241,471"
32 | 鳥取市,"706,962","204,600","194,986","108,323","103,304","86,720","513,462","77,770","230,101","281,468"
33 | 松江市,"727,565","328,050","221,065","93,567","103,611","105,134","545,464","85,915","292,628","281,605"
34 | 岡山市,"765,652","289,496","202,733","96,181","161,001","136,606","502,230","151,293","302,995","233,083"
35 | 広島市,"810,255","219,623","182,511","105,210","127,351","104,142","605,174","181,977","284,268","220,201"
36 | 山口市,"607,019","363,261","177,832","86,593","100,132","108,410","586,591","59,450","298,965","223,511"
37 | 徳島市,"817,065","183,086","211,546","119,732","153,757","113,235","443,341","239,275","362,019","277,219"
38 | 高松市,"809,931","323,569","227,821","119,424","129,374","131,729","615,294","103,593","279,503","243,670"
39 | 松山市,"828,274","197,045","241,818","125,931","159,782","102,157","491,929","208,938","305,368","253,428"
40 | 高知市,"803,052","310,383","225,292","198,099","119,242","102,917","533,892","157,375","309,526","280,036"
41 | 福岡市,"760,638","188,295","156,097","116,400","152,971","96,334","471,238","120,417","355,085","273,449"
42 | 佐賀市,"814,400","262,685","224,972","98,570","140,041","144,157","515,064","144,634","359,726","284,169"
43 | 長崎市,"658,520","308,171","210,173","84,279","115,569","83,159","390,576","88,847","187,986","182,308"
44 | 熊本市,"870,311","311,909","243,256","143,752","152,455","133,442","509,583","223,684","345,740","338,671"
45 | 大分市,"789,001","355,356","207,281","135,103","162,991","100,884","561,382","97,157","451,635","313,381"
46 | 宮崎市,"778,907","222,861","185,008","96,874","122,197","113,690","559,338","131,236","279,217","278,642"
47 | 鹿児島市,"787,120","345,632","198,035","116,358","164,759","121,532","552,727","108,190","298,067","254,743"
48 | 那覇市,"726,160","337,851","211,156","111,406","100,591","102,076","448,672","131,853","237,977","175,467"
49 | 川崎市,"872,136","427,698","158,914","87,568","112,286","94,975","324,322","124,695","292,594","215,952"
50 | 相模原市,"756,340","290,616","166,843","95,209","112,149","122,466","302,611","112,232","363,026","210,583"
51 | 浜松市,"803,305","198,314","187,801","105,721","127,230","114,431","680,801","128,846","281,603","236,023"
52 | 堺市,"927,069","236,032","257,915","123,388","155,071","129,308","640,550","247,428","419,882","251,675"
53 | 北九州市,"862,432","149,685","206,224","109,226","167,286","146,687","700,366","164,434","270,083","256,207"


--------------------------------------------------------------------------------
/R/05/pca_iris.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | 
 6 | library(MASS)
 7 | library(tidyverse)
 8 | # library(caret)
 9 | 
10 | pcr_model <- prcomp(iris %>% select(-Species), scale=T)
11 | pcr_model %>% summary()
12 | 
13 | # plot(pcr_model$x[, 1], pcr_model$x[, 2])
14 | data_pca <- cbind(pcr_model$x, iris$Species) %>% as.data.frame()
15 | 
16 | g <- ggplot(data_pca, aes(x=PC1, y=PC2))
17 | g <- g + geom_point(aes(colour=V5), size=1, alpha=0.5)
18 | #描画
19 | g


--------------------------------------------------------------------------------
/R/05/pca_prefecture.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | 
 6 | library(MASS)
 7 | library(tidyverse)
 8 | # library(caret)
 9 | 
10 | data_prefecture <- read_csv("src/05/data_prefecture_category.csv")
11 | data_prefecture %>% summary()
12 | 
13 | pcr_model <- prcomp(data_prefecture %>% select(-都道府県), scale=T)
14 | pcr_model %>% summary()
15 | 
16 | # plot(pcr_model$x[, 1], pcr_model$x[, 2])
17 | 
18 | # plot(x, y)
19 | pt <- identify(pcr_model$x[, 1], pcr_model$x[, 2])
20 | 
21 | 


--------------------------------------------------------------------------------
/R/06/classification.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages("pROC")
 5 | 
 6 | library(MASS)
 7 | library(tidyverse)
 8 | library(caret)
 9 | library(pROC)
10 | 
11 | data_iris <- iris %>% 
12 |   filter(Species != "virginica") %>% select(-Species) %>% 
13 |   mutate(Species = as.matrix(iris$Species[1:100]))
14 | 
15 | # 予測
16 | train_size = 0.7
17 | train_index <- sample(data_iris %>% nrow(), data_iris %>% nrow() * train_size)
18 | train_data <- data_iris[train_index,] # 訓練データ
19 | test_data <- data_iris[-train_index,] # テストデータ
20 | 
21 | ## 重回帰
22 | logit_multi_model <- train(Species ~ Sepal.Length, data=train_data, method="glm", family=binomial())
23 | y_pred <- predict(logit_multi_model, test_data)
24 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor())
25 | 
26 | roc(test_data$Species %>% as.factor() %>% as.numeric(), y_pred %>% as.numeric(), plot = TRUE)
27 | 
28 | 


--------------------------------------------------------------------------------
/R/06/regression.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | 
 5 | library(MASS)
 6 | library(tidyverse)
 7 | library(caret)
 8 | 
 9 | # データ読み込み
10 | data(Boston)
11 | Boston %>% summary()
12 | Boston %>% head()
13 | 
14 | # 予測
15 | train_size = 0.7
16 | train_index <- sample(Boston %>% nrow(), Boston %>% nrow() * train_size)
17 | train_data <- Boston[train_index,] # 訓練データ
18 | test_data <- Boston[-train_index,] # テストデータ
19 | 
20 | lm_multi_model <- train(data=train_data, medv ~ ., method="lm")
21 | y_pred <- predict(lm_multi_model, test_data)
22 | 
23 | # MAE
24 | MAE(y_pred, test_data$medv)
25 | 
26 | # RMSE
27 | RMSE(y_pred, test_data$medv)
28 | 
29 | # RMSLE
30 | rmsle <- function(y_true, y_pred)
31 |   sqrt(mean((log1p(y_true) - log1p(y_pred))^2))
32 | rmsle(y_pred, test_data$medv)
33 | 


--------------------------------------------------------------------------------
/R/07/cnn_mnist.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | # devtools::install_github("rstudio/keras")
 6 | 
 7 | library(MASS)
 8 | library(tidyverse)
 9 | # library(caret)
10 | library(keras)
11 | # install_keras()
12 | 
13 | mnist <- dataset_mnist()
14 | x_train <- mnist$train$x
15 | y_train <- mnist$train$y
16 | x_test <- mnist$test$x
17 | y_test <- mnist$test$y
18 | 
19 | image_size <- 28
20 | 
21 | # reshape
22 | x_train <- array_reshape(x_train, c(nrow(x_train), image_size, image_size, 1))
23 | x_test <- array_reshape(x_test, c(nrow(x_test), image_size, image_size, 1))
24 | # rescale
25 | x_train <- x_train / 255
26 | x_test <- x_test / 255
27 | 
28 | y_train <- to_categorical(y_train, 10)
29 | y_test <- to_categorical(y_test, 10)
30 | 
31 | 
32 | model <- keras_model_sequential() %>%
33 |   layer_conv_2d(filters = 32, kernel_size = c(3,3), activation = 'relu',
34 |                 input_shape = c(image_size, image_size, 1)) %>% 
35 |   layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
36 |   layer_flatten() %>% 
37 |   layer_dense(units = 32, activation = 'relu') %>% 
38 |   layer_dense(units = 10, activation = 'softmax')
39 | 
40 | model %>% summary()
41 | 
42 | 
43 | model %>% compile(
44 |   loss = 'categorical_crossentropy',
45 |   optimizer = "Adam",
46 |   metrics = c('accuracy')
47 | )
48 | 
49 | history <- model %>% fit(
50 |   x_train, y_train, 
51 |   epochs = 5, batch_size = 64
52 | )
53 | 
54 | model %>% evaluate(x_test, y_test)
55 | 
56 | 


--------------------------------------------------------------------------------
/R/07/nn_mnist.R:
--------------------------------------------------------------------------------
 1 | # install.packages("tidyverse")
 2 | # install.packages("GGally")
 3 | # install.packages("caret")
 4 | # install.packages('e1071')
 5 | # devtools::install_github("rstudio/keras")
 6 | 
 7 | library(MASS)
 8 | library(tidyverse)
 9 | # library(caret)
10 | library(keras)
11 | # install_keras()
12 | 
13 | mnist <- dataset_mnist()
14 | x_train <- mnist$train$x
15 | y_train <- mnist$train$y
16 | x_test <- mnist$test$x
17 | y_test <- mnist$test$y
18 | 
19 | # reshape
20 | x_train <- array_reshape(x_train, c(nrow(x_train), 28**2))
21 | x_test <- array_reshape(x_test, c(nrow(x_test), 28**2))
22 | # rescale
23 | x_train <- x_train / 255
24 | x_test <- x_test / 255
25 | 
26 | y_train <- to_categorical(y_train, 10)
27 | y_test <- to_categorical(y_test, 10)
28 | 
29 | 
30 | model <- keras_model_sequential() 
31 | model %>% 
32 |   layer_dense(units = 512, activation = 'relu', input_shape = c(28**2)) %>% 
33 |   layer_dense(units = 512, activation = 'relu') %>%
34 |   layer_dense(units = 10, activation = 'softmax')
35 | 
36 | 
37 | model %>% summary()
38 | 
39 | 
40 | model %>% compile(
41 |   loss = 'categorical_crossentropy',
42 |   optimizer = "Adam",
43 |   metrics = c('accuracy')
44 | )
45 | 
46 | history <- model %>% fit(
47 |   x_train, y_train, 
48 |   epochs = 5, batch_size = 64
49 |   )
50 | 
51 | model %>% evaluate(x_test, y_test)
52 | 
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PythonBook
 2 | 「Pythonと実データで遊んで学ぶ データ分析講座」 サポートページ
 3 | 
 4 | https://www.amazon.co.jp/dp/4863542836
 5 | 
 6 | 
 7 | # 書籍修正点
 8 | ## 2019.12.03更新
 9 | * Chapter3 で使用している、国土交通省のAPIですが、URLが変更されました。以下のように、「http://」となっているURLを「https://」と変更してください。
10 |   * 第2版では修正済みです。
11 | 
12 | ## 2020.06.25更新
13 | * Google Colaboratory 上でデフォルトで使用されている、 scikit-learn ライブラリのバージョンが変更されました
14 | * それに伴い、いくつかの処理で、書籍中で紹介しているものと異なる結果になる可能性があります
15 | * 現時点で判明したものを以下にまとめます（随時追加します）
16 | 
17 | ### P134以降で登場する LogisticRegression
18 | * デフォルトで使用される、ソルバーが変更されました
19 |   * 公式サイト https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
20 |   * ソルバーとは各パラメータを求めるための、最適化アルゴリズムを指します
21 | * 以下のように、solverに `liblinear` を指定すると、書籍中の結果と一致します
22 | 
23 | ```python
24 | logit = LogisticRegression(solver='liblinear')
25 | ```
26 | 
27 | * なお、本修正は 第3版では修正済みです。
28 | 
29 | 
30 | ## 2021.11.11更新
31 | 
32 | * Chapter6 P200のRMSE計算について、結果は変わっていませんが、計算順序が定義と異なっているため、以下に修正をお願いいたします。
33 | 
34 | ![texclip20211112141247](https://user-images.githubusercontent.com/43558230/141413796-3a7f8c98-0a31-41f7-baf9-92f177411418.png)
35 | 
36 | 
37 | * Chapter P201の「評価指標としてRMEを採用した場合、さまざまな回帰モデルを比較し、その中でMAE値が最も小さいモデルが良い、と判断されます。」のRMEは「MAE」の誤りです。
38 | * Chapter6 P201のMAE計算について、結果は変わっていませんが、計算順序が定義と異なっているため、以下に修正をお願いいたします。
39 | 
40 | ![texclip20211112142108](https://user-images.githubusercontent.com/43558230/141414206-9f3fda8c-4352-4f6d-a80d-2101ceff8204.png)
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------