├── Python ├── 03 │ ├── lm_boston.ipynb │ ├── lm_ridge_lasso_boston.ipynb │ ├── lm_ridge_lasso_tokyo.ipynb │ └── lm_tokyo.ipynb ├── 04 │ ├── decisionTree_iris.ipynb │ ├── decisionTree_tweets.ipynb │ ├── get_tweets.ipynb │ ├── logit_iris.ipynb │ ├── logit_tweets.ipynb │ ├── randomForest_iris.ipynb │ ├── randomForest_tweets.ipynb │ └── tweets.tsv ├── 05 │ ├── Kmeans_iris.ipynb │ ├── Kmeans_prefecture.ipynb │ ├── data_prefecture_category.csv │ ├── pca_iris.ipynb │ └── pca_prefecture.ipynb ├── 06 │ ├── classification.ipynb │ └── regression.ipynb ├── 07 │ ├── cnn_mnist.ipynb │ ├── cnn_temple_shrine.ipynb │ ├── get_imaeg.py │ ├── nn_mnist.ipynb │ └── nn_temple_shrine.ipynb └── 08 │ ├── collaborative_filtering.ipynb │ └── word2vec_tweets.ipynb ├── R ├── 03 │ ├── lm_boston.R │ ├── lm_ridge_lasso_boston.R │ ├── lm_ridge_lasso_tokyo.R │ └── lm_tokyo.R ├── 04 │ ├── decisionTree_iris.R │ ├── decisionTree_tweets.R │ ├── logit_iris.R │ ├── logit_tweets.R │ ├── randomForest_iris.R │ ├── randomForest_tweets.R │ └── tweets.tsv ├── 05 │ ├── Kmeans_iris.R │ ├── Kmeans_prefecture.R │ ├── data_prefecture_category.csv │ ├── pca_iris.R │ └── pca_prefecture.R ├── 06 │ ├── classification.R │ └── regression.R └── 07 │ ├── cnn_mnist.R │ └── nn_mnist.R └── README.md /Python/03/lm_boston.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch03-boston-lm.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "o5dgWD9rz4LG", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "%matplotlib inline\n", 30 | "import seaborn as sns\n", 31 | "from sklearn.linear_model import LinearRegression\n", 32 | "from sklearn.datasets import load_boston" 33 | ], 34 | "execution_count": 0, 35 | "outputs": [] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "id": "GAqEPf-uYZZ5", 41 | "colab_type": "text" 42 | }, 43 | "source": [ 44 | "## データ読み込み" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "Z38CWl_Fz61A", 51 | "colab_type": "code", 52 | "colab": {} 53 | }, 54 | "source": [ 55 | "boston = load_boston()\n", 56 | "print(boston.DESCR)" 57 | ], 58 | "execution_count": 0, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "metadata": { 64 | "id": "2HC7DcOv9Mib", 65 | "colab_type": "code", 66 | "colab": {} 67 | }, 68 | "source": [ 69 | "data_boston = pd.DataFrame(boston.data, columns=boston.feature_names)\n", 70 | "data_boston['PRICE'] = boston.target" 71 | ], 72 | "execution_count": 0, 73 | "outputs": [] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "metadata": { 78 | "id": "x10DS7QWZRiY", 79 | "colab_type": "code", 80 | "colab": {} 81 | }, 82 | "source": [ 83 | "print(data_boston.head())" 84 | ], 85 | "execution_count": 0, 86 | "outputs": [] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "metadata": { 91 | "id": "bDWlr_nRq8v1", 92 | "colab_type": "code", 93 | "colab": {} 94 | }, 95 | "source": [ 96 | "print(data_boston.tail())" 97 | ], 98 | "execution_count": 0, 99 | "outputs": [] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "id": "Sg70gJxoYgt6", 105 | "colab_type": "text" 106 | }, 107 | "source": [ 108 | "## 可視化" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "metadata": { 114 | "id": "OyirnZ2a9U8X", 115 | "colab_type": "code", 116 | "colab": {} 117 | }, 118 | "source": [ 119 | "sns.jointplot('RM', 'PRICE', data=data_boston)" 120 | ], 121 | "execution_count": 0, 122 | "outputs": [] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "id": "TuMKNWCHBS1S", 128 | "colab_type": "code", 129 | "colab": {} 130 | }, 131 | "source": [ 132 | "sns.pairplot(data_boston)" 133 | ], 134 | "execution_count": 0, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "metadata": { 140 | "id": "xIdkp9Q_B4-Y", 141 | "colab_type": "code", 142 | "colab": {} 143 | }, 144 | "source": [ 145 | "sns.pairplot(data_boston, vars=[\"PRICE\", \"RM\", \"DIS\"])" 146 | ], 147 | "execution_count": 0, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": { 153 | "id": "akWwJ5J8yx3f", 154 | "colab_type": "text" 155 | }, 156 | "source": [ 157 | "## 線形回帰を実践" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "id": "37T2q4X11NIm", 164 | "colab_type": "code", 165 | "colab": {} 166 | }, 167 | "source": [ 168 | "lr = LinearRegression()" 169 | ], 170 | "execution_count": 0, 171 | "outputs": [] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "metadata": { 176 | "id": "UaCgv7VMCQaQ", 177 | "colab_type": "code", 178 | "colab": {} 179 | }, 180 | "source": [ 181 | "x_column_list = ['RM']\n", 182 | "y_column_list = ['PRICE']\n", 183 | "\n", 184 | "data_boston_x = data_boston[x_column_list]\n", 185 | "data_boston_y = data_boston[y_column_list]\n", 186 | "\n", 187 | "lr.fit(data_boston_x, data_boston_y)" 188 | ], 189 | "execution_count": 0, 190 | "outputs": [] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "metadata": { 195 | "id": "CmF-CkZkzBRi", 196 | "colab_type": "code", 197 | "colab": {} 198 | }, 199 | "source": [ 200 | "print(lr.coef_)\n", 201 | "print(lr.intercept_)" 202 | ], 203 | "execution_count": 0, 204 | "outputs": [] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": { 209 | "id": "vb42hci58i5V", 210 | "colab_type": "text" 211 | }, 212 | "source": [ 213 | "### 重回帰分析" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "metadata": { 219 | "id": "d3wJ2V2CAFEK", 220 | "colab_type": "code", 221 | "colab": {} 222 | }, 223 | "source": [ 224 | "lr_multi = LinearRegression()\n", 225 | "\n", 226 | "x_column_list_for_multi = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']\n", 227 | "y_column_list_for_multi = ['PRICE']\n", 228 | "\n", 229 | "data_boston_x = data_boston[x_column_list_for_multi]\n", 230 | "data_boston_y = data_boston[y_column_list_for_multi]\n", 231 | "\n", 232 | "lr_multi.fit(data_boston_x, data_boston_y)" 233 | ], 234 | "execution_count": 0, 235 | "outputs": [] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "metadata": { 240 | "id": "QDwGOyzVAc_f", 241 | "colab_type": "code", 242 | "colab": {} 243 | }, 244 | "source": [ 245 | "print(lr_multi.coef_)\n", 246 | "print(lr_multi.intercept_)" 247 | ], 248 | "execution_count": 0, 249 | "outputs": [] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": { 254 | "id": "alpJSV_OY_UP", 255 | "colab_type": "text" 256 | }, 257 | "source": [ 258 | "## 予測" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "metadata": { 264 | "id": "IYDRbFKCZA2n", 265 | "colab_type": "code", 266 | "colab": {} 267 | }, 268 | "source": [ 269 | "from sklearn.model_selection import train_test_split\n", 270 | "\n", 271 | "X_train, X_test, y_train, y_test = train_test_split(data_boston_x, data_boston_y, test_size=0.3)" 272 | ], 273 | "execution_count": 0, 274 | "outputs": [] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "metadata": { 279 | "id": "3tlg7O37brti", 280 | "colab_type": "code", 281 | "colab": {} 282 | }, 283 | "source": [ 284 | "print(X_train.shape)\n", 285 | "print(X_test.shape)\n", 286 | "print(y_train.shape)\n", 287 | "print(y_test.shape)" 288 | ], 289 | "execution_count": 0, 290 | "outputs": [] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "metadata": { 295 | "id": "yDbbRGARejnC", 296 | "colab_type": "code", 297 | "colab": {} 298 | }, 299 | "source": [ 300 | "lr_multi2 = LinearRegression()\n", 301 | "\n", 302 | "lr_multi2.fit(X_train, y_train) \n", 303 | "print(lr_multi2.coef_)\n", 304 | "print(lr_multi2.intercept_)" 305 | ], 306 | "execution_count": 0, 307 | "outputs": [] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "metadata": { 312 | "id": "8dqcvzMMfrmL", 313 | "colab_type": "code", 314 | "colab": {} 315 | }, 316 | "source": [ 317 | "y_pred = lr_multi2.predict(X_test)" 318 | ], 319 | "execution_count": 0, 320 | "outputs": [] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "metadata": { 325 | "id": "m_K6lCv2g0f9", 326 | "colab_type": "code", 327 | "colab": {} 328 | }, 329 | "source": [ 330 | "print(y_pred - y_test)" 331 | ], 332 | "execution_count": 0, 333 | "outputs": [] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": { 338 | "id": "ZPwNArsCk9JX", 339 | "colab_type": "text" 340 | }, 341 | "source": [ 342 | "## MAE" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "metadata": { 348 | "id": "-TGrKPMGg10f", 349 | "colab_type": "code", 350 | "colab": {} 351 | }, 352 | "source": [ 353 | "from sklearn.metrics import mean_absolute_error" 354 | ], 355 | "execution_count": 0, 356 | "outputs": [] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "metadata": { 361 | "id": "8YeXhiqWhPtI", 362 | "colab_type": "code", 363 | "colab": {} 364 | }, 365 | "source": [ 366 | "x_column_list = ['RM']\n", 367 | "y_column_list = ['PRICE']\n", 368 | "\n", 369 | "X_train, X_test, y_train, y_test = train_test_split(data_boston[x_column_list], data_boston[y_column_list], test_size=0.3)\n", 370 | "\n", 371 | "lr_single = LinearRegression()\n", 372 | "\n", 373 | "lr_single.fit(X_train, y_train) \n", 374 | "y_pred = lr_single.predict(X_test)\n", 375 | "\n", 376 | "print(mean_absolute_error(y_pred, y_test))" 377 | ], 378 | "execution_count": 0, 379 | "outputs": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "metadata": { 384 | "id": "vXCexSrUlZ6K", 385 | "colab_type": "code", 386 | "colab": {} 387 | }, 388 | "source": [ 389 | "x_column_list_for_multi = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']\n", 390 | "y_column_list_for_multi = ['PRICE']\n", 391 | "\n", 392 | "X_train, X_test, y_train, y_test = train_test_split(data_boston[x_column_list_for_multi], data_boston[y_column_list_for_multi], test_size=0.3)\n", 393 | "\n", 394 | "lr_multi2 = LinearRegression()\n", 395 | "\n", 396 | "lr_multi2.fit(X_train, y_train) \n", 397 | "y_pred = lr_multi2.predict(X_test)\n", 398 | "\n", 399 | "print(mean_absolute_error(y_pred, y_test))" 400 | ], 401 | "execution_count": 0, 402 | "outputs": [] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "metadata": { 407 | "id": "pbCs-LY9lhs0", 408 | "colab_type": "code", 409 | "colab": {} 410 | }, 411 | "source": [ 412 | "" 413 | ], 414 | "execution_count": 0, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "metadata": { 420 | "id": "ZuiyKVZfnetv", 421 | "colab_type": "code", 422 | "colab": {} 423 | }, 424 | "source": [ 425 | "" 426 | ], 427 | "execution_count": 0, 428 | "outputs": [] 429 | } 430 | ] 431 | } -------------------------------------------------------------------------------- /Python/03/lm_ridge_lasso_boston.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch03-boston-ridge-lasso-lm.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "o5dgWD9rz4LG", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "%matplotlib inline\n", 30 | "import seaborn as sns\n", 31 | "from sklearn.linear_model import LinearRegression, Ridge, Lasso\n", 32 | "from sklearn.datasets import load_boston\n", 33 | "from sklearn.metrics import mean_absolute_error\n", 34 | "from sklearn.model_selection import train_test_split" 35 | ], 36 | "execution_count": 0, 37 | "outputs": [] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "GAqEPf-uYZZ5", 43 | "colab_type": "text" 44 | }, 45 | "source": [ 46 | "## データ読み込み" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "metadata": { 52 | "id": "Z38CWl_Fz61A", 53 | "colab_type": "code", 54 | "colab": {} 55 | }, 56 | "source": [ 57 | "boston = load_boston()\n", 58 | "data_boston = pd.DataFrame(boston.data, columns=boston.feature_names)\n", 59 | "data_boston['PRICE'] = boston.target\n", 60 | "\n", 61 | "print(data_boston.head())\n", 62 | "print(data_boston.tail())" 63 | ], 64 | "execution_count": 0, 65 | "outputs": [] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": { 70 | "id": "Obrkk6djkrQV", 71 | "colab_type": "text" 72 | }, 73 | "source": [ 74 | "## L1正則化なし" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "id": "Dxo6oyArkuZX", 81 | "colab_type": "code", 82 | "colab": {} 83 | }, 84 | "source": [ 85 | "lr_multi = LinearRegression()\n", 86 | "\n", 87 | "x_column_list_for_multi = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']\n", 88 | "y_column_list_for_multi = ['PRICE']\n", 89 | "\n", 90 | "lr_multi.fit(data_boston[x_column_list_for_multi], data_boston[y_column_list_for_multi])\n", 91 | "\n", 92 | "print(lr_multi.coef_)\n", 93 | "print(lr_multi.intercept_)" 94 | ], 95 | "execution_count": 0, 96 | "outputs": [] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "metadata": { 101 | "id": "720BakmC4Khb", 102 | "colab_type": "code", 103 | "colab": {} 104 | }, 105 | "source": [ 106 | "X_train, X_test, y_train, y_test = train_test_split(data_boston[x_column_list_for_multi], data_boston[y_column_list_for_multi], test_size=0.3)" 107 | ], 108 | "execution_count": 0, 109 | "outputs": [] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": { 114 | "id": "W0BRywgqkLKv", 115 | "colab_type": "text" 116 | }, 117 | "source": [ 118 | "### 予測と**MAE**" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "id": "q-pBlusmkMt6", 125 | "colab_type": "code", 126 | "colab": {} 127 | }, 128 | "source": [ 129 | "lr_multi2 = LinearRegression()\n", 130 | "\n", 131 | "lr_multi2.fit(X_train, y_train) \n", 132 | "print(lr_multi2.coef_)\n", 133 | "print(lr_multi2.intercept_)\n", 134 | "\n", 135 | "y_pred_lr = lr_multi2.predict(X_test)\n", 136 | "\n", 137 | "# 残差\n", 138 | "# print(y_pred_lr-y_test)\n", 139 | "\n", 140 | "# MAE\n", 141 | "print(mean_absolute_error(y_pred_lr, y_test))" 142 | ], 143 | "execution_count": 0, 144 | "outputs": [] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "id": "aA5RrYWvk-Pj", 150 | "colab_type": "text" 151 | }, 152 | "source": [ 153 | "## Lasso回帰" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "metadata": { 159 | "id": "zu_8ABezlTgd", 160 | "colab_type": "code", 161 | "colab": {} 162 | }, 163 | "source": [ 164 | "lasso = Lasso(alpha=0.001, normalize=True)\n", 165 | "lasso.fit(X_train, y_train) \n", 166 | "print(lasso.coef_)\n", 167 | "print(lasso.intercept_)" 168 | ], 169 | "execution_count": 0, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": { 175 | "id": "T1FWbFm8lPDq", 176 | "colab_type": "text" 177 | }, 178 | "source": [ 179 | "### MAE" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "metadata": { 185 | "id": "NMzuTOhXlBtQ", 186 | "colab_type": "code", 187 | "colab": {} 188 | }, 189 | "source": [ 190 | "y_pred_lasso = lasso.predict(X_test)\n", 191 | "\n", 192 | "# 残差\n", 193 | "# print(y_pred_lasso.reshape(-1,1) - y_test)\n", 194 | "\n", 195 | "# MAE\n", 196 | "print(mean_absolute_error(y_pred_lasso, y_test))" 197 | ], 198 | "execution_count": 0, 199 | "outputs": [] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": { 204 | "id": "0zLEcnZFjkrk", 205 | "colab_type": "text" 206 | }, 207 | "source": [ 208 | "## Ridge回帰" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "metadata": { 214 | "id": "A1XZ4pV3lVxd", 215 | "colab_type": "code", 216 | "colab": {} 217 | }, 218 | "source": [ 219 | "ridge = Ridge(alpha=0.01, normalize=True)\n", 220 | "ridge.fit(X_train, y_train) \n", 221 | "print(ridge.coef_)\n", 222 | "print(ridge.intercept_)" 223 | ], 224 | "execution_count": 0, 225 | "outputs": [] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": { 230 | "id": "Pm6sVXnXlUb-", 231 | "colab_type": "text" 232 | }, 233 | "source": [ 234 | "### MAE" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "metadata": { 240 | "id": "ZuiyKVZfnetv", 241 | "colab_type": "code", 242 | "colab": {} 243 | }, 244 | "source": [ 245 | "y_pred_ridge = ridge.predict(X_test)\n", 246 | "\n", 247 | "# 残差\n", 248 | "# print(y_pred_ridge.reshape(-1,1) - y_test)\n", 249 | "\n", 250 | "# MAE\n", 251 | "print(mean_absolute_error(y_pred_ridge, y_test))" 252 | ], 253 | "execution_count": 0, 254 | "outputs": [] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "metadata": { 259 | "id": "CXzbRDoXx24o", 260 | "colab_type": "code", 261 | "colab": {} 262 | }, 263 | "source": [ 264 | "" 265 | ], 266 | "execution_count": 0, 267 | "outputs": [] 268 | } 269 | ] 270 | } -------------------------------------------------------------------------------- /Python/03/lm_ridge_lasso_tokyo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch03-tokyo-ridge-lasso-lm.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "hbPe1ouXtDkg", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "source": [ 25 | "import numpy as np\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "import pandas as pd\n", 28 | "import random\n", 29 | "%matplotlib inline\n", 30 | "import seaborn as sns\n", 31 | "from sklearn.linear_model import LinearRegression, Ridge, Lasso\n", 32 | "\n", 33 | "from sklearn.model_selection import train_test_split\n", 34 | "from sklearn.metrics import mean_absolute_error\n", 35 | "\n", 36 | "import requests\n", 37 | "import json\n", 38 | "import re" 39 | ], 40 | "execution_count": 0, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "jB1EII0lhGAW", 47 | "colab_type": "text" 48 | }, 49 | "source": [ 50 | "### CSVファイルからデータ読み込みとデータ整形\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "metadata": { 56 | "id": "bk-9aVfz7CJW", 57 | "colab_type": "code", 58 | "colab": {} 59 | }, 60 | "source": [ 61 | "data_from_csv = pd.read_csv(\"13_Tokyo_20171_20184.csv\", encoding='cp932')\n", 62 | "data_used_apartment = data_from_csv.query('種類 == \"中古マンション等\"')\n", 63 | "\n", 64 | "columns_name_list = [\"最寄駅:距離(分)\", \"間取り\", \"面積(㎡)\",\"建築年\", \"建物の構造\", \"建ぺい率(%)\", \"容積率(%)\", \"市区町村名\", \"取引価格(総額)\"]\n", 65 | "\n", 66 | "data_selected = data_used_apartment[columns_name_list]\n", 67 | "data_selected_dropna = data_selected.dropna(how='any') # 一つでもNANデータを含む行を削除" 68 | ], 69 | "execution_count": 0, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "oibI6rxwQTaX", 76 | "colab_type": "code", 77 | "colab": {} 78 | }, 79 | "source": [ 80 | "# 建築年を築年数に変更\n", 81 | "data_selected_dropna = data_selected_dropna[data_selected_dropna[\"建築年\"].str.match('^平成|昭和')]\n", 82 | "\n", 83 | "wareki_to_seireki = {'昭和': 1926-1, '平成': 1989-1}\n", 84 | "\n", 85 | "building_year_list = data_selected_dropna[\"建築年\"]\n", 86 | "\n", 87 | "building_age_list = []\n", 88 | "for building_year in building_year_list:\n", 89 | " # 昭和○年 → 昭和, ○ に変換、平成○年 → 平成, ○ に変換\n", 90 | " building_year_split = re.search(r'(.+?)([0-9]+|元)年', building_year)\n", 91 | " # 西暦に変換\n", 92 | " seireki = wareki_to_seireki[building_year_split.groups()[0]] + int(building_year_split.groups()[1])\n", 93 | " \n", 94 | " building_age = 2018 - seireki # 築年数に変換\n", 95 | " building_age_list.append(building_age)\n", 96 | "\n", 97 | " \n", 98 | "data_selected_dropna[\"築年数\"] = building_age_list # 新しく、築年数列を追加\n", 99 | "# もう使わないので、建築年列は削除\n", 100 | "data_added_building_age = data_selected_dropna.drop(\"建築年\", axis=1)" 101 | ], 102 | "execution_count": 0, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "metadata": { 108 | "id": "BoyjtlvLhZG4", 109 | "colab_type": "code", 110 | "colab": {} 111 | }, 112 | "source": [ 113 | "# ダミー変数化しないもの\n", 114 | "columns_name_list = [\"最寄駅:距離(分)\", \"面積(㎡)\",\"築年数\", \"建ぺい率(%)\", \"容積率(%)\", \"取引価格(総額)\"]\n", 115 | "# ダミー変数リスト\n", 116 | "dummy_list = [\"間取り\", \"建物の構造\", \"市区町村名\"]\n", 117 | "\n", 118 | "# ダミー変数を追加\n", 119 | "data_added_dummies = pd.concat([data_added_building_age[columns_name_list],\n", 120 | " pd.get_dummies(data_added_building_age[dummy_list], drop_first=True)], axis=1)\n", 121 | "\n", 122 | "\n", 123 | "# 文字列を数値化\n", 124 | "data_added_dummies[\"面積(㎡)\"] = data_added_dummies[\"面積(㎡)\"].astype(float)\n", 125 | "data_added_dummies = data_added_dummies[~data_added_dummies['最寄駅:距離(分)'].str.contains('\\?')]\n", 126 | "data_added_dummies[\"最寄駅:距離(分)\"] = data_added_dummies[\"最寄駅:距離(分)\"].astype(float)\n", 127 | "\n", 128 | "# 6000万円以下のデータのみ抽出\n", 129 | "data_added_dummies = data_added_dummies[data_added_dummies[\"取引価格(総額)\"] < 60000000]" 130 | ], 131 | "execution_count": 0, 132 | "outputs": [] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "metadata": { 137 | "id": "zq7u9UhLib5q", 138 | "colab_type": "code", 139 | "colab": {} 140 | }, 141 | "source": [ 142 | "print(data_added_dummies.shape)" 143 | ], 144 | "execution_count": 0, 145 | "outputs": [] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": { 150 | "id": "-TkxAlYlt4Zg", 151 | "colab_type": "text" 152 | }, 153 | "source": [ 154 | "## L1正則化なし" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "metadata": { 160 | "id": "s-AZ2qRlvCeU", 161 | "colab_type": "code", 162 | "colab": {} 163 | }, 164 | "source": [ 165 | "x = data_added_dummies.drop(\"取引価格(総額)\", axis=1)\n", 166 | "y = data_added_dummies[\"取引価格(総額)\"]\n", 167 | "\n", 168 | "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)" 169 | ], 170 | "execution_count": 0, 171 | "outputs": [] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": { 176 | "id": "BLquTeiTAHlI", 177 | "colab_type": "text" 178 | }, 179 | "source": [ 180 | "### 予測とMAE" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "f8VEXWtRwqUO", 187 | "colab_type": "code", 188 | "colab": {} 189 | }, 190 | "source": [ 191 | "lr_multi = LinearRegression()\n", 192 | "\n", 193 | "lr_multi.fit(X_train, y_train) \n", 194 | "print(lr_multi.coef_)\n", 195 | "print(lr_multi.intercept_)\n", 196 | "\n", 197 | "y_pred_lr = lr_multi.predict(X_test)\n", 198 | "\n", 199 | "# 残差\n", 200 | "# print(y_pred_lr - y_test)\n", 201 | "\n", 202 | "# MAE\n", 203 | "print(mean_absolute_error(y_pred_lr, y_test))" 204 | ], 205 | "execution_count": 0, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "id": "b6MSFJDWCWiw", 212 | "colab_type": "text" 213 | }, 214 | "source": [ 215 | "## Lasso回帰" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "metadata": { 221 | "id": "_jOfClMfA7wG", 222 | "colab_type": "code", 223 | "colab": {} 224 | }, 225 | "source": [ 226 | "lasso = Lasso(alpha=1, normalize=True)\n", 227 | "lasso.fit(X_train, y_train) \n", 228 | "print(lasso.coef_)\n", 229 | "print(lasso.intercept_)" 230 | ], 231 | "execution_count": 0, 232 | "outputs": [] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": { 237 | "id": "Z7IX8p9KCbr3", 238 | "colab_type": "text" 239 | }, 240 | "source": [ 241 | "### MAE" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "metadata": { 247 | "id": "nSB4v9_s5P04", 248 | "colab_type": "code", 249 | "colab": {} 250 | }, 251 | "source": [ 252 | "y_pred_lasso = lasso.predict(X_test)\n", 253 | "\n", 254 | "# 残差\n", 255 | "# print(y_pred_lasso.reshape(-1,1) - y_test)\n", 256 | "\n", 257 | "# MAE\n", 258 | "print(mean_absolute_error(y_pred_lasso, y_test))" 259 | ], 260 | "execution_count": 0, 261 | "outputs": [] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": { 266 | "id": "yZMnQ46MCX38", 267 | "colab_type": "text" 268 | }, 269 | "source": [ 270 | "## Ridge回帰" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "metadata": { 276 | "id": "siOmINIpCZcP", 277 | "colab_type": "code", 278 | "colab": {} 279 | }, 280 | "source": [ 281 | "ridge = Ridge(alpha=0.1, normalize=True)\n", 282 | "ridge.fit(X_train, y_train) \n", 283 | "print(ridge.coef_)\n", 284 | "print(ridge.intercept_)" 285 | ], 286 | "execution_count": 0, 287 | "outputs": [] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "id": "fdy0CHSSCdMu", 293 | "colab_type": "text" 294 | }, 295 | "source": [ 296 | "### MAE" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "metadata": { 302 | "id": "gaH1r7WqCdwZ", 303 | "colab_type": "code", 304 | "colab": {} 305 | }, 306 | "source": [ 307 | "y_pred_ridge = ridge.predict(X_test)\n", 308 | "\n", 309 | "# 残差\n", 310 | "# print(y_pred_ridge.reshape(-1,1) - y_test)\n", 311 | "\n", 312 | "# MAE\n", 313 | "print(mean_absolute_error(y_pred_ridge, y_test))" 314 | ], 315 | "execution_count": 0, 316 | "outputs": [] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "metadata": { 321 | "id": "-eW2Q8MCTyTj", 322 | "colab_type": "code", 323 | "colab": {} 324 | }, 325 | "source": [ 326 | "" 327 | ], 328 | "execution_count": 0, 329 | "outputs": [] 330 | } 331 | ] 332 | } -------------------------------------------------------------------------------- /Python/03/lm_tokyo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch03-tokyo-lm.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "hbPe1ouXtDkg", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import numpy as np\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import pandas as pd\n", 29 | "import random\n", 30 | "%matplotlib inline\n", 31 | "import seaborn as sns\n", 32 | "from sklearn.linear_model import LinearRegression\n", 33 | "from sklearn.model_selection import train_test_split\n", 34 | "\n", 35 | "import requests\n", 36 | "import json\n", 37 | "import re" 38 | ], 39 | "execution_count": 0, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "id": "RLSAgxJdJ0rr", 46 | "colab_type": "text" 47 | }, 48 | "source": [ 49 | "### APIでデータ読み込み" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "metadata": { 55 | "id": "kh4ZZr5Itq_u", 56 | "colab_type": "code", 57 | "colab": {} 58 | }, 59 | "source": [ 60 | "url_path = \"https://www.land.mlit.go.jp/webland/api/TradeListSearch?from=20171&to=20185&area=13\"\n", 61 | "request_result = requests.get(url_path)\n", 62 | "data_json = request_result.json()[\"data\"]" 63 | ], 64 | "execution_count": 0, 65 | "outputs": [] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "id": "haC9urQWKrC0", 71 | "colab_type": "code", 72 | "colab": {} 73 | }, 74 | "source": [ 75 | "print(len(data_json))" 76 | ], 77 | "execution_count": 0, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "sj82Ui5MKu19", 84 | "colab_type": "code", 85 | "colab": {} 86 | }, 87 | "source": [ 88 | "print(data_json[0])" 89 | ], 90 | "execution_count": 0, 91 | "outputs": [] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "metadata": { 96 | "id": "7iJVhsm0LS8F", 97 | "colab_type": "code", 98 | "colab": {} 99 | }, 100 | "source": [ 101 | "print(data_json[1000])" 102 | ], 103 | "execution_count": 0, 104 | "outputs": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "metadata": { 109 | "id": "BuOMyy7NNdsH", 110 | "colab_type": "code", 111 | "colab": {} 112 | }, 113 | "source": [ 114 | "data_pd = pd.io.json.json_normalize(data_json)\n", 115 | "print(data_pd.shape)" 116 | ], 117 | "execution_count": 0, 118 | "outputs": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "metadata": { 123 | "id": "VHVU53mCr-47", 124 | "colab_type": "code", 125 | "colab": {} 126 | }, 127 | "source": [ 128 | "print(data_pd.head(10))" 129 | ], 130 | "execution_count": 0, 131 | "outputs": [] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "metadata": { 136 | "id": "56KmFBgHssAI", 137 | "colab_type": "code", 138 | "colab": {} 139 | }, 140 | "source": [ 141 | "print(data_pd.isnull().sum())" 142 | ], 143 | "execution_count": 0, 144 | "outputs": [] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "id": "jB1EII0lhGAW", 150 | "colab_type": "text" 151 | }, 152 | "source": [ 153 | "### CSVファイルからデータ読み込み" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "metadata": { 159 | "id": "bk-9aVfz7CJW", 160 | "colab_type": "code", 161 | "colab": {} 162 | }, 163 | "source": [ 164 | "data_from_csv = pd.read_csv(\"13_Tokyo_20171_20184.csv\", encoding='cp932')" 165 | ], 166 | "execution_count": 0, 167 | "outputs": [] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "metadata": { 172 | "id": "oibI6rxwQTaX", 173 | "colab_type": "code", 174 | "colab": {} 175 | }, 176 | "source": [ 177 | "print(data_from_csv.shape)" 178 | ], 179 | "execution_count": 0, 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "metadata": { 185 | "id": "BoyjtlvLhZG4", 186 | "colab_type": "code", 187 | "colab": {} 188 | }, 189 | "source": [ 190 | "print(data_from_csv.iloc[0])" 191 | ], 192 | "execution_count": 0, 193 | "outputs": [] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "metadata": { 198 | "id": "zq7u9UhLib5q", 199 | "colab_type": "code", 200 | "colab": {} 201 | }, 202 | "source": [ 203 | "print(data_from_csv.head(10))" 204 | ], 205 | "execution_count": 0, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "id": "4ejeo1d0ZWng", 212 | "colab_type": "text" 213 | }, 214 | "source": [ 215 | "### データ整形" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "metadata": { 221 | "id": "-QNqhoSiivt7", 222 | "colab_type": "code", 223 | "colab": {} 224 | }, 225 | "source": [ 226 | "print(data_from_csv[\"種類\"].unique())" 227 | ], 228 | "execution_count": 0, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "PLG4jg1RCnwA", 235 | "colab_type": "code", 236 | "colab": {} 237 | }, 238 | "source": [ 239 | "data_used_apartment = data_from_csv.query('種類 == \"中古マンション等\"')\n", 240 | "print(data_used_apartment.shape)\n", 241 | "print(data_used_apartment.head())\n", 242 | "print(data_used_apartment.iloc[0])" 243 | ], 244 | "execution_count": 0, 245 | "outputs": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "metadata": { 250 | "id": "2tDqZZiEEH6U", 251 | "colab_type": "code", 252 | "colab": {} 253 | }, 254 | "source": [ 255 | "print(data_used_apartment.isnull().sum())" 256 | ], 257 | "execution_count": 0, 258 | "outputs": [] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "metadata": { 263 | "id": "7vWpDjsISdNh", 264 | "colab_type": "code", 265 | "colab": {} 266 | }, 267 | "source": [ 268 | "columns_name_list = [\"最寄駅:距離(分)\", \"間取り\", \"面積(㎡)\",\"建築年\", \"建物の構造\", \"建ぺい率(%)\", \"容積率(%)\", \"市区町村名\", \"取引価格(総額)\"]\n", 269 | "\n", 270 | "data_selected = data_used_apartment[columns_name_list]\n", 271 | "print(data_selected.shape)\n", 272 | "\n", 273 | "data_selected_dropna = data_selected.dropna(how='any') # 一つでもNANデータを含む行を削除\n", 274 | "print(data_selected_dropna.shape)\n", 275 | "print(data_selected_dropna.iloc[0])" 276 | ], 277 | "execution_count": 0, 278 | "outputs": [] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "metadata": { 283 | "id": "SsRWqNUsSdQx", 284 | "colab_type": "code", 285 | "colab": {} 286 | }, 287 | "source": [ 288 | "data_selected_dropna[\"建築年\"].unique()" 289 | ], 290 | "execution_count": 0, 291 | "outputs": [] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "metadata": { 296 | "id": "8gkx6DdCSdTM", 297 | "colab_type": "code", 298 | "colab": {} 299 | }, 300 | "source": [ 301 | "data_selected_dropna = data_selected_dropna[data_selected_dropna[\"建築年\"].str.match('^平成|昭和')]\n", 302 | "\n", 303 | "\n", 304 | "wareki_to_seireki = {'昭和': 1926-1, '平成': 1989-1}\n", 305 | "\n", 306 | "building_year_list = data_selected_dropna[\"建築年\"]\n", 307 | "\n", 308 | "building_age_list = []\n", 309 | "for building_year in building_year_list:\n", 310 | " # 昭和○年 → 昭和, ○ に変換、平成○年 → 平成, ○ に変換\n", 311 | " building_year_split = re.search(r'(.+?)([0-9]+|元)年', building_year)\n", 312 | " # 西暦に変換\n", 313 | " seireki = wareki_to_seireki[building_year_split.groups()[0]] + int(building_year_split.groups()[1])\n", 314 | " \n", 315 | " building_age = 2018 - seireki # 築年数に変換\n", 316 | " building_age_list.append(building_age)\n", 317 | "\n", 318 | " \n", 319 | "data_selected_dropna[\"築年数\"] = building_age_list # 新しく、築年数列を追加\n", 320 | "\n", 321 | "# もう使わないので、建築年列は削除\n", 322 | "data_added_building_age = data_selected_dropna.drop(\"建築年\", axis=1)\n", 323 | "print(data_added_building_age.head())" 324 | ], 325 | "execution_count": 0, 326 | "outputs": [] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "metadata": { 331 | "id": "0twcuPJlfYm2", 332 | "colab_type": "code", 333 | "colab": {} 334 | }, 335 | "source": [ 336 | "# ダミー変数化しないもののリスト\n", 337 | "columns_name_list = [\"最寄駅:距離(分)\", \"面積(㎡)\",\"築年数\", \"建ぺい率(%)\", \"容積率(%)\", \"取引価格(総額)\"]\n", 338 | "\n", 339 | "# ダミー変数化するリスト\n", 340 | "dummy_list = [\"間取り\", \"建物の構造\", \"市区町村名\"]\n", 341 | "\n", 342 | "# ダミー変数を追加\n", 343 | "data_added_dummies = pd.concat([data_added_building_age[columns_name_list],\n", 344 | " pd.get_dummies(data_added_building_age[dummy_list], drop_first=True)], axis=1)\n", 345 | "\n", 346 | "print(data_added_dummies.shape)\n", 347 | "print(data_added_dummies.iloc[0])" 348 | ], 349 | "execution_count": 0, 350 | "outputs": [] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "metadata": { 355 | "id": "SHRUOuaxSdV-", 356 | "colab_type": "code", 357 | "colab": {} 358 | }, 359 | "source": [ 360 | "print(data_added_dummies.dtypes)" 361 | ], 362 | "execution_count": 0, 363 | "outputs": [] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "metadata": { 368 | "id": "Fn6U6LvDlDAW", 369 | "colab_type": "code", 370 | "colab": {} 371 | }, 372 | "source": [ 373 | "data_added_dummies[\"面積(㎡)\"] = data_added_dummies[\"面積(㎡)\"].astype(float)\n", 374 | "data_added_dummies = data_added_dummies[~data_added_dummies['最寄駅:距離(分)'].str.contains('\\?')]\n", 375 | "data_added_dummies[\"最寄駅:距離(分)\"] = data_added_dummies[\"最寄駅:距離(分)\"].astype(float)" 376 | ], 377 | "execution_count": 0, 378 | "outputs": [] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "metadata": { 383 | "id": "QyJzbmqgSdY3", 384 | "colab_type": "code", 385 | "colab": {} 386 | }, 387 | "source": [ 388 | "print(data_added_dummies.dtypes)" 389 | ], 390 | "execution_count": 0, 391 | "outputs": [] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": { 396 | "id": "w7p-xIA1Ap8I", 397 | "colab_type": "text" 398 | }, 399 | "source": [ 400 | "## 可視化" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "metadata": { 406 | "id": "VQWx7PTAApQX", 407 | "colab_type": "code", 408 | "colab": {} 409 | }, 410 | "source": [ 411 | "plt.hist(data_added_dummies[\"取引価格(総額)\"])\n", 412 | "plt.show()\n", 413 | "\n", 414 | "tmp_data = data_added_dummies[data_added_dummies[\"取引価格(総額)\"] < 60000000]\n", 415 | "print(tmp_data.shape)\n", 416 | "plt.hist(tmp_data[\"取引価格(総額)\"])\n", 417 | "plt.show()" 418 | ], 419 | "execution_count": 0, 420 | "outputs": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "metadata": { 425 | "id": "wgFEq8xwDOFw", 426 | "colab_type": "code", 427 | "colab": {} 428 | }, 429 | "source": [ 430 | "data_added_dummies = data_added_dummies[data_added_dummies[\"取引価格(総額)\"] < 60000000]" 431 | ], 432 | "execution_count": 0, 433 | "outputs": [] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": { 438 | "id": "-TkxAlYlt4Zg", 439 | "colab_type": "text" 440 | }, 441 | "source": [ 442 | "## 線形回帰を実践" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "metadata": { 448 | "id": "s-AZ2qRlvCeU", 449 | "colab_type": "code", 450 | "colab": {} 451 | }, 452 | "source": [ 453 | "lr = LinearRegression()\n", 454 | "\n", 455 | "x_column_list = ['面積(㎡)']\n", 456 | "y_column_list = ['取引価格(総額)']\n", 457 | "\n", 458 | "x = data_added_dummies[x_column_list]\n", 459 | "y = data_added_dummies[y_column_list]\n", 460 | "\n", 461 | "lr.fit(x, y)" 462 | ], 463 | "execution_count": 0, 464 | "outputs": [] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "metadata": { 469 | "id": "7rYy2IflwCOF", 470 | "colab_type": "code", 471 | "colab": {} 472 | }, 473 | "source": [ 474 | "print(lr.coef_)\n", 475 | "print(lr.intercept_)" 476 | ], 477 | "execution_count": 0, 478 | "outputs": [] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": { 483 | "id": "hLDyZjIEt_lm", 484 | "colab_type": "text" 485 | }, 486 | "source": [ 487 | "### 重回帰分析" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "metadata": { 493 | "id": "TeyqwMDWwn7P", 494 | "colab_type": "code", 495 | "colab": {} 496 | }, 497 | "source": [ 498 | "x = data_added_dummies.drop(\"取引価格(総額)\", axis=1)\n", 499 | "y = data_added_dummies[\"取引価格(総額)\"]\n", 500 | "\n", 501 | "print(x.head())\n", 502 | "print(y.head())" 503 | ], 504 | "execution_count": 0, 505 | "outputs": [] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "metadata": { 510 | "id": "f8VEXWtRwqUO", 511 | "colab_type": "code", 512 | "colab": {} 513 | }, 514 | "source": [ 515 | "lr_multi = LinearRegression()\n", 516 | "lr_multi.fit(x, y)\n", 517 | "\n", 518 | "print(lr_multi.coef_)\n", 519 | "print(lr_multi.intercept_)" 520 | ], 521 | "execution_count": 0, 522 | "outputs": [] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "metadata": { 527 | "id": "nSB4v9_s5P04", 528 | "colab_type": "code", 529 | "colab": {} 530 | }, 531 | "source": [ 532 | "for i in range(len(lr_multi.coef_)):\n", 533 | " print(x.columns[i], lr_multi.coef_[i])" 534 | ], 535 | "execution_count": 0, 536 | "outputs": [] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": { 541 | "id": "lVmbQrgXuET8", 542 | "colab_type": "text" 543 | }, 544 | "source": [ 545 | "## 予測" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "metadata": { 551 | "id": "8Be2-0Cs8fpl", 552 | "colab_type": "code", 553 | "colab": {} 554 | }, 555 | "source": [ 556 | "x = data_added_dummies.drop(\"取引価格(総額)\", axis=1)\n", 557 | "y = data_added_dummies[\"取引価格(総額)\"]\n", 558 | "\n", 559 | "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)\n", 560 | "print(X_train.shape)\n", 561 | "print(X_test.shape)\n", 562 | "print(y_train.shape)\n", 563 | "print(y_test.shape)" 564 | ], 565 | "execution_count": 0, 566 | "outputs": [] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "metadata": { 571 | "id": "4NOWZ3gF8iO6", 572 | "colab_type": "code", 573 | "colab": {} 574 | }, 575 | "source": [ 576 | "lr_multi2 = LinearRegression()\n", 577 | "\n", 578 | "lr_multi2.fit(X_train, y_train) \n", 579 | "print(lr_multi2.coef_)\n", 580 | "print(lr_multi2.intercept_)" 581 | ], 582 | "execution_count": 0, 583 | "outputs": [] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "metadata": { 588 | "id": "AiJXsD0O8lce", 589 | "colab_type": "code", 590 | "colab": {} 591 | }, 592 | "source": [ 593 | "y_pred = lr_multi2.predict(X_test)\n", 594 | "print(y_pred)" 595 | ], 596 | "execution_count": 0, 597 | "outputs": [] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "metadata": { 602 | "id": "1TXiAr4w9AJ0", 603 | "colab_type": "code", 604 | "colab": {} 605 | }, 606 | "source": [ 607 | "print(y_pred - y_test)" 608 | ], 609 | "execution_count": 0, 610 | "outputs": [] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "metadata": { 615 | "id": "GMF1DnRuJ_pP", 616 | "colab_type": "code", 617 | "colab": {} 618 | }, 619 | "source": [ 620 | "from sklearn.metrics import r2_score\n", 621 | "r2_score(y_test, y_pred)" 622 | ], 623 | "execution_count": 0, 624 | "outputs": [] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": { 629 | "id": "s8Q2afPluIJH", 630 | "colab_type": "text" 631 | }, 632 | "source": [ 633 | "## MAE" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "metadata": { 639 | "id": "Ao8cg_wZO-XX", 640 | "colab_type": "code", 641 | "colab": {} 642 | }, 643 | "source": [ 644 | "from sklearn.metrics import mean_absolute_error\n", 645 | "\n", 646 | "x_column_list = ['面積(㎡)']\n", 647 | "y_column_list = ['取引価格(総額)']\n", 648 | "\n", 649 | "\n", 650 | "X_train, X_test, y_train, y_test = train_test_split(data_added_dummies[x_column_list], data_added_dummies[y_column_list], test_size=0.3)\n", 651 | "\n", 652 | "lr_single = LinearRegression()\n", 653 | "\n", 654 | "lr_single.fit(X_train, y_train) \n", 655 | "y_pred = lr_single.predict(X_test)\n", 656 | "\n", 657 | "print(mean_absolute_error(y_pred, y_test))" 658 | ], 659 | "execution_count": 0, 660 | "outputs": [] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "metadata": { 665 | "id": "WfV7c4oQObxF", 666 | "colab_type": "code", 667 | "colab": {} 668 | }, 669 | "source": [ 670 | "x = data_added_dummies.drop(\"取引価格(総額)\", axis=1)\n", 671 | "y = data_added_dummies[\"取引価格(総額)\"]\n", 672 | "\n", 673 | "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.7)\n", 674 | "\n", 675 | "lr_multi2 = LinearRegression()\n", 676 | "\n", 677 | "lr_multi2.fit(X_train, y_train) \n", 678 | "y_pred = lr_multi2.predict(X_test)\n", 679 | "\n", 680 | "print(mean_absolute_error(y_pred, y_test))" 681 | ], 682 | "execution_count": 0, 683 | "outputs": [] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "metadata": { 688 | "id": "vR70qGkg-LiC", 689 | "colab_type": "text" 690 | }, 691 | "source": [ 692 | "" 693 | ] 694 | } 695 | ] 696 | } 697 | -------------------------------------------------------------------------------- /Python/04/decisionTree_iris.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch04-iris-DecisionTree.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "kSb7IeHmFcWW", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "from sklearn.tree import DecisionTreeClassifier\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "from sklearn.metrics import accuracy_score\n", 31 | "\n", 32 | "from sklearn.datasets import load_iris" 33 | ], 34 | "execution_count": 0, 35 | "outputs": [] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "id": "lZqQjbcaG-4w", 41 | "colab_type": "text" 42 | }, 43 | "source": [ 44 | "## データ読み込み" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "hHujZz1fFwJK", 51 | "colab_type": "code", 52 | "colab": {} 53 | }, 54 | "source": [ 55 | "iris = load_iris()\n", 56 | "X, Y = iris.data, iris.target\n", 57 | "\n", 58 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)" 59 | ], 60 | "execution_count": 0, 61 | "outputs": [] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "id": "CpGVPa-AHbLE", 67 | "colab_type": "text" 68 | }, 69 | "source": [ 70 | "## 決定木を実践" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "pnWZvrnNHAOB", 77 | "colab_type": "code", 78 | "colab": {} 79 | }, 80 | "source": [ 81 | "clf = DecisionTreeClassifier(max_depth=5)" 82 | ], 83 | "execution_count": 0, 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "metadata": { 89 | "id": "TRPrDkjSbD9g", 90 | "colab_type": "code", 91 | "colab": {} 92 | }, 93 | "source": [ 94 | "# 学習\n", 95 | "clf.fit(X_train, y_train)\n", 96 | "\n", 97 | "# 評価\n", 98 | "y_pred = clf.predict(X_test)\n", 99 | "print(accuracy_score(y_test, y_pred))" 100 | ], 101 | "execution_count": 0, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": { 107 | "id": "3LoiTrkzRn9u", 108 | "colab_type": "text" 109 | }, 110 | "source": [ 111 | "### 可視化" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "metadata": { 117 | "id": "0ou82IgHRF9v", 118 | "colab_type": "code", 119 | "colab": {} 120 | }, 121 | "source": [ 122 | "!sudo apt install graphviz\n", 123 | "!pip install dtreeviz" 124 | ], 125 | "execution_count": 0, 126 | "outputs": [] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "metadata": { 131 | "id": "M57MOkkcRo8H", 132 | "colab_type": "code", 133 | "colab": {} 134 | }, 135 | "source": [ 136 | "from dtreeviz.trees import dtreeviz\n", 137 | "\n", 138 | "viz = dtreeviz(clf, X, Y,\n", 139 | " feature_names = iris.feature_names,\n", 140 | " target_name = 'breed',\n", 141 | " class_names=[str(i) for i in iris.target_names],\n", 142 | " )\n", 143 | "\n", 144 | "display(viz)\n", 145 | "# 保存する場合\n", 146 | "# viz.save(\"tree.svg\")" 147 | ], 148 | "execution_count": 0, 149 | "outputs": [] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "metadata": { 154 | "id": "8Ge1rhSVRuMJ", 155 | "colab_type": "code", 156 | "colab": {} 157 | }, 158 | "source": [ 159 | "" 160 | ], 161 | "execution_count": 0, 162 | "outputs": [] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "metadata": { 167 | "id": "qum_tMJkhuwF", 168 | "colab_type": "code", 169 | "colab": {} 170 | }, 171 | "source": [ 172 | "" 173 | ], 174 | "execution_count": 0, 175 | "outputs": [] 176 | } 177 | ] 178 | } -------------------------------------------------------------------------------- /Python/04/decisionTree_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch04-Tweet-DecisionTree.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "kSb7IeHmFcWW", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "from sklearn.tree import DecisionTreeClassifier\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "from sklearn.metrics import accuracy_score\n", 31 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 32 | "\n", 33 | "from sklearn.datasets import load_iris" 34 | ], 35 | "execution_count": 0, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "hHujZz1fFwJK", 42 | "colab_type": "code", 43 | "colab": {} 44 | }, 45 | "source": [ 46 | "# mecabインストール\n", 47 | "!apt install aptitude\n", 48 | "!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y\n", 49 | "\n", 50 | "# mecab pythonインストール(pythonでmecabを動かすために必要)\n", 51 | "!pip install mecab-python3==0.7\n", 52 | "\n", 53 | "# neologd辞書インストール\n", 54 | "!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git\n", 55 | "!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n\n", 56 | "\n", 57 | "# 辞書変更\n", 58 | "!sed -e \"s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g\" /etc/mecabrc > /etc/mecabrc.new\n", 59 | "!cp /etc/mecabrc /etc/mecabrc.org\n", 60 | "!cp /etc/mecabrc.new /etc/mecabrc" 61 | ], 62 | "execution_count": 0, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": { 68 | "id": "hqNNWh9YTaAC", 69 | "colab_type": "text" 70 | }, 71 | "source": [ 72 | "## データ読み込み" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "metadata": { 78 | "id": "ATIL0zuqTbt5", 79 | "colab_type": "code", 80 | "colab": {} 81 | }, 82 | "source": [ 83 | "import MeCab\n", 84 | "\n", 85 | "data_tweet = pd.read_csv('tweets.tsv', sep=\"\\t\")\n", 86 | "data_tweet = data_tweet.dropna()\n", 87 | "Y = data_tweet.iloc[:,1].values\n", 88 | "\n", 89 | "print(data_tweet.head())" 90 | ], 91 | "execution_count": 0, 92 | "outputs": [] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "metadata": { 97 | "id": "ThOUKPvPTb2W", 98 | "colab_type": "code", 99 | "colab": {} 100 | }, 101 | "source": [ 102 | "tagger = MeCab.Tagger()\n", 103 | "tagger.parse('')\n", 104 | "\n", 105 | "# 文字列を単語で分割しリストに格納する\n", 106 | "def word_tokenaize(texts):\n", 107 | " node = tagger.parseToNode(texts)\n", 108 | " word_list = []\n", 109 | " while node:\n", 110 | " word_type = node.feature.split(\",\")[0]\n", 111 | " if (word_type == '名詞'):#|(word_type == '形容詞'):\n", 112 | " word = node.feature.split(\",\")[6]\n", 113 | " if word != '*':\n", 114 | " word_list.append(word)\n", 115 | " node = node.next\n", 116 | "\n", 117 | " return word_list" 118 | ], 119 | "execution_count": 0, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "id": "dKukpU80UAFx", 126 | "colab_type": "code", 127 | "colab": {} 128 | }, 129 | "source": [ 130 | "vectorizer = TfidfVectorizer(tokenizer=word_tokenaize)\n", 131 | "\n", 132 | "tweet_matrix = vectorizer.fit_transform(data_tweet.iloc[:,0])\n", 133 | "X = tweet_matrix.toarray()\n", 134 | "\n", 135 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)" 136 | ], 137 | "execution_count": 0, 138 | "outputs": [] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": { 143 | "id": "CpGVPa-AHbLE", 144 | "colab_type": "text" 145 | }, 146 | "source": [ 147 | "## 決定木を実践" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "pnWZvrnNHAOB", 154 | "colab_type": "code", 155 | "colab": {} 156 | }, 157 | "source": [ 158 | "clf = DecisionTreeClassifier(max_depth = 30)" 159 | ], 160 | "execution_count": 0, 161 | "outputs": [] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "metadata": { 166 | "id": "rZ3TQB9uHdQV", 167 | "colab_type": "code", 168 | "colab": {} 169 | }, 170 | "source": [ 171 | "# 学習\n", 172 | "clf.fit(X_train, y_train)\n", 173 | "\n", 174 | "# 評価\n", 175 | "y_pred = clf.predict(X_test)\n", 176 | "print(accuracy_score(y_test, y_pred))" 177 | ], 178 | "execution_count": 0, 179 | "outputs": [] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": { 184 | "id": "3LoiTrkzRn9u", 185 | "colab_type": "text" 186 | }, 187 | "source": [ 188 | "### 可視化" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "metadata": { 194 | "id": "0ou82IgHRF9v", 195 | "colab_type": "code", 196 | "colab": {} 197 | }, 198 | "source": [ 199 | "!sudo apt install graphviz\n", 200 | "!pip install dtreeviz" 201 | ], 202 | "execution_count": 0, 203 | "outputs": [] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "metadata": { 208 | "id": "M57MOkkcRo8H", 209 | "colab_type": "code", 210 | "colab": {} 211 | }, 212 | "source": [ 213 | "from dtreeviz.trees import dtreeviz\n", 214 | "\n", 215 | "viz = dtreeviz(clf, X, Y,\n", 216 | " feature_names=[i for i in range(X.shape[1])],\n", 217 | " target_name = 'tweet',\n", 218 | " class_names=['NP-UR', 'C&R'],\n", 219 | " )\n", 220 | "\n", 221 | "display(viz)\n", 222 | "# 保存する場合\n", 223 | "viz.save(\"tree.svg\")" 224 | ], 225 | "execution_count": 0, 226 | "outputs": [] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "metadata": { 231 | "id": "8Ge1rhSVRuMJ", 232 | "colab_type": "code", 233 | "colab": {} 234 | }, 235 | "source": [ 236 | "words_list = vectorizer.get_feature_names()" 237 | ], 238 | "execution_count": 0, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "metadata": { 244 | "id": "IGZ0BlFxdrSQ", 245 | "colab_type": "code", 246 | "colab": {} 247 | }, 248 | "source": [ 249 | "print(words_list[1606])\n", 250 | "print(words_list[1524])" 251 | ], 252 | "execution_count": 0, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "metadata": { 258 | "id": "FYf6-GFRhgV8", 259 | "colab_type": "code", 260 | "colab": {} 261 | }, 262 | "source": [ 263 | "" 264 | ], 265 | "execution_count": 0, 266 | "outputs": [] 267 | } 268 | ] 269 | } -------------------------------------------------------------------------------- /Python/04/get_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch04-Get-Tweet.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "QZ4D9fwktm5J", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "source": [ 25 | "#coding:utf-8\n", 26 | "import numpy as np\n", 27 | "import json\n", 28 | "import requests\n", 29 | "from requests_oauthlib import OAuth1Session, OAuth1\n", 30 | "import datetime\n", 31 | "import re\n", 32 | "import time\n", 33 | "\n", 34 | "from google.colab import files" 35 | ], 36 | "execution_count": 0, 37 | "outputs": [] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "inx3ZQ1ytwgU", 43 | "colab_type": "text" 44 | }, 45 | "source": [ 46 | "## 認証" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "metadata": { 52 | "id": "Oz54bCWxtv1u", 53 | "colab_type": "code", 54 | "colab": {} 55 | }, 56 | "source": [ 57 | "access_token = 'XXXXXXXXXX\n", 58 | "access_token_secret = 'XXXXXXXXXX'\n", 59 | "consumer_key = 'XXXXXXXXXX'\n", 60 | "consumer_key_secret = 'XXXXXXXXXX'\n", 61 | "\n", 62 | "# タイムライン取得用のURL\n", 63 | "url = \"https://api.twitter.com/1.1/statuses/user_timeline.json\"\n", 64 | "\n", 65 | "#APIの認証\n", 66 | "twitter = OAuth1Session(consumer_key, consumer_key_secret, access_token, access_token_secret)" 67 | ], 68 | "execution_count": 0, 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "3U8l4Mg-tyZu", 75 | "colab_type": "text" 76 | }, 77 | "source": [ 78 | "## Np_Ur_ のツイート取得" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "metadata": { 84 | "id": "XF279Mzw02Yh", 85 | "colab_type": "code", 86 | "colab": {} 87 | }, 88 | "source": [ 89 | "def normalize_text(text):\n", 90 | " text = re.sub(r'https?://[\\w/:%#\\$&\\?\\(\\)~\\.=\\+\\-…]+', \"\", text)\n", 91 | " text = re.sub('RT', \"\", text)\n", 92 | " text = re.sub('お気に入り', \"\", text)\n", 93 | " text = re.sub('まとめ', \"\", text)\n", 94 | " text = re.sub(r'[!-~]', \"\", text)\n", 95 | " text = re.sub(r'[︰-@]', \"\", text)\n", 96 | " text = re.sub('\\u3000',\"\", text)\n", 97 | " text = re.sub('\\t', \"\", text)\n", 98 | " text = re.sub('\\n', \"\", text)\n", 99 | " text = text.strip()\n", 100 | " return text" 101 | ], 102 | "execution_count": 0, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "metadata": { 108 | "id": "v4TAWF4qttB5", 109 | "colab_type": "code", 110 | "colab": {} 111 | }, 112 | "source": [ 113 | "# パラメータの定義\n", 114 | "params = {'screen_name': 'Np_Ur_',\n", 115 | " 'exclude_replies':True,\n", 116 | " 'include_rts':False,\n", 117 | " 'count':200\n", 118 | " }\n", 119 | "\n", 120 | "f_out = open('np_ur_.tsv','w')\n", 121 | "\n", 122 | "for _ in range(20):\n", 123 | " res = twitter.get(url, params = params)\n", 124 | "\n", 125 | " if res.status_code == 200:\n", 126 | "\n", 127 | " timeline = json.loads(res.text)\n", 128 | " if len(timeline) == 0:\n", 129 | " break\n", 130 | " \n", 131 | " # 各ツイートの本文を表示\n", 132 | " for i in range(len(timeline)):\n", 133 | " #print(len(timeline[i]['text']))\n", 134 | " f_out.write(normalize_text(timeline[i]['text']) + '\\t' + \"0\" + '\\n')\n", 135 | " \n", 136 | " # 一番最後のツイートIDをパラメータmax_idに追加 \n", 137 | " params['max_id'] = timeline[len(timeline) - 1]['id'] - 1\n", 138 | "\n", 139 | "f_out.close()" 140 | ], 141 | "execution_count": 0, 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "metadata": { 147 | "id": "NIUtJQ2Bt1Wq", 148 | "colab_type": "code", 149 | "colab": {} 150 | }, 151 | "source": [ 152 | "files.download('np_ur_.tsv')" 153 | ], 154 | "execution_count": 0, 155 | "outputs": [] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "id": "AcEcCSiKt-1_", 161 | "colab_type": "text" 162 | }, 163 | "source": [ 164 | "## lucky_CandR のツイート取得" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "metadata": { 170 | "id": "hCI91j-it6IM", 171 | "colab_type": "code", 172 | "colab": {} 173 | }, 174 | "source": [ 175 | "# パラメータの定義\n", 176 | "params = {'screen_name':'lucky_CandR',\n", 177 | " 'exclude_replies':True,\n", 178 | " 'include_rts':False,\n", 179 | " 'count':200\n", 180 | " }\n", 181 | "\n", 182 | "f_out = open('lucky_CandR.tsv','w')\n", 183 | "\n", 184 | "for _ in range(20):\n", 185 | " res = twitter.get(url, params = params)\n", 186 | "\n", 187 | " if res.status_code == 200:\n", 188 | "\n", 189 | " timeline = json.loads(res.text)\n", 190 | " if len(timeline) == 0:\n", 191 | " break\n", 192 | " \n", 193 | " # 各ツイートの本文を表示\n", 194 | " for i in range(len(timeline)):\n", 195 | " f_out.write(normalize_text(timeline[i]['text']) + '\\t' + \"1\" + '\\n')\n", 196 | " \n", 197 | " # 一番最後のツイートIDをパラメータmax_idに追加 \n", 198 | " params['max_id'] = timeline[len(timeline) - 1]['id'] - 1\n", 199 | "\n", 200 | "f_out.close()" 201 | ], 202 | "execution_count": 0, 203 | "outputs": [] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "metadata": { 208 | "id": "xO1jONm_uLAm", 209 | "colab_type": "code", 210 | "colab": {} 211 | }, 212 | "source": [ 213 | "files.download('lucky_CandR.tsv')" 214 | ], 215 | "execution_count": 0, 216 | "outputs": [] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "metadata": { 221 | "id": "632Ox_gGuNti", 222 | "colab_type": "code", 223 | "colab": {} 224 | }, 225 | "source": [ 226 | "# データ結合\n", 227 | "import pandas as pd\n", 228 | "\n", 229 | "tsv_files = ['np_ur_.tsv', 'lucky_CandR.tsv']\n", 230 | "list = []\n", 231 | "\n", 232 | "for file in tsv_files:\n", 233 | " list.append(pd.read_csv(file, delimiter='\\t', header=None))\n", 234 | "df = pd.concat(list, sort=False)\n", 235 | "\n", 236 | "df.to_csv( 'tweets.tsv', sep='\\t',index=False)" 237 | ], 238 | "execution_count": 0, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "metadata": { 244 | "id": "iUo5GZOG3JuI", 245 | "colab_type": "code", 246 | "colab": {} 247 | }, 248 | "source": [ 249 | "files.download('tweets.tsv')" 250 | ], 251 | "execution_count": 0, 252 | "outputs": [] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "metadata": { 257 | "id": "ZkWD27Pm4D_j", 258 | "colab_type": "code", 259 | "colab": {} 260 | }, 261 | "source": [ 262 | "" 263 | ], 264 | "execution_count": 0, 265 | "outputs": [] 266 | } 267 | ] 268 | } -------------------------------------------------------------------------------- /Python/04/logit_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch04-Tweet-logit_2.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "L_FdzJMz8TR7", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "%matplotlib inline\n", 30 | "\n", 31 | "from sklearn.datasets import load_iris\n", 32 | "from sklearn.linear_model import LogisticRegression\n", 33 | "from sklearn.metrics import accuracy_score\n", 34 | "from sklearn.model_selection import train_test_split\n", 35 | "\n", 36 | "from sklearn.feature_extraction.text import TfidfVectorizer" 37 | ], 38 | "execution_count": 0, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "metadata": { 44 | "id": "vlvlKilesNjB", 45 | "colab_type": "code", 46 | "colab": {} 47 | }, 48 | "source": [ 49 | "# mecabインストール\n", 50 | "!apt install aptitude\n", 51 | "!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y\n", 52 | "\n", 53 | "# mecab pythonインストール(pythonでmecabを動かすために必要)\n", 54 | "!pip install mecab-python3==0.7\n", 55 | "\n", 56 | "# neologd辞書インストール\n", 57 | "!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git\n", 58 | "!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n\n", 59 | "\n", 60 | "# 辞書変更\n", 61 | "!sed -e \"s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g\" /etc/mecabrc > /etc/mecabrc.new\n", 62 | "!cp /etc/mecabrc /etc/mecabrc.org\n", 63 | "!cp /etc/mecabrc.new /etc/mecabrc\n", 64 | "\n", 65 | "import MeCab" 66 | ], 67 | "execution_count": 0, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "id": "G3NXk634sC9Q", 74 | "colab_type": "text" 75 | }, 76 | "source": [ 77 | "## データ読み込み" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "OHO3YkrxiwTe", 84 | "colab_type": "code", 85 | "colab": {} 86 | }, 87 | "source": [ 88 | "data_tweet = pd.read_csv('tweets.tsv', sep=\"\\t\")\n", 89 | "data_tweet = data_tweet.dropna()\n", 90 | "Y = data_tweet.iloc[:,1].values\n", 91 | "\n", 92 | "print(data_tweet.head())" 93 | ], 94 | "execution_count": 0, 95 | "outputs": [] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "metadata": { 100 | "id": "n1oDBaWYsExp", 101 | "colab_type": "code", 102 | "colab": {} 103 | }, 104 | "source": [ 105 | "tagger = MeCab.Tagger()\n", 106 | "tagger.parse('')\n", 107 | "\n", 108 | "# 文字列を単語で分割しリストに格納する\n", 109 | "def word_tokenaize(texts):\n", 110 | " node = tagger.parseToNode(texts)\n", 111 | " word_list = []\n", 112 | " while node:\n", 113 | " word_type = node.feature.split(\",\")[0]\n", 114 | " if (word_type == '名詞')|(word_type == '形容詞'):\n", 115 | " word = node.feature.split(\",\")[6]\n", 116 | " if word != '*':\n", 117 | " word_list.append(word)\n", 118 | " node = node.next\n", 119 | "\n", 120 | " return word_list" 121 | ], 122 | "execution_count": 0, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "yJ-jlDAquhmp", 129 | "colab_type": "code", 130 | "colab": {} 131 | }, 132 | "source": [ 133 | "vectorizer = TfidfVectorizer(tokenizer=word_tokenaize)\n", 134 | "\n", 135 | "tweet_matrix = vectorizer.fit_transform(data_tweet.iloc[:,0])\n", 136 | "X = tweet_matrix.toarray()\n", 137 | "print(X.shape)" 138 | ], 139 | "execution_count": 0, 140 | "outputs": [] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": { 145 | "id": "QIJD4zzzcJ6r", 146 | "colab_type": "text" 147 | }, 148 | "source": [ 149 | "## ロジスティック回帰を実践" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "metadata": { 155 | "id": "Zj9MXim3vasH", 156 | "colab_type": "code", 157 | "colab": {} 158 | }, 159 | "source": [ 160 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)\n", 161 | "\n", 162 | "logit_multi2 = LogisticRegression()\n", 163 | "logit_multi2.fit(X_train, y_train)\n", 164 | "\n", 165 | "print(logit_multi2.coef_)\n", 166 | "print(logit_multi2.intercept_)" 167 | ], 168 | "execution_count": 0, 169 | "outputs": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "metadata": { 174 | "id": "kCbO0ucmvzD_", 175 | "colab_type": "code", 176 | "colab": {} 177 | }, 178 | "source": [ 179 | "y_pred = logit_multi2.predict(X_test)\n", 180 | "print(accuracy_score(y_test, y_pred))" 181 | ], 182 | "execution_count": 0, 183 | "outputs": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "metadata": { 188 | "id": "VGGJpwP7wS99", 189 | "colab_type": "code", 190 | "colab": {} 191 | }, 192 | "source": [ 193 | "" 194 | ], 195 | "execution_count": 0, 196 | "outputs": [] 197 | } 198 | ] 199 | } -------------------------------------------------------------------------------- /Python/04/randomForest_iris.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch04-iris-RandomForest.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "kSb7IeHmFcWW", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "%matplotlib inline\n", 30 | "\n", 31 | "from sklearn.ensemble import RandomForestClassifier\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "from sklearn.metrics import accuracy_score\n", 34 | "\n", 35 | "from sklearn.datasets import load_iris" 36 | ], 37 | "execution_count": 0, 38 | "outputs": [] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "id": "lZqQjbcaG-4w", 44 | "colab_type": "text" 45 | }, 46 | "source": [ 47 | "## データ読み込み" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "metadata": { 53 | "id": "hHujZz1fFwJK", 54 | "colab_type": "code", 55 | "colab": {} 56 | }, 57 | "source": [ 58 | "iris = load_iris()\n", 59 | "X, Y = iris.data, iris.target\n", 60 | "\n", 61 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)" 62 | ], 63 | "execution_count": 0, 64 | "outputs": [] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "id": "CpGVPa-AHbLE", 70 | "colab_type": "text" 71 | }, 72 | "source": [ 73 | "## ランダムフォレストを実践" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "metadata": { 79 | "id": "pnWZvrnNHAOB", 80 | "colab_type": "code", 81 | "colab": {} 82 | }, 83 | "source": [ 84 | "clf = RandomForestClassifier(n_estimators=10, max_depth=3)" 85 | ], 86 | "execution_count": 0, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "id": "rZ3TQB9uHdQV", 93 | "colab_type": "code", 94 | "colab": {} 95 | }, 96 | "source": [ 97 | "# 学習\n", 98 | "clf.fit(X_train, y_train)\n", 99 | "\n", 100 | "# 評価\n", 101 | "y_pred = clf.predict(X_test)\n", 102 | "print(accuracy_score(y_test, y_pred))" 103 | ], 104 | "execution_count": 0, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": { 110 | "id": "3LoiTrkzRn9u", 111 | "colab_type": "text" 112 | }, 113 | "source": [ 114 | "### 可視化" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "metadata": { 120 | "id": "0ou82IgHRF9v", 121 | "colab_type": "code", 122 | "colab": {} 123 | }, 124 | "source": [ 125 | "# 特徴量の重要度\n", 126 | "importances = clf.feature_importances_" 127 | ], 128 | "execution_count": 0, 129 | "outputs": [] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "metadata": { 134 | "id": "UZuZlDh-nX2t", 135 | "colab_type": "code", 136 | "colab": {} 137 | }, 138 | "source": [ 139 | "print(importances)" 140 | ], 141 | "execution_count": 0, 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "metadata": { 147 | "id": "8Ge1rhSVRuMJ", 148 | "colab_type": "code", 149 | "colab": {} 150 | }, 151 | "source": [ 152 | "features = np.array(iris.feature_names)\n", 153 | "\n", 154 | "# プロット\n", 155 | "indices = np.argsort(importances)\n", 156 | "plt.figure(figsize=(6,6))\n", 157 | "plt.barh(range(len(indices)), importances[indices], color='b', align='center')\n", 158 | "plt.yticks(range(len(indices)), features[indices])\n", 159 | "plt.savefig('rf_importance_iris.png')" 160 | ], 161 | "execution_count": 0, 162 | "outputs": [] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "metadata": { 167 | "id": "qum_tMJkhuwF", 168 | "colab_type": "code", 169 | "colab": {} 170 | }, 171 | "source": [ 172 | "features" 173 | ], 174 | "execution_count": 0, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "6XM7QTN0blCc", 181 | "colab_type": "code", 182 | "colab": {} 183 | }, 184 | "source": [ 185 | "" 186 | ], 187 | "execution_count": 0, 188 | "outputs": [] 189 | } 190 | ] 191 | } -------------------------------------------------------------------------------- /Python/04/randomForest_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch04-Tweet-RandomForest.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "kSb7IeHmFcWW", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "%matplotlib inline\n", 29 | "\n", 30 | "from sklearn.ensemble import RandomForestClassifier\n", 31 | "from sklearn.model_selection import train_test_split\n", 32 | "from sklearn.metrics import accuracy_score\n", 33 | "from sklearn.feature_extraction.text import TfidfVectorizer" 34 | ], 35 | "execution_count": 0, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "hHujZz1fFwJK", 42 | "colab_type": "code", 43 | "colab": {} 44 | }, 45 | "source": [ 46 | "# mecabインストール\n", 47 | "!apt install aptitude\n", 48 | "!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y\n", 49 | "\n", 50 | "# mecab pythonインストール(pythonでmecabを動かすために必要)\n", 51 | "!pip install mecab-python3==0.7\n", 52 | "\n", 53 | "# neologd辞書インストール\n", 54 | "!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git\n", 55 | "!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n\n", 56 | "\n", 57 | "# 辞書変更\n", 58 | "!sed -e \"s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g\" /etc/mecabrc > /etc/mecabrc.new\n", 59 | "!cp /etc/mecabrc /etc/mecabrc.org\n", 60 | "!cp /etc/mecabrc.new /etc/mecabrc\n", 61 | "\n", 62 | "import MeCab" 63 | ], 64 | "execution_count": 0, 65 | "outputs": [] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": { 70 | "id": "hqNNWh9YTaAC", 71 | "colab_type": "text" 72 | }, 73 | "source": [ 74 | "## データ読み込み" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "id": "ATIL0zuqTbt5", 81 | "colab_type": "code", 82 | "colab": {} 83 | }, 84 | "source": [ 85 | "data_tweet = pd.read_csv('tweets.tsv', sep=\"\\t\")\n", 86 | "data_tweet = data_tweet.dropna()\n", 87 | "Y = data_tweet.iloc[:,1].values\n", 88 | "\n", 89 | "print(data_tweet.head())" 90 | ], 91 | "execution_count": 0, 92 | "outputs": [] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "metadata": { 97 | "id": "ThOUKPvPTb2W", 98 | "colab_type": "code", 99 | "colab": {} 100 | }, 101 | "source": [ 102 | "tagger = MeCab.Tagger()\n", 103 | "tagger.parse('')\n", 104 | "\n", 105 | "# 文字列を単語で分割しリストに格納する\n", 106 | "def word_tokenaize(texts):\n", 107 | " node = tagger.parseToNode(texts)\n", 108 | " word_list = []\n", 109 | " while node:\n", 110 | " word_type = node.feature.split(\",\")[0]\n", 111 | " if (word_type == '名詞'):#|(word_type == '形容詞'):\n", 112 | " word = node.feature.split(\",\")[6]\n", 113 | " if word != '*':\n", 114 | " word_list.append(word)\n", 115 | " node = node.next\n", 116 | "\n", 117 | " return word_list" 118 | ], 119 | "execution_count": 0, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "id": "dKukpU80UAFx", 126 | "colab_type": "code", 127 | "colab": {} 128 | }, 129 | "source": [ 130 | "vectorizer = TfidfVectorizer(tokenizer=word_tokenaize)\n", 131 | "\n", 132 | "tweet_matrix = vectorizer.fit_transform(data_tweet.iloc[:,0])\n", 133 | "X = tweet_matrix.toarray()" 134 | ], 135 | "execution_count": 0, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "metadata": { 141 | "id": "3aJTcqZYoXsb", 142 | "colab_type": "code", 143 | "colab": {} 144 | }, 145 | "source": [ 146 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)" 147 | ], 148 | "execution_count": 0, 149 | "outputs": [] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": { 154 | "id": "CpGVPa-AHbLE", 155 | "colab_type": "text" 156 | }, 157 | "source": [ 158 | "## ランダムフォレストを実践" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "pnWZvrnNHAOB", 165 | "colab_type": "code", 166 | "colab": {} 167 | }, 168 | "source": [ 169 | "clf = RandomForestClassifier(n_estimators= 50, max_depth=20)" 170 | ], 171 | "execution_count": 0, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "id": "rZ3TQB9uHdQV", 178 | "colab_type": "code", 179 | "colab": {} 180 | }, 181 | "source": [ 182 | "# 学習\n", 183 | "clf.fit(X_train, y_train)\n", 184 | "\n", 185 | "# 評価\n", 186 | "y_pred = clf.predict(X_test)\n", 187 | "print(accuracy_score(y_test, y_pred))" 188 | ], 189 | "execution_count": 0, 190 | "outputs": [] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "id": "3LoiTrkzRn9u", 196 | "colab_type": "text" 197 | }, 198 | "source": [ 199 | "### 可視化" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "metadata": { 205 | "id": "0ou82IgHRF9v", 206 | "colab_type": "code", 207 | "colab": {} 208 | }, 209 | "source": [ 210 | "words_list = vectorizer.get_feature_names()\n", 211 | "\n", 212 | "features = np.array(np.arange(0,len(words_list)))\n", 213 | "# 特徴量の重要度\n", 214 | "importances = clf.feature_importances_" 215 | ], 216 | "execution_count": 0, 217 | "outputs": [] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "metadata": { 222 | "id": "M57MOkkcRo8H", 223 | "colab_type": "code", 224 | "colab": {} 225 | }, 226 | "source": [ 227 | "indices = np.argsort(importances)[-11:]\n", 228 | "plt.figure(figsize=(6,6))\n", 229 | "plt.barh(range(len(indices)), importances[indices], color='b', align='center')\n", 230 | "plt.yticks(range(len(indices)), features[indices])\n", 231 | "#plt.show()\n", 232 | "plt.savefig('rf_importance_tweet.png')" 233 | ], 234 | "execution_count": 0, 235 | "outputs": [] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "metadata": { 240 | "id": "8Ge1rhSVRuMJ", 241 | "colab_type": "code", 242 | "colab": {} 243 | }, 244 | "source": [ 245 | "for i in indices:\n", 246 | " print(i, words_list[i])" 247 | ], 248 | "execution_count": 0, 249 | "outputs": [] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "metadata": { 254 | "id": "IGZ0BlFxdrSQ", 255 | "colab_type": "code", 256 | "colab": {} 257 | }, 258 | "source": [ 259 | "" 260 | ], 261 | "execution_count": 0, 262 | "outputs": [] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "metadata": { 267 | "id": "FYf6-GFRhgV8", 268 | "colab_type": "code", 269 | "colab": {} 270 | }, 271 | "source": [ 272 | "" 273 | ], 274 | "execution_count": 0, 275 | "outputs": [] 276 | } 277 | ] 278 | } -------------------------------------------------------------------------------- /Python/05/Kmeans_prefecture.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch05-prefecture-Kmeans.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "fjn-xOmMcOZd", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "%matplotlib inline\n", 29 | "\n", 30 | "from sklearn.cluster import KMeans\n", 31 | "from sklearn.preprocessing import StandardScaler" 32 | ], 33 | "execution_count": 0, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "id": "6KjYW74wx4yT", 40 | "colab_type": "text" 41 | }, 42 | "source": [ 43 | "## データ読み込み" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "metadata": { 49 | "id": "ALMzMhn9hn9Y", 50 | "colab_type": "code", 51 | "colab": {} 52 | }, 53 | "source": [ 54 | "data_prefecture = pd.read_csv(\"data_prefecture_category.csv\", encoding='utf-8', index_col=0)" 55 | ], 56 | "execution_count": 0, 57 | "outputs": [] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "metadata": { 62 | "id": "PLw8rXe-huZ2", 63 | "colab_type": "code", 64 | "colab": {} 65 | }, 66 | "source": [ 67 | "print(data_prefecture.head())" 68 | ], 69 | "execution_count": 0, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "lEJwvHBWkqkT", 76 | "colab_type": "code", 77 | "colab": {} 78 | }, 79 | "source": [ 80 | "# カンマ区切りの文字列を数値に変換\n", 81 | "data_prefecture_float = data_prefecture.apply(lambda x: x.str.replace(',','')).astype(np.float)" 82 | ], 83 | "execution_count": 0, 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "metadata": { 89 | "id": "kx_SRyvpkAkX", 90 | "colab_type": "code", 91 | "colab": {} 92 | }, 93 | "source": [ 94 | "print(data_prefecture_float.head())" 95 | ], 96 | "execution_count": 0, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "id": "oT_9YoHPyXaN", 103 | "colab_type": "text" 104 | }, 105 | "source": [ 106 | "## 可視化" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "metadata": { 112 | "id": "7DKDD7tnyDWD", 113 | "colab_type": "code", 114 | "colab": {} 115 | }, 116 | "source": [ 117 | "plt.hist(data_prefecture_float[\"食料\"])" 118 | ], 119 | "execution_count": 0, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "id": "BLvnLvHHzZM0", 126 | "colab_type": "code", 127 | "colab": {} 128 | }, 129 | "source": [ 130 | "plt.hist(data_prefecture_float[\"住居\"])" 131 | ], 132 | "execution_count": 0, 133 | "outputs": [] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "metadata": { 138 | "id": "ZntDAgVqzb1T", 139 | "colab_type": "code", 140 | "colab": {} 141 | }, 142 | "source": [ 143 | "plt.hist(data_prefecture_float[\"教育\"])" 144 | ], 145 | "execution_count": 0, 146 | "outputs": [] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "id": "K0v4nSjbzkiv", 152 | "colab_type": "text" 153 | }, 154 | "source": [ 155 | "## K平均法" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "metadata": { 161 | "id": "IRDfZM0UyhNq", 162 | "colab_type": "code", 163 | "colab": {} 164 | }, 165 | "source": [ 166 | "# 標準化\n", 167 | "scaler = StandardScaler()\n", 168 | "data_std = scaler.fit_transform(data_prefecture_float)" 169 | ], 170 | "execution_count": 0, 171 | "outputs": [] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "metadata": { 176 | "id": "B5b6usAMi4_n", 177 | "colab_type": "code", 178 | "colab": {} 179 | }, 180 | "source": [ 181 | "k_means = KMeans(n_clusters=4)\n", 182 | "k_means.fit(data_std)" 183 | ], 184 | "execution_count": 0, 185 | "outputs": [] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "metadata": { 190 | "id": "Wn-OuwSblxXP", 191 | "colab_type": "code", 192 | "colab": {} 193 | }, 194 | "source": [ 195 | "print(k_means.labels_)" 196 | ], 197 | "execution_count": 0, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "metadata": { 203 | "id": "EX6_rR_BlzbB", 204 | "colab_type": "code", 205 | "colab": {} 206 | }, 207 | "source": [ 208 | "data_prefecture_float[\"label\"] = k_means.labels_" 209 | ], 210 | "execution_count": 0, 211 | "outputs": [] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "metadata": { 216 | "id": "CF-wfz-Jl9xp", 217 | "colab_type": "code", 218 | "colab": {} 219 | }, 220 | "source": [ 221 | "print(data_prefecture_float[data_prefecture_float[\"label\"] == 0][\"label\"])\n" 222 | ], 223 | "execution_count": 0, 224 | "outputs": [] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "metadata": { 229 | "id": "AQS_ADfDngAX", 230 | "colab_type": "code", 231 | "colab": {} 232 | }, 233 | "source": [ 234 | "print(data_prefecture_float[data_prefecture_float[\"label\"] == 1][\"label\"])\n", 235 | "print(data_prefecture_float[data_prefecture_float[\"label\"] == 2][\"label\"])\n", 236 | "print(data_prefecture_float[data_prefecture_float[\"label\"] == 3][\"label\"])" 237 | ], 238 | "execution_count": 0, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "metadata": { 244 | "id": "OagBxk02mVXC", 245 | "colab_type": "code", 246 | "colab": {} 247 | }, 248 | "source": [ 249 | "# クラスターごとの平均値を計算して、一つのDataFrameに格納する\n", 250 | "k_means_feature = pd.concat([data_prefecture_float[data_prefecture_float[\"label\"] == 0].mean(), \n", 251 | " data_prefecture_float[data_prefecture_float[\"label\"] == 1].mean(),\n", 252 | " data_prefecture_float[data_prefecture_float[\"label\"] == 2].mean(), \n", 253 | " data_prefecture_float[data_prefecture_float[\"label\"] == 3].mean()], axis = 1)" 254 | ], 255 | "execution_count": 0, 256 | "outputs": [] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "metadata": { 261 | "id": "0DRA0HkEm2pX", 262 | "colab_type": "code", 263 | "colab": {} 264 | }, 265 | "source": [ 266 | "k_means_feature" 267 | ], 268 | "execution_count": 0, 269 | "outputs": [] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "metadata": { 274 | "id": "PaewoBOhqv9U", 275 | "colab_type": "code", 276 | "colab": {} 277 | }, 278 | "source": [ 279 | "" 280 | ], 281 | "execution_count": 0, 282 | "outputs": [] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "metadata": { 287 | "id": "ih00cgwxqx9f", 288 | "colab_type": "code", 289 | "colab": {} 290 | }, 291 | "source": [ 292 | "" 293 | ], 294 | "execution_count": 0, 295 | "outputs": [] 296 | } 297 | ] 298 | } -------------------------------------------------------------------------------- /Python/05/data_prefecture_category.csv: -------------------------------------------------------------------------------- 1 | 都道府県,食料,住居,光熱・水道,家具・家事,被服及び,保健医療,交通・通信,教育,教養娯楽,諸雑費 2 | 札幌市,"819,536","279,764","228,330","103,893","129,292","99,902","442,564","124,799","276,976","218,769" 3 | 青森市,"790,368","259,971","295,102","96,173","98,267","115,529","427,590","96,241","245,912","232,403" 4 | 盛岡市,"771,420","246,223","250,260","102,652","142,183","123,152","438,431","144,845","276,140","286,892" 5 | 仙台市,"862,052","240,690","197,006","117,818","116,682","109,467","379,888","150,622","317,874","280,381" 6 | 秋田市,"835,325","226,152","296,036","111,587","127,798","133,474","496,526","111,430","280,440","238,857" 7 | 山形市,"841,537","315,770","285,590","99,357","125,567","104,612","770,941","107,336","302,035","289,043" 8 | 福島市,"950,582","285,711","257,681","126,588","169,182","94,275","665,083","141,012","392,401","276,986" 9 | 水戸市,"877,968","235,274","231,740","127,631","174,481","119,688","695,369","200,251","390,123","322,231" 10 | 宇都宮市,"970,391","294,398","243,081","104,325","171,918","125,397","622,628","175,432","375,213","292,779" 11 | 前橋市,"876,472","149,049","202,882","150,428","166,129","142,103","549,336","113,726","397,195","313,629" 12 | さいたま市,"1,042,267","350,989","216,828","110,043","173,828","174,833","501,966","275,513","330,177","276,978" 13 | 千葉市,"867,636","162,260","153,227","81,768","142,156","87,722","421,253","155,287","329,146","320,532" 14 | 東京都区部,"943,279","404,843","175,822","112,716","208,975","156,721","417,168","272,696","423,476","254,768" 15 | 横浜市,"926,253","215,616","184,484","124,547","172,798","136,661","517,576","251,826","420,737","275,789" 16 | 新潟市,"842,736","178,061","254,426","116,049","128,177","114,074","606,168","199,170","265,664","316,409" 17 | 富山市,"896,917","307,401","263,618","127,392","122,275","114,880","579,845","91,179","336,369","263,650" 18 | 金沢市,"971,470","220,831","246,180","125,704","167,773","101,640","680,653","245,222","405,272","355,490" 19 | 福井市,"925,413","151,093","249,017","94,646","114,519","99,707","462,830","122,414","328,129","277,653" 20 | 甲府市,"747,397","300,816","214,981","90,925","101,371","104,563","420,691","116,368","323,950","234,201" 21 | 長野市,"786,130","344,086","239,435","109,564","116,436","108,134","519,702","92,604","266,054","289,707" 22 | 岐阜市,"865,541","201,315","239,365","130,079","173,834","135,925","699,940","243,758","414,244","305,166" 23 | 静岡市,"807,241","358,014","204,189","106,298","139,274","109,700","432,415","119,306","316,773","227,907" 24 | 名古屋市,"821,916","249,793","156,478","82,537","139,540","104,044","480,970","107,105","394,293","224,362" 25 | 津市,"863,096","195,647","203,113","125,860","164,073","117,537","517,539","251,968","386,805","251,410" 26 | 大津市,"915,677","108,352","236,832","158,680","141,251","108,875","521,557","180,740","325,487","245,402" 27 | 京都市,"845,226","210,964","232,337","88,931","129,277","92,014","390,179","212,035","358,755","246,851" 28 | 大阪市,"840,018","269,369","177,417","95,044","114,748","138,580","369,889","140,737","317,359","202,192" 29 | 神戸市,"656,924","136,381","103,216","67,591","110,686","54,228","319,734","31,347","208,916","184,317" 30 | 奈良市,"898,884","157,240","272,448","114,845","165,037","144,301","496,535","388,515","399,766","282,591" 31 | 和歌山市,"887,859","244,498","246,528","130,329","152,058","92,863","510,125","144,763","343,537","241,471" 32 | 鳥取市,"706,962","204,600","194,986","108,323","103,304","86,720","513,462","77,770","230,101","281,468" 33 | 松江市,"727,565","328,050","221,065","93,567","103,611","105,134","545,464","85,915","292,628","281,605" 34 | 岡山市,"765,652","289,496","202,733","96,181","161,001","136,606","502,230","151,293","302,995","233,083" 35 | 広島市,"810,255","219,623","182,511","105,210","127,351","104,142","605,174","181,977","284,268","220,201" 36 | 山口市,"607,019","363,261","177,832","86,593","100,132","108,410","586,591","59,450","298,965","223,511" 37 | 徳島市,"817,065","183,086","211,546","119,732","153,757","113,235","443,341","239,275","362,019","277,219" 38 | 高松市,"809,931","323,569","227,821","119,424","129,374","131,729","615,294","103,593","279,503","243,670" 39 | 松山市,"828,274","197,045","241,818","125,931","159,782","102,157","491,929","208,938","305,368","253,428" 40 | 高知市,"803,052","310,383","225,292","198,099","119,242","102,917","533,892","157,375","309,526","280,036" 41 | 福岡市,"760,638","188,295","156,097","116,400","152,971","96,334","471,238","120,417","355,085","273,449" 42 | 佐賀市,"814,400","262,685","224,972","98,570","140,041","144,157","515,064","144,634","359,726","284,169" 43 | 長崎市,"658,520","308,171","210,173","84,279","115,569","83,159","390,576","88,847","187,986","182,308" 44 | 熊本市,"870,311","311,909","243,256","143,752","152,455","133,442","509,583","223,684","345,740","338,671" 45 | 大分市,"789,001","355,356","207,281","135,103","162,991","100,884","561,382","97,157","451,635","313,381" 46 | 宮崎市,"778,907","222,861","185,008","96,874","122,197","113,690","559,338","131,236","279,217","278,642" 47 | 鹿児島市,"787,120","345,632","198,035","116,358","164,759","121,532","552,727","108,190","298,067","254,743" 48 | 那覇市,"726,160","337,851","211,156","111,406","100,591","102,076","448,672","131,853","237,977","175,467" 49 | 川崎市,"872,136","427,698","158,914","87,568","112,286","94,975","324,322","124,695","292,594","215,952" 50 | 相模原市,"756,340","290,616","166,843","95,209","112,149","122,466","302,611","112,232","363,026","210,583" 51 | 浜松市,"803,305","198,314","187,801","105,721","127,230","114,431","680,801","128,846","281,603","236,023" 52 | 堺市,"927,069","236,032","257,915","123,388","155,071","129,308","640,550","247,428","419,882","251,675" 53 | 北九州市,"862,432","149,685","206,224","109,226","167,286","146,687","700,366","164,434","270,083","256,207" -------------------------------------------------------------------------------- /Python/05/pca_iris.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch05-iris-pca.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "1fXQD0FV11Kd", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "%matplotlib inline\n", 29 | "\n", 30 | "from sklearn.decomposition import PCA\n", 31 | "from sklearn.preprocessing import StandardScaler\n", 32 | "\n", 33 | "from sklearn.datasets import load_iris" 34 | ], 35 | "execution_count": 0, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "0O3Q1upd2VXx", 42 | "colab_type": "code", 43 | "colab": {} 44 | }, 45 | "source": [ 46 | "iris = load_iris()\n", 47 | "\n", 48 | "data_iris = pd.DataFrame(iris.data, columns=iris.feature_names)\n", 49 | "data_iris[\"target\"] = iris.target\n", 50 | "\n", 51 | "print(data_iris.head())\n", 52 | "print(data_iris.shape)" 53 | ], 54 | "execution_count": 0, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "metadata": { 60 | "id": "aJ86062Q2XZd", 61 | "colab_type": "code", 62 | "colab": {} 63 | }, 64 | "source": [ 65 | "data_iris.describe()" 66 | ], 67 | "execution_count": 0, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "id": "bWeaduLQ2sP5", 74 | "colab_type": "text" 75 | }, 76 | "source": [ 77 | "## 主成分分析" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "7jmVukQ-HEzL", 84 | "colab_type": "code", 85 | "colab": {} 86 | }, 87 | "source": [ 88 | "## 標準化\n", 89 | "scaler = StandardScaler()\n", 90 | "data_std = scaler.fit_transform(data_iris[iris.feature_names])" 91 | ], 92 | "execution_count": 0, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "metadata": { 98 | "id": "hKq2PZecE7Qv", 99 | "colab_type": "code", 100 | "colab": {} 101 | }, 102 | "source": [ 103 | "data_std_df = pd.DataFrame(data_std, columns=data_iris.columns[0:4])\n", 104 | "\n", 105 | "# もとのデータ\n", 106 | "print(data_iris.describe())\n", 107 | "\n", 108 | "# 標準化後のデータ\n", 109 | "print(data_std_df.describe())" 110 | ], 111 | "execution_count": 0, 112 | "outputs": [] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "metadata": { 117 | "id": "oKknBSSI2cbv", 118 | "colab_type": "code", 119 | "colab": {} 120 | }, 121 | "source": [ 122 | "pca = PCA(n_components=2)\n", 123 | "pca_transformed = pca.fit_transform(data_std)" 124 | ], 125 | "execution_count": 0, 126 | "outputs": [] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "metadata": { 131 | "id": "zr4jna7_HF38", 132 | "colab_type": "code", 133 | "colab": {} 134 | }, 135 | "source": [ 136 | "print(pca_transformed.shape)" 137 | ], 138 | "execution_count": 0, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "metadata": { 144 | "id": "EHAuY_xE4fsv", 145 | "colab_type": "code", 146 | "colab": {} 147 | }, 148 | "source": [ 149 | "plt.scatter(pca_transformed[:, 0], pca_transformed[:, 1])" 150 | ], 151 | "execution_count": 0, 152 | "outputs": [] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "metadata": { 157 | "id": "oKP0ctmc4yXv", 158 | "colab_type": "code", 159 | "colab": {} 160 | }, 161 | "source": [ 162 | "plt.scatter(pca_transformed[:, 0], pca_transformed[:, 1], c=data_iris[\"target\"])" 163 | ], 164 | "execution_count": 0, 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "id": "2D5Tdi0259sT", 171 | "colab_type": "text" 172 | }, 173 | "source": [ 174 | "### 寄与度" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "N2TPBL0n5Clm", 181 | "colab_type": "code", 182 | "colab": {} 183 | }, 184 | "source": [ 185 | "print(pca.explained_variance_ratio_)" 186 | ], 187 | "execution_count": 0, 188 | "outputs": [] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "metadata": { 193 | "id": "O1ezKZcq6Goo", 194 | "colab_type": "code", 195 | "colab": {} 196 | }, 197 | "source": [ 198 | "print(sum(pca.explained_variance_ratio_))" 199 | ], 200 | "execution_count": 0, 201 | "outputs": [] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "metadata": { 206 | "id": "EE1frz2M6IcN", 207 | "colab_type": "code", 208 | "colab": {} 209 | }, 210 | "source": [ 211 | "" 212 | ], 213 | "execution_count": 0, 214 | "outputs": [] 215 | } 216 | ] 217 | } -------------------------------------------------------------------------------- /Python/05/pca_prefecture.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch05-prefecture-pca.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "3yZ2JWsv13Yj", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import numpy as np\n", 27 | "import pandas as pd\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "%matplotlib inline\n", 30 | "from mpl_toolkits.mplot3d import Axes3D\n", 31 | "\n", 32 | "from sklearn.decomposition import PCA\n", 33 | "from sklearn.preprocessing import StandardScaler" 34 | ], 35 | "execution_count": 0, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "P8nKuedD6WiW", 42 | "colab_type": "code", 43 | "colab": {} 44 | }, 45 | "source": [ 46 | "data_prefecture = pd.read_csv(\"data_prefecture_category.csv\", encoding='utf-8', index_col=0)" 47 | ], 48 | "execution_count": 0, 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "metadata": { 54 | "id": "ysoFHIiR6Ydc", 55 | "colab_type": "code", 56 | "colab": {} 57 | }, 58 | "source": [ 59 | "print(data_prefecture.head())" 60 | ], 61 | "execution_count": 0, 62 | "outputs": [] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "metadata": { 67 | "id": "nfoP6S_L6a1E", 68 | "colab_type": "code", 69 | "colab": {} 70 | }, 71 | "source": [ 72 | "# カンマ区切りの文字列を数値に変換\n", 73 | "data_prefecture_float = data_prefecture.apply(lambda x: x.str.replace(',','')).astype(np.float)" 74 | ], 75 | "execution_count": 0, 76 | "outputs": [] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "metadata": { 81 | "id": "cLrqPSPR6cfO", 82 | "colab_type": "code", 83 | "colab": {} 84 | }, 85 | "source": [ 86 | "print(data_prefecture_float.head())" 87 | ], 88 | "execution_count": 0, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "vHLhXKXl6eBq", 95 | "colab_type": "code", 96 | "colab": {} 97 | }, 98 | "source": [ 99 | "plt.hist(data_prefecture_float[\"食料\"])" 100 | ], 101 | "execution_count": 0, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "metadata": { 107 | "id": "k_HpeCwo6fi_", 108 | "colab_type": "code", 109 | "colab": {} 110 | }, 111 | "source": [ 112 | "plt.hist(data_prefecture_float[\"住居\"])" 113 | ], 114 | "execution_count": 0, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "metadata": { 120 | "id": "R-znSRyI6hGz", 121 | "colab_type": "code", 122 | "colab": {} 123 | }, 124 | "source": [ 125 | "plt.hist(data_prefecture_float[\"教育\"])" 126 | ], 127 | "execution_count": 0, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": { 133 | "id": "cPdr4CGA6lHX", 134 | "colab_type": "text" 135 | }, 136 | "source": [ 137 | "## 主成分分析" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "metadata": { 143 | "id": "OfIoWjir6xRr", 144 | "colab_type": "code", 145 | "colab": {} 146 | }, 147 | "source": [ 148 | "# 標準化\n", 149 | "\n", 150 | "scaler = StandardScaler()\n", 151 | "data_std = scaler.fit_transform(data_prefecture_float)" 152 | ], 153 | "execution_count": 0, 154 | "outputs": [] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "metadata": { 159 | "id": "L7oIaSiW6im3", 160 | "colab_type": "code", 161 | "colab": {} 162 | }, 163 | "source": [ 164 | "pca = PCA(n_components=2)\n", 165 | "pca_transformed = pca.fit_transform(data_std)" 166 | ], 167 | "execution_count": 0, 168 | "outputs": [] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "metadata": { 173 | "id": "wOIjtLG86tt8", 174 | "colab_type": "code", 175 | "colab": {} 176 | }, 177 | "source": [ 178 | "plt.scatter(pca_transformed[:, 0], pca_transformed[:, 1])" 179 | ], 180 | "execution_count": 0, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "SCB9i_hMBqs6", 187 | "colab_type": "code", 188 | "colab": {} 189 | }, 190 | "source": [ 191 | "fig, ax = plt.subplots(figsize=(14, 8))\n", 192 | "\n", 193 | "plt.scatter(pca_transformed[:, 0], pca_transformed[:, 1])\n", 194 | "for k, v in enumerate(pca_transformed):\n", 195 | " ax.annotate(k, xy=(v[0],v[1]),size=10)" 196 | ], 197 | "execution_count": 0, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "metadata": { 203 | "id": "YiLk33xLDVFr", 204 | "colab_type": "code", 205 | "colab": {} 206 | }, 207 | "source": [ 208 | "for i in range(data_prefecture_float.shape[0]):\n", 209 | " print(i, data_prefecture_float.index[i])" 210 | ], 211 | "execution_count": 0, 212 | "outputs": [] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": { 217 | "id": "BkgQjkFB64sH", 218 | "colab_type": "text" 219 | }, 220 | "source": [ 221 | "### 寄与度" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "metadata": { 227 | "id": "lk5wlzgG61GO", 228 | "colab_type": "code", 229 | "colab": {} 230 | }, 231 | "source": [ 232 | "print(pca.explained_variance_ratio_)" 233 | ], 234 | "execution_count": 0, 235 | "outputs": [] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "metadata": { 240 | "id": "TEDtVFMk65tW", 241 | "colab_type": "code", 242 | "colab": {} 243 | }, 244 | "source": [ 245 | "print(sum(pca.explained_variance_ratio_))" 246 | ], 247 | "execution_count": 0, 248 | "outputs": [] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "id": "lLDsFaTT7I6W", 254 | "colab_type": "text" 255 | }, 256 | "source": [ 257 | "### 次元を増やす" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "metadata": { 263 | "id": "A7SvxLRz67UA", 264 | "colab_type": "code", 265 | "colab": {} 266 | }, 267 | "source": [ 268 | "pca2 = PCA(n_components=3)\n", 269 | "pca2_transformed = pca2.fit_transform(data_std)" 270 | ], 271 | "execution_count": 0, 272 | "outputs": [] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "metadata": { 277 | "id": "ECpeAHo16_pw", 278 | "colab_type": "code", 279 | "colab": {} 280 | }, 281 | "source": [ 282 | "print(sum(pca2.explained_variance_ratio_))" 283 | ], 284 | "execution_count": 0, 285 | "outputs": [] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "metadata": { 290 | "id": "z9t9BTuz7UG5", 291 | "colab_type": "code", 292 | "colab": {} 293 | }, 294 | "source": [ 295 | "print(pca2.explained_variance_ratio_)" 296 | ], 297 | "execution_count": 0, 298 | "outputs": [] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "metadata": { 303 | "id": "ExQFfXh-8ISK", 304 | "colab_type": "code", 305 | "colab": {} 306 | }, 307 | "source": [ 308 | "fig = plt.figure()\n", 309 | "ax = fig.add_subplot(111, projection='3d')\n", 310 | "ax.scatter3D(pca2_transformed[:, 0], pca2_transformed[:, 1], pca2_transformed[:, 2])\n", 311 | "ax.set_title(\"Scatter Plot\")\n", 312 | "ax.view_init(40, 100)\n", 313 | "\n", 314 | "\n", 315 | "plt.show()" 316 | ], 317 | "execution_count": 0, 318 | "outputs": [] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "metadata": { 323 | "id": "N8JmfSnXzSww", 324 | "colab_type": "code", 325 | "colab": {} 326 | }, 327 | "source": [ 328 | "pca2_transformed" 329 | ], 330 | "execution_count": 0, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "ihqW-C3e00c-", 337 | "colab_type": "code", 338 | "colab": {} 339 | }, 340 | "source": [ 341 | "" 342 | ], 343 | "execution_count": 0, 344 | "outputs": [] 345 | } 346 | ] 347 | } -------------------------------------------------------------------------------- /Python/06/classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "classification.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "dj5RSy08UIcH", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "\n", 28 | "from sklearn.datasets import load_iris\n", 29 | "from sklearn.linear_model import LogisticRegression\n", 30 | "\n", 31 | "from sklearn.model_selection import train_test_split\n", 32 | "from sklearn.metrics import accuracy_score\n", 33 | "from sklearn.metrics import roc_auc_score" 34 | ], 35 | "execution_count": 0, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "id": "6i4REqNKVDTv", 42 | "colab_type": "text" 43 | }, 44 | "source": [ 45 | "## データ読み込み\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "metadata": { 51 | "id": "7gQ4SukTUBM7", 52 | "colab_type": "code", 53 | "colab": {} 54 | }, 55 | "source": [ 56 | "iris = load_iris()\n", 57 | "\n", 58 | "tmp_data = pd.DataFrame(iris.data, columns=iris.feature_names)\n", 59 | "tmp_data[\"target\"] = iris.target\n", 60 | "\n", 61 | "data_iris = tmp_data[tmp_data['target'] <= 1]\n", 62 | "\n", 63 | "x_column_list = ['sepal length (cm)']\n", 64 | "y_column_list = ['target']\n", 65 | "\n", 66 | "X_train, X_test, y_train, y_test = train_test_split(data_iris[x_column_list], \n", 67 | " data_iris[y_column_list], test_size=0.3)" 68 | ], 69 | "execution_count": 0, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "id": "oxavKwzVVPTp", 76 | "colab_type": "text" 77 | }, 78 | "source": [ 79 | "## 学習と予測" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "metadata": { 85 | "id": "0Ogf7jEFVK7K", 86 | "colab_type": "code", 87 | "colab": {} 88 | }, 89 | "source": [ 90 | "logit = LogisticRegression()\n", 91 | "\n", 92 | "logit = LogisticRegression()\n", 93 | "logit.fit(X_train, y_train)\n", 94 | "\n", 95 | "y_pred = logit.predict(X_test)" 96 | ], 97 | "execution_count": 0, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "id": "Gc1F9yRXVz5G", 104 | "colab_type": "text" 105 | }, 106 | "source": [ 107 | "## 正解率" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "metadata": { 113 | "id": "LDFpNB5DVunc", 114 | "colab_type": "code", 115 | "colab": {} 116 | }, 117 | "source": [ 118 | "accuracy_score(y_test, y_pred)" 119 | ], 120 | "execution_count": 0, 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "id": "a0xBXjdDWAIK", 127 | "colab_type": "text" 128 | }, 129 | "source": [ 130 | "## AUC" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "metadata": { 136 | "id": "jKJMH5oMV99s", 137 | "colab_type": "code", 138 | "colab": {} 139 | }, 140 | "source": [ 141 | "roc_auc_score(y_test, y_pred)" 142 | ], 143 | "execution_count": 0, 144 | "outputs": [] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "metadata": { 149 | "id": "cL_YCijTUhAr", 150 | "colab_type": "code", 151 | "colab": {} 152 | }, 153 | "source": [ 154 | "" 155 | ], 156 | "execution_count": 0, 157 | "outputs": [] 158 | } 159 | ] 160 | } -------------------------------------------------------------------------------- /Python/06/regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "regression.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "Tplx-pF0QMTW", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "\n", 28 | "from sklearn.linear_model import LinearRegression\n", 29 | "from sklearn.datasets import load_boston\n", 30 | "\n", 31 | "from sklearn.model_selection import train_test_split\n", 32 | "from sklearn.metrics import mean_absolute_error\n", 33 | "from sklearn.metrics import mean_squared_error\n", 34 | "from sklearn.metrics import mean_squared_log_error" 35 | ], 36 | "execution_count": 0, 37 | "outputs": [] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "5RuL6EUjSP6L", 43 | "colab_type": "text" 44 | }, 45 | "source": [ 46 | "## データ読み込み" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "metadata": { 52 | "id": "Ed4iMNSlSMJE", 53 | "colab_type": "code", 54 | "colab": {} 55 | }, 56 | "source": [ 57 | "boston = load_boston()\n", 58 | "data_boston = pd.DataFrame(boston.data, columns=boston.feature_names)\n", 59 | "data_boston['PRICE'] = boston.target\n", 60 | "\n", 61 | "lr_multi = LinearRegression()\n", 62 | "\n", 63 | "x_column_list_for_multi = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', \n", 64 | " 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']\n", 65 | "y_column_list_for_multi = ['PRICE']\n", 66 | "\n", 67 | "X_train, X_test, y_train, y_test = train_test_split(data_boston[x_column_list_for_multi], \n", 68 | " data_boston[y_column_list_for_multi], test_size=0.3)" 69 | ], 70 | "execution_count": 0, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "id": "6lAp0JMISvtD", 77 | "colab_type": "text" 78 | }, 79 | "source": [ 80 | "## 学習と予測" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "metadata": { 86 | "id": "Weqsnyi0SvSZ", 87 | "colab_type": "code", 88 | "colab": {} 89 | }, 90 | "source": [ 91 | "lr_multi.fit(X_train, y_train) \n", 92 | "y_pred = lr_multi.predict(X_test)" 93 | ], 94 | "execution_count": 0, 95 | "outputs": [] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "id": "Ld1yFxecTJdF", 101 | "colab_type": "text" 102 | }, 103 | "source": [ 104 | "## RMSE" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "metadata": { 110 | "id": "NA0uL9a5ZAql", 111 | "colab_type": "code", 112 | "colab": {} 113 | }, 114 | "source": [ 115 | "mean_squared_error(y_test, y_pred)" 116 | ], 117 | "execution_count": 0, 118 | "outputs": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "metadata": { 123 | "id": "Zg-a3SD_TLPM", 124 | "colab_type": "code", 125 | "colab": {} 126 | }, 127 | "source": [ 128 | "np.sqrt(mean_squared_error(y_test, y_pred))" 129 | ], 130 | "execution_count": 0, 131 | "outputs": [] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "id": "fljMKBwdTBxJ", 137 | "colab_type": "text" 138 | }, 139 | "source": [ 140 | "## MAE" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "metadata": { 146 | "id": "qJqX5ZeGSoQ7", 147 | "colab_type": "code", 148 | "colab": {} 149 | }, 150 | "source": [ 151 | "mean_absolute_error(y_test, y_pred)" 152 | ], 153 | "execution_count": 0, 154 | "outputs": [] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "id": "BgJdnzIoTIV5", 160 | "colab_type": "text" 161 | }, 162 | "source": [ 163 | "## RMSLE" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "metadata": { 169 | "id": "RHvxOfkKZruN", 170 | "colab_type": "code", 171 | "colab": {} 172 | }, 173 | "source": [ 174 | "mean_squared_log_error (y_test, y_pred)" 175 | ], 176 | "execution_count": 0, 177 | "outputs": [] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "metadata": { 182 | "id": "LoKr-oCWTHcp", 183 | "colab_type": "code", 184 | "colab": {} 185 | }, 186 | "source": [ 187 | "np.sqrt(mean_squared_log_error (y_test, y_pred))" 188 | ], 189 | "execution_count": 0, 190 | "outputs": [] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "metadata": { 195 | "id": "m0Lqa0CGTxUF", 196 | "colab_type": "code", 197 | "colab": {} 198 | }, 199 | "source": [ 200 | "" 201 | ], 202 | "execution_count": 0, 203 | "outputs": [] 204 | } 205 | ] 206 | } -------------------------------------------------------------------------------- /Python/07/cnn_mnist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch07-mnist-cnn.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "FSJx3WFbK_mI", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "from keras.layers import Conv2D, MaxPool2D, Flatten, Dense\n", 27 | "from keras.models import Sequential\n", 28 | "\n", 29 | "from keras.utils import to_categorical\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "\n", 32 | "from keras.datasets import mnist" 33 | ], 34 | "execution_count": 0, 35 | "outputs": [] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "id": "wVmdIQwbLTlW", 41 | "colab_type": "text" 42 | }, 43 | "source": [ 44 | "## データ読み込み" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "wBIqFYBaLKpi", 51 | "colab_type": "code", 52 | "colab": {} 53 | }, 54 | "source": [ 55 | "(X_train, y_train), (X_test, y_test) = mnist.load_data()\n", 56 | "\n", 57 | "print(X_train.shape)" 58 | ], 59 | "execution_count": 0, 60 | "outputs": [] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "3IQ6YmIALOAS", 66 | "colab_type": "code", 67 | "colab": {} 68 | }, 69 | "source": [ 70 | "plt.imshow(X_train[0], cmap='gray')" 71 | ], 72 | "execution_count": 0, 73 | "outputs": [] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "metadata": { 78 | "id": "8jnlir0kLP41", 79 | "colab_type": "code", 80 | "colab": {} 81 | }, 82 | "source": [ 83 | "print(y_train.shape)\n", 84 | "print(y_train[0])" 85 | ], 86 | "execution_count": 0, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "id": "niA-WDCeLVHh", 93 | "colab_type": "text" 94 | }, 95 | "source": [ 96 | "### データ整形" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "metadata": { 102 | "id": "agl1i6TjLR05", 103 | "colab_type": "code", 104 | "colab": {} 105 | }, 106 | "source": [ 107 | "# 画像をreshape\n", 108 | "X_train = X_train.reshape((60000, 28, 28, 1))\n", 109 | "X_test = X_test.reshape((10000, 28, 28, 1))\n", 110 | "\n", 111 | "# 輝度値を0 ~ 1に入るように正規化\n", 112 | "X_train = X_train.astype('float32')/255\n", 113 | "X_test = X_test.astype('float32')/255\n", 114 | "\n", 115 | "# one hot encoding\n", 116 | "y_train = to_categorical(y_train)\n", 117 | "y_test = to_categorical(y_test)" 118 | ], 119 | "execution_count": 0, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": { 125 | "id": "vxpRwBYLLcZ3", 126 | "colab_type": "text" 127 | }, 128 | "source": [ 129 | "## モデル作成" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "metadata": { 135 | "id": "a-pmohDGLaT9", 136 | "colab_type": "code", 137 | "colab": {} 138 | }, 139 | "source": [ 140 | "model = Sequential()\n", 141 | "\n", 142 | "# 畳み込み層\n", 143 | "model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))\n", 144 | " \n", 145 | "# プーリング層\n", 146 | "model.add(MaxPool2D(2, 2))\n", 147 | " \n", 148 | "model.add(Flatten())\n", 149 | "model.add(Dense(32, activation='relu'))\n", 150 | "model.add(Dense(10, activation='softmax'))\n", 151 | "\n", 152 | "model.summary()" 153 | ], 154 | "execution_count": 0, 155 | "outputs": [] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "metadata": { 160 | "id": "BNfazDhRLnt1", 161 | "colab_type": "code", 162 | "colab": {} 163 | }, 164 | "source": [ 165 | "model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])\n", 166 | "model.fit(X_train, y_train, epochs=5, batch_size=64)" 167 | ], 168 | "execution_count": 0, 169 | "outputs": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "metadata": { 174 | "id": "T9Ucs5GVLud1", 175 | "colab_type": "code", 176 | "colab": {} 177 | }, 178 | "source": [ 179 | "model.evaluate(X_test, y_test)" 180 | ], 181 | "execution_count": 0, 182 | "outputs": [] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "metadata": { 187 | "id": "2Z7oEM5mM3-b", 188 | "colab_type": "code", 189 | "colab": {} 190 | }, 191 | "source": [ 192 | "from keras.models import load_model\n", 193 | "\n", 194 | "# modelの保存\n", 195 | "model.save('model.h5') \n", 196 | "\n", 197 | "# modelの読み込み\n", 198 | "model = load_model('model.h5')" 199 | ], 200 | "execution_count": 0, 201 | "outputs": [] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "metadata": { 206 | "id": "w5auAp3RolLM", 207 | "colab_type": "code", 208 | "colab": {} 209 | }, 210 | "source": [ 211 | "" 212 | ], 213 | "execution_count": 0, 214 | "outputs": [] 215 | } 216 | ] 217 | } -------------------------------------------------------------------------------- /Python/07/cnn_temple_shrine.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch07-temple-shrine-cnn.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "HRoy_vzj83WJ", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "from keras.datasets import mnist\n", 27 | "from keras.utils import to_categorical\n", 28 | "\n", 29 | "from keras.layers import Conv2D, MaxPool2D, Flatten, Dense\n", 30 | "from keras.models import Sequential\n", 31 | "from keras.models import load_model\n", 32 | "\n", 33 | "import matplotlib.pyplot as plt\n", 34 | "from PIL import Image\n", 35 | "import os\n", 36 | "import numpy as np\n", 37 | "from sklearn.model_selection import train_test_split" 38 | ], 39 | "execution_count": 0, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "id": "lkcJwKDJ4vuV", 46 | "colab_type": "code", 47 | "colab": {} 48 | }, 49 | "source": [ 50 | "# Google ドライブをマウントするには、このセルを実行してください。\n", 51 | "from google.colab import drive\n", 52 | "drive.mount('/content/drive/')" 53 | ], 54 | "execution_count": 0, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "gbGo22SlK0E0", 61 | "colab_type": "text" 62 | }, 63 | "source": [ 64 | "## データ読み込み" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "id": "rzob7zR3uw9D", 71 | "colab_type": "code", 72 | "colab": {} 73 | }, 74 | "source": [ 75 | "X = []\n", 76 | "Y = []\n", 77 | "image_size = 30\n", 78 | "\n", 79 | "folder_path = \"/content/drive/My Drive/PythonBooks/src/Ch07/images/\"\n", 80 | "file_list = os.listdir(folder_path)" 81 | ], 82 | "execution_count": 0, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "id": "aG-mpkDNKzoJ", 89 | "colab_type": "code", 90 | "colab": {} 91 | }, 92 | "source": [ 93 | "for file in file_list:\n", 94 | " try:\n", 95 | " image = Image.open(folder_path + file)\n", 96 | " except:\n", 97 | " print('error', file)\n", 98 | " continue\n", 99 | " \n", 100 | " image = image.convert(\"RGB\")\n", 101 | " image = image.resize((image_size, image_size))\n", 102 | " data = np.asarray(image)\n", 103 | " X.append(data)\n", 104 | " if 'temple' in file:\n", 105 | " Y.append(0)\n", 106 | " else:\n", 107 | " Y.append(1)\n", 108 | "\n", 109 | "X = np.array(X)\n", 110 | "Y = np.array(Y)" 111 | ], 112 | "execution_count": 0, 113 | "outputs": [] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": { 118 | "id": "hoPt9AcDP5KW", 119 | "colab_type": "text" 120 | }, 121 | "source": [ 122 | "### データ整形" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "uogTTc6oBsd0", 129 | "colab_type": "code", 130 | "colab": {} 131 | }, 132 | "source": [ 133 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)\n", 134 | "# 画像をreshape\n", 135 | "X_train = X_train.reshape(-1, image_size, image_size, 3)\n", 136 | "X_test = X_test.reshape(-1, image_size, image_size, 3)\n", 137 | "\n", 138 | "# 輝度値を0 ~ 1に入るように正規化\n", 139 | "X_train = X_train.astype('float32')/255\n", 140 | "X_test = X_test.astype('float32')/255\n", 141 | "\n", 142 | "# one hot encoding\n", 143 | "y_train = to_categorical(y_train)\n", 144 | "y_test = to_categorical(y_test)" 145 | ], 146 | "execution_count": 0, 147 | "outputs": [] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "id": "e4VxBZMxQPl6", 153 | "colab_type": "text" 154 | }, 155 | "source": [ 156 | "## モデル作成" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "metadata": { 162 | "id": "rQ3yfGfH_at7", 163 | "colab_type": "code", 164 | "colab": {} 165 | }, 166 | "source": [ 167 | "model_cnn = Sequential()\n", 168 | "model_cnn.add(Conv2D(32, (3,3), activation='relu', input_shape=(image_size, image_size, 3)))\n", 169 | "model_cnn.add(MaxPool2D(2,2))\n", 170 | "model_cnn.add(Conv2D(64, (3,3), activation='relu'))\n", 171 | "model_cnn.add(MaxPool2D(2,2))\n", 172 | "model_cnn.add(Conv2D(128, (3,3), activation='relu'))\n", 173 | "model_cnn.add(MaxPool2D(2,2))\n", 174 | "model_cnn.add(Flatten())\n", 175 | "model_cnn.add(Dense(512, activation='relu')) \n", 176 | "model_cnn.add(Dense(2, activation='softmax'))\n", 177 | "\n", 178 | "model_cnn.summary()" 179 | ], 180 | "execution_count": 0, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "izLnujUXKjXy", 187 | "colab_type": "code", 188 | "colab": {} 189 | }, 190 | "source": [ 191 | "model_cnn.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])\n", 192 | "model_cnn.fit(X_train, y_train, epochs=10, batch_size=20)\n", 193 | "\n", 194 | "model_cnn.evaluate(X_test, y_test)" 195 | ], 196 | "execution_count": 0, 197 | "outputs": [] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "metadata": { 202 | "id": "Fi-ZodmOJ0I1", 203 | "colab_type": "code", 204 | "colab": {} 205 | }, 206 | "source": [ 207 | "" 208 | ], 209 | "execution_count": 0, 210 | "outputs": [] 211 | } 212 | ] 213 | } -------------------------------------------------------------------------------- /Python/07/get_imaeg.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | import urllib.request 4 | from urllib.parse import quote 5 | import httplib2 6 | import json 7 | import requests 8 | 9 | KEY = "" # 取得したAPI Key 10 | ENGINE_ID = "" # 取得した検索エンジンID 11 | 12 | keywords = ["寺", "神社"] 13 | start_num = 0 14 | 15 | def get_urls(keyword, number): 16 | urls = [] 17 | count = 0 18 | 19 | while count < number: 20 | if number - count <= 10: 21 | num_param = str(number - count) 22 | else: 23 | num_param = "10" 24 | 25 | query = "https://www.googleapis.com/customsearch/v1?key=" + KEY + \ 26 | "&cx=" + ENGINE_ID + \ 27 | "&num=" + num_param + \ 28 | "&start=" + str(count + 1) + \ 29 | "&q=" + quote(keyword) + \ 30 | "&searchType=image" # &dateRestrict=y1" 31 | 32 | res = urllib.request.urlopen(query) 33 | data = json.loads(res.read().decode('utf-8')) 34 | 35 | for i in range(len(data["items"])): 36 | urls.append(data["items"][i]["link"]) 37 | 38 | count += 10 39 | 40 | return urls 41 | 42 | def get_images(keyword, number): 43 | urls = get_urls(keyword, number) 44 | 45 | for i in range(len(urls)): 46 | res = requests.get(urls[i], verify=False) 47 | image = res.content 48 | 49 | if keyword == keywords[0]: 50 | filename = "temple" + str(i + start_num) + ".jpg" 51 | else: 52 | filename = "shrine" + str(i + start_num) + ".jpg" 53 | 54 | with open(filename, 'wb') as f: 55 | f.write(image) 56 | 57 | # メイン 58 | for keyword in keywords: 59 | # キーワードごとに取得したい枚数を指定(今回は100) 60 | get_images(keyword, 100) -------------------------------------------------------------------------------- /Python/07/nn_mnist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch07-mnist-nn.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "HRoy_vzj83WJ", 23 | "colab_type": "code", 24 | "colab": {} 25 | }, 26 | "source": [ 27 | "from keras.layers import Dense\n", 28 | "from keras.models import Sequential\n", 29 | "from keras.utils import to_categorical\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "\n", 32 | "from keras.datasets import mnist" 33 | ], 34 | "execution_count": 0, 35 | "outputs": [] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "id": "gbGo22SlK0E0", 41 | "colab_type": "text" 42 | }, 43 | "source": [ 44 | "## データ読み込み" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "aG-mpkDNKzoJ", 51 | "colab_type": "code", 52 | "colab": {} 53 | }, 54 | "source": [ 55 | "(X_train, y_train), (X_test, y_test) = mnist.load_data()" 56 | ], 57 | "execution_count": 0, 58 | "outputs": [] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "metadata": { 63 | "id": "XVmcIrW8MSg6", 64 | "colab_type": "code", 65 | "colab": {} 66 | }, 67 | "source": [ 68 | "print(X_train.shape)" 69 | ], 70 | "execution_count": 0, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "4R59WO9Uy4wD", 77 | "colab_type": "code", 78 | "colab": {} 79 | }, 80 | "source": [ 81 | "print(X_train[0])" 82 | ], 83 | "execution_count": 0, 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "metadata": { 89 | "id": "FND90BkzPWd6", 90 | "colab_type": "code", 91 | "colab": {} 92 | }, 93 | "source": [ 94 | "plt.imshow(X_train[0], cmap='gray')" 95 | ], 96 | "execution_count": 0, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "metadata": { 102 | "id": "YpOfARYEP0Uv", 103 | "colab_type": "code", 104 | "colab": {} 105 | }, 106 | "source": [ 107 | "plt.imshow(X_train[1], cmap='gray')" 108 | ], 109 | "execution_count": 0, 110 | "outputs": [] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "metadata": { 115 | "id": "z9RVpEK_QG5H", 116 | "colab_type": "code", 117 | "colab": {} 118 | }, 119 | "source": [ 120 | "plt.imshow(X_train[2], cmap='gray')" 121 | ], 122 | "execution_count": 0, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "Ao2B3O_VMT5t", 129 | "colab_type": "code", 130 | "colab": {} 131 | }, 132 | "source": [ 133 | "print(y_train.shape)\n", 134 | "print(y_train[0])" 135 | ], 136 | "execution_count": 0, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": { 142 | "id": "hoPt9AcDP5KW", 143 | "colab_type": "text" 144 | }, 145 | "source": [ 146 | "### データ整形" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "metadata": { 152 | "id": "uWs4Ko57J75i", 153 | "colab_type": "code", 154 | "colab": {} 155 | }, 156 | "source": [ 157 | "# 画像を1次元配列にreshape\n", 158 | "X_train = X_train.reshape(60000, 28*28)\n", 159 | "X_test = X_test.reshape(10000, 28*28)" 160 | ], 161 | "execution_count": 0, 162 | "outputs": [] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "metadata": { 167 | "id": "Zw1T3qlYP8LV", 168 | "colab_type": "code", 169 | "colab": {} 170 | }, 171 | "source": [ 172 | "# 輝度値を0 ~ 1に入るように正規化\n", 173 | "X_train = X_train.astype('float32')/255\n", 174 | "X_test = X_test.astype('float32')/255" 175 | ], 176 | "execution_count": 0, 177 | "outputs": [] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "metadata": { 182 | "id": "7VAsaSwvP9sG", 183 | "colab_type": "code", 184 | "colab": {} 185 | }, 186 | "source": [ 187 | "# one hot encoding\n", 188 | "y_train = to_categorical(y_train)\n", 189 | "y_test = to_categorical(y_test)" 190 | ], 191 | "execution_count": 0, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "id": "e4VxBZMxQPl6", 198 | "colab_type": "text" 199 | }, 200 | "source": [ 201 | "## モデル作成" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "metadata": { 207 | "id": "bW5-du5wQO-B", 208 | "colab_type": "code", 209 | "colab": {} 210 | }, 211 | "source": [ 212 | "model = Sequential()\n", 213 | "model.add(Dense(64, activation='relu', input_dim=28*28))\n", 214 | "model.add(Dense(10, activation='softmax'))\n", 215 | "\n", 216 | "model.summary()" 217 | ], 218 | "execution_count": 0, 219 | "outputs": [] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "metadata": { 224 | "id": "iRszP63XQgrl", 225 | "colab_type": "code", 226 | "colab": {} 227 | }, 228 | "source": [ 229 | "model.compile(optimizer='Adam',\n", 230 | " loss='categorical_crossentropy',\n", 231 | " metrics=['accuracy'])\n", 232 | "\n", 233 | "model.fit(X_train, y_train, epochs=5, batch_size=64)" 234 | ], 235 | "execution_count": 0, 236 | "outputs": [] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "metadata": { 241 | "id": "Hx73rHnyRCMJ", 242 | "colab_type": "code", 243 | "colab": {} 244 | }, 245 | "source": [ 246 | "model.evaluate(X_test, y_test)" 247 | ], 248 | "execution_count": 0, 249 | "outputs": [] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "metadata": { 254 | "id": "sJo1GL3hRR5l", 255 | "colab_type": "code", 256 | "colab": {} 257 | }, 258 | "source": [ 259 | "model.save('model.h5')" 260 | ], 261 | "execution_count": 0, 262 | "outputs": [] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": { 267 | "id": "CggIki-tRgNb", 268 | "colab_type": "text" 269 | }, 270 | "source": [ 271 | "## モデルを複雑に" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "metadata": { 277 | "id": "O5wFwHl_RcAa", 278 | "colab_type": "code", 279 | "colab": {} 280 | }, 281 | "source": [ 282 | "model2 = Sequential()\n", 283 | "model2.add(Dense(512, activation='relu', input_dim=28*28))\n", 284 | "model2.add(Dense(512, activation='relu'))\n", 285 | "model2.add(Dense(10, activation='softmax'))\n", 286 | "\n", 287 | "model2.summary()" 288 | ], 289 | "execution_count": 0, 290 | "outputs": [] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "metadata": { 295 | "id": "3ZdPIsRqRvbo", 296 | "colab_type": "code", 297 | "colab": {} 298 | }, 299 | "source": [ 300 | "model2.compile(optimizer='Adam',\n", 301 | " loss='categorical_crossentropy',\n", 302 | " metrics=['accuracy'])\n", 303 | "\n", 304 | "model2.fit(X_train, y_train, epochs=5, batch_size=64)" 305 | ], 306 | "execution_count": 0, 307 | "outputs": [] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "metadata": { 312 | "id": "BhJtZcaiR0zl", 313 | "colab_type": "code", 314 | "colab": {} 315 | }, 316 | "source": [ 317 | "model2.evaluate(X_test, y_test)" 318 | ], 319 | "execution_count": 0, 320 | "outputs": [] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "metadata": { 325 | "id": "ZBgJLIPgR3-T", 326 | "colab_type": "code", 327 | "colab": {} 328 | }, 329 | "source": [ 330 | "" 331 | ], 332 | "execution_count": 0, 333 | "outputs": [] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "metadata": { 338 | "id": "iSOfIy82SfLH", 339 | "colab_type": "code", 340 | "colab": {} 341 | }, 342 | "source": [ 343 | "" 344 | ], 345 | "execution_count": 0, 346 | "outputs": [] 347 | } 348 | ] 349 | } -------------------------------------------------------------------------------- /Python/07/nn_temple_shrine.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch07-temple-shrine-nn.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "HRoy_vzj83WJ", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "from keras.datasets import mnist\n", 27 | "from keras.utils import to_categorical\n", 28 | "\n", 29 | "from keras.layers import Dense\n", 30 | "from keras.models import Sequential\n", 31 | "from keras.models import load_model\n", 32 | "\n", 33 | "import matplotlib.pyplot as plt\n", 34 | "from PIL import Image\n", 35 | "import os\n", 36 | "import numpy as np\n", 37 | "from sklearn.model_selection import train_test_split" 38 | ], 39 | "execution_count": 0, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "id": "lkcJwKDJ4vuV", 46 | "colab_type": "code", 47 | "colab": {} 48 | }, 49 | "source": [ 50 | "# Google ドライブをマウントするには、このセルを実行してください。\n", 51 | "from google.colab import drive\n", 52 | "drive.mount('/content/drive/')" 53 | ], 54 | "execution_count": 0, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "gbGo22SlK0E0", 61 | "colab_type": "text" 62 | }, 63 | "source": [ 64 | "## データ読み込み" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "id": "rzob7zR3uw9D", 71 | "colab_type": "code", 72 | "colab": {} 73 | }, 74 | "source": [ 75 | "X = []\n", 76 | "Y = []\n", 77 | "\n", 78 | "folder_path = \"/content/drive/My Drive/PythonBooks/src/Ch07/images/\"\n", 79 | "file_list = os.listdir(folder_path)" 80 | ], 81 | "execution_count": 0, 82 | "outputs": [] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "metadata": { 87 | "id": "aG-mpkDNKzoJ", 88 | "colab_type": "code", 89 | "colab": {} 90 | }, 91 | "source": [ 92 | "image_size = 30\n", 93 | "\n", 94 | "for file in file_list:\n", 95 | " try:\n", 96 | " image = Image.open(folder_path + file)\n", 97 | " except:\n", 98 | " print('error', file)\n", 99 | " continue\n", 100 | " \n", 101 | " image = image.convert(\"RGB\")\n", 102 | " image = image.resize((image_size, image_size))\n", 103 | " data = np.asarray(image)\n", 104 | " X.append(data)\n", 105 | " if 'temple' in file:\n", 106 | " Y.append(0)\n", 107 | " else:\n", 108 | " Y.append(1)\n", 109 | "\n", 110 | "X = np.array(X)\n", 111 | "Y = np.array(Y)" 112 | ], 113 | "execution_count": 0, 114 | "outputs": [] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "metadata": { 119 | "id": "Y_wXrXCzBQi-", 120 | "colab_type": "code", 121 | "colab": {} 122 | }, 123 | "source": [ 124 | "print(X.shape)" 125 | ], 126 | "execution_count": 0, 127 | "outputs": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "metadata": { 132 | "id": "gRWme6TSBSzL", 133 | "colab_type": "code", 134 | "colab": {} 135 | }, 136 | "source": [ 137 | "print(Y.shape)" 138 | ], 139 | "execution_count": 0, 140 | "outputs": [] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": { 145 | "id": "hoPt9AcDP5KW", 146 | "colab_type": "text" 147 | }, 148 | "source": [ 149 | "### データ整形" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "metadata": { 155 | "id": "uogTTc6oBsd0", 156 | "colab_type": "code", 157 | "colab": {} 158 | }, 159 | "source": [ 160 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)" 161 | ], 162 | "execution_count": 0, 163 | "outputs": [] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "metadata": { 168 | "id": "uWs4Ko57J75i", 169 | "colab_type": "code", 170 | "colab": {} 171 | }, 172 | "source": [ 173 | "# 画像を1次元配列にreshape\n", 174 | "X_train = X_train.reshape(-1, image_size * image_size *3)\n", 175 | "X_test = X_test.reshape(-1, image_size * image_size *3)" 176 | ], 177 | "execution_count": 0, 178 | "outputs": [] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "metadata": { 183 | "id": "Zw1T3qlYP8LV", 184 | "colab_type": "code", 185 | "colab": {} 186 | }, 187 | "source": [ 188 | "# 輝度値を0 ~ 1に入るように正規化\n", 189 | "X_train = X_train.astype('float32')/255\n", 190 | "X_test = X_test.astype('float32')/255" 191 | ], 192 | "execution_count": 0, 193 | "outputs": [] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "metadata": { 198 | "id": "7VAsaSwvP9sG", 199 | "colab_type": "code", 200 | "colab": {} 201 | }, 202 | "source": [ 203 | "# one hot encoding\n", 204 | "y_train = to_categorical(y_train)\n", 205 | "y_test = to_categorical(y_test)" 206 | ], 207 | "execution_count": 0, 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": { 213 | "id": "e4VxBZMxQPl6", 214 | "colab_type": "text" 215 | }, 216 | "source": [ 217 | "## モデル作成" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "bW5-du5wQO-B", 224 | "colab_type": "code", 225 | "colab": {} 226 | }, 227 | "source": [ 228 | "model = Sequential()\n", 229 | "model.add(Dense(64, activation='relu', input_dim=image_size * image_size *3))\n", 230 | "model.add(Dense(2, activation='softmax'))\n", 231 | "\n", 232 | "model.summary()" 233 | ], 234 | "execution_count": 0, 235 | "outputs": [] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "metadata": { 240 | "id": "iRszP63XQgrl", 241 | "colab_type": "code", 242 | "colab": {} 243 | }, 244 | "source": [ 245 | "model.compile(optimizer='Adam',\n", 246 | " loss='categorical_crossentropy',\n", 247 | " metrics=['accuracy'])\n", 248 | "\n", 249 | "model.fit(X_train, y_train, epochs=20, batch_size=20)" 250 | ], 251 | "execution_count": 0, 252 | "outputs": [] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "metadata": { 257 | "id": "Hx73rHnyRCMJ", 258 | "colab_type": "code", 259 | "colab": {} 260 | }, 261 | "source": [ 262 | "model.evaluate(X_test, y_test)" 263 | ], 264 | "execution_count": 0, 265 | "outputs": [] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "metadata": { 270 | "id": "sJo1GL3hRR5l", 271 | "colab_type": "code", 272 | "colab": {} 273 | }, 274 | "source": [ 275 | "model.save('model.h5')" 276 | ], 277 | "execution_count": 0, 278 | "outputs": [] 279 | } 280 | ] 281 | } -------------------------------------------------------------------------------- /Python/08/collaborative_filtering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch08-Collaborative-filtering.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "GJcUJGK-7KIQ", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import pandas as pd\n", 27 | "import numpy as np\n", 28 | "\n", 29 | "from sklearn.metrics.pairwise import cosine_similarity" 30 | ], 31 | "execution_count": 0, 32 | "outputs": [] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "id": "ZMFk_sBO9SPj", 38 | "colab_type": "text" 39 | }, 40 | "source": [ 41 | "## データ読み込み" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "metadata": { 47 | "id": "36vbv-1y7b2B", 48 | "colab_type": "code", 49 | "colab": {} 50 | }, 51 | "source": [ 52 | "cols_name = ['user_id','item_id','rating','timestamp']\n", 53 | "data_movie = pd.read_csv('u.data', names=cols_name, sep=\"\\t\")\n", 54 | "print(data_movie.head())" 55 | ], 56 | "execution_count": 0, 57 | "outputs": [] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "metadata": { 62 | "id": "7U5GlXH79PML", 63 | "colab_type": "code", 64 | "colab": {} 65 | }, 66 | "source": [ 67 | "movie_rating = data_movie.pivot(index='user_id', columns='item_id', values='rating').fillna(0).as_matrix()\n", 68 | "print(movie_rating[0:5])\n", 69 | "print(movie_rating.shape)" 70 | ], 71 | "execution_count": 0, 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "id": "Z5TLFkyWptmk", 78 | "colab_type": "text" 79 | }, 80 | "source": [ 81 | "## コサイン類似度計算" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "metadata": { 87 | "id": "gCm59y0L9g_-", 88 | "colab_type": "code", 89 | "colab": {} 90 | }, 91 | "source": [ 92 | "cos_sim = cosine_similarity(movie_rating, movie_rating)\n", 93 | "print(cos_sim[:5])\n", 94 | "print(cos_sim.shape)" 95 | ], 96 | "execution_count": 0, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "id": "nCT49DY4vBNp", 103 | "colab_type": "text" 104 | }, 105 | "source": [ 106 | "## レコメンド" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "metadata": { 112 | "id": "4mW_F4RE9cDB", 113 | "colab_type": "code", 114 | "colab": {} 115 | }, 116 | "source": [ 117 | "# ユーザー1との類似度\n", 118 | "cos_sim_for_user_1 = cos_sim[0]\n", 119 | "# ユーザー1と類似度の高いユーザー10人のインデックスを抽出\n", 120 | "similar_user = np.argsort(cos_sim_for_user_1)[-11:-1]\n", 121 | "print(similar_user)" 122 | ], 123 | "execution_count": 0, 124 | "outputs": [] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "metadata": { 129 | "id": "BQowdy38RBqn", 130 | "colab_type": "code", 131 | "colab": {} 132 | }, 133 | "source": [ 134 | "print(cos_sim_for_user_1[similar_user])" 135 | ], 136 | "execution_count": 0, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "ABhcpLBt9mbv", 143 | "colab_type": "code", 144 | "colab": {} 145 | }, 146 | "source": [ 147 | "# 類似度の高いユーザーの映画評価値\n", 148 | "movie_rating_of_similar_user = movie_rating[similar_user]\n", 149 | "print(movie_rating_of_similar_user)" 150 | ], 151 | "execution_count": 0, 152 | "outputs": [] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "metadata": { 157 | "id": "jfcEIQFvze1o", 158 | "colab_type": "code", 159 | "colab": {} 160 | }, 161 | "source": [ 162 | "# 重みづけされた評価値を計算\n", 163 | "weighted_movie_rating = movie_rating_of_similar_user * cos_sim_for_user_1[similar_user].reshape(-1, 1)\n", 164 | "print(weighted_movie_rating)" 165 | ], 166 | "execution_count": 0, 167 | "outputs": [] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "metadata": { 172 | "id": "Li21_3bb9oTf", 173 | "colab_type": "code", 174 | "colab": {} 175 | }, 176 | "source": [ 177 | "# 各映画のレコメンド値を計算\n", 178 | "mean_weighted_movie_rating = weighted_movie_rating.mean(axis=0)\n", 179 | "print(mean_weighted_movie_rating)" 180 | ], 181 | "execution_count": 0, 182 | "outputs": [] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "metadata": { 187 | "id": "d8MqIyjv9_1l", 188 | "colab_type": "code", 189 | "colab": {} 190 | }, 191 | "source": [ 192 | "#ユーザー1の評価と加重平均スコアを列とするデータフレーム作成\n", 193 | "recommend_values = pd.DataFrame({'user_1_score':movie_rating[0], 'recommend_value':mean_weighted_movie_rating})\n", 194 | "print(recommend_values.head())" 195 | ], 196 | "execution_count": 0, 197 | "outputs": [] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "metadata": { 202 | "id": "1bqd0849-CWB", 203 | "colab_type": "code", 204 | "colab": {} 205 | }, 206 | "source": [ 207 | "#未評価のうちスコアの高い上位10件を抽出\n", 208 | "recommend_values[recommend_values['user_1_score'] == 0].sort_values('recommend_value', ascending=False).head(10)" 209 | ], 210 | "execution_count": 0, 211 | "outputs": [] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "metadata": { 216 | "id": "h5C7u5Y9wy8_", 217 | "colab_type": "code", 218 | "colab": {} 219 | }, 220 | "source": [ 221 | "" 222 | ], 223 | "execution_count": 0, 224 | "outputs": [] 225 | } 226 | ] 227 | } -------------------------------------------------------------------------------- /Python/08/word2vec_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Ch08_word2vec.ipynb のコピー", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "toc_visible": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "4o7bonIQ-7eN", 22 | "colab_type": "code", 23 | "colab": {} 24 | }, 25 | "source": [ 26 | "import json\n", 27 | "import requests\n", 28 | "from requests_oauthlib import OAuth1\n", 29 | "import re\n", 30 | "from google.colab import files\n", 31 | "\n", 32 | "\n", 33 | "# 取得したkeyを定義\n", 34 | "access_token = 'xxxxxxxx'\n", 35 | "access_token_secret = 'xxxxxxxx'\n", 36 | "consumer_key = 'xxxxxxxx'\n", 37 | "consumer_key_secret = 'xxxxxxxx'\n", 38 | "\n", 39 | "url = \"https://stream.twitter.com/1.1/statuses/sample.json?language=ja\"\n", 40 | "\n", 41 | "# OAuth で GET\n", 42 | "twitter = OAuth1(consumer_key, consumer_key_secret, access_token, access_token_secret)" 43 | ], 44 | "execution_count": 0, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "nILqdz0jeTaF", 51 | "colab_type": "code", 52 | "colab": {} 53 | }, 54 | "source": [ 55 | "def normalize_text(text):\n", 56 | " text = re.sub(r'https?://[\\w/:%#\\$&\\?\\(\\)~\\.=\\+\\-…]+', \"\", text)\n", 57 | " text = re.sub('RT', \"\", text)\n", 58 | " text = re.sub('お気に入り', \"\", text)\n", 59 | " text = re.sub('まとめ', \"\", text)\n", 60 | " text = re.sub(r'[!-~]', \"\", text)\n", 61 | " text = re.sub(r'[︰-@]', \"\", text)\n", 62 | " text = re.sub('\\u3000',\"\", text)\n", 63 | " text = re.sub('\\t', \"\", text)\n", 64 | " text = re.sub('\\n', \"\", text)\n", 65 | "\n", 66 | " text = text.strip()\n", 67 | " return text" 68 | ], 69 | "execution_count": 0, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "kVb27XwPeVoY", 76 | "colab_type": "code", 77 | "colab": {} 78 | }, 79 | "source": [ 80 | "url = \"https://stream.twitter.com/1.1/statuses/sample.json?language=ja\"\n", 81 | "\n", 82 | "with open('public_text_twitter.tsv','a', encoding='utf-8') as f:\n", 83 | " res = requests.get(url, auth=twitter, stream=True)\n", 84 | " for r in res.iter_lines():\n", 85 | " try:\n", 86 | " r_json = json.loads(r)\n", 87 | " text = r_json['text']\n", 88 | " f.write(normalize_text(text) + '\\n')\n", 89 | " except:\n", 90 | " continue" 91 | ], 92 | "execution_count": 0, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "metadata": { 98 | "id": "UYWv6fpGIzj6", 99 | "colab_type": "code", 100 | "colab": {} 101 | }, 102 | "source": [ 103 | "files.download('public_text_twitter.tsv')" 104 | ], 105 | "execution_count": 0, 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": { 111 | "id": "A3yyvf-tmW-q", 112 | "colab_type": "text" 113 | }, 114 | "source": [ 115 | "## word2vec 実践" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "metadata": { 121 | "id": "3mpQrwtNeGOu", 122 | "colab_type": "code", 123 | "colab": {} 124 | }, 125 | "source": [ 126 | "# mecabインストール\n", 127 | "!apt install aptitude\n", 128 | "!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y\n", 129 | "\n", 130 | "# mecab pythonインストール(pythonでmecabを動かすために必要)\n", 131 | "!pip install mecab-python3==0.7\n", 132 | "\n", 133 | "# neologd辞書インストール\n", 134 | "!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git\n", 135 | "!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n" 136 | ], 137 | "execution_count": 0, 138 | "outputs": [] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": { 143 | "id": "gcy-DRiosnJz", 144 | "colab_type": "text" 145 | }, 146 | "source": [ 147 | "" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "2NJPcNzrYTKv", 154 | "colab_type": "code", 155 | "colab": {} 156 | }, 157 | "source": [ 158 | "# 辞書変更\n", 159 | "!sed -e \"s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g\" /etc/mecabrc > /etc/mecabrc.new\n", 160 | "!cp /etc/mecabrc /etc/mecabrc.org\n", 161 | "!cp /etc/mecabrc.new /etc/mecabrc" 162 | ], 163 | "execution_count": 0, 164 | "outputs": [] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "metadata": { 169 | "id": "y-QwAOeovcLI", 170 | "colab_type": "code", 171 | "colab": {} 172 | }, 173 | "source": [ 174 | "import MeCab\n", 175 | "import pandas as pd\n", 176 | "import unicodedata\n", 177 | "from gensim.models import word2vec" 178 | ], 179 | "execution_count": 0, 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "metadata": { 185 | "id": "PWzcuYrDwtwu", 186 | "colab_type": "code", 187 | "colab": {} 188 | }, 189 | "source": [ 190 | "# データ インポート\n", 191 | "df = pd.read_csv('public_text_twitter.tsv', sep='\\t', names=['text'])\n", 192 | "text_lists = df['text'].unique().tolist()\n", 193 | "\n", 194 | "mt = MeCab.Tagger(\"-Ochasen\") " 195 | ], 196 | "execution_count": 0, 197 | "outputs": [] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "metadata": { 202 | "id": "ge1qpQLyJdZJ", 203 | "colab_type": "code", 204 | "colab": {} 205 | }, 206 | "source": [ 207 | "word_pos = ('名詞', '形容詞')\n", 208 | "\n", 209 | "with open('public_text_splited.txt', 'w', encoding='utf-8') as f:\n", 210 | " for text in text_lists:\n", 211 | " tmp_lists = []\n", 212 | " text = unicodedata.normalize('NFKC', str(text))\n", 213 | " \n", 214 | " node = mt.parseToNode(text)\n", 215 | " while node:\n", 216 | " if node.feature.startswith(word_pos) and ',非自立,' not in node.feature:\n", 217 | " tmp_lists.append(node.surface)\n", 218 | " \n", 219 | " node = node.next\n", 220 | " \n", 221 | " f.write(' '.join(tmp_lists) + '\\n')" 222 | ], 223 | "execution_count": 0, 224 | "outputs": [] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "metadata": { 229 | "id": "_sfShPFtw4GV", 230 | "colab_type": "code", 231 | "colab": {} 232 | }, 233 | "source": [ 234 | "sentences = word2vec.LineSentence('public_text_splited.txt')\n", 235 | "model = word2vec.Word2Vec(sentences,\n", 236 | " sg=1, #0: CBOW, 1: skip-gram\n", 237 | " size=200, # ベクトルの次元数\n", 238 | " window=3, # 入力単語からの最大距離\n", 239 | " min_count=5, # 単語の出現回数でフィルタリング\n", 240 | " )" 241 | ], 242 | "execution_count": 0, 243 | "outputs": [] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "metadata": { 248 | "id": "A71Y2Jk5YnMJ", 249 | "colab_type": "code", 250 | "colab": {} 251 | }, 252 | "source": [ 253 | "model.most_similar(positive='人生', topn=20)" 254 | ], 255 | "execution_count": 0, 256 | "outputs": [] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "metadata": { 261 | "id": "rb45klcguFS1", 262 | "colab_type": "code", 263 | "colab": {} 264 | }, 265 | "source": [ 266 | "" 267 | ], 268 | "execution_count": 0, 269 | "outputs": [] 270 | } 271 | ] 272 | } -------------------------------------------------------------------------------- /R/03/lm_boston.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | 5 | library(MASS) 6 | library(tidyverse) 7 | library(GGally) 8 | library(caret) 9 | 10 | # データ読み込み 11 | data(Boston) 12 | Boston %>% summary() 13 | Boston %>% head() 14 | 15 | # 可視化 16 | Boston %>% ggpairs() 17 | Boston %>% select(1, 2) %>% ggpairs() 18 | 19 | # 線形回帰 20 | # 単回帰 21 | lm_model <- train(data=Boston, medv ~ rm, method="lm") 22 | lm_model %>% summary() 23 | 24 | # 重回帰 25 | lm_multi_model <- train(data=Boston, medv ~ ., method="lm") 26 | 27 | lm_multi_model %>% summary() 28 | 29 | # 予測 30 | train_size = 0.7 31 | train_index <- sample(Boston %>% nrow(), Boston %>% nrow() * train_size) 32 | train_data <- Boston[train_index,] # 訓練データ 33 | test_data <- Boston[-train_index,] # テストデータ 34 | 35 | lm_multi_model2 <- train(data=train_data, medv ~ ., method="lm") 36 | y_pred <- predict(lm_multi_model2, test_data) 37 | y_pred - test_data$medv 38 | 39 | # MAE 40 | ## 単回帰 41 | lm_single_model <- train(data=train_data, medv ~ rm, method="lm") 42 | y_pred <- predict(lm_single_model, test_data) 43 | MAE(y_pred, test_data$medv) 44 | 45 | ## 重回帰 46 | lm_multi_model <- train(data=train_data, medv ~ ., method="lm") 47 | y_pred <- predict(lm_multi_model, test_data) 48 | MAE(y_pred, test_data$medv) 49 | 50 | -------------------------------------------------------------------------------- /R/03/lm_ridge_lasso_boston.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | 5 | library(MASS) 6 | library(tidyverse) 7 | library(GGally) 8 | library(caret) 9 | 10 | # データ読み込み 11 | data(Boston) 12 | Boston %>% summary() 13 | Boston %>% head() 14 | 15 | # 可視化 16 | Boston %>% ggpairs() 17 | Boston %>% select(1, 2) %>% ggpairs() 18 | 19 | # L1正則化なし 20 | train_size = 0.7 21 | train_index <- sample(Boston %>% nrow(), Boston %>% nrow() * train_size) 22 | train_data <- Boston[train_index,] # 訓練データ 23 | test_data <- Boston[-train_index,] # テストデータ 24 | 25 | lm_multi_model <- train(data=train_data, medv ~ ., method="lm") 26 | y_pred <- predict(lm_multi_model, test_data) 27 | MAE(y_pred, test_data$medv) 28 | 29 | # Lasso回帰 30 | lasso <- train(data=train_data, medv ~ ., method="glmnet", tuneGrid = expand.grid(alpha = 1, lambda = 1)) 31 | y_pred <- predict(lasso, test_data) 32 | MAE(y_pred, test_data$medv) 33 | 34 | 35 | # Ridge回帰 36 | ridge <- train(data=train_data, medv ~ ., method="glmnet", tuneGrid = expand.grid(alpha = 0, lambda = 1)) 37 | y_pred <- predict(ridge, test_data) 38 | MAE(y_pred, test_data$medv) 39 | 40 | -------------------------------------------------------------------------------- /R/03/lm_ridge_lasso_tokyo.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages("glmnet") 5 | 6 | library(MASS) 7 | library(tidyverse) 8 | library(GGally) 9 | library(caret) 10 | 11 | # データ読み込み 12 | data_tokyo <- read.csv("src/03/13_Tokyo_20171_20184.csv", header = TRUE, encoding = "cp932") 13 | 14 | data_tokyo %>% summary() 15 | data_tokyo %>% head() 16 | 17 | # 整形 18 | data_used_apartment <- data_tokyo %>% filter(種類 == "中古マンション等") 19 | columns_name_list <- c("最寄駅.距離.分.", "間取り", "面積...","建築年", "建物の構造", "建ぺい率...", "容積率...", "市区町村名", "取引価格.総額.") 20 | data_selected_dropna <- data_used_apartment %>% select(columns_name_list) %>% 21 | na.omit() %>% filter(str_detect(建築年, "^平成|昭和")) %>% 22 | filter(str_detect(最寄駅.距離.分., "\\?", negate = TRUE)) 23 | 24 | wareki_to_seireki = c(1926-1, 1989-1) 25 | building_year_list <- data_selected_dropna$建築年 26 | 27 | building_age_list <- c() 28 | for (i in 1:(building_year_list %>% length())){ 29 | # 西暦に変換 30 | tmp <- unlist(strsplit(as.character(building_year_list[i]), "成|和|年")) 31 | if (tmp[1] == "平"){ 32 | seireki = wareki_to_seireki[2] + as.integer(tmp[2]) 33 | } 34 | else { 35 | seireki = wareki_to_seireki[1] + as.integer(tmp[2]) 36 | } 37 | # 築年数に変換 38 | building_age = 2018 - seireki 39 | 40 | building_age_list = c(building_age_list, building_age) 41 | } 42 | 43 | data_selected_dropna$築年数 <- building_age_list 44 | data_selected_dropna <- data_selected_dropna[, colnames(data_selected_dropna) != "建築年"] 45 | data_selected_dropna$最寄駅.距離.分. <- as.numeric(data_selected_dropna$最寄駅.距離.分) 46 | data_selected_dropna$面積... <- as.numeric(data_selected_dropna$面積...) 47 | 48 | data_added_dummies <- data_selected_dropna %>% filter(取引価格.総額. < 60000000) 49 | 50 | # L1正則化なし 51 | train_size = 0.7 52 | train_index <- sample(data_added_dummies %>% nrow(), data_added_dummies %>% nrow() * train_size) 53 | train_data <- data_added_dummies[train_index,] # 訓練データ 54 | test_data <- data_added_dummies[-train_index,] # テストデータ 55 | 56 | lm_multi_model <- train(data=train_data, 取引価格.総額. ~ ., method="lm") 57 | y_pred <- predict(lm_multi_model, test_data) 58 | MAE(y_pred, test_data$取引価格.総額.) 59 | 60 | # Lasso回帰 61 | lasso <- train(data=train_data, 取引価格.総額. ~ ., method="glmnet", tuneGrid = expand.grid(alpha = 1, lambda = 1)) 62 | y_pred <- predict(lasso, test_data) 63 | MAE(y_pred, test_data$取引価格.総額.) 64 | 65 | 66 | # Ridge回帰 67 | ridge <- train(data=train_data, 取引価格.総額. ~ ., method="glmnet", tuneGrid = expand.grid(alpha = 0, lambda = 1)) 68 | y_pred <- predict(ridge, test_data) 69 | MAE(y_pred, test_data$取引価格.総額.) -------------------------------------------------------------------------------- /R/03/lm_tokyo.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | 5 | library(MASS) 6 | library(tidyverse) 7 | library(GGally) 8 | library(caret) 9 | 10 | # データ読み込み 11 | data_tokyo <- read.csv("src/03/13_Tokyo_20171_20184.csv", header = TRUE, encoding = "cp932") 12 | 13 | data_tokyo %>% summary() 14 | data_tokyo %>% head() 15 | 16 | # 整形 17 | data_used_apartment <- data_tokyo %>% filter(種類 == "中古マンション等") 18 | columns_name_list <- c("最寄駅.距離.分.", "間取り", "面積...","建築年", "建物の構造", "建ぺい率...", "容積率...", "市区町村名", "取引価格.総額.") 19 | data_selected_dropna <- data_used_apartment %>% select(columns_name_list) %>% 20 | na.omit() %>% filter(str_detect(建築年, "^平成|昭和")) %>% 21 | filter(str_detect(最寄駅.距離.分., "\\?", negate = TRUE)) 22 | 23 | wareki_to_seireki = c(1926-1, 1989-1) 24 | building_year_list <- data_selected_dropna$建築年 25 | 26 | building_age_list <- c() 27 | for (i in 1:(building_year_list %>% length())){ 28 | # 西暦に変換 29 | tmp <- unlist(strsplit(as.character(building_year_list[i]), "成|和|年")) 30 | if (tmp[1] == "平"){ 31 | seireki = wareki_to_seireki[2] + as.integer(tmp[2]) 32 | } 33 | else { 34 | seireki = wareki_to_seireki[1] + as.integer(tmp[2]) 35 | } 36 | # 築年数に変換 37 | building_age = 2018 - seireki 38 | 39 | building_age_list = c(building_age_list, building_age) 40 | } 41 | 42 | data_selected_dropna$築年数 <- building_age_list 43 | data_selected_dropna <- data_selected_dropna[, colnames(data_selected_dropna) != "建築年"] 44 | data_selected_dropna$最寄駅.距離.分. <- as.numeric(data_selected_dropna$最寄駅.距離.分) 45 | data_selected_dropna$面積... <- as.numeric(data_selected_dropna$面積...) 46 | 47 | data_added_dummies <- data_selected_dropna %>% filter(取引価格.総額. < 60000000) 48 | 49 | # 線形回帰 50 | ## 単回帰 51 | lm_model <- train(data=data_added_dummies, 取引価格.総額. ~ 面積..., method="lm") 52 | lm_model %>% summary() 53 | 54 | ## 重回帰 55 | lm_multi_model <- train(data=data_added_dummies, 取引価格.総額. ~ ., method="lm") 56 | lm_multi_model %>% summary() 57 | 58 | # 予測 59 | train_size = 0.7 60 | train_index <- sample(data_added_dummies %>% nrow(), data_added_dummies %>% nrow() * train_size) 61 | train_data <- data_added_dummies[train_index,] # 訓練データ 62 | test_data <- data_added_dummies[-train_index,] # テストデータ 63 | 64 | lm_multi_model2 <- train(data=train_data, 取引価格.総額. ~ ., method="lm") 65 | y_pred <- predict(lm_multi_model2, test_data) 66 | y_pred - test_data$取引価格.総額. 67 | 68 | # MAE 69 | ## 単回帰 70 | lm_single_model <- train(data=train_data, 取引価格.総額. ~ 面積..., method="lm") 71 | y_pred <- predict(lm_single_model, test_data) 72 | MAE(y_pred, test_data$取引価格.総額.) 73 | 74 | ## 重回帰 75 | lm_multi_model <- train(data=train_data, 取引価格.総額. ~ ., method="lm") 76 | y_pred <- predict(lm_multi_model, test_data) 77 | MAE(y_pred, test_data$取引価格.総額.) 78 | 79 | -------------------------------------------------------------------------------- /R/04/decisionTree_iris.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | # install.packages("doParallel") 6 | 7 | library(MASS) 8 | library(tidyverse) 9 | library(caret) 10 | library(doParallel) 11 | 12 | detectCores() 13 | cl <- makePSOCKcluster(4) 14 | registerDoParallel(cl) 15 | 16 | # データ整形 17 | data_iris <- iris %>% 18 | filter(Species != "virginica") %>% select(-Species) %>% 19 | mutate(Species = as.matrix(iris$Species[1:100])) 20 | 21 | # 決定木 22 | # 予測 23 | train_size = 0.7 24 | train_index <- sample(data_iris %>% nrow(), data_iris %>% nrow() * train_size) 25 | train_data <- data_iris[train_index,] # 訓練データ 26 | test_data <- data_iris[-train_index,] # テストデータ 27 | 28 | 29 | decisionTree_model <- train(Species ~ ., data=train_data, method="rpart") 30 | y_pred <- predict(decisionTree_model, test_data) 31 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor()) 32 | 33 | -------------------------------------------------------------------------------- /R/04/decisionTree_tweets.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | # install.packages("doParallel") 6 | 7 | library(MASS) 8 | library(tidyverse) 9 | library(caret) 10 | library(doParallel) 11 | library(RMeCab) 12 | 13 | detectCores() 14 | cl <- makePSOCKcluster(4) 15 | registerDoParallel(cl) 16 | 17 | # データ読み込み 18 | tweets <- read.csv("src/04/tweets.tsv",sep = "\t") %>% na.omit() 19 | tweets %>% dim() 20 | 21 | y <- tweets$X1 22 | text_all <- as.data.frame(tweets$X0) 23 | 24 | # データ整形(tf-idf) 25 | doc_matrix <- docDF(text_all, col = 1, type = 1, pos = c("名詞", "形容詞"), minFreq = 1, weight = "tf*idf*norm") %>% 26 | filter(POS2 %in% c("一般", "固有名詞","自立")) 27 | doc_matrix_t <- doc_matrix[, 4:ncol(doc_matrix)] %>% t() 28 | 29 | rownames(doc_matrix_t) <- c(1:nrow(doc_matrix_t)) 30 | # colnames(doc_matrix_t) <- doc_matrix[, 1] 31 | 32 | doc_matrix_t_1 <- cbind(doc_matrix_t, y) %>% na.omit() 33 | 34 | doc_matrix_t_1[is.nan(doc_matrix_t_1)] <- NA 35 | doc_matrix_df <- doc_matrix_t_1 %>% na.omit() %>% as.data.frame() 36 | 37 | # 決定木 38 | # 予測 39 | train_size = 0.7 40 | train_index <- sample(doc_matrix_df %>% nrow(), doc_matrix_df %>% nrow() * train_size) 41 | train_data <- doc_matrix_df[train_index,] # 訓練データ 42 | test_data <- doc_matrix_df[-train_index,] # テストデータ 43 | 44 | 45 | decisionTree_model <- train(y ~ ., data=train_data, method="rpart") 46 | y_pred <- predict(decisionTree_model, test_data) 47 | confusionMatrix(data = y_pred %>% round() %>% as.factor(), test_data$y %>% as.factor()) 48 | 49 | -------------------------------------------------------------------------------- /R/04/logit_iris.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | 6 | library(MASS) 7 | library(tidyverse) 8 | library(caret) 9 | 10 | data_iris <- iris %>% 11 | filter(Species != "virginica") %>% select(-Species) %>% 12 | mutate(Species = as.matrix(iris$Species[1:100])) 13 | # ロジスティック回帰 14 | ## 単回帰 15 | logit_model <- train(Species ~ Sepal.Length, data=data_iris, method="glm", family=binomial()) 16 | logit_model %>% summary() 17 | 18 | ## 重回帰 19 | logit_multi_model <- train(Species ~ ., data=data_iris, method="glm", family=binomial()) 20 | logit_multi_model %>% summary() 21 | 22 | # 予測 23 | train_size = 0.7 24 | train_index <- sample(data_iris %>% nrow(), data_iris %>% nrow() * train_size) 25 | train_data <- data_iris[train_index,] # 訓練データ 26 | test_data <- data_iris[-train_index,] # テストデータ 27 | 28 | 29 | ## 単回帰 30 | logit_single_model <- train(Species ~ Sepal.Length, data=train_data, method="glm", family=binomial()) 31 | y_pred <- predict(logit_single_model, test_data) 32 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor()) 33 | 34 | 35 | ## 重回帰 36 | logit_multi_model2 <- train(Species ~ ., data=train_data, method="glm", family=binomial()) 37 | y_pred <- predict(logit_multi_model2, test_data) 38 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor()) 39 | 40 | -------------------------------------------------------------------------------- /R/04/logit_tweets.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | # install.packages("RMeCab", repos = "http://rmecab.jp/R", type = "source") 6 | 7 | library(MASS) 8 | library(tidyverse) 9 | library(caret) 10 | library(RMeCab) 11 | 12 | # データ読み込み 13 | tweets <- read.csv("src/04/tweets.tsv",sep = "\t") %>% na.omit() 14 | tweets %>% dim() 15 | 16 | y <- tweets$X1 17 | text_all <- as.data.frame(tweets$X0) 18 | 19 | # データ整形(tf-idf) 20 | doc_matrix <- docDF(text_all, col = 1, type = 1, pos = c("名詞", "形容詞"), minFreq = 1, weight = "tf*idf*norm") %>% 21 | filter(POS2 %in% c("一般", "固有名詞","自立")) 22 | doc_matrix_t <- doc_matrix[, 4:ncol(doc_matrix)] %>% t() 23 | 24 | rownames(doc_matrix_t) <- c(1:nrow(doc_matrix_t)) 25 | # colnames(doc_matrix_t) <- doc_matrix[, 1] 26 | 27 | doc_matrix_t_1 <- cbind(doc_matrix_t, y) %>% na.omit() 28 | 29 | doc_matrix_t_1[is.nan(doc_matrix_t_1)] <- NA 30 | doc_matrix_df <- doc_matrix_t_1 %>% na.omit() %>% as.data.frame() 31 | 32 | 33 | # ロジスティック回帰 34 | # logit_multi_model <- train(y ~ ., data=doc_matrix_df, method="glm", family=binomial()) 35 | # logit_multi_model %>% summary() 36 | 37 | # 予測 38 | train_size = 0.7 39 | train_index <- sample(doc_matrix_df %>% nrow(), doc_matrix_df %>% nrow() * train_size) 40 | train_data <- doc_matrix_df[train_index,] # 訓練データ 41 | test_data <- doc_matrix_df[-train_index,] # テストデータ 42 | 43 | 44 | logit_multi_model2 <- train(y ~ ., data=train_data, method="glm", family=binomial()) 45 | y_pred <- predict(logit_multi_model2, test_data) 46 | confusionMatrix(data = y_pred %>% round() %>% as.factor(), test_data$y %>% as.factor()) 47 | 48 | -------------------------------------------------------------------------------- /R/04/randomForest_iris.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | # install.packages("doParallel") 6 | 7 | library(MASS) 8 | library(tidyverse) 9 | library(caret) 10 | library(doParallel) 11 | 12 | detectCores() 13 | cl <- makePSOCKcluster(4) 14 | registerDoParallel(cl) 15 | 16 | # データ整形 17 | data_iris <- iris %>% 18 | filter(Species != "virginica") %>% select(-Species) %>% 19 | mutate(Species = as.matrix(iris$Species[1:100])) 20 | 21 | # ランダムフォレスト 22 | # 予測 23 | train_size = 0.7 24 | train_index <- sample(data_iris %>% nrow(), data_iris %>% nrow() * train_size) 25 | train_data <- data_iris[train_index,] # 訓練データ 26 | test_data <- data_iris[-train_index,] # テストデータ 27 | 28 | 29 | rf_model <- train(Species ~ ., data=train_data, method="rf") 30 | y_pred <- predict(rf_model, test_data) 31 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor()) 32 | 33 | -------------------------------------------------------------------------------- /R/04/randomForest_tweets.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | # install.packages("doParallel") 6 | 7 | library(MASS) 8 | library(tidyverse) 9 | library(caret) 10 | library(doParallel) 11 | library(RMeCab) 12 | 13 | detectCores() 14 | cl <- makePSOCKcluster(4) 15 | registerDoParallel(cl) 16 | 17 | # データ読み込み 18 | tweets <- read.csv("src/04/tweets.tsv",sep = "\t") %>% na.omit() 19 | tweets %>% dim() 20 | 21 | y <- tweets$X1 22 | text_all <- as.data.frame(tweets$X0) 23 | 24 | # データ整形(tf-idf) 25 | doc_matrix <- docDF(text_all, col = 1, type = 1, pos = c("名詞", "形容詞"), minFreq = 1, weight = "tf*idf*norm") %>% 26 | filter(POS2 %in% c("一般", "固有名詞","自立")) 27 | doc_matrix_t <- doc_matrix[, 4:ncol(doc_matrix)] %>% t() 28 | 29 | rownames(doc_matrix_t) <- c(1:nrow(doc_matrix_t)) 30 | # colnames(doc_matrix_t) <- doc_matrix[, 1] 31 | 32 | doc_matrix_t_1 <- cbind(doc_matrix_t, y %>% as.factor()) %>% na.omit() 33 | 34 | doc_matrix_t_1[is.nan(doc_matrix_t_1)] <- NA 35 | doc_matrix_df <- doc_matrix_t_1 %>% na.omit() %>% as.data.frame() 36 | 37 | colnames(doc_matrix_df)[ncol(doc_matrix_df)] = "y" 38 | 39 | # 決定木 40 | # 予測 41 | train_size = 0.7 42 | train_index <- sample(doc_matrix_df %>% nrow(), doc_matrix_df %>% nrow() * train_size) 43 | train_data <- doc_matrix_df[train_index,] # 訓練データ 44 | test_data <- doc_matrix_df[-train_index,] # テストデータ 45 | 46 | 47 | rf_model <- train(y ~ ., data=train_data, method="rf", tuneLength=4) 48 | y_pred <- predict(rf_model, test_data) 49 | confusionMatrix(data = y_pred %>% round() %>% as.factor(), test_data$y %>% as.factor()) 50 | 51 | -------------------------------------------------------------------------------- /R/05/Kmeans_iris.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | 6 | library(MASS) 7 | library(tidyverse) 8 | # library(caret) 9 | 10 | data_iris <- iris %>% select(Sepal.Width, Petal.Width) %>% scale() 11 | k_means <- kmeans(data_iris, 2) 12 | k_means %>% summary() 13 | k_means$cluster 14 | 15 | data_kmeans <- cbind(data_iris, k_means$cluster) %>% as.data.frame() 16 | 17 | g <- ggplot(data_kmeans, aes(x=Sepal.Width, y=Petal.Width)) 18 | g <- g + geom_point(aes(colour=V3), size=1, alpha=0.5) 19 | g -------------------------------------------------------------------------------- /R/05/Kmeans_prefecture.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | 6 | library(MASS) 7 | library(tidyverse) 8 | # library(caret) 9 | 10 | data_prefecture <- read_csv("src/05/data_prefecture_category.csv") 11 | data_prefecture %>% summary() 12 | 13 | data_prefecture_scaled <- data_prefecture %>% select(-都道府県) %>% scale() 14 | k_means <- kmeans(data_prefecture_scaled, 4) 15 | k_means %>% summary() 16 | k_means$cluster 17 | 18 | data_prefecture_kmeans <- data_prefecture %>% mutate("label" = k_means$cluster) 19 | 20 | data_prefecture_kmeans %>% filter(label==1) 21 | data_prefecture_kmeans %>% filter(label==2) 22 | data_prefecture_kmeans %>% filter(label==3) 23 | data_prefecture_kmeans %>% filter(label==4) 24 | 25 | data_prefecture_kmeans %>% filter(label==1) %>% summary() 26 | data_prefecture_kmeans %>% filter(label==2) %>% summary() 27 | data_prefecture_kmeans %>% filter(label==3) %>% summary() 28 | data_prefecture_kmeans %>% filter(label==4) %>% summary() 29 | -------------------------------------------------------------------------------- /R/05/data_prefecture_category.csv: -------------------------------------------------------------------------------- 1 | 都道府県,食料,住居,光熱・水道,家具・家事,被服及び,保健医療,交通・通信,教育,教養娯楽,諸雑費 2 | 札幌市,"819,536","279,764","228,330","103,893","129,292","99,902","442,564","124,799","276,976","218,769" 3 | 青森市,"790,368","259,971","295,102","96,173","98,267","115,529","427,590","96,241","245,912","232,403" 4 | 盛岡市,"771,420","246,223","250,260","102,652","142,183","123,152","438,431","144,845","276,140","286,892" 5 | 仙台市,"862,052","240,690","197,006","117,818","116,682","109,467","379,888","150,622","317,874","280,381" 6 | 秋田市,"835,325","226,152","296,036","111,587","127,798","133,474","496,526","111,430","280,440","238,857" 7 | 山形市,"841,537","315,770","285,590","99,357","125,567","104,612","770,941","107,336","302,035","289,043" 8 | 福島市,"950,582","285,711","257,681","126,588","169,182","94,275","665,083","141,012","392,401","276,986" 9 | 水戸市,"877,968","235,274","231,740","127,631","174,481","119,688","695,369","200,251","390,123","322,231" 10 | 宇都宮市,"970,391","294,398","243,081","104,325","171,918","125,397","622,628","175,432","375,213","292,779" 11 | 前橋市,"876,472","149,049","202,882","150,428","166,129","142,103","549,336","113,726","397,195","313,629" 12 | さいたま市,"1,042,267","350,989","216,828","110,043","173,828","174,833","501,966","275,513","330,177","276,978" 13 | 千葉市,"867,636","162,260","153,227","81,768","142,156","87,722","421,253","155,287","329,146","320,532" 14 | 東京都区部,"943,279","404,843","175,822","112,716","208,975","156,721","417,168","272,696","423,476","254,768" 15 | 横浜市,"926,253","215,616","184,484","124,547","172,798","136,661","517,576","251,826","420,737","275,789" 16 | 新潟市,"842,736","178,061","254,426","116,049","128,177","114,074","606,168","199,170","265,664","316,409" 17 | 富山市,"896,917","307,401","263,618","127,392","122,275","114,880","579,845","91,179","336,369","263,650" 18 | 金沢市,"971,470","220,831","246,180","125,704","167,773","101,640","680,653","245,222","405,272","355,490" 19 | 福井市,"925,413","151,093","249,017","94,646","114,519","99,707","462,830","122,414","328,129","277,653" 20 | 甲府市,"747,397","300,816","214,981","90,925","101,371","104,563","420,691","116,368","323,950","234,201" 21 | 長野市,"786,130","344,086","239,435","109,564","116,436","108,134","519,702","92,604","266,054","289,707" 22 | 岐阜市,"865,541","201,315","239,365","130,079","173,834","135,925","699,940","243,758","414,244","305,166" 23 | 静岡市,"807,241","358,014","204,189","106,298","139,274","109,700","432,415","119,306","316,773","227,907" 24 | 名古屋市,"821,916","249,793","156,478","82,537","139,540","104,044","480,970","107,105","394,293","224,362" 25 | 津市,"863,096","195,647","203,113","125,860","164,073","117,537","517,539","251,968","386,805","251,410" 26 | 大津市,"915,677","108,352","236,832","158,680","141,251","108,875","521,557","180,740","325,487","245,402" 27 | 京都市,"845,226","210,964","232,337","88,931","129,277","92,014","390,179","212,035","358,755","246,851" 28 | 大阪市,"840,018","269,369","177,417","95,044","114,748","138,580","369,889","140,737","317,359","202,192" 29 | 神戸市,"656,924","136,381","103,216","67,591","110,686","54,228","319,734","31,347","208,916","184,317" 30 | 奈良市,"898,884","157,240","272,448","114,845","165,037","144,301","496,535","388,515","399,766","282,591" 31 | 和歌山市,"887,859","244,498","246,528","130,329","152,058","92,863","510,125","144,763","343,537","241,471" 32 | 鳥取市,"706,962","204,600","194,986","108,323","103,304","86,720","513,462","77,770","230,101","281,468" 33 | 松江市,"727,565","328,050","221,065","93,567","103,611","105,134","545,464","85,915","292,628","281,605" 34 | 岡山市,"765,652","289,496","202,733","96,181","161,001","136,606","502,230","151,293","302,995","233,083" 35 | 広島市,"810,255","219,623","182,511","105,210","127,351","104,142","605,174","181,977","284,268","220,201" 36 | 山口市,"607,019","363,261","177,832","86,593","100,132","108,410","586,591","59,450","298,965","223,511" 37 | 徳島市,"817,065","183,086","211,546","119,732","153,757","113,235","443,341","239,275","362,019","277,219" 38 | 高松市,"809,931","323,569","227,821","119,424","129,374","131,729","615,294","103,593","279,503","243,670" 39 | 松山市,"828,274","197,045","241,818","125,931","159,782","102,157","491,929","208,938","305,368","253,428" 40 | 高知市,"803,052","310,383","225,292","198,099","119,242","102,917","533,892","157,375","309,526","280,036" 41 | 福岡市,"760,638","188,295","156,097","116,400","152,971","96,334","471,238","120,417","355,085","273,449" 42 | 佐賀市,"814,400","262,685","224,972","98,570","140,041","144,157","515,064","144,634","359,726","284,169" 43 | 長崎市,"658,520","308,171","210,173","84,279","115,569","83,159","390,576","88,847","187,986","182,308" 44 | 熊本市,"870,311","311,909","243,256","143,752","152,455","133,442","509,583","223,684","345,740","338,671" 45 | 大分市,"789,001","355,356","207,281","135,103","162,991","100,884","561,382","97,157","451,635","313,381" 46 | 宮崎市,"778,907","222,861","185,008","96,874","122,197","113,690","559,338","131,236","279,217","278,642" 47 | 鹿児島市,"787,120","345,632","198,035","116,358","164,759","121,532","552,727","108,190","298,067","254,743" 48 | 那覇市,"726,160","337,851","211,156","111,406","100,591","102,076","448,672","131,853","237,977","175,467" 49 | 川崎市,"872,136","427,698","158,914","87,568","112,286","94,975","324,322","124,695","292,594","215,952" 50 | 相模原市,"756,340","290,616","166,843","95,209","112,149","122,466","302,611","112,232","363,026","210,583" 51 | 浜松市,"803,305","198,314","187,801","105,721","127,230","114,431","680,801","128,846","281,603","236,023" 52 | 堺市,"927,069","236,032","257,915","123,388","155,071","129,308","640,550","247,428","419,882","251,675" 53 | 北九州市,"862,432","149,685","206,224","109,226","167,286","146,687","700,366","164,434","270,083","256,207" -------------------------------------------------------------------------------- /R/05/pca_iris.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | 6 | library(MASS) 7 | library(tidyverse) 8 | # library(caret) 9 | 10 | pcr_model <- prcomp(iris %>% select(-Species), scale=T) 11 | pcr_model %>% summary() 12 | 13 | # plot(pcr_model$x[, 1], pcr_model$x[, 2]) 14 | data_pca <- cbind(pcr_model$x, iris$Species) %>% as.data.frame() 15 | 16 | g <- ggplot(data_pca, aes(x=PC1, y=PC2)) 17 | g <- g + geom_point(aes(colour=V5), size=1, alpha=0.5) 18 | #描画 19 | g -------------------------------------------------------------------------------- /R/05/pca_prefecture.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | 6 | library(MASS) 7 | library(tidyverse) 8 | # library(caret) 9 | 10 | data_prefecture <- read_csv("src/05/data_prefecture_category.csv") 11 | data_prefecture %>% summary() 12 | 13 | pcr_model <- prcomp(data_prefecture %>% select(-都道府県), scale=T) 14 | pcr_model %>% summary() 15 | 16 | # plot(pcr_model$x[, 1], pcr_model$x[, 2]) 17 | 18 | # plot(x, y) 19 | pt <- identify(pcr_model$x[, 1], pcr_model$x[, 2]) 20 | 21 | -------------------------------------------------------------------------------- /R/06/classification.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages("pROC") 5 | 6 | library(MASS) 7 | library(tidyverse) 8 | library(caret) 9 | library(pROC) 10 | 11 | data_iris <- iris %>% 12 | filter(Species != "virginica") %>% select(-Species) %>% 13 | mutate(Species = as.matrix(iris$Species[1:100])) 14 | 15 | # 予測 16 | train_size = 0.7 17 | train_index <- sample(data_iris %>% nrow(), data_iris %>% nrow() * train_size) 18 | train_data <- data_iris[train_index,] # 訓練データ 19 | test_data <- data_iris[-train_index,] # テストデータ 20 | 21 | ## 重回帰 22 | logit_multi_model <- train(Species ~ Sepal.Length, data=train_data, method="glm", family=binomial()) 23 | y_pred <- predict(logit_multi_model, test_data) 24 | confusionMatrix(data = y_pred, test_data$Species %>% as.factor()) 25 | 26 | roc(test_data$Species %>% as.factor() %>% as.numeric(), y_pred %>% as.numeric(), plot = TRUE) 27 | 28 | -------------------------------------------------------------------------------- /R/06/regression.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | 5 | library(MASS) 6 | library(tidyverse) 7 | library(caret) 8 | 9 | # データ読み込み 10 | data(Boston) 11 | Boston %>% summary() 12 | Boston %>% head() 13 | 14 | # 予測 15 | train_size = 0.7 16 | train_index <- sample(Boston %>% nrow(), Boston %>% nrow() * train_size) 17 | train_data <- Boston[train_index,] # 訓練データ 18 | test_data <- Boston[-train_index,] # テストデータ 19 | 20 | lm_multi_model <- train(data=train_data, medv ~ ., method="lm") 21 | y_pred <- predict(lm_multi_model, test_data) 22 | 23 | # MAE 24 | MAE(y_pred, test_data$medv) 25 | 26 | # RMSE 27 | RMSE(y_pred, test_data$medv) 28 | 29 | # RMSLE 30 | rmsle <- function(y_true, y_pred) 31 | sqrt(mean((log1p(y_true) - log1p(y_pred))^2)) 32 | rmsle(y_pred, test_data$medv) 33 | -------------------------------------------------------------------------------- /R/07/cnn_mnist.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | # devtools::install_github("rstudio/keras") 6 | 7 | library(MASS) 8 | library(tidyverse) 9 | # library(caret) 10 | library(keras) 11 | # install_keras() 12 | 13 | mnist <- dataset_mnist() 14 | x_train <- mnist$train$x 15 | y_train <- mnist$train$y 16 | x_test <- mnist$test$x 17 | y_test <- mnist$test$y 18 | 19 | image_size <- 28 20 | 21 | # reshape 22 | x_train <- array_reshape(x_train, c(nrow(x_train), image_size, image_size, 1)) 23 | x_test <- array_reshape(x_test, c(nrow(x_test), image_size, image_size, 1)) 24 | # rescale 25 | x_train <- x_train / 255 26 | x_test <- x_test / 255 27 | 28 | y_train <- to_categorical(y_train, 10) 29 | y_test <- to_categorical(y_test, 10) 30 | 31 | 32 | model <- keras_model_sequential() %>% 33 | layer_conv_2d(filters = 32, kernel_size = c(3,3), activation = 'relu', 34 | input_shape = c(image_size, image_size, 1)) %>% 35 | layer_max_pooling_2d(pool_size = c(2, 2)) %>% 36 | layer_flatten() %>% 37 | layer_dense(units = 32, activation = 'relu') %>% 38 | layer_dense(units = 10, activation = 'softmax') 39 | 40 | model %>% summary() 41 | 42 | 43 | model %>% compile( 44 | loss = 'categorical_crossentropy', 45 | optimizer = "Adam", 46 | metrics = c('accuracy') 47 | ) 48 | 49 | history <- model %>% fit( 50 | x_train, y_train, 51 | epochs = 5, batch_size = 64 52 | ) 53 | 54 | model %>% evaluate(x_test, y_test) 55 | 56 | -------------------------------------------------------------------------------- /R/07/nn_mnist.R: -------------------------------------------------------------------------------- 1 | # install.packages("tidyverse") 2 | # install.packages("GGally") 3 | # install.packages("caret") 4 | # install.packages('e1071') 5 | # devtools::install_github("rstudio/keras") 6 | 7 | library(MASS) 8 | library(tidyverse) 9 | # library(caret) 10 | library(keras) 11 | # install_keras() 12 | 13 | mnist <- dataset_mnist() 14 | x_train <- mnist$train$x 15 | y_train <- mnist$train$y 16 | x_test <- mnist$test$x 17 | y_test <- mnist$test$y 18 | 19 | # reshape 20 | x_train <- array_reshape(x_train, c(nrow(x_train), 28**2)) 21 | x_test <- array_reshape(x_test, c(nrow(x_test), 28**2)) 22 | # rescale 23 | x_train <- x_train / 255 24 | x_test <- x_test / 255 25 | 26 | y_train <- to_categorical(y_train, 10) 27 | y_test <- to_categorical(y_test, 10) 28 | 29 | 30 | model <- keras_model_sequential() 31 | model %>% 32 | layer_dense(units = 512, activation = 'relu', input_shape = c(28**2)) %>% 33 | layer_dense(units = 512, activation = 'relu') %>% 34 | layer_dense(units = 10, activation = 'softmax') 35 | 36 | 37 | model %>% summary() 38 | 39 | 40 | model %>% compile( 41 | loss = 'categorical_crossentropy', 42 | optimizer = "Adam", 43 | metrics = c('accuracy') 44 | ) 45 | 46 | history <- model %>% fit( 47 | x_train, y_train, 48 | epochs = 5, batch_size = 64 49 | ) 50 | 51 | model %>% evaluate(x_test, y_test) 52 | 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PythonBook 2 | 「Pythonと実データで遊んで学ぶ データ分析講座」 サポートページ 3 | 4 | https://www.amazon.co.jp/dp/4863542836 5 | 6 | 7 | # 書籍修正点 8 | ## 2019.12.03更新 9 | * Chapter3 で使用している、国土交通省のAPIですが、URLが変更されました。以下のように、「http://」となっているURLを「https://」と変更してください。 10 | * 第2版では修正済みです。 11 | 12 | ## 2020.06.25更新 13 | * Google Colaboratory 上でデフォルトで使用されている、 scikit-learn ライブラリのバージョンが変更されました 14 | * それに伴い、いくつかの処理で、書籍中で紹介しているものと異なる結果になる可能性があります 15 | * 現時点で判明したものを以下にまとめます(随時追加します) 16 | 17 | ### P134以降で登場する LogisticRegression 18 | * デフォルトで使用される、ソルバーが変更されました 19 | * 公式サイト https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html 20 | * ソルバーとは各パラメータを求めるための、最適化アルゴリズムを指します 21 | * 以下のように、solverに `liblinear` を指定すると、書籍中の結果と一致します 22 | 23 | ```python 24 | logit = LogisticRegression(solver='liblinear') 25 | ``` 26 | 27 | * なお、本修正は 第3版では修正済みです。 28 | 29 | 30 | ## 2021.11.11更新 31 | 32 | * Chapter6 P200のRMSE計算について、結果は変わっていませんが、計算順序が定義と異なっているため、以下に修正をお願いいたします。 33 | 34 | ![texclip20211112141247](https://user-images.githubusercontent.com/43558230/141413796-3a7f8c98-0a31-41f7-baf9-92f177411418.png) 35 | 36 | 37 | * Chapter P201の「評価指標としてRMEを採用した場合、さまざまな回帰モデルを比較し、その中でMAE値が最も小さいモデルが良い、と判断されます。」のRMEは「MAE」の誤りです。 38 | * Chapter6 P201のMAE計算について、結果は変わっていませんが、計算順序が定義と異なっているため、以下に修正をお願いいたします。 39 | 40 | ![texclip20211112142108](https://user-images.githubusercontent.com/43558230/141414206-9f3fda8c-4352-4f6d-a80d-2101ceff8204.png) 41 | 42 | 43 | --------------------------------------------------------------------------------