├── README.md ├── Yelp_Dataset_-_Restaurant_Recommender.ipynb ├── .ipynb_checkpoints ├── Yelp_Dataset_Data_Preprocessing-checkpoint.ipynb └── Yelp_Dataset_-_NLP-checkpoint.ipynb └── Yelp_Dataset_-_NLP.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Yelp Challenge (Foodie Challenge) 2 | Yelp dataset challenge: NLP; sentiment analysis; Restaurant Recommendation System 3 | 4 | This projects use Yelp Dataset challenge data containing restaurant comments from Yelp users all over U.S., business information of restaurants registered on Yelp. 5 | 6 | In this project, I selected comments from year 2016 to year 2018 to train and test my model. All together, the raw data is of size ~1GB. There're three main sections of this projects: 7 | * data preprocessing 8 | * NLP and sentiment analysis (Naive Bayes Classifier) 9 | * recommendation system 10 | -------------------------------------------------------------------------------- /Yelp_Dataset_-_Restaurant_Recommender.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Yelp Data Challenge - Restaurant Recommender\n", 8 | "\n", 9 | "BitTiger DS501\n", 10 | "\n", 11 | "Nov 2017" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd\n", 23 | "import numpy as np\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import pandas as pd\n", 26 | "from scipy import sparse\n", 27 | "from sklearn.metrics.pairwise import cosine_similarity\n", 28 | "% matplotlib inline\n", 29 | "plt.style.use(\"ggplot\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "df = pd.read_csv('dataset/last_2_years_restaurant_reviews.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | "
business_idnamecategoriesavg_starscooldatefunnyreview_idstarstextusefuluser_id
0--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Steakhouses', 'Cajun/Creole', 'Restaurants']4.012016-05-1700Qc1THNHSapDL7cv-ZzW5g5What can I say.. Wowzers! Probably one of the ...04LxKRRIikhr65GfPDW626w
1--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Steakhouses', 'Cajun/Creole', 'Restaurants']4.002017-01-200L8lo5SKXfZRlbn1bpPiC9w5Went here for guys weekend. Unbelievable. Ravi...0nT8zgjoc-PbdBoQsFEXFLw
2--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Steakhouses', 'Cajun/Creole', 'Restaurants']4.0522016-09-25306eUT3IwwWPP3CZkAhxqOIw5One word my friends: tableside!!! Yes, tablesi...567RlyCglsIzhBn081inwvcg
3--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Steakhouses', 'Cajun/Creole', 'Restaurants']4.012017-02-1203cnTdE45VrsS0o4cVhfGog3Located inside my favorite hotel Venetian, Del...1rOIrilMC7VFwFVBeQNiKMw
4--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Steakhouses', 'Cajun/Creole', 'Restaurants']4.002016-10-300tYrSbjX3QgZGBZuQ3n8g6w5After the most incredible service, delicious m...2PiWlV_UC_-SXqyxQM9fAtw
\n", 148 | "
" 149 | ], 150 | "text/plain": [ 151 | " business_id name \\\n", 152 | "0 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 153 | "1 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 154 | "2 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 155 | "3 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 156 | "4 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 157 | "\n", 158 | " categories avg_stars cool \\\n", 159 | "0 ['Steakhouses', 'Cajun/Creole', 'Restaurants'] 4.0 1 \n", 160 | "1 ['Steakhouses', 'Cajun/Creole', 'Restaurants'] 4.0 0 \n", 161 | "2 ['Steakhouses', 'Cajun/Creole', 'Restaurants'] 4.0 52 \n", 162 | "3 ['Steakhouses', 'Cajun/Creole', 'Restaurants'] 4.0 1 \n", 163 | "4 ['Steakhouses', 'Cajun/Creole', 'Restaurants'] 4.0 0 \n", 164 | "\n", 165 | " date funny review_id stars \\\n", 166 | "0 2016-05-17 0 0Qc1THNHSapDL7cv-ZzW5g 5 \n", 167 | "1 2017-01-20 0 L8lo5SKXfZRlbn1bpPiC9w 5 \n", 168 | "2 2016-09-25 30 6eUT3IwwWPP3CZkAhxqOIw 5 \n", 169 | "3 2017-02-12 0 3cnTdE45VrsS0o4cVhfGog 3 \n", 170 | "4 2016-10-30 0 tYrSbjX3QgZGBZuQ3n8g6w 5 \n", 171 | "\n", 172 | " text useful \\\n", 173 | "0 What can I say.. Wowzers! Probably one of the ... 0 \n", 174 | "1 Went here for guys weekend. Unbelievable. Ravi... 0 \n", 175 | "2 One word my friends: tableside!!! Yes, tablesi... 56 \n", 176 | "3 Located inside my favorite hotel Venetian, Del... 1 \n", 177 | "4 After the most incredible service, delicious m... 2 \n", 178 | "\n", 179 | " user_id \n", 180 | "0 4LxKRRIikhr65GfPDW626w \n", 181 | "1 nT8zgjoc-PbdBoQsFEXFLw \n", 182 | "2 7RlyCglsIzhBn081inwvcg \n", 183 | "3 rOIrilMC7VFwFVBeQNiKMw \n", 184 | "4 PiWlV_UC_-SXqyxQM9fAtw " 185 | ] 186 | }, 187 | "execution_count": 3, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "df.head()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "## 1. Clean data and get rating data " 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "#### Select relevant columns in the original dataframe" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 4, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "\n", 220 | "Index: 136735 entries, ---udAKDsn0yQXmzbWQNSw to zzvqVZTYs5VKxPc-IkRQ4A\n", 221 | "Columns: 4268 entries, --9e1ONYQuAa-CB_Rrw7Tw to zwNC-Ow4eIMan2__bS9-rg\n", 222 | "dtypes: int64(4268)\n", 223 | "memory usage: 4.3+ GB\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "# Get business_id, user_id, stars for recommender\n", 229 | "df_utility = pd.pivot_table(data=df, \n", 230 | " values='stars', \n", 231 | " index='user_id', \n", 232 | " columns='business_id', \n", 233 | " fill_value=0)\n", 234 | "\n", 235 | "df_utility.info()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "#### There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender\n", 243 | "\n", 244 | "**Q**: How do we recommend to these users anyways?" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "source": [ 253 | "**A**: We can use item-item based collaborative filtering. Business as item, user as user. So we need to calculate the similarity between business first and then recommend. Or we can use U-V decomposition to build recommender" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "#### Create utility matrix from records" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 14, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "<136735x4268 sparse matrix of type ''\n", 272 | "\twith 0 stored elements in LInked List format>" 273 | ] 274 | }, 275 | "execution_count": 14, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "num_users = df.user_id.value_counts()\n", 282 | "num_business = df.business_id.value_counts()\n", 283 | "#print(num_users,num_business)\n", 284 | "stars_mat = sparse.lil_matrix((num_users.shape[0], num_business.shape[0]))\n", 285 | "stars_mat" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 17, 291 | "metadata": { 292 | "collapsed": true 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "df.set_index('user_id', inplace = True)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "## 2. Item-Item similarity recommender" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "### Let's reuse the ItemItemRecommender class derived from previous exercise\n", 311 | "\n", 312 | "Hint: we need to make modification to accommodate the dense numpy array" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 5, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/plain": [ 323 | "(136735, 4268)" 324 | ] 325 | }, 326 | "execution_count": 5, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "df.set_index('user_id', inplace = True)\n", 333 | "utility_mat = df_utility.as_matrix()\n", 334 | "utility_mat.shape" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 7, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Item-Item Similarity Matrix\n", 344 | "item_sim_mat = cosine_similarity(utility_mat.T)\n", 345 | "\n", 346 | "least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)\n", 347 | "\n", 348 | "# Neighborhoods\n", 349 | "neighborhood_size = 75\n", 350 | "neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 8, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "(4268, 75)" 362 | ] 363 | }, 364 | "execution_count": 8, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "neighborhoods.shape\n", 371 | "#" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 38, 377 | "metadata": { 378 | "collapsed": true 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "# Let's pick a lucky user\n", 383 | "user_id = 3" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 39, 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "name": "stdout", 393 | "output_type": "stream", 394 | "text": [ 395 | "[ 3. 4. 3. 4. 3. 3. 4.\n", 396 | " 3. 3. 3.55785356 3. 4. 3. 3.\n", 397 | " 4. 3.47318803 3. 3. 4. 4. 3.\n", 398 | " 3. 3. 4. 4. 4. 3. 3.\n", 399 | " 3. 3. 3. 3. 3. 4. 4.\n", 400 | " 3. 3. 4. 3. 3. 4. 4.\n", 401 | " 4. 3. 4. 3. 3. 4. 3.\n", 402 | " 3. 3. 4. 3. 3. 4. 3.\n", 403 | " 3. 3. 3. 3. 3. 3. 4.\n", 404 | " 3. 3. 4. 3. 3. 4. 3.\n", 405 | " 3. 3. 4. 3. 3.49174105 3. 3.\n", 406 | " 4. 3. 3. 3. 4. 4. 3.\n", 407 | " 3. 3. 4. 3. 3. 3. 4.\n", 408 | " 3. 3. 3. 3. 3. 4. 3.\n", 409 | " 3. 4. 3. 3. 4. 3. 3.\n", 410 | " 3. 3. 3. 3. 4. 3.88590038\n", 411 | " 3. 3.52389887 3. 3. 4. 3. 3.\n", 412 | " 4. 3. 4. 3. 3. 3. 3.\n", 413 | " 3.3187534 3. 3. 3. 4. 4. 3.\n", 414 | " 3. 3. 3. 3.49332573 4. 4. 4.\n", 415 | " 3. 3. 3. 3. 4. 3. 4.\n", 416 | " 3. 3.35313823 3. 3. 3. 3.\n", 417 | " 3.36801672 3. 3. 4. 4. 4. 4.\n", 418 | " 3. 3. 4. 3. 3. 3. 4.\n", 419 | " 3. 4. 3. 3. 4. 3. 4.\n", 420 | " 4. 3. 3. 3. 4. 4. 3.\n", 421 | " 4. 3. 3. 3. 3. 4. 3.\n", 422 | " 3. 3. 4. 3. 4. 3. 3.\n", 423 | " 3. ]\n", 424 | "[ 0. 0. 0. ..., 0. 0. 0.]\n", 425 | "Execution time: 0.049621 seconds\n" 426 | ] 427 | } 428 | ], 429 | "source": [ 430 | "from time import time\n", 431 | "n_users = utility_mat.shape[0]\n", 432 | "n_items = utility_mat.shape[1]\n", 433 | "\n", 434 | "start_time = time()\n", 435 | "items_rated_by_this_user = utility_mat[user_id].nonzero()[0]\n", 436 | "# Just initializing so we have somewhere to put rating preds\n", 437 | "out = np.zeros(n_items)\n", 438 | "for item_to_rate in range(n_items):\n", 439 | " relevant_items = np.intersect1d(neighborhoods[item_to_rate],\n", 440 | " items_rated_by_this_user,\n", 441 | " assume_unique=True) # assume_unique speeds up intersection op\n", 442 | " #print(neighborhoods[item_to_rate])\n", 443 | " #print(items_rated_by_this_user)\n", 444 | " if len(relevant_items) != 0:\n", 445 | " #print(relevant_items)\n", 446 | " #print(utility_mat[user_id, relevant_items], '*', item_sim_mat[item_to_rate, relevant_items])\n", 447 | " out[item_to_rate] = sum(utility_mat[user_id, relevant_items] * \\\n", 448 | " item_sim_mat[item_to_rate, relevant_items]) / \\\n", 449 | " item_sim_mat[item_to_rate, relevant_items].sum()\n", 450 | " else:\n", 451 | " out[item_to_rate] = np.nan\n", 452 | "\n", 453 | "\n", 454 | "print(out[~np.isnan(out)])\n", 455 | "pred_ratings = np.nan_to_num(out)\n", 456 | "print (pred_ratings)\n", 457 | "print(\"Execution time: %f seconds\" % (time()-start_time))" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 46, 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "data": { 467 | "text/plain": [ 468 | "[44, 3604, 1264, 29, 918, 2372, 2893, 3476, 484, 3908]" 469 | ] 470 | }, 471 | "execution_count": 46, 472 | "metadata": {}, 473 | "output_type": "execute_result" 474 | } 475 | ], 476 | "source": [ 477 | "# Recommend n movies\n", 478 | "n = 10\n", 479 | "\n", 480 | "# Get item indexes sorted by predicted rating\n", 481 | "item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))\n", 482 | "\n", 483 | "# Find items that have been rated by user\n", 484 | "items_rated_by_this_user = utility_mat[user_id].nonzero()[0]\n", 485 | "\n", 486 | "# We want to exclude the items that have been rated by user\n", 487 | "unrated_items_by_pred_rating_it = [item for item in item_index_sorted_by_pred_rating\n", 488 | " if item not in items_rated_by_this_user]\n", 489 | "\n", 490 | "unrated_items_by_pred_rating_it[-n:]" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 51, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "recommend business name: Beer Park\n", 503 | "recommend business name: Leticia's Mexican Cocina\n", 504 | "recommend business name: Squeeze In\n", 505 | "recommend business name: Jaburritos\n", 506 | "recommend business name: Yong Kang Street\n", 507 | "recommend business name: Estiatorio Milos\n", 508 | "recommend business name: Off The Strip at the LINQ\n", 509 | "recommend business name: Gallagher's Steakhouse\n", 510 | "recommend business name: Lobster ME\n", 511 | "recommend business name: Pin-Up Pizza\n" 512 | ] 513 | } 514 | ], 515 | "source": [ 516 | "recommend_business_id = df_utility.columns[unrated_items_by_pred_rating_it[-n:]].values\n", 517 | "for business in recommend_business_id:\n", 518 | " print(\"recommend business name: \", df.loc[df.business_id == business].name.values[0])" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "## 3. Matrix Factorization recommender\n", 526 | "\n", 527 | "Take a look at Graphlab Create examples" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 40, 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "name": "stdout", 537 | "output_type": "stream", 538 | "text": [ 539 | "9.89994527306\n", 540 | "2.46531960038\n" 541 | ] 542 | } 543 | ], 544 | "source": [ 545 | "from sklearn.decomposition import TruncatedSVD\n", 546 | "\n", 547 | "def fit_uvd(M,k):\n", 548 | " # use TruncatedSVD to realize UVD\n", 549 | " svd = TruncatedSVD(n_components=k, n_iter=7, random_state=0)\n", 550 | " svd.fit(M)\n", 551 | "\n", 552 | " V = svd.components_\n", 553 | " U = svd.transform(M) # effectively, it's doing: U = M.dot(V.T)\n", 554 | " # we can ignore svd.singular_values_ for our purpose\n", 555 | " \n", 556 | " # why we can do this?\n", 557 | " # recall: \n", 558 | " # SVD start from u*s*v=M => u*s=M*v.T, where M*v.T is our transformation above to get U in UVD\n", 559 | " # so the above U is effectively u*s in SVD\n", 560 | " # that's why U*V = u*s*v = M our original matrix\n", 561 | " # there are many ways to understand it!\n", 562 | " # here we by-passed singular values.\n", 563 | " \n", 564 | " return U,V\n", 565 | "\n", 566 | "# decompose\n", 567 | "U,V = fit_uvd(utility_mat,200)\n", 568 | "\n", 569 | "# reconstruct\n", 570 | "ratings_mat_fitted = U.dot(V) # U*V\n", 571 | "\n", 572 | "# calculate errs\n", 573 | "errs = np.array((utility_mat-ratings_mat_fitted).flatten()).squeeze()\n", 574 | "mask = np.array(utility_mat.flatten()).squeeze()>0\n", 575 | "\n", 576 | "mse = np.mean(errs[mask]**2)\n", 577 | "average_abs_err = abs(errs[mask]).mean()\n", 578 | "print (mse)\n", 579 | "print (average_abs_err)" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 52, 585 | "metadata": {}, 586 | "outputs": [ 587 | { 588 | "data": { 589 | "text/plain": [ 590 | "[3908, 484, 3476, 2893, 2372, 918, 29, 1264, 3604, 44]" 591 | ] 592 | }, 593 | "execution_count": 52, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "# get recommendations for one user\n", 600 | "user_id = 3\n", 601 | "n = 10\n", 602 | "\n", 603 | "pred_ratings = ratings_mat_fitted[user_id,:]\n", 604 | "item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]\n", 605 | "\n", 606 | "items_rated_by_this_user = utility_mat[user_id].nonzero()[0]\n", 607 | "\n", 608 | "unrated_items_by_pred_rating_uv = [item for item in item_index_sorted_by_pred_rating\n", 609 | " if item not in items_rated_by_this_user]\n", 610 | "\n", 611 | "unrated_items_by_pred_rating_uv[:n]" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 53, 617 | "metadata": {}, 618 | "outputs": [ 619 | { 620 | "name": "stdout", 621 | "output_type": "stream", 622 | "text": [ 623 | "recommend business name: Culinary Dropout\n", 624 | "recommend business name: Shake Shack\n", 625 | "recommend business name: CUT by Wolfgang Puck\n", 626 | "recommend business name: Nine Fine Irishmen\n", 627 | "recommend business name: BabyStacks Cafe\n", 628 | "recommend business name: Dog Haus\n", 629 | "recommend business name: Rí Rá Irish Pub\n", 630 | "recommend business name: The Bootlegger Italian Bistro\n", 631 | "recommend business name: Scarpetta\n", 632 | "recommend business name: Delmonico Steakhouse\n" 633 | ] 634 | } 635 | ], 636 | "source": [ 637 | "recommend_business_id = df_utility.columns[unrated_items_by_pred_rating_uv[-n:]].values\n", 638 | "for business in recommend_business_id:\n", 639 | " print(\"recommend business name: \", df.loc[df.business_id == business].name.values[0])" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": {}, 645 | "source": [ 646 | "## 4. Other recommenders (optional)\n", 647 | "\n", 648 | "What are other ways you can build a better recommender?\n", 649 | "\n", 650 | "* Other features (have you noticed there are other features in the Yelp dataset, e.g. tips, etc.?)\n", 651 | "* Popularity-based\n", 652 | "* Content-based\n", 653 | "* Hybrid" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": { 659 | "collapsed": true 660 | }, 661 | "source": [ 662 | "## Note\n", 663 | "> We can change the neighborhood_size to 200 or even larger to improve the item-item based recommender\n", 664 | "\n", 665 | "> We can increase the latent factor from 200 to 500 to improve the U-V decomposition recommender" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": { 672 | "collapsed": true 673 | }, 674 | "outputs": [], 675 | "source": [] 676 | } 677 | ], 678 | "metadata": { 679 | "anaconda-cloud": {}, 680 | "kernelspec": { 681 | "display_name": "Python 3", 682 | "language": "python", 683 | "name": "python3" 684 | }, 685 | "language_info": { 686 | "codemirror_mode": { 687 | "name": "ipython", 688 | "version": 3 689 | }, 690 | "file_extension": ".py", 691 | "mimetype": "text/x-python", 692 | "name": "python", 693 | "nbconvert_exporter": "python", 694 | "pygments_lexer": "ipython3", 695 | "version": "3.6.0" 696 | } 697 | }, 698 | "nbformat": 4, 699 | "nbformat_minor": 1 700 | } 701 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Yelp_Dataset_Data_Preprocessing-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Yelp Data Challenge - Data Preprocessing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Dataset Introduction\n", 15 | "\n", 16 | "[Yelp Dataset Challenge](https://www.yelp.com/dataset_challenge)\n", 17 | "\n", 18 | "The Challenge Dataset:\n", 19 | "\n", 20 | " 4.1M reviews and 947K tips by 1M users for 144K businesses\n", 21 | " 1.1M business attributes, e.g., hours, parking availability, ambience.\n", 22 | " Aggregated check-ins over time for each of the 125K businesses\n", 23 | " 200,000 pictures from the included businesses\n", 24 | "\n", 25 | "Cities:\n", 26 | "\n", 27 | " U.K.: Edinburgh\n", 28 | " Germany: Karlsruhe\n", 29 | " Canada: Montreal and Waterloo\n", 30 | " U.S.: Pittsburgh, Charlotte, Urbana-Champaign, Phoenix, Las Vegas, Madison, Cleveland\n", 31 | "\n", 32 | "Files:\n", 33 | "\n", 34 | " yelp_academic_dataset_business.json\n", 35 | " yelp_academic_dataset_checkin.json\n", 36 | " yelp_academic_dataset_review.json\n", 37 | " yelp_academic_dataset_tip.json\n", 38 | " yelp_academic_dataset_user.json\n", 39 | "\n", 40 | "Notes on the Dataset\n", 41 | "\n", 42 | " Each file is composed of a single object type, one json-object per-line.\n", 43 | " Take a look at some examples to get you started: https://github.com/Yelp/dataset-examples.\n", 44 | "\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Read data from file and load to Pandas DataFrame\n", 52 | "\n", 53 | "**Warning**: Loading all the 1.8 GB data into Pandas at a time takes long time and a lot of memory!" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "import json\n", 65 | "import pandas as pd" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "file_business, file_checkin, file_review, file_tip, file_user = [\n", 77 | " 'dataset/business.json',\n", 78 | " 'dataset/checkin.json',\n", 79 | " 'dataset/review_after2016.json',\n", 80 | " 'dataset/tip.json',\n", 81 | " 'dataset/user.json'\n", 82 | "]" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "#### Business Data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "with open(file_business) as f:\n", 101 | " df_business = pd.DataFrame(json.loads(line) for line in f)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "df_business.head(2)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "df_business.info()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "#### Checkin Data" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "# with open(file_checkin) as f:\n", 142 | "# df_checkin = pd.DataFrame(json.loads(line) for line in f)\n", 143 | "# df_checkin.head(2)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "#### Review Data" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "# with open(file_review) as f:\n", 162 | "# df_review = pd.DataFrame(json.loads(line) for line in f)\n", 163 | "# df_review.head(2)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "#### Tip Data" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "# with open(file_tip) as f:\n", 182 | "# df_tip = pd.DataFrame(json.loads(line) for line in f)\n", 183 | "# df_tip.head(2)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "#### User Data" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "# with open(file_user) as f:\n", 202 | "# df_user = pd.DataFrame(json.loads(line) for line in f)\n", 203 | "# df_user.head(2)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "## Filter data by city and category" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "#### Create filters/masks\n", 218 | "\n", 219 | "* create filters that selects business \n", 220 | " * that are located in \"Las Vegas\"\n", 221 | " * that contains \"Restaurants\" in their category (first filter null categories)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "temp = df_business[df_business['city'] == 'Las Vegas'].dropna(subset = ['categories'])\n", 233 | "df_filtered = temp[temp['categories'].apply(lambda x: True if 'Restaurants' in x else False)]\n", 234 | "del temp" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "#### Keep relevant columns\n", 242 | "\n", 243 | "* only keep some useful columns\n", 244 | " * business_id\n", 245 | " * name\n", 246 | " * categories\n", 247 | " * stars" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "selected_features = [u'business_id', u'name', u'categories', u'stars']" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "# Make a DataFrame that contains only the abovementioned columns, and name it as df_selected_business\n", 270 | "df_selected_business = df_filtered[selected_features]" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "collapsed": true 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "# Rename the column name \"stars\" to \"avg_stars\" to avoid naming conflicts with review dataset\n", 282 | "df_selected_business.rename(columns={\"stars\":\"avg_stars\"}, inplace = True)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": true 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "# Inspect your DataFrame\n", 294 | "df_selected_business.head()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "#### Save results to csv files" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "collapsed": true 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "# Save to ./data/selected_business.csv for your next task\n", 313 | "df_selected_business.to_csv('dataset/selected_business.csv', index = False)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 15, 319 | "metadata": { 320 | "collapsed": true 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "# Try reload the csv file to check if everything works fine\n", 325 | "df_selected_business = pd.read_csv('dataset/selected_business.csv')" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "### Use the \"business_id\" column to filter review data\n", 333 | "\n", 334 | "* We want to make a DataFrame that contain and only contain the reviews about the business entities we just obtained" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "#### Load review dataset" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 4, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/html": [ 352 | "
\n", 353 | "\n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | "
business_idcooldatefunnyreview_idstarstextusefuluser_id
0uYHaNptLzDLoV_JZ_MuzUA02016-07-120VfBHSwC5Vz_pbFluy07i9Q5My girlfriend and I stayed here for 3 nights a...0cjpdDjZyprfyDG3RlkVG3w
1uYHaNptLzDLoV_JZ_MuzUA02016-10-0203zRpneRKDsOPq92tq7ybAA3If you need an inexpensive place to stay for a...0bjTcT8Ty4cJZhEOEo01FGA
\n", 395 | "
" 396 | ], 397 | "text/plain": [ 398 | " business_id cool date funny review_id \\\n", 399 | "0 uYHaNptLzDLoV_JZ_MuzUA 0 2016-07-12 0 VfBHSwC5Vz_pbFluy07i9Q \n", 400 | "1 uYHaNptLzDLoV_JZ_MuzUA 0 2016-10-02 0 3zRpneRKDsOPq92tq7ybAA \n", 401 | "\n", 402 | " stars text useful \\\n", 403 | "0 5 My girlfriend and I stayed here for 3 nights a... 0 \n", 404 | "1 3 If you need an inexpensive place to stay for a... 0 \n", 405 | "\n", 406 | " user_id \n", 407 | "0 cjpdDjZyprfyDG3RlkVG3w \n", 408 | "1 bjTcT8Ty4cJZhEOEo01FGA " 409 | ] 410 | }, 411 | "execution_count": 4, 412 | "metadata": {}, 413 | "output_type": "execute_result" 414 | } 415 | ], 416 | "source": [ 417 | "with open(file_review) as f:\n", 418 | " df_review = pd.DataFrame(json.loads(line) for line in f)\n", 419 | "df_review.head(2)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "#### Prepare dataframes to be joined, - on business_id" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 16, 432 | "metadata": {}, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "text/html": [ 437 | "
\n", 438 | "\n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | "
namecategoriesavg_stars
business_id
LDMCrFlGIFUN6L-FEFgzWgEl Pollo Loco['Restaurants', 'American (Traditional)', 'Mex...3.0
QTH_XGh4rWYdd0fTW-tUDwBaja Fresh Mexican Grill['Mexican', 'Restaurants']3.5
\n", 468 | "
" 469 | ], 470 | "text/plain": [ 471 | " name \\\n", 472 | "business_id \n", 473 | "LDMCrFlGIFUN6L-FEFgzWg El Pollo Loco \n", 474 | "QTH_XGh4rWYdd0fTW-tUDw Baja Fresh Mexican Grill \n", 475 | "\n", 476 | " categories \\\n", 477 | "business_id \n", 478 | "LDMCrFlGIFUN6L-FEFgzWg ['Restaurants', 'American (Traditional)', 'Mex... \n", 479 | "QTH_XGh4rWYdd0fTW-tUDw ['Mexican', 'Restaurants'] \n", 480 | "\n", 481 | " avg_stars \n", 482 | "business_id \n", 483 | "LDMCrFlGIFUN6L-FEFgzWg 3.0 \n", 484 | "QTH_XGh4rWYdd0fTW-tUDw 3.5 " 485 | ] 486 | }, 487 | "execution_count": 16, 488 | "metadata": {}, 489 | "output_type": "execute_result" 490 | } 491 | ], 492 | "source": [ 493 | "# Prepare the business dataframe and set index to column \"business_id\", and name it as df_left\n", 494 | "df_left = df_selected_business.set_index('business_id')\n", 495 | "df_left.head(2)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 7, 501 | "metadata": {}, 502 | "outputs": [ 503 | { 504 | "data": { 505 | "text/html": [ 506 | "
\n", 507 | "\n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | "
cooldatefunnyreview_idstarstextusefuluser_id
business_id
uYHaNptLzDLoV_JZ_MuzUA02016-07-120VfBHSwC5Vz_pbFluy07i9Q5My girlfriend and I stayed here for 3 nights a...0cjpdDjZyprfyDG3RlkVG3w
uYHaNptLzDLoV_JZ_MuzUA02016-10-0203zRpneRKDsOPq92tq7ybAA3If you need an inexpensive place to stay for a...0bjTcT8Ty4cJZhEOEo01FGA
\n", 557 | "
" 558 | ], 559 | "text/plain": [ 560 | " cool date funny review_id \\\n", 561 | "business_id \n", 562 | "uYHaNptLzDLoV_JZ_MuzUA 0 2016-07-12 0 VfBHSwC5Vz_pbFluy07i9Q \n", 563 | "uYHaNptLzDLoV_JZ_MuzUA 0 2016-10-02 0 3zRpneRKDsOPq92tq7ybAA \n", 564 | "\n", 565 | " stars \\\n", 566 | "business_id \n", 567 | "uYHaNptLzDLoV_JZ_MuzUA 5 \n", 568 | "uYHaNptLzDLoV_JZ_MuzUA 3 \n", 569 | "\n", 570 | " text \\\n", 571 | "business_id \n", 572 | "uYHaNptLzDLoV_JZ_MuzUA My girlfriend and I stayed here for 3 nights a... \n", 573 | "uYHaNptLzDLoV_JZ_MuzUA If you need an inexpensive place to stay for a... \n", 574 | "\n", 575 | " useful user_id \n", 576 | "business_id \n", 577 | "uYHaNptLzDLoV_JZ_MuzUA 0 cjpdDjZyprfyDG3RlkVG3w \n", 578 | "uYHaNptLzDLoV_JZ_MuzUA 0 bjTcT8Ty4cJZhEOEo01FGA " 579 | ] 580 | }, 581 | "execution_count": 7, 582 | "metadata": {}, 583 | "output_type": "execute_result" 584 | } 585 | ], 586 | "source": [ 587 | "# Prepare the review dataframe and set index to column \"business_id\", and name it as df_right\n", 588 | "df_right = df_review.set_index('business_id')\n", 589 | "df_right.head(2)" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "#### Join! and reset index" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 19, 602 | "metadata": {}, 603 | "outputs": [ 604 | { 605 | "name": "stdout", 606 | "output_type": "stream", 607 | "text": [ 608 | "(286608, 11)\n", 609 | "(1690348, 8)\n", 610 | "(5682, 3)\n" 611 | ] 612 | } 613 | ], 614 | "source": [ 615 | "# Join df_left and df_right. What type of join?\n", 616 | "df_joined = df_left.join(df_right, how = \"inner\")\n", 617 | "print(df_joined.shape)\n", 618 | "print(df_right.shape)\n", 619 | "print(df_left.shape)" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 20, 625 | "metadata": { 626 | "collapsed": true 627 | }, 628 | "outputs": [], 629 | "source": [ 630 | "# You may want to reset the index \n", 631 | "df_joined.reset_index(inplace = True)" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [ 638 | "#### We further filter data by date, e.g. keep comments from last 2 years\n", 639 | "\n", 640 | "* Otherwise your laptop may crush on memory when running machine learning algorithms\n", 641 | "* Purposefully ignoring the reviews made too long time ago" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 21, 647 | "metadata": { 648 | "collapsed": true, 649 | "scrolled": true 650 | }, 651 | "outputs": [], 652 | "source": [ 653 | "# Make a filter that selects date after 2015-01-20\n", 654 | "cond_last_years = df_joined['date'] > u'2016-01-20'" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 22, 660 | "metadata": { 661 | "collapsed": true 662 | }, 663 | "outputs": [], 664 | "source": [ 665 | "# Filter the joined DataFrame and name it as df_final\n", 666 | "df_joined = df_joined[cond_last_years]" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": {}, 672 | "source": [ 673 | "#### Take a glance at the final dataset\n", 674 | "\n", 675 | "* Do more EDA here as you like!" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": 23, 681 | "metadata": { 682 | "collapsed": true 683 | }, 684 | "outputs": [], 685 | "source": [ 686 | "import matplotlib.pyplot as plt\n", 687 | "\n", 688 | "% matplotlib inline" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 33, 694 | "metadata": {}, 695 | "outputs": [ 696 | { 697 | "data": { 698 | "text/plain": [ 699 | "" 700 | ] 701 | }, 702 | "execution_count": 33, 703 | "metadata": {}, 704 | "output_type": "execute_result" 705 | }, 706 | { 707 | "data": { 708 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAD8CAYAAABgmUMCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAFQlJREFUeJzt3X+w3XV95/HnyyAorTVBgmWS0Bts\nxso6dc2myK5dt1sqv9oS3Ck7OE7NUNrsrmB13U4N2ilOu87obivKtMs2LlkD64KIWrILLka0pTuz\n/AiK/IqYFFm4JiVxg+AWlQXf+8f53HK83CTnm9xzz7m5z8fMmfP9fr6fc77vfOZcXnx/p6qQJGlQ\nLxp1AZKk+cXgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6uSoURcwDMcff3xN\nTEyMugxJmlfuvvvub1fV0oP1OyKDY2Jigm3bto26DEmaV5L870H6uatKktSJwSFJ6sTgkCR1MrTg\nSLIpyZ4k98+w7HeSVJLj23ySXJFkZ5J7k6zu67suyY72WjeseiVJgxnmFscngLOmNyZZAbwZeLSv\n+WxgVXutB65sfY8DLgPeAJwKXJZkyRBrliQdxNCCo6puA/bNsOhy4HeB/idIrQWurp7bgcVJTgTO\nBLZW1b6qegLYygxhJEmaO3N6jCPJucC3qupr0xYtAx7rm59sbftrn+m71yfZlmTb3r17Z7FqSVK/\nOQuOJMcC7wd+f6bFM7TVAdpf2Fi1sarWVNWapUsPev2KJOkQzeUWx6uAlcDXkjwCLAe+kuQn6W1J\nrOjruxzYdYB2SdKIzFlwVNV9VXVCVU1U1QS9UFhdVX8DbAHe3s6uOg14sqp2A7cAZyRZ0g6Kn9Ha\nhmpiw03DXoUkzVvDPB33WuB/Aa9OMpnkogN0vxl4GNgJfBx4B0BV7QP+ELirvf6gtUmSRmRo96qq\nqrceZPlE33QBF++n3yZg06wWJ0k6ZF45LknqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkT\ng0OS1InBIUnqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqxOCQJHVicEiS\nOjE4JEmdDC04kmxKsifJ/X1t/z7J15Pcm+RzSRb3Lbs0yc4kDyU5s6/9rNa2M8mGYdUrSRrMMLc4\nPgGcNa1tK/DaqvpZ4BvApQBJTgEuAP5e+8x/SLIoySLgT4GzgVOAt7a+kqQRGVpwVNVtwL5pbV+o\nqmfb7O3A8ja9Friuqn5QVd8EdgKnttfOqnq4qp4Brmt9JUkjMspjHL8BfL5NLwMe61s22dr21y5J\nGpGRBEeS9wPPAp+capqhWx2gfabvXJ9kW5Jte/funZ1CJUkvMOfBkWQd8CvA26pqKgQmgRV93ZYD\nuw7Q/gJVtbGq1lTVmqVLl85+4ZIkYI6DI8lZwHuBc6vq6b5FW4ALkhyTZCWwCrgTuAtYlWRlkqPp\nHUDfMpc1S5J+1FHD+uIk1wK/AByfZBK4jN5ZVMcAW5MA3F5V/7KqHkhyPfAgvV1YF1fVc+17LgFu\nARYBm6rqgWHVLEk6uKEFR1W9dYbmqw7Q/4PAB2dovxm4eRZLkyQdBq8clyR1YnBIkjoxOCRJnRgc\nkqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJ\nwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUidDC44km5LsSXJ/X9txSbYm2dHel7T2JLkiyc4k\n9yZZ3feZda3/jiTrhlWvJGkww9zi+ARw1rS2DcCtVbUKuLXNA5wNrGqv9cCV0Asa4DLgDcCpwGVT\nYSNJGo2hBUdV3Qbsm9a8FtjcpjcD5/W1X109twOLk5wInAlsrap9VfUEsJUXhpEkaQ7N9TGOV1bV\nboD2fkJrXwY81tdvsrXtr12SNCLjcnA8M7TVAdpf+AXJ+iTbkmzbu3fvrBYnSXreXAfH420XFO19\nT2ufBFb09VsO7DpA+wtU1caqWlNVa5YuXTrrhUuSeuY6OLYAU2dGrQNu7Gt/ezu76jTgybYr6xbg\njCRL2kHxM1qbJGlEjhrWFye5FvgF4Pgkk/TOjvoQcH2Si4BHgfNb95uBc4CdwNPAhQBVtS/JHwJ3\ntX5/UFXTD7hLkubQ0IKjqt66n0Wnz9C3gIv38z2bgE2zWJok6TCMy8FxSdI8YXBIkjoxOCRJnRgc\nkqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqROBgqOJK8ddiGSpPlh0C2O\n/5jkziTvSLJ4qBVJksbaQMFRVT8PvI3e0/i2JfmvSd481MokSWNp4GMcVbUD+D3gvcA/Aa5I8vUk\n/2xYxUmSxs+gxzh+NsnlwHbgF4FfrarXtOnLh1ifJGnMDPoEwD8BPg68r6q+N9VYVbuS/N5QKpMk\njaVBg+Mc4HtV9RxAkhcBL6mqp6vqmqFVJ0kaO4Me4/gi8NK++WNbmyRpgRk0OF5SVf93aqZNHzuc\nkiRJ42zQ4PjbJKunZpL8A+B7B+h/QEn+dZIHktyf5NokL0myMskdSXYk+VSSo1vfY9r8zrZ84lDX\nK0k6fIMGx7uBTyf5qyR/BXwKuORQVphkGfDbwJqqei2wCLgA+DBweVWtAp4ALmofuQh4oqp+mt4Z\nXB8+lPVKkmbHoBcA3gX8DPCvgHcAr6mquw9jvUcBL01yFL1dXrvpndp7Q1u+GTivTa9t87TlpyfJ\nYaxbknQYBj2rCuDngIn2mdcnoaqu7rrCqvpWkj8CHqW3u+sLwN3Ad6rq2dZtEljWppcBj7XPPpvk\nSeAVwLe7rluSdPgGCo4k1wCvAu4BnmvNBXQOjiRL6G1FrAS+A3waOHuGrjX1kQMs6//e9cB6gJNO\nOqlrWZKkAQ26xbEGOKWqXvAf7EPwS8A3q2ovQJLPAv8IWJzkqLbVsRzY1fpP0rtH1mTbtfVyYN/0\nL62qjcBGgDVr1sxGnZKkGQx6cPx+4CdnaZ2PAqclObYdqzgdeBD4MvBrrc864MY2vaXN05Z/aZYC\nTJJ0CAbd4jgeeDDJncAPphqr6tyuK6yqO5LcAHwFeBb4Kr0thZuA65L829Z2VfvIVcA1SXbS29K4\noOs6JUmzZ9Dg+MBsrrSqLgMum9b8MHDqDH2/D5w/m+uXJB26gYKjqv4yyU8Bq6rqi0mOpXf9hSRp\ngRn0tuq/Re8aij9rTcuAPx9WUZKk8TXowfGLgTcCT8HfPdTphGEVJUkaX4MGxw+q6pmpmXZarGc2\nSdICNGhw/GWS99G7Tcib6V2099+GV5YkaVwNGhwbgL3AfcC/AG6m9/xxSdICM+hZVT+k9+jYjw+3\nHEnSuBv0XlXfZIZjGlV18qxXJEkaa13uVTXlJfQuyDtu9suRJI27QZ/H8X/6Xt+qqo/Se36GJGmB\nGXRX1eq+2RfR2wJ52VAqkiSNtUF3Vf1x3/SzwCPAP5/1aiRJY2/Qs6r+6bALkSTND4PuqnrPgZZX\n1UdmpxxJ0rjrclbVz9F7qBLArwK30Z4FLklaOLo8yGl1VX0XIMkHgE9X1W8OqzBJ0nga9JYjJwHP\n9M0/A0zMejWSpLE36BbHNcCdST5H7wrytwBXD60qSdLYGvSsqg8m+Tzwj1vThVX11eGVJUkaV4Pu\nqgI4Fniqqj4GTCZZOaSaJEljbNBHx14GvBe4tDW9GPgvwypKkjS+Bt3ieAtwLvC3AFW1i8O45UiS\nxUluSPL1JNuT/MMkxyXZmmRHe1/S+ibJFUl2Jrl32u1PJElzbNDgeKaqinZr9SQ/dpjr/RjwP6rq\nZ4DXAdvpPSzq1qpaBdza5gHOBla113rgysNctyTpMAwaHNcn+TNgcZLfAr7IIT7UKclPAG8CrgKo\nqmeq6jvAWmBz67YZOK9NrwWurp7bWw0nHsq6u5jYcBMTG24a9mokad4Z9KyqP2rPGn8KeDXw+1W1\n9RDXeTK9x9D+5ySvA+4G3gW8sqp2t/XtTnJC67+MH71CfbK17T7E9UuSDsNBgyPJIuCWqvol4FDD\nYvo6VwPvrKo7knyM53dLzVjCDG0veBphkvX0dmVx0kknzUKZkqSZHHRXVVU9Bzyd5OWztM5JYLKq\n7mjzN9ALksendkG19z19/Vf0fX45sGuGOjdW1ZqqWrN06dJZKlWSNN2gV45/H7gvyVbamVUAVfXb\nXVdYVX+T5LEkr66qh4DTgQfbax3wofZ+Y/vIFuCSJNcBbwCenNqlJUmae4MGx03tNVveCXwyydHA\nw8CF9LZ+rk9yEfAoveeaA9wMnAPsBJ5ufSVJI3LA4EhyUlU9WlWbD9Svq6q6h96t2qc7fYa+BVw8\nm+uXJB26gx3j+POpiSSfGXItkqR54GDB0X9G08nDLESSND8cLDhqP9OSpAXqYAfHX5fkKXpbHi9t\n07T5qqqfGGp1kqSxc8DgqKpFc1WIJGl+6PI8DkmSDA5JUjcGhySpE4NDktSJwSFJ6sTgkCR1YnBI\nkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnYwsOJIsSvLVJP+9\nza9MckeSHUk+leTo1n5Mm9/Zlk+MqmZJ0mi3ON4FbO+b/zBweVWtAp4ALmrtFwFPVNVPA5e3fpKk\nERlJcCRZDvwy8J/afIBfBG5oXTYD57XptW2etvz01l+SNAKj2uL4KPC7wA/b/CuA71TVs21+EljW\nppcBjwG05U+2/pKkEZjz4EjyK8Ceqrq7v3mGrjXAsv7vXZ9kW5Jte/funYVKJUkzGcUWxxuBc5M8\nAlxHbxfVR4HFSY5qfZYDu9r0JLACoC1/ObBv+pdW1caqWlNVa5YuXTrcf4EkLWBzHhxVdWlVLa+q\nCeAC4EtV9Tbgy8CvtW7rgBvb9JY2T1v+pap6wRaHJGlujNN1HO8F3pNkJ71jGFe19quAV7T29wAb\n5rKoiQ03zeXqJGnsHXXwLsNTVX8B/EWbfhg4dYY+3wfOn9PCJEn7NU5bHJKkecDgkCR1YnBIkjox\nOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGxwAm\nNtzkczkkqTE4JEmdGBySpE4MDklSJwaHJKkTg0OS1InBIUnqZM6DI8mKJF9Osj3JA0ne1dqPS7I1\nyY72vqS1J8kVSXYmuTfJ6rmuWZL0vFFscTwL/Juqeg1wGnBxklOADcCtVbUKuLXNA5wNrGqv9cCV\nc19yj9dySNIIgqOqdlfVV9r0d4HtwDJgLbC5ddsMnNem1wJXV8/twOIkJ85x2ZKkZqTHOJJMAK8H\n7gBeWVW7oRcuwAmt2zLgsb6PTbY2SdIIjCw4kvw48Bng3VX11IG6ztBWM3zf+iTbkmzbu3fvbJUp\nSZpmJMGR5MX0QuOTVfXZ1vz41C6o9r6ntU8CK/o+vhzYNf07q2pjVa2pqjVLly4dXvGStMCN4qyq\nAFcB26vqI32LtgDr2vQ64Ma+9re3s6tOA56c2qUlSZp7R41gnW8Efh24L8k9re19wIeA65NcBDwK\nnN+W3QycA+wEngYunNtyJUn95jw4qup/MvNxC4DTZ+hfwMVDLUqSNDCvHO/IZ3NIWugMDklSJwaH\nJKkTg0OS1InBIUnqZBSn4x4R+g+QP/KhXx5hJZI0t9zikCR1YnBIkjoxOGaB13VIWkgMDklSJwaH\nJKkTg0OS1InBMUu8h5WkhcLgkCR1YnDMsv6tDrdAJB2JvHJ8CAwMSUcytzgkSZ0YHEPmQXNJRxqD\nYwQME0nzmcExRwwKSUcKD47Poenh4a3ZJc1H82aLI8lZSR5KsjPJhlHXM2zuzpI0rubFFkeSRcCf\nAm8GJoG7kmypqgdHW9ns2V9I7K/dLRRJozIvggM4FdhZVQ8DJLkOWAscMcHR1fRAmQoSd39JGrb5\nEhzLgMf65ieBN4yolrE005ZJ111dMwXN9CCamu/vO0hYTWy4adaDrL+WmeqSNBypqlHXcFBJzgfO\nrKrfbPO/DpxaVe/s67MeWN9mXw08dBirPB749mF8fqFwnAbjOA3GcRrMMMfpp6pq6cE6zZctjklg\nRd/8cmBXf4eq2ghsnI2VJdlWVWtm47uOZI7TYBynwThOgxmHcZovZ1XdBaxKsjLJ0cAFwJYR1yRJ\nC9K82OKoqmeTXALcAiwCNlXVAyMuS5IWpHkRHABVdTNw8xytblZ2eS0AjtNgHKfBOE6DGfk4zYuD\n45Kk8TFfjnFIksaEwdFnod3W5GCSPJLkviT3JNnW2o5LsjXJjva+pLUnyRVt7O5Nsnq01Q9Xkk1J\n9iS5v6+t89gkWdf670iybhT/lmHazzh9IMm32u/qniTn9C27tI3TQ0nO7Gs/ov82k6xI8uUk25M8\nkORdrX08f1NV5au3u24R8NfAycDRwNeAU0Zd14jH5BHg+Glt/w7Y0KY3AB9u0+cAnwcCnAbcMer6\nhzw2bwJWA/cf6tgAxwEPt/clbXrJqP9tczBOHwB+Z4a+p7S/u2OAle3vcdFC+NsETgRWt+mXAd9o\n4zGWvym3OJ73d7c1qapngKnbmuhHrQU2t+nNwHl97VdXz+3A4iQnjqLAuVBVtwH7pjV3HZszga1V\nta+qngC2AmcNv/q5s59x2p+1wHVV9YOq+iawk97f5RH/t1lVu6vqK236u8B2enfMGMvflMHxvJlu\na7JsRLWMiwK+kOTudmU+wCurajf0fuzACa3d8es+Ngt5zC5pu1g2Te1+wXECIMkE8HrgDsb0N2Vw\nPC8ztC30U87eWFWrgbOBi5O86QB9Hb/929/YLNQxuxJ4FfD3gd3AH7f2BT9OSX4c+Azw7qp66kBd\nZ2ibs7EyOJ530NuaLDRVtau97wE+R2+XweNTu6Da+57W3fHrPjYLcsyq6vGqeq6qfgh8nN7vChb4\nOCV5Mb3Q+GRVfbY1j+VvyuB4nrc16ZPkx5K8bGoaOAO4n96YTJ2psQ64sU1vAd7ezvY4DXhyahN7\nAek6NrcAZyRZ0nbXnNHajmjTjn29hd7vCnrjdEGSY5KsBFYBd7IA/jaTBLgK2F5VH+lbNJ6/qVGf\nTTBOL3pnKnyD3hkc7x91PSMei5Ppnb3yNeCBqfEAXgHcCuxo78e19tB72NZfA/cBa0b9bxjy+FxL\nbzfL/6P3f3kXHcrYAL9B7yDwTuDCUf+75micrmnjcC+9/wCe2Nf//W2cHgLO7ms/ov82gZ+nt0vp\nXuCe9jpnXH9TXjkuSerEXVWSpE4MDklSJwaHJKkTg0OS1InBIUnqxOCQJHVicEiSOjE4JEmd/H9V\npfa3H13wnAAAAABJRU5ErkJggg==\n", 709 | "text/plain": [ 710 | "" 711 | ] 712 | }, 713 | "metadata": {}, 714 | "output_type": "display_data" 715 | } 716 | ], 717 | "source": [ 718 | "# e.g. calculate counts of reviews per business entity, and plot it\n", 719 | "df_joined['business_id'].value_counts().plot.hist(bins = 200)\n", 720 | "plt.show" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "## Save your preprocessed dataset to csv file\n", 728 | "\n", 729 | "* Respect your laptop's hard work! You don't want to make it run everything again." 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 31, 735 | "metadata": { 736 | "collapsed": true 737 | }, 738 | "outputs": [], 739 | "source": [ 740 | "# Save to ./data/last_2_years_restaurant_reviews.csv for your next task\n", 741 | "df_joined.to_csv('dataset/last_2_years_restaurant_reviews.csv', index = False)" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": null, 747 | "metadata": { 748 | "collapsed": true 749 | }, 750 | "outputs": [], 751 | "source": [] 752 | } 753 | ], 754 | "metadata": { 755 | "anaconda-cloud": {}, 756 | "kernelspec": { 757 | "display_name": "Python 3", 758 | "language": "python", 759 | "name": "python3" 760 | }, 761 | "language_info": { 762 | "codemirror_mode": { 763 | "name": "ipython", 764 | "version": 3 765 | }, 766 | "file_extension": ".py", 767 | "mimetype": "text/x-python", 768 | "name": "python", 769 | "nbconvert_exporter": "python", 770 | "pygments_lexer": "ipython3", 771 | "version": "3.5.5" 772 | } 773 | }, 774 | "nbformat": 4, 775 | "nbformat_minor": 1 776 | } 777 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Yelp_Dataset_-_NLP-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Yelp Data Challenge - NLP" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 8, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "%matplotlib inline" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "df = pd.read_csv('../dataset/2016_restaurant_reviews.csv')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 4, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/html": [ 40 | "
\n", 41 | "\n", 54 | "\n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | "
business_idnamecategoriesavg_starscooldatefunnyreview_idstarstextusefuluser_id
0--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-03-3106SgvNWJltnZhW7duJgZ42w5This is mine and my fiancé's favorite steakhou...0oFyOUOeGTRZhFPF9uTqrTQ
1--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-02-100UxFpgng8dPMWOj99653k5Q5Truly Fantastic! Best Steak ever. Service was...0aVOGlN9fZ-BXcbtj6dbf0g
2--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-07-130GN7KnAaxJDrYgfJzgsvmkA5We decided to give Delmonico's a try because w...0C6kw0Rny7jZAGjTj0MWA3Q
3--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-04-010i7xD3FY-EaF9O08QL69l5w5Absolutely impressed with this restaurant. The...0tTifjrXlRrUme-4c0UW9Bw
4--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-10-150ZqMUHOJg9lGOmfqQ7RXj7A1This was supposed to be a very special dinner ...0kOll36Me-rM9NsqoDnejKA
\n", 150 | "
" 151 | ], 152 | "text/plain": [ 153 | " business_id name \\\n", 154 | "0 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 155 | "1 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 156 | "2 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 157 | "3 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 158 | "4 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 159 | "\n", 160 | " categories avg_stars cool \\\n", 161 | "0 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 162 | "1 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 163 | "2 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 164 | "3 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 165 | "4 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 166 | "\n", 167 | " date funny review_id stars \\\n", 168 | "0 2016-03-31 0 6SgvNWJltnZhW7duJgZ42w 5 \n", 169 | "1 2016-02-10 0 UxFpgng8dPMWOj99653k5Q 5 \n", 170 | "2 2016-07-13 0 GN7KnAaxJDrYgfJzgsvmkA 5 \n", 171 | "3 2016-04-01 0 i7xD3FY-EaF9O08QL69l5w 5 \n", 172 | "4 2016-10-15 0 ZqMUHOJg9lGOmfqQ7RXj7A 1 \n", 173 | "\n", 174 | " text useful \\\n", 175 | "0 This is mine and my fiancé's favorite steakhou... 0 \n", 176 | "1 Truly Fantastic! Best Steak ever. Service was... 0 \n", 177 | "2 We decided to give Delmonico's a try because w... 0 \n", 178 | "3 Absolutely impressed with this restaurant. The... 0 \n", 179 | "4 This was supposed to be a very special dinner ... 0 \n", 180 | "\n", 181 | " user_id \n", 182 | "0 oFyOUOeGTRZhFPF9uTqrTQ \n", 183 | "1 aVOGlN9fZ-BXcbtj6dbf0g \n", 184 | "2 C6kw0Rny7jZAGjTj0MWA3Q \n", 185 | "3 tTifjrXlRrUme-4c0UW9Bw \n", 186 | "4 kOll36Me-rM9NsqoDnejKA " 187 | ] 188 | }, 189 | "execution_count": 4, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "df.head()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### Define feature variables, here is the text of the review" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 5, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# Take the values of the column that contains review text data, save to a variable named \"documents\"\n", 212 | "documents = [t for i,t in enumerate(df.text)]" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 6, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "178080" 224 | ] 225 | }, 226 | "execution_count": 6, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "# inspect your documents, e.g. check the size, take a peek at elements of the numpy array\n", 233 | "len(documents)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### Define the target variable (any categorical variable that may be meaningful)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 9, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "data": { 257 | "image/png": "\n", 258 | "text/plain": [ 259 | "
" 260 | ] 261 | }, 262 | "metadata": {}, 263 | "output_type": "display_data" 264 | } 265 | ], 266 | "source": [ 267 | "# Make a column and take the values, save to a variable named \"target\"\n", 268 | "df.target = df.stars\n", 269 | "\n", 270 | "plt.hist(df.target)\n", 271 | "plt.show()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "## Let's create training dataset and test dataset" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 10, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/plain": [ 289 | "count 178080.000000\n", 290 | "mean 0.462809\n", 291 | "std 0.498616\n", 292 | "min 0.000000\n", 293 | "25% 0.000000\n", 294 | "50% 0.000000\n", 295 | "75% 1.000000\n", 296 | "max 1.000000\n", 297 | "Name: stars, dtype: float64" 298 | ] 299 | }, 300 | "execution_count": 10, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "# df.target = df.stars.apply(lambda x: 1 if x > 4 else 0)\n", 307 | "# df.target.describe()" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 26, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "name": "stderr", 317 | "output_type": "stream", 318 | "text": [ 319 | "/Users/jessie/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 320 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "from sklearn.cross_validation import train_test_split" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 27, 331 | "metadata": { 332 | "collapsed": true 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "# Split to documents_train, documents_test, target_train, target_test\n", 337 | "documents_train, documents_test, target_train, target_test = train_test_split(\n", 338 | " documents, df.stars, test_size=0.33, random_state=42)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "## Let's get NLP representation of the documents" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 29, 351 | "metadata": { 352 | "collapsed": true 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "from sklearn.feature_extraction.text import TfidfVectorizer" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 30, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "# Create TfidfVectorizer, and name it vectorizer\n", 366 | "vectorizer = TfidfVectorizer(stop_words = 'english', max_features=5000)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 37, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "# Train the model with your training data\n", 376 | "vectors_train = vectorizer.fit_transform(documents_train)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 38, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "# Get the vocab of your tfidf\n", 386 | "words = vectorizer.get_feature_names()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 34, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "# Use the trained model to transform your test data\n", 396 | "vectors_test = vectorizer.transform(documents_test).toarray()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "## Similar review search engine" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 88, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "import numpy as np\n", 413 | "\n", 414 | "# We will need these helper methods pretty soon\n", 415 | "\n", 416 | "def get_top_values(lst, n, labels):\n", 417 | " '''\n", 418 | " INPUT: LIST, INTEGER, LIST\n", 419 | " OUTPUT: LIST\n", 420 | "\n", 421 | " Given a list of values, find the indices with the highest n values.\n", 422 | " Return the labels for each of these indices.\n", 423 | "\n", 424 | " e.g.\n", 425 | " lst = [7, 3, 2, 4, 1]\n", 426 | " n = 2\n", 427 | " labels = [\"cat\", \"dog\", \"mouse\", \"pig\", \"rabbit\"]\n", 428 | " output: [\"cat\", \"pig\"]\n", 429 | " '''\n", 430 | " return [labels[i] for i in np.argsort(lst)[::-1][:n]] # np.argsort by default sorts values in ascending order\n", 431 | "\n", 432 | "def get_bottom_values(lst, n, labels):\n", 433 | " '''\n", 434 | " INPUT: LIST, INTEGER, LIST\n", 435 | " OUTPUT: LIST\n", 436 | "\n", 437 | " Given a list of values, find the indices with the lowest n values.\n", 438 | " Return the labels for each of these indices.\n", 439 | "\n", 440 | " e.g.\n", 441 | " lst = [7, 3, 2, 4, 1]\n", 442 | " n = 2\n", 443 | " labels = [\"cat\", \"dog\", \"mouse\", \"pig\", \"rabbit\"]\n", 444 | " output: [\"mouse\", \"rabbit\"]\n", 445 | " '''\n", 446 | " return [labels[i] for i in np.argsort(lst)[::1][:n]] \n" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 36, 452 | "metadata": { 453 | "collapsed": true 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "# Let's use cosine similarity\n", 458 | "from sklearn.metrics.pairwise import cosine_similarity" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 43, 464 | "metadata": {}, 465 | "outputs": [ 466 | { 467 | "name": "stderr", 468 | "output_type": "stream", 469 | "text": [ 470 | "/Users/jessie/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: VisibleDeprecationWarning: converting an array with ndim > 0 to an index will result in an error in the future\n", 471 | " from ipykernel import kernelapp as app\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "# Draw an arbitrary review from test (unseen in training) documents\n", 477 | "doc_test = documents_test[np.random.randint(len(documents_test), size=1)]" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 50, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "# Transform the drawn review(s) to vector(s)\n", 487 | "doc_test_vector = vectorizer.transform([doc_test]).toarray()" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 51, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "# Calculate the similarity score(s) between vector(s) and training vectors\n", 497 | "similarity_scores = cosine_similarity(doc_test_vector, vectors_train.toarray())" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 61, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "# Let's find top 5 similar reviews\n", 507 | "n = 2\n", 508 | "#np.shape(similarity_scores[0])\n", 509 | "searched_result = get_top_values(similarity_scores[0], n, documents_train)" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 63, 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "name": "stdout", 519 | "output_type": "stream", 520 | "text": [ 521 | "Our search query:\n", 522 | "Had to come check this place out, I'd never had a fresh White Castle burger and I was not disappointed. I got the 4 cheeseburgers with fries and a drink, it tasted fresh and was nice and hot. The onions were a tasty addition and even though I could only finish three of the burgers and half the fries, it was really tasty! Would definitely go back.\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "print ('Our search query:')\n", 528 | "print (doc_test)" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 64, 534 | "metadata": {}, 535 | "outputs": [ 536 | { 537 | "name": "stdout", 538 | "output_type": "stream", 539 | "text": [ 540 | "Most 5 similar reviews:\n", 541 | "[\"After walking up and down the Strip twice I was craving some White Castle almost as much as Roldy and Kumar!\\n\\nFrom the outside it looked like a tiny standalone but actually the location is inside the Casino Royale.\\n\\nGot the #7 Castle Pack, which was 10 sliders, two regular fries and two drinks.\\n\\nService was good as they got my order out quickly in spite of the place being packed but for my first time eating White Castle burgers at a White Castle I was rather disappointed. \\n\\nThese certainly did not hit the spot! I know sliders don't have much meat but these were practically meatless! And the onions could've tasted a whole lot better. The Fries were meh too as they were a bit soggy.\\n\\nNot gonna lie, I enjoyed the microwave White Castle burgers you get from the grocery better...\", 'After a night of dancing, my friends and I went to White Castle. No regrets.\\n\\nI have had White Castle burgers in the past. From the freezer section at the supermarket. I have always wonder if they are really that, uh, good, in real life. After Tao, we stopped at White Castle and bought the 20 piece combo with fries and chicken rings. Yes, chicken rings. \\n\\nThe verdict: White Castle burgers taste just like the burgers you get out of the freezer aisle. They were pretty damn satisfying and tasty. The fries were good and the chicken rings were definitely questionable. I would definitely go back!', \"Been wanting to venture out to White Castle for about a decade now. Ever since Harold and Kumar made their voyage, I've been anticipating my own. Had a few days off from work and figured it was time to try these burgers fresh, instead of the ones from the frozen food section that I have once in a while.\\n\\nWalking down the strip towards White Castle was awesome and exciting but when I finally pulled out my first slider my excitement faded a bit (not much, it's Vegas!). The bread was very soggy and they tasted about the same as the frozen ones to me. Though these had more cheese. Was expecting a fresh White Castle patty to be tastier. Maybe this just isn't a great location. I'm definitely trying White Castle again if I ever encounter another chain.\\n\\nIf you happen to stop by a White Castle I would firstly advise you not to settle for this and turn away. Vegas has so many better options! What? Sorry! If you're on some quest like me or just dgaf than I'd recommend asking for a toasted bun if you choose to eat here. It is a cool experience to have if you are a fan of the Harold and Kumar movies, but beyond that these burgers are kinda average (both frozen ones and these).\\n\\nWhile the burger itself wasn't what i was hoping for, I was still glad I made the drive to White Castle. It was an experience I knew I wanted to have and though the burgers weren't satisfying, the overall experience was fun. Oh yeah and the fries were meh too. Though crispy. Throw some Cajun Seasoning on them sumbitches or something! Thank God for Ketchup. \\n\\nPros:\\n+ Staff was nice\\n\\nCons:\\n- Bun was very soggy\\n- Fries were bland\\n- Location could have been cleaner (several dirty tables)\", \"For over 30 years I've been hearing about White Castle sliders from those that lived in, or traveled to parts of the country that offered them. Then, after Harold and Kumar made their movie, I truly hoped that one day I would travel to a place in the mid west or eastern parts that had a White Castle Burger joint, or even better, I wished that White Castle would open a store here in Southern California. Sadly, no luck either way. Then voila! I saw them in the frozen section of my local super market. I bought a pack, took them home and immediately nuked a few in the microwave. Not bad I thought to myself. They had a unique flavor with kind of an onion tang. Nevertheless, these were frozen burgers. I knew the freshly made ones had to be better, so I withheld final judgement, and continued to hope for a real experience at a White Castle location.\\n\\nFast forward several years and I'm walking the strip in Las Vegas and the White Castle sign hit my like a ton of bricks! There it was! Boom! A real deal White Castle burger joint right here in Las Vegas, and right in front of me! I dragged the wife into the place and immediately placed an order for sliders, fries and a drink. So let me get this out of the way first, I don't like krinkle fries. And, the White Castle krinkle fries were no different than any other krinkle fries I've tried. So what did I think of the sliders? They were really tasty and much better than the frozen ones, as I expected them to be. These are truly unique sliders. They are steamed with the onion on the patties. I think that's part of what makes them distinctive.\\n\\nNow, if I'm only judging the slider, in the context of the fast food world of large or popular burger chains, I give it 5 stars considering the under $2 price. I don't smoke weed, but if I did, I could see myself eating a bag of these little morsels after smoking a bowl, just like Harold and Kumar did in the movie. \\n\\nThe slider is the only thing that stood out. Again, I didn't care for the fries, and everything else was just okay. Service was decent, drink fountain was nothing special. Over all, 4 stars for White Castle, mostly because of their famous sliders.\", '西元1921年在堪薩斯州成立的 White Castle ,可是在美國無人不知無人不曉,距今將近一個世紀歷史的 速食 Fast Food 品牌,在近年來也到 賭城 Las Vegas 的賭城大道上開設分店,以全天候二十四小時的開業時間來提供給來自四面八方慕名而來的顧客。比起一般漢堡尺寸小了許多的特色小漢堡(又稱做Slider)不僅是陪伴著許多美國人成長的風味也是美國文化的縮影,在以其為主題的電影 Harold and Kumar Go to White Castle 中就可以感受其風潮與熱度,也是全美第一個以冷凍方式銷售到外州的漢堡。現在不用舟車勞頓前往東岸、也不用屈就於冷凍食物,趁著來到賭城的機會就可以品嚐到這個全美國資歷最深的經典速食風味。\\n\\n作為一個經典的速食品牌,相關的特色與紀念產品相信是許多粉絲們不可錯過的,而料理區就在點餐臺的後方,可以看到這個小漢堡特殊的料理方式,並不是採用一般香煎或火烤方式來料理漢堡肉,而是一種結合蒸煮同時又帶有煎的方式,並讓熱度與蒸氣把洋蔥的香甜味傳達到肉片上。\\n\\n一個小漢堡對一個成年男性來說,不用幾口就可以經鬆解決,特殊的烹煮方式讓品質尚可的肉質吃起來還挺有水分的,加上融化的起司帶來香濃的奶香與洋蔥的甜味,吃起來確實是挺有特色也是居住在加州的居民很少有機會品嚐到的口味。單吃小漢堡說起來並不難吃,但在當前眾多速食漢堡店都強調著食材品質和創新口味的衝擊下,相比起來食感薄弱了不少,但依靠著各種醬料的搭配來調整口位到也是挺有趣的。\\n\\n下次來到賭城,無論是早起的鳥兒、飢腸轆轆的食客、宵夜時間睡不著的夜貓子,都可以來到這間二十四小時經營的經典漢堡店,依照自己的食量來享用這個吃起來感覺不會太有負擔、卻是在美國速食業具有重高地位的可愛小漢堡。']\n" 542 | ] 543 | } 544 | ], 545 | "source": [ 546 | "print ('Most %s similar reviews:' % n)\n", 547 | "print (searched_result)" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "## Classifying positive/negative review" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "#### Naive-Bayes Classifier" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 65, 567 | "metadata": {}, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/plain": [ 572 | "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" 573 | ] 574 | }, 575 | "execution_count": 65, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "# Build a Naive-Bayes Classifier\n", 582 | "\n", 583 | "from sklearn.naive_bayes import MultinomialNB\n", 584 | "\n", 585 | "nbf = MultinomialNB()\n", 586 | "nbf.fit(vectors_train.toarray(),target_train)" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 67, 592 | "metadata": {}, 593 | "outputs": [ 594 | { 595 | "data": { 596 | "text/plain": [ 597 | "0.81235194038850844" 598 | ] 599 | }, 600 | "execution_count": 67, 601 | "metadata": {}, 602 | "output_type": "execute_result" 603 | } 604 | ], 605 | "source": [ 606 | "# Get score for training set\n", 607 | "nbf.score(vectors_train.toarray(),target_train)" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 68, 613 | "metadata": {}, 614 | "outputs": [ 615 | { 616 | "data": { 617 | "text/plain": [ 618 | "0.80715542778439708" 619 | ] 620 | }, 621 | "execution_count": 68, 622 | "metadata": {}, 623 | "output_type": "execute_result" 624 | } 625 | ], 626 | "source": [ 627 | "# Get score for test set\n", 628 | "nbf.score(vectors_test,target_test)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "#### Logistic Regression Classifier" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 69, 641 | "metadata": {}, 642 | "outputs": [ 643 | { 644 | "data": { 645 | "text/plain": [ 646 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 647 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 648 | " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", 649 | " verbose=0, warm_start=False)" 650 | ] 651 | }, 652 | "execution_count": 69, 653 | "metadata": {}, 654 | "output_type": "execute_result" 655 | } 656 | ], 657 | "source": [ 658 | "# Build a Logistic Regression Classifier\n", 659 | "from sklearn.linear_model import LogisticRegression\n", 660 | "vectors_train = vectors_train.toarray()\n", 661 | "\n", 662 | "lg = LogisticRegression()\n", 663 | "lg.fit(vectors_train,target_train)" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 70, 669 | "metadata": {}, 670 | "outputs": [ 671 | { 672 | "data": { 673 | "text/plain": [ 674 | "0.84339061894301592" 675 | ] 676 | }, 677 | "execution_count": 70, 678 | "metadata": {}, 679 | "output_type": "execute_result" 680 | } 681 | ], 682 | "source": [ 683 | "# Get score for training set\n", 684 | "lg.score(vectors_train,target_train)" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 71, 690 | "metadata": { 691 | "scrolled": true 692 | }, 693 | "outputs": [ 694 | { 695 | "data": { 696 | "text/plain": [ 697 | "0.83342260310658811" 698 | ] 699 | }, 700 | "execution_count": 71, 701 | "metadata": {}, 702 | "output_type": "execute_result" 703 | } 704 | ], 705 | "source": [ 706 | "# Get score for test set\n", 707 | "lg.score(vectors_test,target_test)" 708 | ] 709 | }, 710 | { 711 | "cell_type": "markdown", 712 | "metadata": {}, 713 | "source": [ 714 | "#### Q: What are the key features(words) that make the positive prediction?" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 86, 720 | "metadata": {}, 721 | "outputs": [ 722 | { 723 | "data": { 724 | "text/plain": [ 725 | "['worst',\n", 726 | " 'horrible',\n", 727 | " 'ok',\n", 728 | " 'bland',\n", 729 | " 'amazing',\n", 730 | " 'mediocre',\n", 731 | " 'disappointing',\n", 732 | " 'rude',\n", 733 | " 'best',\n", 734 | " 'terrible',\n", 735 | " 'okay',\n", 736 | " 'slow',\n", 737 | " 'lacked',\n", 738 | " 'average',\n", 739 | " 'poor',\n", 740 | " 'lacking',\n", 741 | " 'overpriced',\n", 742 | " 'meh',\n", 743 | " 'poisoning',\n", 744 | " 'tasteless']" 745 | ] 746 | }, 747 | "execution_count": 86, 748 | "metadata": {}, 749 | "output_type": "execute_result" 750 | } 751 | ], 752 | "source": [ 753 | "# Let's find it out by ranking\n", 754 | "n = 20\n", 755 | "get_top_values(lg.coef_[0],n, words)" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "metadata": {}, 761 | "source": [ 762 | "A: (insert your comments here)" 763 | ] 764 | }, 765 | { 766 | "cell_type": "markdown", 767 | "metadata": {}, 768 | "source": [ 769 | "#### Q: What are the key features(words) that make the negative prediction?" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": 90, 775 | "metadata": {}, 776 | "outputs": [ 777 | { 778 | "data": { 779 | "text/plain": [ 780 | "['worst',\n", 781 | " 'horrible',\n", 782 | " 'ok',\n", 783 | " 'bland',\n", 784 | " 'mediocre',\n", 785 | " 'disappointing',\n", 786 | " 'rude',\n", 787 | " 'terrible',\n", 788 | " 'okay',\n", 789 | " 'slow',\n", 790 | " 'lacked',\n", 791 | " 'average',\n", 792 | " 'poor',\n", 793 | " 'lacking',\n", 794 | " 'overpriced',\n", 795 | " 'meh',\n", 796 | " 'poisoning',\n", 797 | " 'tasteless',\n", 798 | " 'alright',\n", 799 | " 'unfortunately']" 800 | ] 801 | }, 802 | "execution_count": 90, 803 | "metadata": {}, 804 | "output_type": "execute_result" 805 | } 806 | ], 807 | "source": [ 808 | "# Let's find it out by ranking\n", 809 | "n = 20\n", 810 | "get_bottom_values(lg.coef_[0], n, words)" 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "metadata": {}, 816 | "source": [ 817 | "A: (insert your comments here)" 818 | ] 819 | }, 820 | { 821 | "cell_type": "markdown", 822 | "metadata": {}, 823 | "source": [ 824 | "#### Random Forest Classifier" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": 91, 830 | "metadata": {}, 831 | "outputs": [ 832 | { 833 | "data": { 834 | "text/plain": [ 835 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 836 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 837 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 838 | " min_samples_leaf=20, min_samples_split=2,\n", 839 | " min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=-1,\n", 840 | " oob_score=False, random_state=None, verbose=0,\n", 841 | " warm_start=False)" 842 | ] 843 | }, 844 | "execution_count": 91, 845 | "metadata": {}, 846 | "output_type": "execute_result" 847 | } 848 | ], 849 | "source": [ 850 | "# Build a Random Forest Classifier\n", 851 | "\n", 852 | "from sklearn.ensemble import RandomForestClassifier\n", 853 | "\n", 854 | "rfc = RandomForestClassifier(n_estimators = 5, min_samples_leaf = 20, n_jobs = -1)\n", 855 | "rfc.fit(vectors_train,target_train)" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 92, 861 | "metadata": {}, 862 | "outputs": [ 863 | { 864 | "data": { 865 | "text/plain": [ 866 | "0.80161627255890078" 867 | ] 868 | }, 869 | "execution_count": 92, 870 | "metadata": {}, 871 | "output_type": "execute_result" 872 | } 873 | ], 874 | "source": [ 875 | "# Get score for training set\n", 876 | "rfc.score(vectors_train,target_train)" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": 93, 882 | "metadata": {}, 883 | "outputs": [ 884 | { 885 | "data": { 886 | "text/plain": [ 887 | "0.77830853819835377" 888 | ] 889 | }, 890 | "execution_count": 93, 891 | "metadata": {}, 892 | "output_type": "execute_result" 893 | } 894 | ], 895 | "source": [ 896 | "# Get score for test set\n", 897 | "rfc.score(vectors_test,target_test)" 898 | ] 899 | }, 900 | { 901 | "cell_type": "markdown", 902 | "metadata": {}, 903 | "source": [ 904 | "#### Q: What do you see from the training score and the test score?" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": {}, 910 | "source": [ 911 | "A: The number of estimators is not enough and the prediction score is even less than logistic regression. If we increase the number of estimators, the training time may be much longer but the result is expected to be improved. The test score is lower than the training score, this means that our model doesn't have overfitting problem yet." 912 | ] 913 | }, 914 | { 915 | "cell_type": "markdown", 916 | "metadata": {}, 917 | "source": [ 918 | "### Important features (words) by inspecting the RFC model" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": 94, 924 | "metadata": {}, 925 | "outputs": [ 926 | { 927 | "data": { 928 | "text/plain": [ 929 | "['amazing',\n", 930 | " 'delicious',\n", 931 | " 'like',\n", 932 | " 'best',\n", 933 | " 'great',\n", 934 | " 'asked',\n", 935 | " 'awesome',\n", 936 | " 'bad',\n", 937 | " 'vegas',\n", 938 | " 'decent',\n", 939 | " 'didn',\n", 940 | " 'worst',\n", 941 | " 'okay',\n", 942 | " 'love',\n", 943 | " 'ok',\n", 944 | " 'order',\n", 945 | " 'bland',\n", 946 | " 'rude',\n", 947 | " 'definitely',\n", 948 | " 'dry']" 949 | ] 950 | }, 951 | "execution_count": 94, 952 | "metadata": {}, 953 | "output_type": "execute_result" 954 | } 955 | ], 956 | "source": [ 957 | "n = 20\n", 958 | "get_top_values(rfc.feature_importances_,n,words)" 959 | ] 960 | }, 961 | { 962 | "cell_type": "markdown", 963 | "metadata": {}, 964 | "source": [ 965 | "## TODO: Use cross validation to evaluate your classifiers\n", 966 | "\n", 967 | "[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)" 968 | ] 969 | }, 970 | { 971 | "cell_type": "code", 972 | "execution_count": 11, 973 | "metadata": {}, 974 | "outputs": [ 975 | { 976 | "ename": "NameError", 977 | "evalue": "name 'lg' is not defined", 978 | "output_type": "error", 979 | "traceback": [ 980 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 981 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 982 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_selection\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcross_val_score\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m cv_scores = cross_val_score(lg,\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mvectors_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtarget_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 983 | "\u001b[0;31mNameError\u001b[0m: name 'lg' is not defined" 984 | ] 985 | } 986 | ], 987 | "source": [ 988 | "from sklearn.model_selection import cross_val_score\n", 989 | "\n", 990 | "cv_scores = cross_val_score(lg,\n", 991 | " vectors_train,\n", 992 | " target_train,\n", 993 | " cv = 5,\n", 994 | " scoring=\"accuracy\")\n", 995 | "cv_scores" 996 | ] 997 | }, 998 | { 999 | "cell_type": "markdown", 1000 | "metadata": {}, 1001 | "source": [ 1002 | "## TODO: Use grid search to find best predictable classifier\n", 1003 | "\n", 1004 | "\n", 1005 | "[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)\n", 1006 | "\n", 1007 | "[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": null, 1013 | "metadata": { 1014 | "collapsed": true 1015 | }, 1016 | "outputs": [], 1017 | "source": [ 1018 | "# To be implemented\n" 1019 | ] 1020 | }, 1021 | { 1022 | "cell_type": "code", 1023 | "execution_count": null, 1024 | "metadata": { 1025 | "collapsed": true 1026 | }, 1027 | "outputs": [], 1028 | "source": [] 1029 | } 1030 | ], 1031 | "metadata": { 1032 | "anaconda-cloud": {}, 1033 | "kernelspec": { 1034 | "display_name": "Python 3", 1035 | "language": "python", 1036 | "name": "python3" 1037 | }, 1038 | "language_info": { 1039 | "codemirror_mode": { 1040 | "name": "ipython", 1041 | "version": 3 1042 | }, 1043 | "file_extension": ".py", 1044 | "mimetype": "text/x-python", 1045 | "name": "python", 1046 | "nbconvert_exporter": "python", 1047 | "pygments_lexer": "ipython3", 1048 | "version": "3.5.5" 1049 | } 1050 | }, 1051 | "nbformat": 4, 1052 | "nbformat_minor": 1 1053 | } 1054 | -------------------------------------------------------------------------------- /Yelp_Dataset_-_NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Yelp Data Challenge - NLP" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "%matplotlib inline" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "df = pd.read_csv('../dataset/2016_restaurant_reviews.csv')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/html": [ 40 | "
\n", 41 | "\n", 54 | "\n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | "
business_idnamecategoriesavg_starscooldatefunnyreview_idstarstextusefuluser_id
0--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-03-3106SgvNWJltnZhW7duJgZ42w5This is mine and my fiancé's favorite steakhou...0oFyOUOeGTRZhFPF9uTqrTQ
1--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-02-100UxFpgng8dPMWOj99653k5Q5Truly Fantastic! Best Steak ever. Service was...0aVOGlN9fZ-BXcbtj6dbf0g
2--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-07-130GN7KnAaxJDrYgfJzgsvmkA5We decided to give Delmonico's a try because w...0C6kw0Rny7jZAGjTj0MWA3Q
3--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-04-010i7xD3FY-EaF9O08QL69l5w5Absolutely impressed with this restaurant. The...0tTifjrXlRrUme-4c0UW9Bw
4--9e1ONYQuAa-CB_Rrw7TwDelmonico Steakhouse['Cajun/Creole', 'Steakhouses', 'Restaurants']4.002016-10-150ZqMUHOJg9lGOmfqQ7RXj7A1This was supposed to be a very special dinner ...0kOll36Me-rM9NsqoDnejKA
\n", 150 | "
" 151 | ], 152 | "text/plain": [ 153 | " business_id name \\\n", 154 | "0 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 155 | "1 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 156 | "2 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 157 | "3 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 158 | "4 --9e1ONYQuAa-CB_Rrw7Tw Delmonico Steakhouse \n", 159 | "\n", 160 | " categories avg_stars cool \\\n", 161 | "0 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 162 | "1 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 163 | "2 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 164 | "3 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 165 | "4 ['Cajun/Creole', 'Steakhouses', 'Restaurants'] 4.0 0 \n", 166 | "\n", 167 | " date funny review_id stars \\\n", 168 | "0 2016-03-31 0 6SgvNWJltnZhW7duJgZ42w 5 \n", 169 | "1 2016-02-10 0 UxFpgng8dPMWOj99653k5Q 5 \n", 170 | "2 2016-07-13 0 GN7KnAaxJDrYgfJzgsvmkA 5 \n", 171 | "3 2016-04-01 0 i7xD3FY-EaF9O08QL69l5w 5 \n", 172 | "4 2016-10-15 0 ZqMUHOJg9lGOmfqQ7RXj7A 1 \n", 173 | "\n", 174 | " text useful \\\n", 175 | "0 This is mine and my fiancé's favorite steakhou... 0 \n", 176 | "1 Truly Fantastic! Best Steak ever. Service was... 0 \n", 177 | "2 We decided to give Delmonico's a try because w... 0 \n", 178 | "3 Absolutely impressed with this restaurant. The... 0 \n", 179 | "4 This was supposed to be a very special dinner ... 0 \n", 180 | "\n", 181 | " user_id \n", 182 | "0 oFyOUOeGTRZhFPF9uTqrTQ \n", 183 | "1 aVOGlN9fZ-BXcbtj6dbf0g \n", 184 | "2 C6kw0Rny7jZAGjTj0MWA3Q \n", 185 | "3 tTifjrXlRrUme-4c0UW9Bw \n", 186 | "4 kOll36Me-rM9NsqoDnejKA " 187 | ] 188 | }, 189 | "execution_count": 3, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "df.head()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### Define feature variables, here is the text of the review" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 4, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# Take the values of the column that contains review text data, save to a variable named \"documents\"\n", 212 | "documents = [t for i,t in enumerate(df.text)]" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 5, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "178080" 224 | ] 225 | }, 226 | "execution_count": 5, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "# inspect your documents, e.g. check the size, take a peek at elements of the numpy array\n", 233 | "len(documents)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### Define the target variable (any categorical variable that may be meaningful)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 6, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "name": "stderr", 257 | "output_type": "stream", 258 | "text": [ 259 | "/Users/jessie/anaconda3/lib/python3.5/site-packages/ipykernel_launcher.py:2: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n", 260 | " \n" 261 | ] 262 | }, 263 | { 264 | "data": { 265 | "image/png": "\n", 266 | "text/plain": [ 267 | "
" 268 | ] 269 | }, 270 | "metadata": {}, 271 | "output_type": "display_data" 272 | } 273 | ], 274 | "source": [ 275 | "# Make a column and take the values, save to a variable named \"target\"\n", 276 | "df.target = df.stars\n", 277 | "\n", 278 | "plt.hist(df.target)\n", 279 | "plt.show()" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "## Let's create training dataset and test dataset" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 7, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "# df.target = df.stars.apply(lambda x: 1 if x > 4 else 0)\n", 296 | "# df.target.describe()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 8, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stderr", 306 | "output_type": "stream", 307 | "text": [ 308 | "/Users/jessie/anaconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 309 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 310 | ] 311 | } 312 | ], 313 | "source": [ 314 | "from sklearn.cross_validation import train_test_split" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 9, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "# Split to documents_train, documents_test, target_train, target_test\n", 324 | "documents_train, documents_test, target_train, target_test = train_test_split(\n", 325 | " documents, df.stars, test_size=0.33, random_state=42)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "## Let's get NLP representation of the documents" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 10, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "from sklearn.feature_extraction.text import TfidfVectorizer" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 11, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "# Create TfidfVectorizer, and name it vectorizer\n", 351 | "vectorizer = TfidfVectorizer(stop_words = 'english', max_features=5000)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 12, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "# Train the model with your training data\n", 361 | "vectors_train = vectorizer.fit_transform(documents_train)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 13, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "# Get the vocab of your tfidf\n", 371 | "words = vectorizer.get_feature_names()" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 14, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "# Use the trained model to transform your test data\n", 381 | "vectors_test = vectorizer.transform(documents_test).toarray()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "## Similar review search engine" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 15, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "import numpy as np\n", 398 | "\n", 399 | "# We will need these helper methods pretty soon\n", 400 | "\n", 401 | "def get_top_values(lst, n, labels):\n", 402 | " '''\n", 403 | " INPUT: LIST, INTEGER, LIST\n", 404 | " OUTPUT: LIST\n", 405 | "\n", 406 | " Given a list of values, find the indices with the highest n values.\n", 407 | " Return the labels for each of these indices.\n", 408 | "\n", 409 | " e.g.\n", 410 | " lst = [7, 3, 2, 4, 1]\n", 411 | " n = 2\n", 412 | " labels = [\"cat\", \"dog\", \"mouse\", \"pig\", \"rabbit\"]\n", 413 | " output: [\"cat\", \"pig\"]\n", 414 | " '''\n", 415 | " return [labels[i] for i in np.argsort(lst)[::-1][:n]] # np.argsort by default sorts values in ascending order\n", 416 | "\n", 417 | "def get_bottom_values(lst, n, labels):\n", 418 | " '''\n", 419 | " INPUT: LIST, INTEGER, LIST\n", 420 | " OUTPUT: LIST\n", 421 | "\n", 422 | " Given a list of values, find the indices with the lowest n values.\n", 423 | " Return the labels for each of these indices.\n", 424 | "\n", 425 | " e.g.\n", 426 | " lst = [7, 3, 2, 4, 1]\n", 427 | " n = 2\n", 428 | " labels = [\"cat\", \"dog\", \"mouse\", \"pig\", \"rabbit\"]\n", 429 | " output: [\"mouse\", \"rabbit\"]\n", 430 | " '''\n", 431 | " return [labels[i] for i in np.argsort(lst)[::1][:n]] \n" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 16, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "# Let's use cosine similarity\n", 441 | "from sklearn.metrics.pairwise import cosine_similarity" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 21, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "# Draw an arbitrary review from test (unseen in training) documents\n", 451 | "doc_test = documents_test[np.random.randint(len(documents_test))]" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 22, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "# Transform the drawn review(s) to vector(s)\n", 461 | "doc_test_vector = vectorizer.transform([doc_test]).toarray()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 23, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "# Calculate the similarity score(s) between vector(s) and training vectors\n", 471 | "similarity_scores = cosine_similarity(doc_test_vector, vectors_train.toarray())" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 24, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "# Let's find top 5 similar reviews\n", 481 | "n = 2\n", 482 | "#np.shape(similarity_scores[0])\n", 483 | "searched_result = get_top_values(similarity_scores[0], n, documents_train)" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 25, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "name": "stdout", 493 | "output_type": "stream", 494 | "text": [ 495 | "Our search query:\n", 496 | "I was prepared to not like Margaritaville. I figured this was going to a formulaic theme restaurant with mediocre burgers. You typically see Margaritaville's in resort areas. This time we were at the Flamingo Hotel in Las Vegas waiting to see an 8:00 show. Everybody was hungry and thirsty and our alternative was the Food Court.\n", 497 | "\n", 498 | "We started off with drinks. While a margarita would have was the natural choice, I lighted on the specialty cocktails. Most were rum-based with fruit juices. I had my party face on so I went for the \"It's Five O'clock Somewhere\" cocktail. One aspect of the sweet party drinks is they go down easily. If you don't pace yourself, you'll find yourself face down again in Margaritaville.\n", 499 | "\n", 500 | "Fortunately dinner came before that could happen! I had the salmon with bourbon sauce. It came with rice and a generous amount of green beans. This was a good meal. The salmon was a bit dry as they left it cooking to get the bourbon sauce to caramelize but they offset it by including a nice amount of tartare sauce. The combination worked perfectly.\n", 501 | "\n", 502 | "By now I was both sated and buzzed. At this point, the floor show started. The show consisted of two people sort of dancing with stilts. It was cheesy but fun. We lustily sang along with the chorus then had a piece of key lime pie. Everyone left with a big grin.\n" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "print ('Our search query:')\n", 508 | "print (doc_test)" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 26, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "name": "stdout", 518 | "output_type": "stream", 519 | "text": [ 520 | "Most 2 similar reviews:\n", 521 | "[\"Order the cocktail called 'the viale', it is one of the best bourbon drinks I've ever had.\", 'Staff was on point, food was good, cocktails were terrible. I got some tropical cocktail with rum it only tasted like rum.']\n" 522 | ] 523 | } 524 | ], 525 | "source": [ 526 | "print ('Most %s similar reviews:' % n)\n", 527 | "print (searched_result)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "## Classifying positive/negative review" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "#### Naive-Bayes Classifier" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 27, 547 | "metadata": {}, 548 | "outputs": [ 549 | { 550 | "data": { 551 | "text/plain": [ 552 | "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" 553 | ] 554 | }, 555 | "execution_count": 27, 556 | "metadata": {}, 557 | "output_type": "execute_result" 558 | } 559 | ], 560 | "source": [ 561 | "# Build a Naive-Bayes Classifier\n", 562 | "\n", 563 | "from sklearn.naive_bayes import MultinomialNB\n", 564 | "\n", 565 | "nbf = MultinomialNB()\n", 566 | "nbf.fit(vectors_train.toarray(),target_train)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 28, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "nbf_predict_test = nbf.predict(vectors_test)" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 41, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [ 584 | "def make_confusion_matrix_relative(confusion_matrix):\n", 585 | " star_category_classes = [1, 2, 3, 4, 5]\n", 586 | " N = list(map(lambda clazz : sum(target_test == clazz), star_category_classes))\n", 587 | " relative_confusion_matrix = np.empty((len(star_category_classes), len(star_category_classes)))\n", 588 | " \n", 589 | " for j in range(0, len(star_category_classes)):\n", 590 | " if N[j] > 0:\n", 591 | " relative_frequency = confusion_matrix[j, :] / float(N[j])\n", 592 | " relative_confusion_matrix[j, :] = relative_frequency\n", 593 | " \n", 594 | " return relative_confusion_matrix\n", 595 | "\n", 596 | "# http://www.wenda.io/questions/4330313/heatmap-with-text-in-each-cell-with-matplotlibs-pyplot.html\n", 597 | "# http://stackoverflow.com/questions/20520246/create-heatmap-using-pandas-timeseries\n", 598 | "# http://sebastianraschka.com/Articles/heatmaps_in_r.html\n", 599 | "# http://code.activestate.com/recipes/578175-hierarchical-clustering-heatmap-python/\n", 600 | "def plot_confusion_matrix(confusion_matrix=[[]], title='CM', savefilename=''):\n", 601 | " rcm = make_confusion_matrix_relative(confusion_matrix)\n", 602 | " #plt.imshow(rcm, vmin=0, vmax=1, interpolation='nearest')\n", 603 | " c = plt.pcolor(rcm, edgecolors='k', linewidths=4, cmap='jet', vmin=0.0, vmax=1.0)\n", 604 | " plt.title(title)\n", 605 | " plt.colorbar()\n", 606 | " plt.ylabel('Actual Label')\n", 607 | " plt.xlabel('Predicted Label')\n", 608 | " plt.xticks(0.5 + np.arange(5), np.arange(1,6))\n", 609 | " plt.yticks(0.5 + np.arange(5), np.arange(1,6))\n", 610 | "\n", 611 | " def show_values(pc, fmt=\"%.2f\", **kw):\n", 612 | " pc.update_scalarmappable()\n", 613 | " ax = pc.axes\n", 614 | " for p, color, value in zip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):\n", 615 | " x, y = p.vertices[:-2, :].mean(0)\n", 616 | " if sum(color[:2] > 0.3) >= 2:\n", 617 | " color = (0.0, 0.0, 0.0)\n", 618 | " else:\n", 619 | " color = (1.0, 1.0, 1.0)\n", 620 | " ax.text(x, y, fmt % value, ha=\"center\", va=\"center\", color=color, **kw)\n", 621 | " \n", 622 | " show_values(c)\n", 623 | "\n", 624 | " if savefilename:\n", 625 | " plt.savefig(savefilename, bbox_inches='tight')\n", 626 | " \n", 627 | " return plt.show()" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 30, 633 | "metadata": {}, 634 | "outputs": [ 635 | { 636 | "data": { 637 | "text/plain": [ 638 | "0.6258245119978544" 639 | ] 640 | }, 641 | "execution_count": 30, 642 | "metadata": {}, 643 | "output_type": "execute_result" 644 | } 645 | ], 646 | "source": [ 647 | "# Get score for training set\n", 648 | "nbf.score(vectors_train.toarray(),target_train)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 42, 654 | "metadata": {}, 655 | "outputs": [ 656 | { 657 | "name": "stdout", 658 | "output_type": "stream", 659 | "text": [ 660 | "[[7.72460555e-01 4.48559134e-02 3.18164037e-02 3.40331204e-02\n", 661 | " 1.16834007e-01]\n", 662 | " [3.87141999e-01 9.29301021e-02 1.51011416e-01 1.42399359e-01\n", 663 | " 2.26517124e-01]\n", 664 | " [1.43563591e-01 2.70437336e-02 1.35682275e-01 3.05671457e-01\n", 665 | " 3.88038943e-01]\n", 666 | " [2.63616213e-02 1.34578847e-03 1.24287524e-02 2.32504750e-01\n", 667 | " 7.27359088e-01]\n", 668 | " [9.25857344e-03 1.85171469e-04 9.25857344e-04 3.72564995e-02\n", 669 | " 9.52373898e-01]]\n" 670 | ] 671 | }, 672 | { 673 | "data": { 674 | "image/png": "\n", 675 | "text/plain": [ 676 | "
" 677 | ] 678 | }, 679 | "metadata": {}, 680 | "output_type": "display_data" 681 | } 682 | ], 683 | "source": [ 684 | "# Get score for test set\n", 685 | "from sklearn import metrics\n", 686 | "from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_auc_score\n", 687 | "multinomial_confusion_matrix = confusion_matrix(target_test, nbf_predict_test)\n", 688 | "print(make_confusion_matrix_relative(multinomial_confusion_matrix))\n", 689 | "plot_confusion_matrix(multinomial_confusion_matrix, 'Multinomial Naive Bayes Confusion Matrix', savefilename='MultinomialCM.png')" 690 | ] 691 | }, 692 | { 693 | "cell_type": "markdown", 694 | "metadata": {}, 695 | "source": [ 696 | "#### Logistic Regression Classifier" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 43, 702 | "metadata": {}, 703 | "outputs": [ 704 | { 705 | "data": { 706 | "text/plain": [ 707 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 708 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 709 | " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", 710 | " verbose=0, warm_start=False)" 711 | ] 712 | }, 713 | "execution_count": 43, 714 | "metadata": {}, 715 | "output_type": "execute_result" 716 | } 717 | ], 718 | "source": [ 719 | "# Build a Logistic Regression Classifier\n", 720 | "from sklearn.linear_model import LogisticRegression\n", 721 | "vectors_train = vectors_train.toarray()\n", 722 | "\n", 723 | "lg = LogisticRegression()\n", 724 | "lg.fit(vectors_train,target_train)" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 44, 730 | "metadata": {}, 731 | "outputs": [ 732 | { 733 | "data": { 734 | "text/plain": [ 735 | "0.6988341588929957" 736 | ] 737 | }, 738 | "execution_count": 44, 739 | "metadata": {}, 740 | "output_type": "execute_result" 741 | } 742 | ], 743 | "source": [ 744 | "# Get score for training set\n", 745 | "lg.score(vectors_train,target_train)" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": 45, 751 | "metadata": { 752 | "scrolled": true 753 | }, 754 | "outputs": [ 755 | { 756 | "data": { 757 | "text/plain": [ 758 | "0.6602174689876972" 759 | ] 760 | }, 761 | "execution_count": 45, 762 | "metadata": {}, 763 | "output_type": "execute_result" 764 | } 765 | ], 766 | "source": [ 767 | "# Get score for test set\n", 768 | "lg.score(vectors_test,target_test)" 769 | ] 770 | }, 771 | { 772 | "cell_type": "markdown", 773 | "metadata": {}, 774 | "source": [ 775 | "#### Q: What are the key features(words) that make the positive prediction?" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 46, 781 | "metadata": {}, 782 | "outputs": [ 783 | { 784 | "data": { 785 | "text/plain": [ 786 | "['worst',\n", 787 | " 'horrible',\n", 788 | " 'terrible',\n", 789 | " 'poisoning',\n", 790 | " 'zero',\n", 791 | " 'disgusting',\n", 792 | " 'awful',\n", 793 | " 'waste',\n", 794 | " 'joke',\n", 795 | " 'sick',\n", 796 | " 'rude',\n", 797 | " 'worse',\n", 798 | " 'sucks',\n", 799 | " 'money',\n", 800 | " 'nasty',\n", 801 | " 'garbage',\n", 802 | " 'poor',\n", 803 | " 'disappointment',\n", 804 | " 'rip',\n", 805 | " 'shitty']" 806 | ] 807 | }, 808 | "execution_count": 46, 809 | "metadata": {}, 810 | "output_type": "execute_result" 811 | } 812 | ], 813 | "source": [ 814 | "# Let's find it out by ranking\n", 815 | "n = 20\n", 816 | "get_top_values(lg.coef_[0],n, words)" 817 | ] 818 | }, 819 | { 820 | "cell_type": "markdown", 821 | "metadata": {}, 822 | "source": [ 823 | "#### Q: What are the key features(words) that make the negative prediction?" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": 47, 829 | "metadata": {}, 830 | "outputs": [ 831 | { 832 | "data": { 833 | "text/plain": [ 834 | "['delicious',\n", 835 | " 'amazing',\n", 836 | " 'great',\n", 837 | " 'good',\n", 838 | " 'excellent',\n", 839 | " 'best',\n", 840 | " 'awesome',\n", 841 | " 'friendly',\n", 842 | " 'loved',\n", 843 | " 'love',\n", 844 | " 'perfect',\n", 845 | " 'stars',\n", 846 | " 'favorite',\n", 847 | " 'fantastic',\n", 848 | " 'nice',\n", 849 | " 'tasty',\n", 850 | " 'enjoyed',\n", 851 | " 'yummy',\n", 852 | " 'attentive',\n", 853 | " 'bit']" 854 | ] 855 | }, 856 | "execution_count": 47, 857 | "metadata": {}, 858 | "output_type": "execute_result" 859 | } 860 | ], 861 | "source": [ 862 | "# Let's find it out by ranking\n", 863 | "n = 20\n", 864 | "get_bottom_values(lg.coef_[0], n, words)" 865 | ] 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "metadata": {}, 870 | "source": [ 871 | "#### Random Forest Classifier" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": 48, 877 | "metadata": {}, 878 | "outputs": [ 879 | { 880 | "data": { 881 | "text/plain": [ 882 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 883 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 884 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 885 | " min_samples_leaf=20, min_samples_split=2,\n", 886 | " min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=-1,\n", 887 | " oob_score=False, random_state=None, verbose=0,\n", 888 | " warm_start=False)" 889 | ] 890 | }, 891 | "execution_count": 48, 892 | "metadata": {}, 893 | "output_type": "execute_result" 894 | } 895 | ], 896 | "source": [ 897 | "# Build a Random Forest Classifier\n", 898 | "\n", 899 | "from sklearn.ensemble import RandomForestClassifier\n", 900 | "\n", 901 | "rfc = RandomForestClassifier(n_estimators = 5, min_samples_leaf = 20, n_jobs = -1)\n", 902 | "rfc.fit(vectors_train,target_train)" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": 49, 908 | "metadata": {}, 909 | "outputs": [ 910 | { 911 | "data": { 912 | "text/plain": [ 913 | "0.5952410885653701" 914 | ] 915 | }, 916 | "execution_count": 49, 917 | "metadata": {}, 918 | "output_type": "execute_result" 919 | } 920 | ], 921 | "source": [ 922 | "# Get score for training set\n", 923 | "rfc.score(vectors_train,target_train)" 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": 50, 929 | "metadata": {}, 930 | "outputs": [ 931 | { 932 | "name": "stdout", 933 | "output_type": "stream", 934 | "text": [ 935 | "[[6.25635676e-01 8.73647151e-03 2.69917851e-02 4.31607772e-02\n", 936 | " 2.95475290e-01]\n", 937 | " [3.43280593e-01 2.18305628e-02 9.11275786e-02 1.39995994e-01\n", 938 | " 4.03765271e-01]\n", 939 | " [1.29964457e-01 1.35991346e-02 1.01684438e-01 2.37366713e-01\n", 940 | " 5.17385257e-01]\n", 941 | " [3.49905003e-02 2.21659278e-03 2.43825206e-02 1.78989867e-01\n", 942 | " 7.59420519e-01]\n", 943 | " [1.30731057e-02 2.96274350e-04 3.66639508e-03 4.03673802e-02\n", 944 | " 9.42596845e-01]]\n" 945 | ] 946 | }, 947 | { 948 | "data": { 949 | "image/png": "\n", 950 | "text/plain": [ 951 | "
" 952 | ] 953 | }, 954 | "metadata": {}, 955 | "output_type": "display_data" 956 | } 957 | ], 958 | "source": [ 959 | "# Get score for test set\n", 960 | "rfc_predict_test = rfc.predict(vectors_test)\n", 961 | "rfc_confusion_matrix = confusion_matrix(target_test, rfc_predict_test)\n", 962 | "print(make_confusion_matrix_relative(rfc_confusion_matrix))\n", 963 | "plot_confusion_matrix(rfc_confusion_matrix, 'Random Forest Confusion Matrix', savefilename='rfcCM.png')" 964 | ] 965 | }, 966 | { 967 | "cell_type": "markdown", 968 | "metadata": {}, 969 | "source": [ 970 | "__Imbalaned Data__\n", 971 | "\n", 972 | "1. The number of estimators may not be enough. \n", 973 | "2. We can see that the classifier tend to classify samples to the majority classes. We can try to downsample 5 and 1 star reviews or oversample 2,3,4 star reviews" 974 | ] 975 | }, 976 | { 977 | "cell_type": "markdown", 978 | "metadata": {}, 979 | "source": [ 980 | "### Important features (words) by inspecting the RFC model" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": 51, 986 | "metadata": {}, 987 | "outputs": [ 988 | { 989 | "data": { 990 | "text/plain": [ 991 | "['amazing',\n", 992 | " 'good',\n", 993 | " 'best',\n", 994 | " 'great',\n", 995 | " 'worst',\n", 996 | " 'delicious',\n", 997 | " 'terrible',\n", 998 | " 'told',\n", 999 | " 'bad',\n", 1000 | " 'love',\n", 1001 | " 'ok',\n", 1002 | " 'awesome',\n", 1003 | " 'said',\n", 1004 | " 'rude',\n", 1005 | " 'wasn',\n", 1006 | " 'slow',\n", 1007 | " 'average',\n", 1008 | " 'asked',\n", 1009 | " 'perfect',\n", 1010 | " 'little']" 1011 | ] 1012 | }, 1013 | "execution_count": 51, 1014 | "metadata": {}, 1015 | "output_type": "execute_result" 1016 | } 1017 | ], 1018 | "source": [ 1019 | "n = 20\n", 1020 | "get_top_values(rfc.feature_importances_,n,words)" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "markdown", 1025 | "metadata": {}, 1026 | "source": [ 1027 | "## Use cross validation to evaluate classifiers\n", 1028 | "\n", 1029 | "[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "code", 1034 | "execution_count": 52, 1035 | "metadata": {}, 1036 | "outputs": [ 1037 | { 1038 | "data": { 1039 | "text/plain": [ 1040 | "array([0.56541234, 0.56958471, 0.56939904, 0.57342218, 0.57308692])" 1041 | ] 1042 | }, 1043 | "execution_count": 52, 1044 | "metadata": {}, 1045 | "output_type": "execute_result" 1046 | } 1047 | ], 1048 | "source": [ 1049 | "from sklearn.model_selection import cross_val_score\n", 1050 | "\n", 1051 | "cv_scores = cross_val_score(rfc,\n", 1052 | " vectors_train,\n", 1053 | " target_train,\n", 1054 | " cv = 5,\n", 1055 | " scoring=\"accuracy\")\n", 1056 | "cv_scores" 1057 | ] 1058 | }, 1059 | { 1060 | "cell_type": "markdown", 1061 | "metadata": {}, 1062 | "source": [ 1063 | "## TODO: Use grid search to find best predictable classifier\n", 1064 | "\n", 1065 | "\n", 1066 | "[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)\n", 1067 | "\n", 1068 | "[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": null, 1074 | "metadata": {}, 1075 | "outputs": [], 1076 | "source": [ 1077 | "# To be implemented\n" 1078 | ] 1079 | }, 1080 | { 1081 | "cell_type": "code", 1082 | "execution_count": null, 1083 | "metadata": {}, 1084 | "outputs": [], 1085 | "source": [] 1086 | } 1087 | ], 1088 | "metadata": { 1089 | "anaconda-cloud": {}, 1090 | "kernelspec": { 1091 | "display_name": "Python 3", 1092 | "language": "python", 1093 | "name": "python3" 1094 | }, 1095 | "language_info": { 1096 | "codemirror_mode": { 1097 | "name": "ipython", 1098 | "version": 3 1099 | }, 1100 | "file_extension": ".py", 1101 | "mimetype": "text/x-python", 1102 | "name": "python", 1103 | "nbconvert_exporter": "python", 1104 | "pygments_lexer": "ipython3", 1105 | "version": "3.5.5" 1106 | } 1107 | }, 1108 | "nbformat": 4, 1109 | "nbformat_minor": 1 1110 | } 1111 | --------------------------------------------------------------------------------