├── Image Classification with Keras.ipynb ├── Image Processing.ipynb ├── KNN ├── K Nearest Neighbors Project.ipynb └── KNN_Project_Data ├── Logistic Regression ├── Logistic Regression Project .ipynb └── advertising.csv ├── Machine Learning Projects └── Linear Regression │ ├── Ecommerce Customers.csv │ └── Linear Regression .ipynb ├── NLP ├── Yelp Business Rating Prediction_NLP.ipynb └── yelp.csv ├── README.md ├── Random Forest ├── Decision Trees and Random Forest Project .ipynb └── loan_data.csv ├── SVM ├── DS_Meetup_6_22_17.pptx ├── SVM_Breast_Cancer.ipynb └── Support Vector Machines_Iris.ipynb └── k means clustering ├── College_Data └── K Means Clustering-Universities Data.ipynb /Image Classification with Keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Traffic Sign Recognition Project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "from skimage import color, exposure, transform\n", 18 | "\n", 19 | "NUM_CLASSES = 43\n", 20 | "IMG_SIZE = 48\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "def preprocess_img(img):\n", 30 | " # Histogram normalization in v channel\n", 31 | " hsv = color.rgb2hsv(img)\n", 32 | " hsv[:, :, 2] = exposure.equalize_hist(hsv[:, :, 2])\n", 33 | " img = color.hsv2rgb(hsv)\n", 34 | "\n", 35 | " # central square crop\n", 36 | " min_side = min(img.shape[:-1])\n", 37 | " centre = img.shape[0] // 2, img.shape[1] // 2\n", 38 | " img = img[centre[0] - min_side // 2:centre[0] + min_side // 2,\n", 39 | " centre[1] - min_side // 2:centre[1] + min_side // 2,\n", 40 | " :]\n", 41 | "\n", 42 | " # rescale to standard size\n", 43 | " img = transform.resize(img, (IMG_SIZE, IMG_SIZE))\n", 44 | "\n", 45 | " # roll color axis to axis 0\n", 46 | " img = np.rollaxis(img, -1)\n", 47 | "\n", 48 | " return img" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 7, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "from skimage import io\n", 58 | "import os\n", 59 | "import glob" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 8, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stderr", 69 | "output_type": "stream", 70 | "text": [ 71 | "/Users/amy/anaconda3/envs/TensorFlow/lib/python2.7/site-packages/skimage/transform/_warps.py:84: UserWarning: The default mode, 'constant', will be changed to 'reflect' in skimage 0.15.\n", 72 | " warn(\"The default mode, 'constant', will be changed to 'reflect' in \"\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "def get_class(img_path):\n", 78 | " return int(img_path.split('/')[-2])\n", 79 | "\n", 80 | "root_dir = 'GTSRB/Final_Training/Images/'\n", 81 | "imgs = []\n", 82 | "labels = []\n", 83 | "\n", 84 | "all_img_paths = glob.glob(os.path.join(root_dir, '*/*.ppm'))\n", 85 | "np.random.shuffle(all_img_paths)\n", 86 | "for img_path in all_img_paths:\n", 87 | " img = preprocess_img(io.imread(img_path))\n", 88 | " label = get_class(img_path)\n", 89 | " imgs.append(img)\n", 90 | " labels.append(label)\n", 91 | "\n", 92 | "X = np.array(imgs, dtype='float32')\n", 93 | "# Make one hot targets\n", 94 | "Y = np.eye(NUM_CLASSES, dtype='uint8')[labels]" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "source": [ 103 | "# Model" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "import tensorflow as tf" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 12, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "from keras.models import Sequential\n", 122 | "from keras.layers.core import Dense, Dropout, Activation, Flatten\n", 123 | "from keras.layers.convolutional import Conv2D\n", 124 | "from keras.layers.pooling import MaxPooling2D\n", 125 | "from keras.optimizers import SGD\n", 126 | "from keras import backend as K\n", 127 | "K.set_image_data_format('channels_first')" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 13, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "def cnn_model():\n", 137 | " model = Sequential()\n", 138 | "\n", 139 | " model.add(Conv2D(32, (3, 3), padding='same',\n", 140 | " input_shape=(3, IMG_SIZE, IMG_SIZE),\n", 141 | " activation='relu'))\n", 142 | " model.add(Conv2D(32, (3, 3), activation='relu'))\n", 143 | " model.add(MaxPooling2D(pool_size=(2, 2)))\n", 144 | " model.add(Dropout(0.2))\n", 145 | "\n", 146 | " model.add(Conv2D(64, (3, 3), padding='same',\n", 147 | " activation='relu'))\n", 148 | " model.add(Conv2D(64, (3, 3), activation='relu'))\n", 149 | " model.add(MaxPooling2D(pool_size=(2, 2)))\n", 150 | " model.add(Dropout(0.2))\n", 151 | "\n", 152 | " model.add(Conv2D(128, (3, 3), padding='same',\n", 153 | " activation='relu'))\n", 154 | " model.add(Conv2D(128, (3, 3), activation='relu'))\n", 155 | " model.add(MaxPooling2D(pool_size=(2, 2)))\n", 156 | " model.add(Dropout(0.2))\n", 157 | "\n", 158 | " model.add(Flatten())\n", 159 | " model.add(Dense(512, activation='relu'))\n", 160 | " model.add(Dropout(0.5))\n", 161 | " model.add(Dense(NUM_CLASSES, activation='softmax'))\n", 162 | " return model" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "collapsed": true 169 | }, 170 | "source": [ 171 | "## Configuring the learning algorithm \n", 172 | " \n", 173 | " * **Loss** function we want to optimize. We cannot use error percentage as it is not continuous and thus non differentiable. We therefore use a proxy for it: categorical_crossentropy \n", 174 | " \n", 175 | "* **Optimizer** : We use standard stochastic gradient descent with Nesterov momentum \n", 176 | "\n", 177 | "* **Metric** : Since we are dealing with a classification problem, our metric is accuracy\n", 178 | " \n", 179 | " " 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 15, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "from keras.optimizers import SGD\n", 189 | "model = cnn_model()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "** Let's train the model using SGD + momentum **" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 19, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "lr = 0.01\n", 206 | "sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)\n", 207 | "model.compile(loss='categorical_crossentropy',optimizer=sgd,metrics=['accuracy'])" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Training\n", 215 | "\n", 216 | "The model will iterate over batches of training set each size of batch_size. Gradients will be computed and updates will be made to the weigths\n", 217 | "\n", 218 | "**Epoch**: one iteration over all the trainong set \n", 219 | "\n", 220 | "**Learning rate scheduler**: Decaying learning rate over the epochs\n", 221 | "\n", 222 | "**Model checkpoint**: save the model with best validation accuracy. The network might start overfitting after certain number of epochs\n", 223 | "\n", 224 | "Training is run until the loss converges to a constant" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 22, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "from keras.callbacks import LearningRateScheduler, ModelCheckpoint" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 23, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "Train on 31367 samples, validate on 7842 samples\n", 246 | "Epoch 1/30\n", 247 | "31367/31367 [==============================] - 615s 20ms/step - loss: 1.6899 - acc: 0.5185 - val_loss: 0.2242 - val_acc: 0.9364\n", 248 | "Epoch 2/30\n", 249 | "31367/31367 [==============================] - 631s 20ms/step - loss: 0.2773 - acc: 0.9149 - val_loss: 0.0824 - val_acc: 0.9759\n", 250 | "Epoch 3/30\n", 251 | "31367/31367 [==============================] - 646s 21ms/step - loss: 0.1594 - acc: 0.9522 - val_loss: 0.0600 - val_acc: 0.9829\n", 252 | "Epoch 4/30\n", 253 | "31367/31367 [==============================] - 633s 20ms/step - loss: 0.1194 - acc: 0.9629 - val_loss: 0.0401 - val_acc: 0.9884\n", 254 | "Epoch 5/30\n", 255 | "31367/31367 [==============================] - 621s 20ms/step - loss: 0.0954 - acc: 0.9709 - val_loss: 0.0485 - val_acc: 0.9847\n", 256 | "Epoch 6/30\n", 257 | "31367/31367 [==============================] - 627s 20ms/step - loss: 0.0744 - acc: 0.9773 - val_loss: 0.0281 - val_acc: 0.9927\n", 258 | "Epoch 7/30\n", 259 | "31367/31367 [==============================] - 619s 20ms/step - loss: 0.0611 - acc: 0.9812 - val_loss: 0.0307 - val_acc: 0.9904\n", 260 | "Epoch 8/30\n", 261 | "31367/31367 [==============================] - 619s 20ms/step - loss: 0.0592 - acc: 0.9811 - val_loss: 0.0255 - val_acc: 0.9929\n", 262 | "Epoch 9/30\n", 263 | "31367/31367 [==============================] - 622s 20ms/step - loss: 0.0438 - acc: 0.9867 - val_loss: 0.0227 - val_acc: 0.9932\n", 264 | "Epoch 10/30\n", 265 | "31367/31367 [==============================] - 620s 20ms/step - loss: 0.0494 - acc: 0.9855 - val_loss: 0.0299 - val_acc: 0.9915\n", 266 | "Epoch 11/30\n", 267 | "31367/31367 [==============================] - 615s 20ms/step - loss: 0.0271 - acc: 0.9916 - val_loss: 0.0167 - val_acc: 0.9954\n", 268 | "Epoch 12/30\n", 269 | "31367/31367 [==============================] - 876s 28ms/step - loss: 0.0154 - acc: 0.9952 - val_loss: 0.0142 - val_acc: 0.9962\n", 270 | "Epoch 13/30\n", 271 | "31367/31367 [==============================] - 696s 22ms/step - loss: 0.0129 - acc: 0.9955 - val_loss: 0.0143 - val_acc: 0.9959\n", 272 | "Epoch 14/30\n", 273 | "31367/31367 [==============================] - 608s 19ms/step - loss: 0.0110 - acc: 0.9967 - val_loss: 0.0145 - val_acc: 0.9959\n", 274 | "Epoch 15/30\n", 275 | "31367/31367 [==============================] - 608s 19ms/step - loss: 0.0116 - acc: 0.9965 - val_loss: 0.0133 - val_acc: 0.9963\n", 276 | "Epoch 16/30\n", 277 | "31367/31367 [==============================] - 607s 19ms/step - loss: 0.0090 - acc: 0.9972 - val_loss: 0.0128 - val_acc: 0.9964\n", 278 | "Epoch 17/30\n", 279 | "31367/31367 [==============================] - 624s 20ms/step - loss: 0.0085 - acc: 0.9975 - val_loss: 0.0134 - val_acc: 0.9964\n", 280 | "Epoch 18/30\n", 281 | "31367/31367 [==============================] - 1693s 54ms/step - loss: 0.0080 - acc: 0.9971 - val_loss: 0.0127 - val_acc: 0.9969\n", 282 | "Epoch 19/30\n", 283 | "31367/31367 [==============================] - 608s 19ms/step - loss: 0.0072 - acc: 0.9977 - val_loss: 0.0130 - val_acc: 0.9964\n", 284 | "Epoch 20/30\n", 285 | "31367/31367 [==============================] - 607s 19ms/step - loss: 0.0068 - acc: 0.9981 - val_loss: 0.0131 - val_acc: 0.9966\n", 286 | "Epoch 21/30\n", 287 | "31367/31367 [==============================] - 608s 19ms/step - loss: 0.0056 - acc: 0.9984 - val_loss: 0.0128 - val_acc: 0.9967\n", 288 | "Epoch 22/30\n", 289 | "31367/31367 [==============================] - 609s 19ms/step - loss: 0.0073 - acc: 0.9979 - val_loss: 0.0128 - val_acc: 0.9967\n", 290 | "Epoch 23/30\n", 291 | "31367/31367 [==============================] - 607s 19ms/step - loss: 0.0065 - acc: 0.9981 - val_loss: 0.0126 - val_acc: 0.9966\n", 292 | "Epoch 24/30\n", 293 | "31367/31367 [==============================] - 607s 19ms/step - loss: 0.0060 - acc: 0.9978 - val_loss: 0.0126 - val_acc: 0.9967\n", 294 | "Epoch 25/30\n", 295 | "31367/31367 [==============================] - 608s 19ms/step - loss: 0.0062 - acc: 0.9980 - val_loss: 0.0126 - val_acc: 0.9967\n", 296 | "Epoch 26/30\n", 297 | "31367/31367 [==============================] - 607s 19ms/step - loss: 0.0069 - acc: 0.9976 - val_loss: 0.0125 - val_acc: 0.9967\n", 298 | "Epoch 27/30\n", 299 | "31367/31367 [==============================] - 618s 20ms/step - loss: 0.0061 - acc: 0.9979 - val_loss: 0.0125 - val_acc: 0.9967\n", 300 | "Epoch 28/30\n", 301 | "31367/31367 [==============================] - 621s 20ms/step - loss: 0.0061 - acc: 0.9981 - val_loss: 0.0124 - val_acc: 0.9967\n", 302 | "Epoch 29/30\n", 303 | "31367/31367 [==============================] - 632s 20ms/step - loss: 0.0055 - acc: 0.9980 - val_loss: 0.0126 - val_acc: 0.9966\n", 304 | "Epoch 30/30\n", 305 | "31367/31367 [==============================] - 611s 19ms/step - loss: 0.0063 - acc: 0.9980 - val_loss: 0.0126 - val_acc: 0.9967\n" 306 | ] 307 | }, 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "" 312 | ] 313 | }, 314 | "execution_count": 23, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "def lr_schedule(epoch):\n", 321 | " return lr * (0.1 ** int(epoch / 10))\n", 322 | "\n", 323 | "batch_size = 32\n", 324 | "epochs = 30\n", 325 | "\n", 326 | "model.fit(X, Y,\n", 327 | " batch_size=batch_size,\n", 328 | " epochs=epochs,\n", 329 | " validation_split=0.2,\n", 330 | " callbacks=[LearningRateScheduler(lr_schedule),\n", 331 | " ModelCheckpoint('model.h5', save_best_only=True)]\n", 332 | " )" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## Evaluating the Model" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 28, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "import pandas as pd\n", 349 | "test = pd.read_csv('/Users/amy/Desktop/Keras_Project/GTSRB/GT-final_test.csv', sep=';')\n" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 29, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "# Loading test dataset\n", 359 | "X_test = []\n", 360 | "y_test = []\n", 361 | "i = 0\n", 362 | "for file_name, class_id in zip(list(test['Filename']), list(test['ClassId'])):\n", 363 | " img_path = os.path.join('GTSRB/Final_Test/Images/', file_name)\n", 364 | " X_test.append(preprocess_img(io.imread(img_path)))\n", 365 | " y_test.append(class_id)\n", 366 | "\n", 367 | "X_test = np.array(X_test)\n", 368 | "y_test = np.array(y_test)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 30, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | "Test accuracy = 0\n" 381 | ] 382 | } 383 | ], 384 | "source": [ 385 | "# predict and evaluate\n", 386 | "y_pred = model.predict_classes(X_test)\n", 387 | "acc = np.sum(y_pred == y_test) / np.size(y_pred)\n", 388 | "print(\"Test accuracy = {}\".format(acc))" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [] 397 | } 398 | ], 399 | "metadata": { 400 | "kernelspec": { 401 | "display_name": "Python 2", 402 | "language": "python", 403 | "name": "python2" 404 | }, 405 | "language_info": { 406 | "codemirror_mode": { 407 | "name": "ipython", 408 | "version": 2 409 | }, 410 | "file_extension": ".py", 411 | "mimetype": "text/x-python", 412 | "name": "python", 413 | "nbconvert_exporter": "python", 414 | "pygments_lexer": "ipython2", 415 | "version": "2.7.14" 416 | } 417 | }, 418 | "nbformat": 4, 419 | "nbformat_minor": 2 420 | } 421 | -------------------------------------------------------------------------------- /NLP/Yelp Business Rating Prediction_NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Natural Language Processing Project\n", 8 | "\n", 9 | "Classifying Yelp Reviews into 1 star or 5 star categories based off the text content in the reviews. \n", 10 | "\n", 11 | "Dataset from [Yelp Review Data Set from Kaggle]\n", 12 | "\n", 13 | "Each observation in this dataset is a review of a particular business by a particular user.\n", 14 | "\n", 15 | "The \"stars\" column is the number of stars (1 through 5) assigned by the reviewer to the business. (Higher stars is better.) In other words, it is the rating of the business by the person who wrote the review.\n", 16 | "\n", 17 | "The \"cool\" column is the number of \"cool\" votes this review received from other Yelp users. \n", 18 | "\n", 19 | "All reviews start with 0 \"cool\" votes, and there is no limit to how many \"cool\" votes a review can receive. In other words, it is a rating of the review itself, not a rating of the business.\n", 20 | "\n", 21 | "The \"useful\" and \"funny\" columns are similar to the \"cool\" column.\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import pandas as pd\n", 33 | "import numpy as np\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "import seaborn as sns" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## The Data" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "yelp=pd.read_csv('yelp.csv')" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 14, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/html": [ 66 | "
\n", 67 | "\n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | "
business_iddatereview_idstarstexttypeuser_idcoolusefulfunnytext length
09yKzy9PApeiPPOUJEtnvkg2011-01-26fWKvX83p0-ka4JS3dc6E5A5My wife took me here on my birthday for breakf...reviewrLtl8ZkDX5vH5nAx9C3q5Q250889
1ZRJwVLyzEJq1VAihDhYiow2011-07-27IjZ33sJrzXqU-0X6U8NwyA5I have no idea why some people give bad review...review0a2KyEL0d3Yb1V6aivbIuQ0001345
26oRAC4uyJCsJl1X0WZpVSA2012-06-14IESLBzqUCLdSzSqm0eCSxQ4love the gyro plate. Rice is so good and I als...review0hT2KtfLiobPvh6cDC8JQg01076
3_1QQZuf4zZOyFCvXc0o6Vg2010-05-27G-WvGaISbqqaMHlNnByodA5Rosie, Dakota, and I LOVE Chaparral Dog Park!!...reviewuZetl9T0NcROGOyFfughhg120419
46ozycU1RpktNG2-1BroVtw2012-01-051uJFq2r5QfJG_6ExMRCaGw5General Manager Scott Petello is a good egg!!!...reviewvYmM4KTsC8ZfQBg-j5MWkw000469
\n", 157 | "
" 158 | ], 159 | "text/plain": [ 160 | " business_id date review_id stars \\\n", 161 | "0 9yKzy9PApeiPPOUJEtnvkg 2011-01-26 fWKvX83p0-ka4JS3dc6E5A 5 \n", 162 | "1 ZRJwVLyzEJq1VAihDhYiow 2011-07-27 IjZ33sJrzXqU-0X6U8NwyA 5 \n", 163 | "2 6oRAC4uyJCsJl1X0WZpVSA 2012-06-14 IESLBzqUCLdSzSqm0eCSxQ 4 \n", 164 | "3 _1QQZuf4zZOyFCvXc0o6Vg 2010-05-27 G-WvGaISbqqaMHlNnByodA 5 \n", 165 | "4 6ozycU1RpktNG2-1BroVtw 2012-01-05 1uJFq2r5QfJG_6ExMRCaGw 5 \n", 166 | "\n", 167 | " text type \\\n", 168 | "0 My wife took me here on my birthday for breakf... review \n", 169 | "1 I have no idea why some people give bad review... review \n", 170 | "2 love the gyro plate. Rice is so good and I als... review \n", 171 | "3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!... review \n", 172 | "4 General Manager Scott Petello is a good egg!!!... review \n", 173 | "\n", 174 | " user_id cool useful funny text length \n", 175 | "0 rLtl8ZkDX5vH5nAx9C3q5Q 2 5 0 889 \n", 176 | "1 0a2KyEL0d3Yb1V6aivbIuQ 0 0 0 1345 \n", 177 | "2 0hT2KtfLiobPvh6cDC8JQg 0 1 0 76 \n", 178 | "3 uZetl9T0NcROGOyFfughhg 1 2 0 419 \n", 179 | "4 vYmM4KTsC8ZfQBg-j5MWkw 0 0 0 469 " 180 | ] 181 | }, 182 | "execution_count": 14, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "yelp.head()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 6, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "\n", 203 | "RangeIndex: 10000 entries, 0 to 9999\n", 204 | "Data columns (total 10 columns):\n", 205 | "business_id 10000 non-null object\n", 206 | "date 10000 non-null object\n", 207 | "review_id 10000 non-null object\n", 208 | "stars 10000 non-null int64\n", 209 | "text 10000 non-null object\n", 210 | "type 10000 non-null object\n", 211 | "user_id 10000 non-null object\n", 212 | "cool 10000 non-null int64\n", 213 | "useful 10000 non-null int64\n", 214 | "funny 10000 non-null int64\n", 215 | "dtypes: int64(4), object(6)\n", 216 | "memory usage: 781.3+ KB\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "yelp.info()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 8, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/html": [ 234 | "
\n", 235 | "\n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | "
starscoolusefulfunny
count10000.00000010000.00000010000.00000010000.000000
mean3.7775000.8768001.4093000.701300
std1.2146362.0678612.3366471.907942
min1.0000000.0000000.0000000.000000
25%3.0000000.0000000.0000000.000000
50%4.0000000.0000001.0000000.000000
75%5.0000001.0000002.0000001.000000
max5.00000077.00000076.00000057.000000
\n", 304 | "
" 305 | ], 306 | "text/plain": [ 307 | " stars cool useful funny\n", 308 | "count 10000.000000 10000.000000 10000.000000 10000.000000\n", 309 | "mean 3.777500 0.876800 1.409300 0.701300\n", 310 | "std 1.214636 2.067861 2.336647 1.907942\n", 311 | "min 1.000000 0.000000 0.000000 0.000000\n", 312 | "25% 3.000000 0.000000 0.000000 0.000000\n", 313 | "50% 4.000000 0.000000 1.000000 0.000000\n", 314 | "75% 5.000000 1.000000 2.000000 1.000000\n", 315 | "max 5.000000 77.000000 76.000000 57.000000" 316 | ] 317 | }, 318 | "execution_count": 8, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "yelp.describe()" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "**Creating a new column called \"text length\" which is the number of words in the text column.**" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 3, 337 | "metadata": { 338 | "collapsed": false 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "yelp['text length']=yelp['text'].apply(len)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "# EDA" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "**Using FacetGrid from the seaborn library to create a grid of 5 histograms of text length based off of the star ratings**" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 4, 362 | "metadata": { 363 | "collapsed": false 364 | }, 365 | "outputs": [ 366 | { 367 | "data": { 368 | "text/plain": [ 369 | "" 370 | ] 371 | }, 372 | "execution_count": 4, 373 | "metadata": {}, 374 | "output_type": "execute_result" 375 | }, 376 | { 377 | "data": { 378 | "image/png": "iVBORw0KGgoAAAANSUhEUgAABDAAAADQCAYAAADxn5GHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGN9JREFUeJzt3X+UpXV9H/D3wooruhJMF61JrG3VT0gaNGoUowRqQyw2\nHqKptfXEXyhajg2xxRoT0MYUK8Zgo1g1Zwn+iJrYaBGlxahB/EG01h9EiOZj8EdzTtK0aEFoUBTY\n/nHvssM6szt7987c7+y8Xufs2TvPvfM87zt33zO7n/0+z92ya9euAAAAAIzssEUHAAAAANgfAwwA\nAABgeAYYAAAAwPAMMAAAAIDhGWAAAAAAwzPAAAAAAIa3ddEBWDtV9bIkH+rujy3o+HdJ8v4k/767\nr1hEBpjVIvtTVc9NcmaSXUk+neR53f2d9c4Bs1hwd85I8ovTD/9rkhd1t/eLZ8NY9N/dphmen+TJ\n3X3SojLALBb88+eiJCck+Zvpppd198XrnWMzsALj0HZiksMXceCqqiRXJPnJRRwf5mAh/amqByX5\nt5l057hMvk8/f71zwEFYVHf+bpKzkjwiyY9l0qGT1zsHHKSF/d0tSarqR5L8yqKODwdpkf35iSQ/\n1d0Pmf4yvFgjVmAcAqrqB5O8Pcndk9yeyf/cPijJw5NcWFVPTHKvJC9PcmSSozP5X6k/qKo3J/n+\nJA9I8qJMin9yktuSXNLdL9vrWC9P8k/2ivD27n7VXtueneRVSV4wp6cJa2LA/tyS5IzuvnH6OVcn\nud/cnjDMyWjd6e6vVtWx3f3dqvr+JEcluWG+zxrmY7T+TB931yS/neQlSZ4xtycLczZaf6rq7pn8\nXW1nVd0vycWZrMC4fZ7PmwkrMA4Nz05yaXc/PJMiPqa735rJ0vPndPfVmSypfU53P3T6+Jcu+fxv\ndPexST6f5JTufnAm/3P1wKratvRA3X32ksni7l97Dy/S3S/q7vesxZOFORuqP939P7v7Q0lSVTuS\n/Kskl6zFE4eDNFR3po/7blWdnuQrSf5Xkqvm/qxhPobrT5JXJLkoyVfn/Fxh3kbrz72TXJ7ktCTH\nZ3IqybPn/aSZsALj0PChJP+lqn48k3N+X7fMY34hyc9W1ZMzKdY9ltz336e//2WSb1XVlUkuTXJO\nd3976U4OYAUGbBRD9qeqfiDJZUl+xzVkGNSQ3enunVX1piRvSvJrSX71QJ8YrIOh+lNVJye5X3f/\nm6o6afanBetiqP5091eSPHHJ51yQ5OlJds7w3NgPA4xDQHdfOT1n8WeTPCXJM/O95/1+LMmHM7ku\nxR8leceS+7413c+tVfXITJZSPT7JJ6rqxO7+0pJjnZ3k7LV5JrD+RuxPVf1wJhfAvaC7z5/tmcHa\nGq07VfVDmfwD7MrpPn8/yRmzP0NYO6P1J8m/SPKjVXVVJv/Qu09VvbO7nzLbM4S1M1p/qurHkjyo\nu9893bQlyXdnenLsl1NIDgFV9RtJntbdb8lkuflDp3fdmmRrVd0rk/PCXtrd/y3Jz2SZC9xMp5gf\nSfLR7n5hki8kqXV4CrAwo/WnqrYn+UAm/wtgeMGwRutOJte8eHtVfV9VbUnyT5N8fIb9wJobrT/d\nfVp3H9vdD0nynCSfNrxgVKP1J5OBxW9V1dHTd2F8bibXwWANGGAcGi5I8vPTqfnF2fM/Tu9P8sYk\nP5zkwiR/WlWfS3JMkiOnF5y5Q3d/LsknklxTVZ9N8rVMlrDDoWy0/jwnk3MpX1hVV01//foM+4G1\nNlR3uvuaTM7h/+Mkf5Lk5iSGgIxqqP7ABjNUf7r785n8/LkykyHIVd39ezM8L1Zhy65d3h4dAAAA\nGJsVGAAAAMDwDDAAAACA4RlgAAAAAMMzwAAAAACGt3XRAZZz3XU37fPKokcffWSuv/7m9YqzXyPl\nkWVlI+VZTZYdO7ZvmWXf+jM7WVY2Uh792WOkPLKsbKQ8+rPHSHlkWdlIedaqP/vrzmqPvZ5GyiPL\n8kbKkuw/z2q7syFXYGzd+j1v47tQI+WRZWUj5VlklpG+DslYeWRZ2Uh59GePkfLIsrKR8ujPHiPl\nkWVlI+XRnz1GyiPL8kbKkswvz4YcYAAAAACbiwEGAAAAMDwDDAAAAGB4BhgAAADA8AwwAAAAgOEZ\nYAAAAADD27roAAAAAKzsCWddMpf9XPTix85lP7AoVmAAAAAAwzPAAAAAAIZngAEAAAAMzwADAAAA\nGJ4BBgAAADA8AwwAAABgeAYYAAAAwPAMMAAAAIDhGWAAAAAAwzPAAAAAAIZngAEAAAAMzwADAAAA\nGJ4BBgAAADC8rYsOAADAoe8JZ10yl/1c9OLHzmU/AGw8VmAAAAAAwzPAAAAAAIZngAEAAAAMzwAD\nAAAAGN6qLuJZVY9M8sruPqmqfjzJpUn+fHr3G7r7nVV1epLnJbk1ybndfWlV3S3J25Ick+SmJM/o\n7uvm/iwAAACAQ9p+BxhV9aIkT0vyN9NND0vy6u4+f8lj7pPkzCQPT7Itycer6oNJzkhydXf/WlX9\n8yTnJPml+T4FAAAA4FC3mhUYX07ypCS/O/34YUmqqk7NZBXGC5I8IsmV3X1Lkluq6tokxyV5TJLf\nmH7eZUleMsfsAAAAwCax3wFGd7+7qu6/ZNOnklzY3Z+pqrOT/LskVyX55pLH3JTkqCT3XLJ997b9\nOvroI7N16+H7fMyOHdtXs6t1M1IeWVY2Up61yqI/B0eWlY2UR3/2GCmPLCsbKc8i+zMP88y/GV6X\nWYyUJRkrz1pkWa/uJPqzHmRZ2TzyrOoaGHu5uLtv2H07yQVJPppkaZrtSW5IcuOS7bu37df119+8\nz/t37Nie66676QAir62R8siyspHyrCbLrAXXn9nJsrKR8ujPHiPlkWVlI+VZZH/mZV5fy432uqyX\nkbIkY+VZq/6sV3cS/Vlrsqxsf3lW251Z3oXkD6vqEdPb/yjJZzJZlXFCVW2rqqOSHJvkmiRXJnn8\n9LGnJPnYDMcDAAAANrlZVmCckeSCqvpukr9O8tzuvrGqXpvJgOKwJGd397er6g1J3lJVH0/ynSRP\nnVdwAAAAYPNY1QCju7+W5Pjp7c8mefQyj9mZZOde225O8uSDTgkAAABsarOcQgIAAACwrgwwAAAA\ngOEZYAAAAADDM8AAAAAAhmeAAQAAAAzPAAMAAAAYngEGAAAAMDwDDAAAAGB4BhgAAADA8AwwAAAA\ngOEZYAAAAADDM8AAAAAAhmeAAQAAAAzPAAMAAAAYngEGAAAAMDwDDAAAAGB4BhgAAADA8AwwAAAA\ngOEZYAAAAADDM8AAAAAAhmeAAQAAAAzPAAMAAAAYngEGAAAAMDwDDAAAAGB4BhgAAADA8LYuOgAA\nAKzWaeddPpf9vO/8U+eyHwDWjxUYAAAAwPAMMAAAAIDhGWAAAAAAwzPAAAAAAIZngAEAAAAMzwAD\nAAAAGN6q3ka1qh6Z5JXdfVJVPSDJm5PsSnJNkud39+1VdXqS5yW5Ncm53X1pVd0tyduSHJPkpiTP\n6O7r1uB5AAAAAIew/a7AqKoXJbkwybbpplcnOae7T0iyJcmpVXWfJGcmeXSSxyV5RVXdNckZSa6e\nPvatSc6Z/1MAAAAADnWrOYXky0metOTjhyX5yPT2ZUl+OskjklzZ3bd09zeTXJvkuCSPSfL+vR4L\nAAAAcED2ewpJd7+7qu6/ZNOW7t41vX1TkqOS3DPJN5c8Zrntu7ft19FHH5mtWw/f52N27Ni+ml2t\nm5HyyLKykfKsVRb9OTiyrGykPPqzx0h5ZFnZSHkW2Z/RbIbXZRYjZUnGyrMWWdazO/PMf6i/LrOS\nZWXzyLOqa2Ds5fYlt7cnuSHJjdPb+9q+e9t+XX/9zfu8f8eO7bnuuptWGXftjZRHlpWNlGc1WWYt\nuP7MTpaVjZRHf/YYKY8sKxspzyL7M6KN9Lqsl5GyJGPlWav+rGd35vW13Givy3qRZWX7y7Pa7szy\nLiSfq6qTprdPSfKxJJ9KckJVbauqo5Icm8kFPq9M8vi9HgsAAABwQGZZgXFWkp1VdUSSLyZ5V3ff\nVlWvzWRAcViSs7v721X1hiRvqaqPJ/lOkqfOKzgAAACrd9p5l89lP+87/9S57AcO1KoGGN39tSTH\nT29/KcmJyzxmZ5Kde227OcmTDzolAAAAsKnNcgoJAAAAwLoywAAAAACGZ4ABAAAADM8AAwAAABie\nAQYAAAAwPAMMAAAAYHgGGAAAAMDwDDAAAACA4RlgAAAAAMMzwAAAAACGZ4ABAAAADM8AAwAAABie\nAQYAAAAwPAMMAAAAYHgGGAAAAMDwDDAAAACA4RlgAAAAAMMzwAAAAACGZ4ABAAAADM8AAwAAABie\nAQYAAAAwPAMMAAAAYHgGGAAAAMDwDDAAAACA4RlgAAAAAMMzwAAAAACGZ4ABAAAADG/rogMAwEbx\nhLMumct+LnrxY+eyHwCAzcQKDAAAAGB4BhgAAADA8AwwAAAAgOEZYAAAAADDM8AAAAAAhjfzu5BU\n1WeT3Dj98KtJXp7kzUl2JbkmyfO7+/aqOj3J85LcmuTc7r70oBIDAAAAm85MA4yq2pZkS3eftGTb\ne5Oc091XVNUbk5xaVZ9IcmaShyfZluTjVfXB7r7l4KMDAAAAm8WsKzAenOTIqvrAdB+/muRhST4y\nvf+yJD+T5LYkV04HFrdU1bVJjkvyP/a186OPPjJbtx6+zwA7dmyfMfraGCmPLCsbKc9aZdGfgyPL\nykbKs8j+zMNp510+l/287/xTN8XrMouRsiRj5dno/ZmnzfC6zGKkLMlYedYiy0bsTnLovy6zkmVl\n88gz6wDj5iS/meTCJA/MZGCxpbt3Te+/KclRSe6Z5JtLPm/39n26/vqb93n/jh3bc911Nx146jUy\nUh5ZVjZSntVkmbXg+jM7WVY2Up5F9mdEG+l1WS8jZUnGyqM/d7aRXpf1MlKWZKw8a9WfjdidRH+W\nI8vK9pdntd2ZdYDxpSTXTgcWX6qqb2SyAmO37UluyOQaGduX2Q4AAACwarO+C8lpSc5Pkqq6byYr\nLT5QVSdN7z8lyceSfCrJCVW1raqOSnJsJhf4BAAAAFi1WVdg/E6SN1fVxzN515HTknw9yc6qOiLJ\nF5O8q7tvq6rXZjLMOCzJ2d397TnkBgCAmT3hrEsOeh8Xvfixc0gCwGrNNMDo7u8keeoyd524zGN3\nJtk5y3EAAAAAktlPIQEAAABYNwYYAAAAwPAMMAAAAIDhGWAAAAAAwzPAAAAAAIY369uoAgAAsAl5\nG2IWxQADNqHTzrv8oPfhhw4AALCenEICAAAADM8AAwAAABieAQYAAAAwPAMMAAAAYHgu4gnMZB4X\nAk2S951/6lz2AwAAHNo27ADjYP/x5B0UAAAAYONwCgkAAAAwPAMMAAAAYHgGGAAAAMDwNuw1MA6W\na2gAAADAxmEFBgAAADC8TbsCAwA2uiecdclB78OKQpidtxQHWF9WYAAAAADDM8AAAAAAhmeAAQAA\nAAzPAAMAAAAYngEGAAAAMDzvQgIslHdRAAAAVsMAAwAAgHXlbYiZhVNIAAAAgOFZgTGjeUwMLXsH\nAACA1bECAwAAABieFRjAhuccSpid/sDizeOC1onVvcChzwBjgQ72L41+SAEAAJuZd7TbXJxCAgAA\nAAxvzVdgVNVhSV6f5MFJbknynO6+dq2PC3CgTPBhdvoDizePU8KcDgaMbD1OIfm5JNu6+1FVdXyS\n85P4zjgHTkEB4FDiehyweK7HwWbk58/GsR4DjMckeX+SdPcnq+rh63BMVmFeRV2kg/3h6O1wmbdD\noVd782ecjWak1SB+zrBZjfRn3z9OWS/zGgDOw7z+vI7U5STZsmvXrrntbDlVdWGSd3f3ZdOP/yLJ\n3+vuW9f0wAAAAMAhYz0u4nljku1Lj2l4AQAAAByI9RhgXJnk8UkyvQbG1etwTAAAAOAQsh7XwLg4\nyclV9cdJtiR51jocEwAAADiErPk1MAAAAAAO1nqcQgIAAABwUAwwAAAAgOEZYAAAAADDW4+LeM5N\nVR2W5PVJHpzkliTP6e5r1/iYj0zyyu4+qaoekOTNSXYluSbJ87v79qo6Pcnzktya5NzuvrSq7pbk\nbUmOSXJTkmd093UzZrhLkouS3D/JXZOcm+QLi8gyzXN4kp1Janr8f5nk24vKM810TJLPJDl5eqyF\nZKmqz2by1sFJ8tUkL19UlmWy6Y/+rJRJf/afTX8W3B/d2W8W/dlzvIV3Z5pDf/adSX9Wl01/9Ge5\nTJu2PxttBcbPJdnW3Y9K8uIk56/lwarqRUkuTLJtuunVSc7p7hMyeUeVU6vqPknOTPLoJI9L8oqq\numuSM5JcPX3sW5OccxBRfiHJN6b7+sdJXrfALEnyhCTp7kdP9/XyReaZfoP77STfmm5aSJaq2pZk\nS3efNP31rEVlWYH+6M/30J9V05/F90d3Vs6iP1MDdSfRnxXpzwHRH/25k83en402wHhMkvcnSXd/\nMsnD1/h4X07ypCUfPyzJR6a3L0vy00kekeTK7r6lu7+Z5Nokxy3NuuSxs/qDJC+Z3t6SydRqUVnS\n3e9J8tzph38nyQ2LzJPkN5O8MclfTT9eVJYHJzmyqj5QVZdX1fELzLIc/dGf5ejP6ujPgvujO/uk\nP3uM0p1Ef/ZFf1ZPf/Rnb5u6PxttgHHPJN9c8vFtVbVmp8F097uTfHfJpi3dvft9Z29KctQymZbb\nvnvbrDn+X3ffVFXbk7wrk8nUQrIsyXRrVb0lyQVJ3r6oPFX1zCTXdfcfLtm8qK/NzZl8Q3lcJkvL\nFvZ1WYH+6M+d6M8B0Z8B+qM7K9KfqVG6M82iP8vQnwOmP/pzB/3ZeAOMG5NsX/LxYd196zoe//Yl\nt7dnMn3bO9Ny23dvm1lV/VCSDyf53e5+xyKz7Nbdz0jyoEzOCbvbgvKcluTkqroiyUMyWXp0zIKy\nfCnJ27p7V3d/Kck3ktx7QVmWoz/6szf9WT39GaQ/urMs/VnZQv+86s+y9OfA6I/+LLXp+7PRBhhX\nJnl8kkyXp1y9zsf/XFWdNL19SpKPJflUkhOqaltVHZXk2EwuVnJH1iWPnUlV3TvJB5L8cndftMgs\n0zxPq6pfmX54cybfTD69iDzd/VPdfWJ3n5TkqiRPT3LZgr42p2V6XmJV3TeTieIHFvU6LUN/9OdO\n9OeA6M+C+6M7+6Q/K1vk93v9WYb+HDD90Z876E+yZdeuXft7zDBqz1V4j8vkXKhndfefrfEx75/k\n97v7+KraPXE7IskXk5ze3bfV5Iqqz81kIPQfuvvdVXVkkrck+dtJvpPkqd391zNmeE2SpyRZ+lx/\nKclr1zvLNM/dk7wpyX2S3CXJedMM6/612SvXFZksXbp9EVmq6ohMrrh7v0yuuvvLSb6+iCwr5NOf\nPfTne3NdEf3ZVz792WMh/dGdfWbQnzsf7/5ZcHemOfRn/7muiP7sL5/+7KE/d851RTZhfzbUAAMA\nAADYnDbaKSQAAADAJmSAAQAAAAzPAAMAAAAYngEGAAAAMDwDDAAAAGB4BhgbTFUdVVXvmfFzH1FV\nr1xm+zOr6s0HHW6FY63F/mEW+gOz0x+Ynf7A7PSHpQwwNp6jkzxkxs/9kST3nmOWUY4Fq6U/MDv9\ngdnpD8xOf7jD1kUH4IC9Nsl9q+ri7n5iVT09yQsyGUZ9JsnzMynPZUn+QZLbknwuyalJfj3JParq\n7O5++XI7r6qfSPIfkxyZ5OtJntfdX62qK5J8KskJSXYk+cXuvqyqfjDJ2zP5xnJ1khOnx73jWEn+\nMskDpvu4X5I/6u7T5/tlgVXRH5id/sDs9Admpz/cwQqMjefMJH81Le+PJjk9yU9290OS/J8kL+zu\nzyZ5Y5JXJbkgyRu6+6okL03y3n2U94gkFyZ5anc/NMn5SXYuecgR3f2oJP86ybnTba9J8s7uPi7J\nu5L8QHffsMyx7pfkSUmOTXLKNDusN/2B2ekPzE5/YHb6wx2swNjY/mGSByb5ZFUlyRFJPju979wk\nn07yrSRPW+X+HpTk7yd573R/SXLPJfe/f/r7NUnuNb19cpJnJkl3X1xVN6yw74929/9Nkqr6cpK/\ntcpMsFb0B2anPzA7/YHZ6c8mZ4CxsR2e5D9395lJUlX3yJ7X9PuSbJ/+ulcmy6FWs7+vTKeZqarD\nc+fzuL49/X1Xki3T27dldSt5bl1ye+nnw6LoD8xOf2B2+gOz059NzikkG8+t2VPSK5I8saqOqaot\nSd6QyflgSfKfkrwuyeunv/b+3OX8WZJ7VdUJ049PS/KO/eT5YJKnJklVnZLJN47VHAsWQX9gdvoD\ns9MfmJ3+cAcDjI3nfyf5i6r6cHf/SZKXJbk8yZ9m8nqeV1X/LJOlUK9J8ltJHjTd9qkkx1fVecvt\nuLtvSfLkJOdX1eeTPCPJs/eT5wVJfr6qPpfkKUl2L6Ha57FgQfQHZqc/MDv9gdnpD3fYsmvXrkVn\nYAOrqjOTfKi7v1BVD02ys7sftuhcsBHoD8xOf2B2+gOz05/FssSFg/XnSX6vqm7P5Bwxbw8Eq6c/\nMDv9gdnpD8xOfxbICgwAAABgeK6BAQAAAAzPAAMAAAAYngEGAAAAMDwDDAAAAGB4BhgAAADA8P4/\n2727l8S5yEgAAAAASUVORK5CYII=\n", 379 | "text/plain": [ 380 | "" 381 | ] 382 | }, 383 | "metadata": {}, 384 | "output_type": "display_data" 385 | } 386 | ], 387 | "source": [ 388 | "%matplotlib inline\n", 389 | "g=sns.FacetGrid(yelp,col='stars')\n", 390 | "g.map(plt.hist,'text length')" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "**Creating a boxplot of text length for each star category.**" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 5, 403 | "metadata": { 404 | "collapsed": false 405 | }, 406 | "outputs": [ 407 | { 408 | "data": { 409 | "text/plain": [ 410 | "" 411 | ] 412 | }, 413 | "execution_count": 5, 414 | "metadata": {}, 415 | "output_type": "execute_result" 416 | }, 417 | { 418 | "data": { 419 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEFCAYAAAD5bXAgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH2RJREFUeJzt3XuUXWWZ5/FvVZK65mKuZmxR1zT6JHEEkVZoQiQ9Sugw\nakZp6IUIaZhWyOCgq3tEDcFlegWxg501A600E7uFiMZBEHAYkCwVEOKFiHE6GPIwOM2EASGVa1Xq\nltRl/tjn1Kkq36qcqpx99t5n/z5rZeWpfc6p89RO5Tz7vez3rRscHERERGS0+qQTEBGRdFKBEBGR\nIBUIEREJUoEQEZEgFQgREQmamnQCldTW1qEpWSIiEzR//oy60HG1IEREJEgFQkREglQgREQkSAVC\nRESCVCBERCRIBUJEqmbPnt3s2bM76TSkTLFOczWzXwHthS//BbgJuBMYBJ4FrnX3ATP7OHA10Ads\ncPeHzKwZuBtYAHQAq929Lc58pXKKHwKLFi1JOJPk6VyUbN26BYD167+ccCbJy8LvRWwtCDNrAurc\nfXnhz5XAJmCduy8D6oBVZrYQuA5YClwA3GxmjcAaYFfhuVuAdXHlKpX34IP38eCD9yWdRiroXET2\n7NnNSy/t5aWX9qoVQVQsiwUzreLsYjodaDGzbWb2YzM7GzgTeKLw+CPA+4H3ANvdvdfdjwAvAKcB\n5wI/GPVcyYA9e3bj/hzuz+X+g0DnomT4h2HaPxjjlpViGWcXUxfwFeDrwFuJPuTr3L14t3MHMAuY\nCRwZ9rrQ8eKxcc2e3cLUqVMqkrxM3qZNDwzFDz/8AMuWnZVgNsnSuSjZt++1EfH8+TMSzCZZf/M3\ndw/F99xzN7fddluC2YwtzgLxPPBCoSA8b2YHiFoQRTOAw0RjFDNOcLx4bFyHDnVVIG05WUeOtI+I\n29o6EswmWceP94+I83wuRsvzuXjttX0j4qTPxVjFOs4upquAvwMwszcQtQi2mdnywuMrgSeBp4Fl\nZtZkZrOAxUQD2NuBC0c9VyRTVq26KBjn0YIFrw/GeTRv3rxgnDZxFoh/BF5nZk8B/52oYHwKWG9m\nPwMagHvd/VXgVqIC8GPgBnfvAW4H3l54/SeA9THmKhXU0tIajPNo0aIlmC3GbHGqZ6tUw6WXXhGM\n8ygr5yK2LiZ3PwZ8NPDQeYHnbgY2jzrWBVwcT3YSpzPOOBP354bivMt7y0F+36JFS5g/f8FQnFa6\nUU4qbufOZ4Kx5Nvwqb6a9gtNTU00NTUlnca4amo/CJE0Kn4YpvlKUaqrOM21GKf1d0MtCKk4DcyW\n6D6IEv1elGSlNaUWhFRccWC2GOfZ6A+CPJ8P/V5kjwqExCLvV4gSpt+LyKpVF7Fx44ahOK3UxSSx\nWLRoia4SUbfKaHv3vsjevS8mnUbisjL9WS2ICsrC6oxSXepWGenBB78HwIoVF57gmbUvCxcMKhAV\npNkqEpKFD4Jq2LbtYbq7u4bivBeJLHxOqIupQjRbRcai7rZIsfUwOpb0UoGokKxMWxNJSn9/fzCW\n9FKBEJGqmDVrVjCW9FKBqBDNVhEZ35w5c4NxXmVhf24ViAopLr41f/4C9TeLBOgiaqQsbEWrWUwV\n1N5+5MRPEskpTfktKU5qKcZpPR9qQVTItm0P09vbS29vL9u2PZx0OiKptGrVRWo9kJ1JLSoQFaIp\nfCInpim/2aICUSEDA/3BWCQLg5FSXVkZj1GBqJCmpuZgLLJ16xa2bt2SdBqSIosWLaG5uYXm5pZU\nt6hUICrk2LFjwVjyrbgxzEsv7VUrQobs2bOb7u4uuru7Uv17oQJRIfPmzQvGkm/DWw5qRUiRBqlz\n5tJLrwjGkm/79+8PxiJZoAJRIYsWLeGUU97EKae8KdV9ilJdallKiAapc+jSS69Q60FGUMtSskwF\nooI0x1tEyqExCBHJzAeBSIgKhMRCN4eJjE1jEJJrWVipshqy8kFQLbpwiGRlUotWc5WKy8pKldWw\naNESGhoah+K8077t2aIWhFSc+t1L9uzZzbFjvRw71pv7K2ft216SlTvsVSBEYqQ7qUt04VCSlXOh\nAiEVd8YZZwbjPNKd1JJlKhBScTt3PhOM80h3UpdowL4kK+ci1kFqM1sAPAOcD/QBdwKDwLPAte4+\nYGYfB64uPL7B3R8ys2bgbmAB0AGsdve2OHOVyunq6gzGeXTppVewceOGoTjPtOVoSXEWUzFOq9ha\nEGY2DbgD6C4c2gSsc/dlQB2wyswWAtcBS4ELgJvNrBFYA+wqPHcLsC6uPEXilJXpjNVyxhln5r7b\nMUvibEF8BfgH4POFr88EnijEjwArgH5gu7v3Ar1m9gJwGnAusHHYc2+MMU+psJaW1mCcV3lvOQxX\n7HJcseLChDNJVnEWUzFO68VDLAXCzP4CaHP3R82sWCDq3H2wEHcAs4CZwJFhLw0dLx47odmzW5g6\ndcpJZi8na/Xqy1m7du1QPH/+jIQzStb8+WclnUIq7Nq1a+j+mFdffZF3vOMdCWeUnE2bHhiKH374\nAZYtS+fvSFwtiKuAQTN7P/BOom6iBcMenwEcBtoL8XjHi8dO6NChrpPLWipi4cK3DPU1L1z4Ftra\nOhLOSNLgrru+OSL+7Gfz2zFw5Ej7iDjp/yNjXcTFUiDc/b3F2MweB64BbjGz5e7+OLASeAx4GrjJ\nzJqARmAx0QD2duDCwuMrgSfjyFPik+aZGSJSnmpOc/1rYL2Z/QxoAO5191eBW4kKwI+BG9y9B7gd\neLuZPQV8AlhfxTylArT0uYyWlamd1ZCVcbrY12Jy9+XDvjwv8PhmYPOoY13AxfFmJnEqLh+gIiHy\n+1atumho+nOai6VulJNYbN26JfdLS8hIWVleohqyMv1ZBUIqLisLkUl1HTx4IBjnVU9PDz09PUmn\nMS4VCKk4LVAnIe3tR4JxHu3Zs5u2tn20te1L9UWUCoRUnBaok5D6+inBOI+ychGlAlFB2i0rogXq\nJOTss88JxnmUlYsoFYgK0jabkeFLS2iZCSl65ZWXg3EeZeUiSgWiQrRbVklxm82GhsZUz9CQ6tIq\nvyVLl743GKeNCkSFaApfibbZFBlfVvZMUYGQilOxlJCs3D1cDVlpTalAVIiWESjJyi+/VJf+j2RP\n7EttiIhANDbV3NwyFOdZVlpTakFUiLpVSrLyyy/VtWfPbrq7u+ju7sr92NTwXfXSvMOeCkSFqFul\nRF0JEqKLqJKsDFKri6lChq+pkvb1VeKmzelFaoNaEBVy9OjRYJxX2pxeRlPLsiQr50IFokKmT58e\njPNq585nUt10FklSccC+ubkl1a1sFYgKaWpqCsZ5pLvKJSQrC9RVQ1YG7FUgKkQzd0o0GDmSFnGM\n7Nv3WjDOo6z8H1GBqJCsTFuT6tMijpGBgYFgLOmlAlEhWZm2Vg1ZGYCrBnW3SUhW/o+oQEjFFae5\nmi1O9QBcNWSlK6Ea6uvrg7Gkl/6VKiQrVwTVsmrVRToP6AbK4RYseH0wzqOsDNjrRrkK0c1hI+kc\nRDo62oNxHp166tt46aW9Q3GeaUe5HNJVs4zW3t4ejPPopz99MhjnkXaUk1zT1M7I1KnTgnEeHTt2\nLBjnUVa25VWBqCBNZyzRuYh85CMXB+M8GhwcDMZ5tGjREhobG2lsTPe2vCoQFaLpjCU6FxJSV1cX\njPNoz57d9Pb20tub7m15VSAqRNMZS3QuSh588HvBOI9mzpwVjPMoK/9HVCAqRNMZRaRcWfm8UIGQ\nitM9ISVnn31OMM6j9vYjwTiPDh48EIzTpqz7ICya4D8PGOo4dPefxJWUSK145ZWXg3EeaZC6pLOz\nMxinzQkLhJn9N2Al8Fug+K86CPzbGPPKHM13Lxndv5rmWRpxy0pXgkhIOS2I9wF/6O4TmrhsZlOA\nzYARFZRrgB7gzsLXzwLXuvuAmX0cuBroAza4+0Nm1gzcDSwAOoDV7t42kRyq6ciRw8E4j/ShKCF1\ndXVDLYe8z2KaMmUK/f39Q3FalTMGsRdonsT3/iCAuy8F1gE3AZuAde6+jKi7apWZLQSuA5YCFwA3\nm1kjsAbYVXjulsL3kAzQ8hIl2iek5PTTzwjGefSGN/xBME6bMVsQZvYNoiv9qcD/MrOfEF3hA+Du\nV433jd39ATN7qPDlm4HDwPuBJwrHHgFWAP3AdnfvBXrN7AXgNOBcYOOw5954oh9m9uwWpk5NRzWe\nP39G0ikk5siRIyPiPJ+LZcuW4v7cUJznc3H48MERcZ7PxZo117B27dqhOK3nYrwupscLfz8ReKys\nESZ37zOzu4APA38GnO/uxdd2ALOAmcDwKQ2h48Vj4zp0qKuctGJRX18/tAlKfX09bW0dieWSNnk+\nF48+um1EfM45+R26+93vfjcizvPvxeHDXSPipM/FWAVqzALh7ncBmNnn3f3m4Y+Z2ZfKfWN3X21m\nnwV+wciuqhlErYr2Qjze8eKx1GpsbKK7u2sozrOWlpahmRktLS0JZ5OsrKzaKdWVlYkc43UxfZlo\ngPhDZvbWUa85G1g73jc2s8uBNxaKSxcwAPzSzJa7++NEM6MeA54GbjKzJqARWEw0gL0duLDw+Eog\n1cs/mi3i17/+1VCcZ8eP9wXjPJo+ffrQhcP06dMTziZZTU3N9Pb2DsWSfuMNUt9H1L3UWfi7+OdR\n4N+V8b2/B5xRGLt4FPg0cC2w3sx+BjQA97r7q8CtRAXgx8AN7t4D3A683cyeAj4BrJ/4j1c97nuC\ncR4Nn5WR5hkaUl2a3VaSlZtJx+ti2gHsMLP73X3CU1HcvRO4JPDQeYHnbiaaEjv8WBeQ7+UvM0qt\nqZKjR48G4zzq6+sLxnm0Y8fPR8Rp7WIqZ5rrb8ys38wOFP4U4x1m9s7YM8yIVas+EozzSK2pkuHd\nSnnvYtKd1CWPP/6jYJw25RSIJ4CL3H2uu88FPgB8n6jb56txJifZ1Nd3PBiLSCQrxbKcAvFv3P2B\n4hfu/ghwmrvvZHI30NWk++//bjDOo+J039FxHmmBuhLtB5E95Sy1cdjMriZa9qIeuAw4aFHnslaD\nLTh+/HgwzqOsXB1Vg/rdS2bOnDW0DE3e94NoaGjk2LHeoTityvmAvww4H3gF+L/AcuCKwrHPxZZZ\nxsydOy8Y55HORUlxvZ3RseTb0qXLgnHanLAF4e4vE90FPdptlU8nu6688hNs3LhhKM6z971vBd/5\nzt1DsQiou224F154PhinTTnLfV8AbADmMHI/iH8dY16Zs3fviyPitE5bq4adO58ZEa9YcWGC2Uha\nqOuxZN++14Jx2pQzBnEb8FdEdzfn+191HKP3HtaHogDU1dUzODgwFItAdsamyikQ+939oRM/Ld8G\nBvqDcR6tWnXRUHdbmu8SrY7BMWLJs6yMTZVTIJ40s03AD4g2/AG05ehoM2fOoq1t31AsAtDQ0DC0\n/lBDQ0PC2YhMTDlt3vcAZwCfJ1oPaT3wxRhzyqQ5c+YG4zzaunVLMM6jc85ZFoxFsqCcWUx/Uo1E\nsk7dKiWvvvq7YJxHzz77z8FY8i0rW46WM4vpzcDXgbcAy4BvA1e5+4uxZpYxmsVUkpUBuGo4cGB/\nMJZ8mz59xtBNg9Onp3M3OSivi+kO4BbgKPAasJVoj2gZZvQspjwb3tee9353LX0uIVlZ+rycAjHP\n3bcBuPtgYWnumfGmJVmmO6lLVCAkJCtL85RTILrN7I0U5uiZ2blAb6xZZZCW+y555ZWXg3Ee9fT0\nBGORLChnmutfAQ8Bf2hmvya6ozq0EVCuvfbaq8FYRCSrTtiCKOws926ifaivAE5195+P/6r8eeyx\nHwZjEZGsGrMFYWbfYIxbP80Md78qtqwk0173utdx+PDhoVhEsmm8LqbHq5WE1JasDMCJyPjGLBDu\nflc1E8m6rNz4Ug2dnZ3BWESyRctLVoiW2hCRWqMCUSEqECJSa05YIMzs84FjX4onnewavv5S3tdi\nEpHaMN4spi8DC4APmdlbhz00DTgLWBtzbpmyaNESzBYPxSIiWTfeLKb7gCXA+4Anhh3vA/4mzqTS\n4J57vsWOHb+Y0Gva29sB+Mxnrpvw+7373WdxySWXTfh1km719fUMDAwMxSJZMt4sph3ADjPb6e4j\n1ik2sz8D/nfcyWVNf39trlw60WLZ2Ng4tElOY2PjhAtmLRVL7cMsWVbOUhvfN7OvuvstZjYHuB14\nK3BvvKkl65JLLpvwh1Txg/CWW26NI6XMmD59xlCBSPNSxtWgAiFZVk6BeBdwq5n9lGhM4mvAR2PN\nSlJlMsVyzZorARVLkSwrp0DUAceBlkI8UPgjMqa8txxEakE5o2a/AV4E/oho9tIfA0/HmJOIiKRA\nOS2Ile6+sxDvB/7czC4+0YvMbBrwT0RblTYCG4DdwJ1EiwA+C1zr7gNm9nHgaqIZUhvc/SEzawbu\nJurW6gBWu3vbBH42ERE5CWW1IMzsBjPbYmYzzewLwINlvO5jwAF3Xwb8KfD3wCZgXeFYHbDKzBYC\n1wFLgQuAm82sEVgD7Co8dwuwbqI/nIiITF45BeKrQCvRYHUfcCrw9TJe913gxkJcV3jtmZTuqXgE\neD/wHmC7u/e6+xHgBeA04FzgB6OeKyIiVVJOF9OZ7v4uM1vp7l1mthrYdaIXuftRADObQTQldh3w\nFXcvzvXrAGYR7W99ZNhLQ8eLx8Y1e3YLU6cmt5LqlClRvZ0/XwO0OhdhOh8lOhclaT0X5RSIQTNr\noLR50DzG2EhoNDM7Bbgf+Jq7f9vMNg57eAZwGGgvxOMdLx4b16FDXeWkFZv+/mhyV1tbR6J5pIHO\nRZjOR4nORUnS52KsAlVOgfivwA+BhWb2X4APU8ZSG2b2emAb8El3/1Hh8E4zW+7ujwMrgceIZkTd\nZGZNRIPZi4kGsLcDFxYeXwk8WUauIrGazBIsw+X5rnLJnhMWCHffYma/BP4EmAJ8cPTSG2NYC8wG\nbjSz4ljEp4huumsAngPudfd+M7uVqADUAze4e4+Z3Q7cZWZPAcfQzXmSQdOmTRvaVW/atGkJZ1NZ\n1SyWKpTJOGGBMLP73P0ioimqxWM/cvf3jfc6d/8UUUEY7bzAczcDm0cd6wJOOJ1WpJomc1f5VVdF\n1zZ33KFNGiVbxlvu+37gdOANZvZ/Rr3mpbgTE6kVtdZyKJposdyzZzcbN24A4Prr12lZ/AwYrwWx\nGphDNAYxvC3YB7wWZ1IitWTmzBNOwMuF4QVBxSEbxlvuu51oJtGq6qUjIrVs5syZSacgE1DOLCYR\nkYqYNq0h6RRkArTFlYiIBKkFISJykmp1yq9aECIiEqQWhIjISZrolN81a64csW97WndeVAtCRKTK\nbr/9G8E4bVQgREQkSF1MIiIJmDt3XtIpnJBaECIiEqQCISIiQSoQIiISpAIhIiJBKhAiIhKkAiEi\nIkEqECIiEqQCISIiQSoQIiISpAIhIiJBKhAiIhKkAiEiIkEqECIiEqQCISIiQSoQIiISpAIhIiJB\nKhAiIhKkAiEiIkHacjRHvvSlL3Lo0MGqvFfxfT7zmeuq8n6zZ89h7dovVuW9RPKi5guEPhRLDh06\nyIGD+6lvjv+ffaB+MHrP7sPxv1d3X+zvIZJHNV8gDh06yIEDB6ib1hz7ew0WeuwOtnfF/17Huyf1\nuvrmqcz+0zdVOJtkHfrB3qRTEKlJsRYIMzsL+Ft3X25mpwJ3AoPAs8C17j5gZh8Hrgb6gA3u/pCZ\nNQN3AwuADmC1u7dNNo+6ac1MP/VDJ/nTpMvRF76fdAoiUuNiG6Q2s+uBrwNNhUObgHXuvgyoA1aZ\n2ULgOmApcAFws5k1AmuAXYXnbgHWxZWniIiExdmC+C3wEeCbha/PBJ4oxI8AK4B+YLu79wK9ZvYC\ncBpwLrBx2HNvLOcNZ89uYerUKSOOTZlSuxO1pkypZ/78GRN6fq2a6Lm4/vrrOXDgQIwZlRTHpj73\nuU9X5f3mzp3Lxo0bT/zEBBR/Byfyb1WrsnAuYisQ7n6fmb1l2KE6dx8sxB3ALGAmcGTYc0LHi8dO\n6NCh3+/77+8fmFDeWdLfP0BbW8eEnl+rJnou9u1r4+CB/Uyvj79oThmIznv3/v2xv9fRgYEJn4tq\nKv4OpjW/akrTuRirSFVzkHr4p9MM4DDQXojHO148JlJR0+vr+disOUmnUVF3H5n4jD3N9JOxVLNA\n7DSz5e7+OLASeAx4GrjJzJqARmAx0QD2duDCwuMrgSermKdIrhRn+jVOa4n9veqIuoCPtk9uFt5E\n9B6PfzZhratmgfhrYLOZNQDPAfe6e7+Z3UpUAOqBG9y9x8xuB+4ys6eAY8BHq5inSO40TmvhXYsv\nSjqNivrVc/clnULmxVog3P1F4OxC/DxwXuA5m4HNo451ARfHmZuIiIyv5m+UExEpl8ZjRlKBEBEp\nOHToIAcP7mdGa0Ps71WckX+8tz329+roPDap16lAiIgMM6O1gasve0fSaVTUHd/aNanX1e6dUyIi\nclJUIEREJEgFQkREglQgREQkSIPUOdLZ2clAb1/N7Z8w0N1H50Bn0mmI1By1IEREJKjmWxCdnZ0M\nHu+puQ12Bo9309k5eOInDtPa2sqx+uM1uaNca3PrhF7T2dlJ78DApBa3S7OjAwM0dqo1JZWhFoSI\niATVfAuitbWV3v66mtxytLU1/tU3a1VrayvTjvXW5HLfDa2TaE0d76m5xe16j3dR11m7e6BUg1oQ\nIiISVPMtCBEZX2trK4P99TW53Hdra3PSaWSaCoSISEFnZye9vccmvXZRWnV0HqOxb+KTF9TFJCIi\nQWpBiIgUtLa20jC1vyZXc53WOLHJC6AWhIiIjEEtiJwZ6K7OUhsDx/oBqG+YEv97dfeBxiJFKk4F\nIkdmz67enP9DPdEdyrObXxf/mzVP7mc7WqU7qXsGorn4TfXxN9iPDgxQW3d2SJJyUSAGj3dXZamN\nwf5oW7+6KfFvVzh4vBuY2I1yE92P9mQU99m95ZZbq/aeE1HNYtlZ2Hu4oQrvOYfq/mxS22q+QFT1\nqvlQT/SeM6txh3OLPghOgorlSL3Hu6pyJ3Vf4SJqahUuonqPdzFdfY8npeYLhD4IRMZX3YuobgCm\nz4z/g3s6zbqIOkk1XyBEZHy6iBqpo7M6N8r19PYB0NQY/8dwR+cx5jRO/HUqECIiBdVscRztisam\npjXOjP295jRO7mdTgRARKVBraiTdKCciIkEqECIiEqQCISIiQSoQIiISpAIhIiJBqZ3FZGb1wNeA\n04Fe4C/d/YVksxIRyY80tyD+PdDk7n8MfA74u4TzERHJlbrBwcGkcwgys03A0+7+ncLXL7v7H4z3\nmra2jor9MPfc8y127PjFhF5zqLAo22RuSHn3u8/ikksum/DrqkHnokTnoqSa5yLN5wGyfy7mz59R\nFzqe2i4mYCZwZNjX/WY21d37xnrB7NktTJ1amf0HmpsbmDJlYg2spqYmgAm/rvh+8+fPmPDrqkHn\nokTnoqSa5yLN5wFq91ykvQXxc3e/p/D1/3P3N473mkq2IERE8mKsFkSaxyC2AxcCmNnZQPyrZ4mI\nyJA0dzHdD5xvZj8F6oArE85HRCRXUtvFNBnqYhIRmbgsdjGJiEiCVCBERCRIBUJERIJUIEREJEgF\nQkREgmpqFpOIiFSOWhAiIhKkAiEiIkEqECIiEqQCISIiQSoQIiISpAIhIiJBKhAiIhKU5uW+M8nM\nzgL+1t2XJ51LUsxsGvBPwFuARmCDu38/0aQSYmZTgM2AAYPANe7+bLJZJcfMFgDPAOe7+56k80mS\nmf0KaC98+S/unrotDVQgKsjMrgcuBzqTziVhHwMOuPvlZjYH+DWQywIBfBDA3Zea2XLgJmBVohkl\npHDhcAfQnXQuSTOzJqAu7ReS6mKqrN8CH0k6iRT4LnBjIa4DxtxHvNa5+wPAJwpfvhk4nGA6SfsK\n8A/AK0knkgKnAy1mts3MflzYNTN1VCAqyN3vA44nnUfS3P2ou3eY2QzgXmBd0jklyd37zOwu4Dbg\nW0nnkwQz+wugzd0fTTqXlOgiKpgXANcA3zKz1PXoqEBILMzsFOAx4Jvu/u2k80mau68G3gZsNrPW\npPNJwFVEWwg/DrwT2GJmC5NNKVHPA3e7+6C7Pw8cAP5Vwjn9ntRVLMk+M3s9sA34pLv/KOl8kmRm\nlwNvdPebia4aBwp/csXd31uMC0XiGnd/NbmMEncV8A7gP5rZG4CZwO+STen3qUBIHNYCs4Ebzaw4\nFrHS3fM4OPk94Btm9hNgGvDpnJ4HGekfgTvN7Cmi2W1XuXvqxuq03LeIiARpDEJERIJUIEREJEgF\nQkREglQgREQkSAVCRESCVCBEKsjM1pvZsqTzEKkEFQiRyjoPmJJ0EiKVoPsgRCbJzN5ItLZSK9Hd\n0Q8B1wOvAh8G5hCt3tpCdOPg9e7+XTO7E5gLnFp4/nnA+UA/8KC7r6/uTyISphaEyOT9B+Ahd/8j\nog/6LuCXwF+6+y7gPxXidxWe+4Vhrz3g7ouBfya6y/x04BzgrYWloEUSp6U2RCbvh8D3zOwM4H8C\nfw98YNjjHwM+YGYXA2cD04c99ovC3y8D3Wa2nagFss7de2LPXKQMakGITJK7bweWAI8Cfw78j1FP\neRJ4D9EOajcR7Y1R1F34Hn3AWUT7Z8wFfmZmb4s3c5HyqECITJKZbQQud/e7gE8C7yLaHGlqYSe9\ntwFfcPeHgRUEBq8LrY8ngJ+4+38GdhNtTyqSOBUIkcm7DbjIzH4N3A+sAX5AtGvaIuDrwG/MbCew\ngGgHsRF7Qbj7TuBnwLOFPYpfBB6p2k8gMg7NYhIRkSC1IEREJEgFQkREglQgREQkSAVCRESCVCBE\nRCRIBUJERIJUIEREJOj/A6HJMPmhDEeDAAAAAElFTkSuQmCC\n", 420 | "text/plain": [ 421 | "" 422 | ] 423 | }, 424 | "metadata": {}, 425 | "output_type": "display_data" 426 | } 427 | ], 428 | "source": [ 429 | "sns.boxplot(x='stars',y='text length',data=yelp)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "**Creating a countplot of the number of occurrences for each type of star rating.**" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 6, 442 | "metadata": { 443 | "collapsed": false 444 | }, 445 | "outputs": [ 446 | { 447 | "data": { 448 | "text/plain": [ 449 | "" 450 | ] 451 | }, 452 | "execution_count": 6, 453 | "metadata": {}, 454 | "output_type": "execute_result" 455 | }, 456 | { 457 | "data": { 458 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEFCAYAAAD5bXAgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAE/ZJREFUeJzt3XuQXnV9x/F3kkVCdJMuTpBaHRmrfJs6hWoooEBJq2jD\naFErlWFMU6ncqlymVqwkqDhhHFFoDR1jZ1GJojOWCF7ShktVLklRFGUklX4hjoxWy8yKuaysCYRs\n/zhny0P4ZfNk3bPnSfb9mtnJOb/zO89+n/NHPvs7v3OZMTo6iiRJu5vZdgGSpN5kQEiSigwISVKR\nASFJKjIgJElFfW0XMJmGhoa9JEuS9tH8+f0zSu2OICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKK\nDAhJUpEBIUkqMiAkSUWN3UkdEbOAQSCAUeA84CBgLfBQ3W1VZn4xIs4GzgV2Aisyc21EHAJcDxwG\nDANLM3OoqXolSU/X5KM23gCQmSdExCLgCuBrwNWZedVYp4g4HLgQOAaYDayPiNuA84H7M/ODEXEG\nsBy4qMF6pWlr8J9ubruERpx98Z+1XcJ+rbGAyMwvR8TaevVFwBZgIRARcRrVKOJi4FhgQ2buAHZE\nxCbgKOBE4Mp6/3XAZXv7nQMDc+jrmzW5X0TSfmv+/P62S9ivNfqwvszcGRGrgTcBbwF+B7g2M++N\niGXAB4D7gK0duw0D84C5He1jbePavHlkEquXtL8bGhpuu4T9wp6CtPFJ6sxcChxJNR9xa2beW2+6\nCXg5sA3orK6farTR2T7WJkmaIo0FREQsiYj31asjwC7gxog4tm57NXAvcA9wUkTMjoh5wAJgI7AB\nOLXuuxi4q6laJUnP1OQpphuBz0TEnVRXL10M/BS4JiKeAB4BzsnMbRGxkioAZgLLMnN7RKwCVkfE\neuBx4MwGa5Uk7WbG6OiB844dXxgkTYxXMU1vvjBIkrRPDAhJUpEBIUkqMiAkSUWN3ignSfubB759\n1d477YcWHPfufd7HEYQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAk\nFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUWNvTAoImYBg0AAo8B5wHbgunp9I/DOzNwVEWcD\n5wI7gRWZuTYiDgGuBw4DhoGlmTnUVL2SpKdrcgTxBoDMPAFYDlwBXA0sz8yTgBnAaRFxOHAhcALw\nOuDDEXEwcD5wf933s/VnSJKmSGMBkZlfBs6pV18EbAEWAnfUbeuA1wDHAhsyc0dmbgU2AUcBJwI3\n79ZXkjRFGn0ndWbujIjVwJuAtwCnZOZovXkYmAfMBbZ27FZqH2sb18DAHPr6Zk1S9ZL2d/Pn9+/z\nPg80UEcvmMixaDQgADJzaUS8F/g2cEjHpn6qUcW2enm89rG2cW3ePDIZJUs6QAwNDbddQs8Y71js\nKTwaO8UUEUsi4n316giwC/huRCyq2xYDdwH3ACdFxOyImAcsoJrA3gCcultfSdIUaXIEcSPwmYi4\nEzgIuJhq9DYYEc+ql9dk5pMRsZIqAGYCyzJze0SsAlZHxHrgceDMBmuVJO2msYDIzMeAvyxsOrnQ\nd5DqktjOthHg9GaqkyTtjTfKSZKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiS\nigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKmo\nr4kPjYiDgE8DRwAHAyuAnwJrgYfqbqsy84sRcTZwLrATWJGZayPiEOB64DBgGFiamUNN1CpJKmsk\nIIC3AY9m5pKIOBS4D/gQcHVmXjXWKSIOBy4EjgFmA+sj4jbgfOD+zPxgRJwBLAcuaqhWSVJBUwFx\nA7CmXp5BNTpYCEREnEY1irgYOBbYkJk7gB0RsQk4CjgRuLLefx1wWUN1SpL2oJGAyMxfAUREP1VQ\nLKc61XRtZt4bEcuAD1CNLLZ27DoMzAPmdrSPte3VwMAc+vpmTcp3kLT/mz+/f5/3eaCBOnrBRI5F\nUyMIIuKFwE3AJzLzCxHxW5m5pd58E3ANcCfQWXU/sAXY1tE+1rZXmzePTEbpkg4QQ0PDbZfQM8Y7\nFnsKj0auYoqI5wG3Au/NzE/XzbdExLH18quBe4F7gJMiYnZEzAMWABuBDcCpdd/FwF1N1ClJ2rOm\nRhCXAgPAZRExNn/wd8A/RsQTwCPAOZm5LSJWUgXATGBZZm6PiFXA6ohYDzwOnNlQnZKkPWhqDuIi\nylcdnVDoOwgM7tY2ApzeRG2SpO54o5wkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJ\nRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRk\nQEiSigwISVJRXzedIuKazLxgt7bVmbl0D/0PAj4NHAEcDKwAfghcB4wCG4F3ZuauiDgbOBfYCazI\nzLURcQhwPXAYMAwszcyhff96kqSJGjcgIuJa4MXAMRHxso5NBwHzxtn1bcCjmbkkIg4F7qt/lmfm\n7RHxSeC0iLgbuBA4BpgNrI+I24Dzgfsz84MRcQawHLhoYl9RkjQRextBrKAaBXwcuLyjfSfwwDj7\n3QCsqZdn1P0XAnfUbeuA1wJPAhsycwewIyI2AUcBJwJXdvS9rIvvwsDAHPr6ZnXTVdI0MH9+/z7v\nM95/bPuziRyLcQMiMx8GHgaOjoi5VKOGGfXm5wC/3MN+vwKIiH6qoFgOfCwzR+suw/VnzQW2duxa\nah9r26vNm0e66SZpmhgaGm67hJ4x3rHYU3h0NUkdEe8D/ge4k2oUcAdw+172eSHwTeBzmfkFYFfH\n5n5gC7CtXh6vfaxNkjSFupqkBt4B/G63E8UR8TzgVuBdmfn1uvn7EbEoM28HFlOFxz3AFRExm2oy\newHVBPYG4NR6+2Lgri7rlCRNkm4D4ifs4XTSHlwKDACXRcTY/MFFwMqIeBbVab41mflkRKykCoCZ\nwLLM3B4Rq4DVEbEeeBw4cx9+tyRpEnQbEA9RXWH0TWD7WGNmfqjUOTMvonzV0cmFvoPA4G5tI8Dp\nXdYmSWpAtwHxs/oHnpqkliQdwLoKiMy8fO+9JEkHkm7vpN5FdQd0p59n5gsnvyRJUi/odgTx/5fD\n1o/ReCPwyqaKkiS1b58f1peZT2TmDcCfNlCPJKlHdHuK6a86VmcAL6O6/FSSdIDq9iqmP+lYHgV+\nAbx18suRJPWKbucg3l7PPUS9z8bM3NloZZKkVnX7LKaFVDfLrQY+A/wkIo5rsjBJUru6PcW0Enhr\nZn4bICKOB64Bjm2qMElSu7q9iuk5Y+EAkJnfonrBjyTpANVtQPwyIk4bW4mINwKPNlOSJKkXdHuK\n6RxgbUR8iuoy11HgVY1VJUlqXbcjiMXACPAiqkteh4BFDdUkSeoB3QbEOcAJmflYZv6A6v3SFzRX\nliSpbd0GxEE8/c7px3nmw/skSQeQbucgvgx8IyL+tV5/M/CVZkqSJPWCrkYQmfleqnshAngxsDIz\nLxt/L0nS/qzbEQSZuQZY02AtkqQess+P+5YkTQ8GhCSpqOtTTBNRP9DvI5m5KCJeDqyleugfwKrM\n/GJEnA2cC+wEVmTm2og4BLgeOAwYBpZm5lCTtUqSnq6xgIiIS4AlwGN100Lg6sy8qqPP4cCFwDFU\nz3ZaHxG3AecD92fmByPiDGA5cFFTtUqSnqnJEcSPqC6H/Vy9vhCI+plODwEXUz0NdkNm7gB2RMQm\n4CjgRODKer91gFdMSdIUaywgMvNLEXFER9M9wLWZeW9ELAM+ANwHbO3oMwzMA+Z2tI+17dXAwBz6\n+mb9pqVLOkDMn9+/z/s80EAdvWAix6LROYjd3JSZW8aWqd4ncSfQWXU/sAXY1tE+1rZXmzePTE6l\nkg4IQ0PDbZfQM8Y7FnsKj6kMiFsi4oLMvAd4NXAv1ajiioiYDRwMLAA2AhuAU+vti4G7prBOTRPf\nefeFbZfQiD+6amXbJegAMZUBcT5wTUQ8ATwCnJOZ2yJiJVUAzASWZeb2iFgFrI6I9VTPfTpzCuuU\nJNFwQGTmw8Dx9fL3gBMKfQaBwd3aRoDTm6xNkjQ+b5STJBUZEJKkIgNCklRkQEiSigwISVKRASFJ\nKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQi\nA0KSVGRASJKKDAhJUlFfkx8eEccBH8nMRRHxEuA6YBTYCLwzM3dFxNnAucBOYEVmro2IQ4DrgcOA\nYWBpZg41Wask6ekaG0FExCXAtcDsuulqYHlmngTMAE6LiMOBC4ETgNcBH46Ig4Hzgfvrvp8FljdV\npySprMlTTD8C3tyxvhC4o15eB7wGOBbYkJk7MnMrsAk4CjgRuHm3vpKkKdTYKabM/FJEHNHRNCMz\nR+vlYWAeMBfY2tGn1D7WtlcDA3Po65v1m5Qt7ffmz+9vu4SeMZFj8UADdfSCiRyLRucgdrOrY7kf\n2AJsq5fHax9r26vNm0d+8yql/dzQ0HDbJfQMj8VTxjsWewqPqbyK6fsRsaheXgzcBdwDnBQRsyNi\nHrCAagJ7A3Dqbn0lSVNoKgPi3cDlEXE38CxgTWY+AqykCoBvAMsyczuwCnhZRKwHzgEun8I6JUk0\nfIopMx8Gjq+XHwROLvQZBAZ3axsBTm+ytunqPWsPzAvCPvr6FW2XIB1wvFFOklRkQEiSigwISVKR\nASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkoqm8mmurbnoo19tu4RGfPw9f952CZIOYI4g\nJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFU35s5gi4nvA\ntnr1x8AVwHXAKLAReGdm7oqIs4FzgZ3AisxcO9W1StJ0NqUBERGzgRmZuaij7avA8sy8PSI+CZwW\nEXcDFwLHALOB9RFxW2bumMp6JWk6m+oRxNHAnIi4tf7dlwILgTvq7euA1wJPAhvqQNgREZuAo4Dv\njPfhAwNz6Oub1VTtPWf+/P62S+gZHouneCyeMpFj8UADdfSCiRyLqQ6IEeBjwLXAS6kCYUZmjtbb\nh4F5wFxga8d+Y+3j2rx5ZFKL7XVDQ8Ntl9AzPBZP8Vg8xWPxlPGOxZ7CY6oD4kFgUx0ID0bEo1Qj\niDH9wBaqOYr+QrskaYpM9VVMZwFXAUTE86lGCrdGxKJ6+2LgLuAe4KSImB0R84AFVBPYkqQpMtUj\niE8B10XEeqqrls4CfgEMRsSzqE7/rcnMJyNiJVVYzASWZeb2Ka5Vkqa1KQ2IzHwcOLOw6eRC30Fg\nsPGiJElF3ignSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSp\nyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpKK+tgvYk4iYCXwC\nOBrYAbwjMze1W5UkTR+9PIJ4IzA7M18J/ANwVcv1SNK00ssBcSJwM0Bmfgs4pt1yJGl6mTE6Otp2\nDUURcS3wpcxcV6//BHhxZu5stzJJmh56eQSxDejvWJ9pOEjS1OnlgNgAnAoQEccD97dbjiRNLz17\nFRNwE3BKRPwnMAN4e8v1SNK00rNzEJKkdvXyKSZJUosMCElSkQEhSSrq5Unq/U5EHAd8JDMXtV1L\nmyLiIODTwBHAwcCKzPxqq0W1JCJmAYNAAKPAeZm5sd2q2hMRhwH3Aqdk5n+3XU+bIuJ7VJfzA/w4\nM3vuQhwDYpJExCXAEuCxtmvpAW8DHs3MJRFxKHAfMC0DAngDQGaeEBGLgCuA01qtqCX1Hw7/Avy6\n7VraFhGzgRm9/sekp5gmz4+AN7ddRI+4AbisXp4BTNsbHDPzy8A59eqLgC0tltO2jwGfBH7ediE9\n4GhgTkTcGhHfqO/16jkGxCTJzC8BT7RdRy/IzF9l5nBE9ANrgOVt19SmzNwZEauBa4DPt11PGyLi\nr4GhzLyl7Vp6xAhVYL4OOA/4fET03BkdA0KNiIgXAt8EPpeZX2i7nrZl5lLgSGAwIp7ddj0tOIvq\nxtfbgT8EPhsRh7dbUqseBK7PzNHMfBB4FPjtlmt6hp5LLO3/IuJ5wK3AuzLz623X06aIWAK8IDM/\nTPVX4676Z1rJzD8eW65D4rzMfKS9ilp3FvAHwN9GxPOBucD/tlvSMxkQasKlwABwWUSMzUUszszp\nODl5I/CZiLgTOAi4eJoeBz3dp4DrImI91dVtZ/Xiw0h91IYkqcg5CElSkQEhSSoyICRJRQaEJKnI\ngJAkFRkQ0iSJiMsj4qS265AmiwEhTZ6TgVltFyFNFu+DkCYgIl5A9VylZ1PdGb0WuAR4BHgTcCjV\nk1vnUN00eElm3hAR1wHPBV5S9z8ZOAV4EvhKZl4+td9E2jNHENLE/A2wNjOPofqPfgT4LvCOzLwf\nuKBefkXd9/0d+z6amQuAH1DdYX408CrgpfVjoKWe4KM2pIn5D+DGiHg58G/APwOv79j+NuD1EXE6\ncDzwnI5t367//Rnw64jYQDUCWZ6Z2xuvXOqSIwhpAjJzA/D7wC3AW4Gv7dblLuBYqrenXUH1Xowx\nv64/YydwHNW7M54L3B0RRzZbudQ9A0KagIi4EliSmauBdwGvoHoxUl/9Fr0jgfdn5r8Dr6UweV2P\nPu4A7szMvwd+SPVqUqknGBDSxFwD/EVE3AfcBJwP3Ez1xrTfA64F/isivg8cRvX2sKe9ByIzvw/c\nDWys30/8MLBuyr6BtBdexSRJKnIEIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSiv4PJA1D\nhf8Mvt0AAAAASUVORK5CYII=\n", 459 | "text/plain": [ 460 | "" 461 | ] 462 | }, 463 | "metadata": {}, 464 | "output_type": "display_data" 465 | } 466 | ], 467 | "source": [ 468 | "sns.countplot(x='stars',data=yelp)" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "** Using groupby to get the mean values of the numerical columns**" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 7, 481 | "metadata": { 482 | "collapsed": false 483 | }, 484 | "outputs": [ 485 | { 486 | "data": { 487 | "text/html": [ 488 | "
\n", 489 | "\n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | "
coolusefulfunnytext length
stars
10.5767691.6048061.056075826.524700
20.7195251.5631070.875944842.265372
30.7885011.3066390.694730758.505133
40.9546231.3959160.670448712.944129
50.9442611.3817800.608631625.015583
\n", 544 | "
" 545 | ], 546 | "text/plain": [ 547 | " cool useful funny text length\n", 548 | "stars \n", 549 | "1 0.576769 1.604806 1.056075 826.524700\n", 550 | "2 0.719525 1.563107 0.875944 842.265372\n", 551 | "3 0.788501 1.306639 0.694730 758.505133\n", 552 | "4 0.954623 1.395916 0.670448 712.944129\n", 553 | "5 0.944261 1.381780 0.608631 625.015583" 554 | ] 555 | }, 556 | "execution_count": 7, 557 | "metadata": {}, 558 | "output_type": "execute_result" 559 | } 560 | ], 561 | "source": [ 562 | "stars=yelp.groupby('stars').mean()\n", 563 | "stars\n" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "**Using the corr() method on that groupby dataframe to produce this dataframe:**" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 11, 576 | "metadata": { 577 | "collapsed": false, 578 | "scrolled": true 579 | }, 580 | "outputs": [ 581 | { 582 | "data": { 583 | "text/html": [ 584 | "
\n", 585 | "\n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | "
coolusefulfunnytext length
cool1.000000-0.743329-0.944939-0.857651
useful-0.7433291.0000000.8945060.699895
funny-0.9449390.8945061.0000000.843463
text length-0.8576510.6998950.8434631.000000
\n", 626 | "
" 627 | ], 628 | "text/plain": [ 629 | " cool useful funny text length\n", 630 | "cool 1.000000 -0.743329 -0.944939 -0.857651\n", 631 | "useful -0.743329 1.000000 0.894506 0.699895\n", 632 | "funny -0.944939 0.894506 1.000000 0.843463\n", 633 | "text length -0.857651 0.699895 0.843463 1.000000" 634 | ] 635 | }, 636 | "execution_count": 11, 637 | "metadata": {}, 638 | "output_type": "execute_result" 639 | } 640 | ], 641 | "source": [ 642 | "corrmat=stars.corr()\n", 643 | "corrmat" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 13, 649 | "metadata": { 650 | "collapsed": false 651 | }, 652 | "outputs": [ 653 | { 654 | "data": { 655 | "text/plain": [ 656 | "" 657 | ] 658 | }, 659 | "execution_count": 13, 660 | "metadata": {}, 661 | "output_type": "execute_result" 662 | }, 663 | { 664 | "data": { 665 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAD3CAYAAAAjdY4DAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xd8VFXawPHfTYcUQuhY6DwgUhSkGFFEqcouuiKI+ioW\nhBVRl1UQG6yv9UVd3VXAVayrWBYELIgKShMURECEQy8rHQykl5l5/7hDSELJJJmZOzM+38/nfpJ7\nz5l7n5tMnjk599xzLY/Hg1JKqeCLcjoApZT6vdIErJRSDtEErJRSDtEErJRSDtEErJRSDokJ5M5H\nWI11iIXXonv/4XQIIWPXj4udDiFkJNVr7HQIIWP3+yOtqu6jIjlnimd7lY9XVdoCVkophwS0BayU\nUsEU7XibtmI0ASulIkZcVHhlYE3ASqmIEW1pAlZKKUdoF4RSSjlEW8BKKeUQbQErpZRDtAWslFIO\nidUErJRSztAuCKWUcoh2QSillEO0BayUUg7RFrBSSjlEb0VWSimHaBeEUko5RBOwUko5RPuAlVLK\nIdoCVkoph/irBSwiUcDLQHsgH7jNGLO5RPn1wBjABUwzxkyuzHH0kURKqYgRF2X5vJRjIJBgjOkG\njAOeLVM+CbgcSAfGiEjNysSrCVgpFTGiLd+XclwEzAUwxiwDOpUpXwPUABIAC6jUA4g1ASulIka0\nZfm8lCMFOFJi3SUiJbtsfwZWAuuAT4wxGZWJVxOwUipiRFmWz0s5jgLJJXdtjCkCEJF2wBVAE6Ax\nUFdEBlUm3oi/CNe4cweufnocz106xOlQAq7HOfUY2bslRW43M7/fxUfLdpYqHzewDdIwBYDayQlk\n5hUy9IXFxeUTBrXjSE4Bz3+6IahxB0Lfbq0Zd9NlFLncvP3ZCt789PtS5Y3q12TKA9diWRa79v3G\n6EkzyM0vLC5/YczV/JaZw4RX5gY7dL/rdX4j7r2mE0UuN9MXbODd+etLlZ9RK4kXR12GZUFGVj53\nvvgVuQVFtG9Whwk3pmNZsD8jh7v++TX5hS6HzsI3lv+GQSwBBgAfiEhXYG2JsiNALpBrjHGJyH5A\n+4DL6n3fHdz46lPEJMQ7HUrAxURZjB3YhtunLuPml5YyqGsjaiXFlarz1MfrGPbyd9w+ZRlZeYU8\n+sHq4rJB3RrRokFy2d2GpZjoKJ4adSUD//oa/e6eyrABnalTM6lUncdGXsG02cvpO3oKi37ayqhr\nuxeXDRvQhTZN6wc77ICIiY5iwk3pXPf4HP40YRY3XH4OtWtUK1Xn9ivaMfu7zVw9YRZm12Gu69kK\ngEnDe3Dv5PkMfPRjvlm9izNrh/77Izou2uelHDOBPBFZCjwP3CsiQ0VkuDFmBzAVWCwii4FU4I3K\nxBvRLeADW3Yw9eoR3Pz2806HEnBN6yWx82A2R3PtVtyP2w7TsVkt5q3ec0Ld67s3Yak5wKY9mQB0\naFyTdmen8uF3O2hSN+mE+uFGGtVl66+HyMjKBeC7tdtJb9eEj7893ohp1aguo5cbAJat3cFTo64E\noHObRnRqfRbT5iyn5dl1gh+8n7U4I5Xte49wJLsAgO837KFr6wZ8smxrcZ112w/RoFYiAMnV49h9\nKItmDVI5nJnH8CvaI2el8fWqHWzZU6luzqDyVwvYGOMGRpTZvKFE+RRgSlWPc9oELCLfceLVPQvw\nGGMurOrBA23VjLnUanSm02EERVJCLJm5x/+Fzs4vIjkh9oR6sdEWg7o1YsjfFwFQOzmeP/duyejX\nV9C3Q4OgxRtIyYkJHMnKK17PysknJSmhVJ21m3fTP701737xI/3TW1M9IY56ack8cNPlDH34La66\ntF2www6I5GpxZOYUFK9n5xaSUr30f4R7DmcxfmgXrkpvQVxsNM9++APNz6hJJ6nPg68vYvveo7w1\nth+rtxxgybpfg30KFRIVZndilNcCjvyO0zA3up9wXpM0pGEKa3Ycb6EkxseUSsjHdG1Zh5VbD5GV\nVwRAnw4NSU2MY/LtnamdnEC1uGi27c/i4x/+G7Rz8JeHb+1N17aNObdpA1as31W8Pal6fKmEDDD+\n5U+ZdPcfub5fJ+YtMxw6ks1VPdqSVqM6Hz09jHppyVSLj2PjzgO8O3dlsE+lyu4f3JnOUp/WjWqx\natP+4u2J1WI5kp1fqu7DN3TjnskL+Hb1Li4772xeuPMy/vb2UrbvPcLmX+331IKfdtG+WZ2QT8BW\nVHj1qp42AXv7OhCRM7H7Qc4BNgL3Bj405YsXP7f/jY6Jspg9tgc1qseSk19Ex6ZpvP7NlhPqd2tR\nm0Xrj/9B/nvRNv69aBsAAy84kyZ1k8Iy+QI89to8wO73/OHNMdRMrkZWbgEXtmvCi+8vLFW3Z6cW\nTHx1Lpt3HWTUtd1ZsGITr85axpQZSwEY2rcjLc+uE5bJF+CZ9+2LjjHRUXzz7GBSE+PJziuka+uG\nTJmzulTdjKz84lbyvt9ySE2MZ8e+oyQmxNK4Xgrb9x2lS+sGvFfm4l0oirQW8DH/AiYDC4EewGvA\nZQGKSVVCkdvDM7N+4ZXhXbEsmPn9LvYfyaNG9VgmXtuee95YAUDjuknMXhGeCdZXRS43D7z0CTP/\n71Ysy+Kdz1ew5+BRaiZX4x/3XcMNj7zNpl0HePWh6ygoKGL99n2M+fvHTocdEEUuNxPfWsq7D15J\nlGUxfcF69v6WTWpiPJNG9OC2Z7/godcX8/gt3YmOsrAsGD9tEYUuN2OmfMNLoy/HsixWmL18vWpn\n+Qd0mB9HQQSF5fGUfwOHiCwwxlxaYn2hMebi8l43wmpcqbtDItGie//hdAghY9ePi8uv9DuRVK+x\n0yGEjN3vj6xy9lzY5UKfc87Fy5c6nq197TCJEZG2AN6vmliVUiHHirJ8XkKBr10Qo4FpItIA2A0M\nD1xISilVOVHREXQR7hhjzCoR6Qs0A7YaYw4GNiyllKq4cOsD9unjQkSuBZYCDwDLROSGgEallFKV\nYEVbPi+hwNcuiHuBjsaYLBFJBuYD7wQuLKWUqrhw64LwNVq3MSYLwBiTCeSVU18ppYIuOjbK5yUU\n+NoC3ioiz2KPA+4OnDjCXymlHGaFWQvY1wQ8FbgE6AVcB/QJWERKKVVJ4XYnnK8fF88D040xo4AL\ngOcCF5JSSlVOuF2E8zUBFxpjtgAYY7YC7sCFpJRSlWNFR/m8hAJfuyB2iMgTwHdAZyC0p0RSSv0u\nhcrFNV/5Gu0wYD/QHzgA3BKwiJRSqpKioqN8XkKBr3fC5QF/D3AsSilVJaHSt+uriH4kkVLq9yVU\n+nZ9pQlYKRUxIuqJGEopFU5CpW/XV5qAlVIRIyouvFJaeEWrlFKnoV0QSinlECs62ukQKkQTsFIq\nYugoCKWUckiUdkEopZQztAWslFIOiYoNr5QW0GgX3fuPQO4+rHR//i6nQwgZLZPinA4hZHRpX8/p\nEELIyCrvQVvASinlEE3ASinlEL0TTimlHKI3YiillEP0VmSllHKItoCVUsohUXorslJKOUNHQSil\nlEP8lYBFJAp4GWgP5AO3GWM2n6TeK8BhY8y4yhwnvD4ulFLqNKyoKJ+XcgwEEowx3YBxwLNlK4jI\nHUDbqsSrCVgpFTGi4mJ8XspxETAXwBizDOhUslBELgS6AFOrFG9VXqyUUqHEjy3gFOBIiXWXiMQA\niEgD4FFgVFXj1T5gpVTEsKL8NgriKJBcYj3KGFPk/X4QUBv4DKgPVBeRDcaYNyp6EE3ASqnI4b8E\nvAQYAHwgIl2BtccKjDEvAi8CiMjNQKvKJF/QBKyUiiT+uxFjJtBLRJYCFjBMRIYCScaYV/x1EE3A\nSqmI4a9nwhlj3MCIMps3nKTeG1U5jiZgpVTkiAmvuaY1ASulIobOBaGUUk7x30W4oNAErJSKHJqA\nlVLKGdoFoZRSTtGLcEop5Qx/DUMLlrBPwD3OqcfI3i0pcruZ+f0uPlq2s1T5uIFtkIYpANROTiAz\nr5ChLywuLp8wqB1Hcgp4/tMThvhFnMadO3D10+N47tIhTocSWJbFZc9PoE7bVrjyC/hy1INkbD3+\nvmh17QA63nULHpebn9/+iDWvvUd0XCx9Jj9FjSZnkX80i/ljJpKxZYeDJxEAlkWzMX+hevPmeAoL\n2fzU0+T9+isAsWlpyMQJxVUTmzdnx5Sp7J01y6FgK0m7IIInJspi7MA2DH5+EbkFRbxz10Us+Hkv\nh7IKius89fG64rpv35XOox+sLi4b1K0RLRoks2LLoaDHHmy977uDLjdeRX52rtOhBFzzAb2ISYhn\n+mWDaXBBey5+Yhyzh/y5uPzix8fyVucrKMjK4eYfPsP851NaX/sHCrJzeK/ntdRs0YSekx5hxlW3\nOngW/pfWvTtWXDxrR4wkqc05NB51JxseGA9A4eHD/HzXaACS27Th7OG3s3fOHCfDrZwwuwgXXh8X\nZTStl8TOg9kczS2k0OXhx22H6dis1knrXt+9CUvNATbtyQSgQ+OatDs7lQ+/i7BWzikc2LKDqVeX\nvbEnMp3RrSPbv1wEwJ4fVlP/vNJTth782RCXkkxMQhxYFng8pLVqxvYvFwLw26ZtpEmzoMcdaCnt\n2pGxfDkAWet+IalVq5PWa3rvPWyd9Cy43cEMzy+sqGifl1Bw2hawiPQ+VZkxZp7/w6mYpIRYMnML\ni9ez84tITog9oV5stMWgbo0Y8nf7j7J2cjx/7t2S0a+voG+HBkGL10mrZsylVqMznQ4jKOKSk8g/\nmlm87na5sKKj8bhcABxcv4kbFs6gMCeXTbPnkX8kkwNrN9C0bw82z/mSBhe0J6lhPayoKDxhmIRO\nJSYxkaLsrOMb3G6IjgbvzwUgLT2dnG3byN21y4EI/SDCuiCuO8V2D+BYAh7dTzivSRrSMIU1OzKK\ntyfGx5RKyMd0bVmHlVsPkZVnzybXp0NDUhPjmHx7Z2onJ1AtLppt+7P4+If/Bu0cVOAUZGYRl5RY\nvG5FRRUn39pthKZ9evBq254UZuXQ79VJtBjYl5/f+oi0lk0ZPO89di9byf5V6yIq+QIUZWcTXb36\n8Q2WVSr5AtTp05vdH34U5Mj8x4qkURDGmGHBCqQiXvzcAHa/7uyxPahRPZac/CI6Nk3j9W+2nFC/\nW4vaLFq/v3j934u28e9F2wAYeMGZNKmbpMk3guz+biVN+/dk48zPaXBBew6u21hcln80k6LcPIpy\n8/G43eQcOERCzRrU79iWnd9+x7cPPEm9884l+awzHDyDwMhcu5aa6ekcmr+ApDbnkLN16wl1klq1\nInPt2pO8OkxEWAsYABHZg93qtYA0YKsxpnUgA/NFkdvDM7N+4ZXhXbEsmPn9LvYfyaNG9VgmXtue\ne95YAUDjuknMXqEJ9vdi05wvObtnOkO+mg6WxRcjH6DVoCuJTUpk7evvs2badIbMew9XYSEZ23ay\n7p0ZxKUkccVD99DlvpHkZ2Qy787xTp+G3x1auJDUCzrRdvLLYFlsfuJJave6nOhq1dg3ew4xqakU\nZWc7HWaVhNswNMvj8VToBSLSCJjgS+u4zV/mVGznEaz783c5HULIaJkUXv8mBlKX9vWcDiFkpC9e\nZFV1H65fvvE550Sf06PKx6uqCrfXjTE7gJNfPlVKKSdFRfu+hABfuyDew+6CAGgA7AtYREopVUlW\nzImjoEJZecPQLjbGLATeBI6N4M8DVgQ6MKWUqjArsi7CvSgi6cA4oBf2RTiAaMB1ylcppZQTIiwB\nfwGsARoChuMJ2AM0DWBcSilVYZ5ISsDGmLHAWBF52BjzWJBiUkqpyomkBFzC6yLyDlAX+BBYY4xZ\nHriwlFKqEizHR5ZViK8fF1OBaUAssBB4IWARKaVUJXmiY3xeQoGvCbiaMWY+4DHGGOyREEopFVqs\nKN+XEODrx0CeiPQBokWkK5qAlVKhKEQSq698TcDDgUlAbeCvwMiARaSUUpUViQnYGPNfEbkeexha\nN+DXgEallFKVEFHD0I4Rkb8D64FGwPnYtyLfFMC4lFKq4sIsAfsa7QXGmKlAN2NMX+D38WgFpVR4\nicTJeLAvvnUEtotIHJAcwJiUUqpSIrILAnsynpeBYcDTwJSARaSUUpUViU/EAO73fv0E+0Lcpdg3\nZiilVOiI0BbwsQnYLaAjcE1gwlFKqSqIxARsjMkvsbpERJ4MUDxKKVVpnqjQuMXYV74OQ3uS0k/E\niKzndSulIkMktoCBDSW+Xw3MDUAsSilVNX6aDU1EorAHHrQH8oHbjDGbS5QPAB4BioBpxph/VeY4\nvnZBvFmZnSulVFD5rwU8EEgwxnTzzn/zLPBHABGJBZ4HLgCysbtlZxtjKvyszPBqryul1Gl4rCif\nl3JchPc/fWPMMqBTibLWwGZjzG/GmAJgMXBxZeLVBKyUihz+m44yBThSYt0lIjGnKMsEalQm3IBe\nMtz14+JA7j6stEyKczqEkLExq8DpEEJGnQ0HnQ4hZKT7YR9u/PZEjKOUvuM3yhhTdIqyZCCjMgcJ\nrzEbSil1Gm6Pp/xKvlkCDAA+8PYBry1Rth5oISJpQBZ298OkyhxEE7BSKmL4Lf3CTKCXiCzFvgFt\nmIgMBZKMMa+IyF+wnxofhT0KolJT9GoCVkpFDLefMrAxxg2MKLN5Q4nyOcCcqh5HE7BSKmJ4/NcF\nERSagJVSEcNfLeBg0QSslIoYLk3ASinlDO2CUEoph4TbLGGagJVSESPMGsCagJVSkUMvwimllENc\nYdYE1gSslIoYYZZ/NQErpSKHH+eCCApNwEqpiBFe6VcTsFIqguhFOKWUckiY9UBoAlZKRQ4dBaGU\nUg7RLgillHJImDWANQErpSKHO8zGQWgCVkpFDG0BK6WUQ8LtRowopwOoqr7dWvPNlFF89dKfuemK\nzieUN6pfk89fuIO5L47gXw8Oplp8bKnyF8ZczYThfYMVbuBYFpf9fSJDvn6fQZ+9TWrTs0sVt7p2\nANcvmsnQb/5Du1uvAyA6Lpb+rz3LdfM/4OqPp5HarJETkTuicecO/GXBdKfDCDzLosukR+n7+Xv0\nmvUmyU1Kvy+aXHMl/ef/h35ffkDLYUNKlSXUTuPq1fNJad4kmBFXSaHL4/MSCsI6AcdER/HUqCsZ\n+NfX6Hf3VIYN6Eydmkml6jw28gqmzV5O39FTWPTTVkZd2724bNiALrRpWj/YYQdE8wG9iEmIZ/pl\ng1n86CQufmJcqfKLHx/Lf/5wM9N7DaHTXbcQn5pC25sHU5Cdw3s9r2XBfY/Rc9IjDkUfXL3vu4Mb\nX32KmIR4p0MJuLP6X050fDxz+13Hqr89R8e/3V+q/PyJ9/PV1bfwRf/raT3yZuJqpABgxcTQ5dmJ\nuPLynQi70lwej89LKCg3AYvIX0WkTjCCqShpVJetvx4iIyuXwiIX363dTnq70p/WrRrV5cvlBoBl\na3fQrW1jADq3aUSn1mcxbc7yYIcdEGd068j2LxcBsOeH1dQ/r22p8oM/G+JSkolJiAPLAo+HtFbN\n2P7lQgB+27SNNGkW9LidcGDLDqZeXfaBt5Gpbtfz2T1/MQAHV66mVodzS5VnrDPEpSQRHR+HZVnF\nT5ToOPE+Nr4xnZy9+4Mec1W4PR6fl1DgSws4C5gpIh+JSD8RsQIdlK+SExM4kpVXvJ6Vk09KUkKp\nOms376Z/emsA+qe3pnpCHPXSknngpsv56wuzghpvIMUlJ5F/NLN43e1yYUVHF68fXL+JGxbO4Kbv\nP2Pr3AXkH8nkwNoNNO3bA4AGF7QnqWE9rKiw/qfIJ6tmzMVVWOR0GEERm5xEQYn3hafM+yJjwyb6\nf/0RA5bM4b/zvqHwaCZNhwwk/9Bv7FmwxImQq8Tl9n0JBeVehDPGTAGmiEgb4EFgqohMA14wxvwW\n6ABP5uFbe9O1bWPObdqAFet3FW9Pqh5fKiEDjH/5Uybd/Ueu79eJecsMh45kc1WPtqTVqM5HTw+j\nXloy1eLj2LjzAO/OXRnsU/Gbgsws4pISi9etqCg8LhcAtdsITfv04NW2PSnMyqHfq5NoMbAvP7/1\nEWktmzJ43nvsXraS/avW4XGHyDtT+UVhZhaxJd4XlHhfpJ7TkjN6XcLM83tRlJ1D+pRnOPsPfWh+\n/Z/A46H+Jd1IO7cV6S8/xYIb7iRv/0GHzsJ3odKy9VW5CVhEUoEhwP8AGcDdQDTwCZAe0OhO4bHX\n5gF2H/APb46hZnI1snILuLBdE158f2Gpuj07tWDiq3PZvOsgo67tzoIVm3h11jKmzFgKwNC+HWl5\ndp2wTr4Au79bSdP+Pdk483MaXNCeg+s2FpflH82kKDePotx8PG43OQcOkVCzBvU7tmXnt9/x7QNP\nUu+8c0k+6wwHz0AFwoHlP3Jmn0vZMWsutTu2J+OX4++LwqNZuHLzcOXZ74u8A4eIT01h3oAbi+v0\nmvUmy8dMCIvkC1AYZrfC+TIM7QfgHWCIMWbnsY0icl7AovJRkcvNAy99wsz/uxXLsnjn8xXsOXiU\nmsnV+Md913DDI2+zadcBXn3oOgoKili/fR9j/v6x02EHxKY5X3J2z3SGfDUdLIsvRj5Aq0FXEpuU\nyNrX32fNtOkMmfcersJCMrbtZN07M4hLSeKKh+6hy30jyc/IZN6d450+DeVnOz/9igY9LqTPZ+9i\nWRZL7xpP4z9dQWxidTa99SEb3/qAPp++g7ugkMztu9jyXnj/fbjCLAFb5T3GWUQsY0ylziqlx9jw\n+mkE0ISVM50OIWRszCpwOoSQkV6rmtMhhIwbD66v8vWlzzfs8znn9GtVz/HrWb60gMeJyFggB7AA\njzGmYWDDUkqpiguR4b0+8yUBDwEaGmNyAh2MUkpVRcRdhAO2AbmBDkQppaoq3PqAfUnAccBaEVnr\nXfcYY4YGMCallKqUSBwF8XTAo1BKKT+IxC6IH4F+QEJ5FZVSyknuCGwBzwJ2A8duOQuvM1RK/W5E\n4iiIKGPMDQGPRCmlqigSuyDWiEgX4Ce8rV9jjI6kV0qFnMJQmWXHR74k4EuAASXWPUDTwISjlFKV\nF8guCBGphj0tQ10gE7jJGHPgJPWigE+BWd7JzE7Jl9nQ2lcuXKWUCq4Ad0GMBNYaYyaIyBDgIezJ\nycr6X6CmLzv0ZTa0BZS58GaM6enLzpVSKpgC/KSLi4BnvN9/DjxctoKIXAO4gbm+7NCXLohjjw6w\ngI5AB192rJRSweavO+FE5Fbg3jKb9wFHvN9nAjXKvOZcYChwDeDT87186YIwJVY3eANTSqmQ468E\nbIx5DXit5DYRmQEke1eTsedHL+l/gDOA+UBjoEBEthtjTtkaPmUCFpEaxpgjIjK8xOaGQNKpXqOU\nUk4qKAroKIglQH/ge+yb0xaVLDTGFD/xVEQmAHtPl3zh9C3gT7H7PM7HvhED7CkpB1U0aqWUCoYA\nT8YzGXhTRBYDBdjdDYjIX4DNxpjZFd3h6RJwoYj8ALQA1pfYPhC4sKIHUkqpQAtkAvZOyXtCA9QY\n89xJtk3wZZ+nS8CXY/dnTAb+7FuISinlnIiZjtIY4wJ2AlcELxyllKq8iEnASikVbjQBK6WUQ/ID\nOwrC7zQBK6UihraAlVLKIZqAS0iq1ziQuw8rXdrXczqEkFFnw0GnQwgZSw7p826PudEP+wjwXBB+\npy1gpVTE0BawUko5JMC3IvudJmClVMRwuTUBK6WUI7QLQimlHKIJWCmlHFKkCVgppZyhLWCllHKI\njoJQSimHaAtYKaUcoglYKaUc4tEErJRSznBrAlZKKWd4dDIepZRyhktHQSillDM84ZV/NQErpSKH\ndkEopZRD9CKcUko5RIehKaWUQ1yu8OoE1gSslIoY2gJWSimHaAJWSimH6EW4IOt1fiPuvaYTRS43\n0xds4N3560uVn1EriRdHXYZlQUZWPne++BW5BUW0b1aHCTemY1mwPyOHu/75NfmFLofOws8si2Zj\n/kL15s3xFBay+amnyfv1VwBi09KQiROKqyY2b86OKVPZO2uWQ8EGgGXR5f8eoWabVrgKClh2z8Nk\nbttZXNzkmitp/edheFwutrw7g42vTy8uS6idRv+vP+KrP93K0c3bnIg+6Bp37sDVT4/juUuHOB1K\nlekwtCCKiY5iwk3p9B//ETl5Rcx67CrmrdzOwSO5xXVuv6Ids7/bzJvz1jF2cGeu69mKaXN/ZtLw\nHtz+3Bds33eUoT1bc2btZLbsyXDwbPwnrXt3rLh41o4YSVKbc2g86k42PDAegMLDh/n5rtEAJLdp\nw9nDb2fvnDlOhut3Z/W/nOj4eOb2u47aHdvT8W/3882No4rLz594P3PSB1CUncOAJXPYPuMzCo4c\nxYqJocuzE3Hl5TsYfXD1vu8Outx4FfnZueVXDgPhdiNGVHkVROQKEflUROYfW4IRmC9anJHK9r1H\nOJJdQKHLzfcb9tC1dYNSddZtP0SNxHgAkqvHUVjkplmDVA5n5jH8ivb859E/kpoUHzHJFyClXTsy\nli8HIGvdLyS1anXSek3vvYetk56FMHuSbHnqdj2f3fMXA3Bw5WpqdTi3VHnGOkNcShLR8XFYllXc\nauo48T42vjGdnL37gx6zUw5s2cHUq0c4HYbfuIrcPi+hwJcW8GPAvcDeAMdSYcnV4sjMKShez84t\nJKV6fKk6ew5nMX5oF65Kb0FcbDTPfvgDzc+oSSepz4OvL2L73qO8NbYfq7ccYMm6X4N9CgERk5hI\nUXbW8Q1uN0RHg+t4F0taejo527aRu2uXAxEGVmxyEgVHM4vXPS4XVnQ0Hu/5Z2zYRP+vP6IoJ5ed\nn3xJ4dFMmg4ZSP6h39izYAnn3jPcqdCDbtWMudRqdKbTYfhNJF6EO2yM+TbgkVTA/YM701nq07pR\nLVZtOt5aSawWy5Hs0v8+PnxDN+6ZvIBvV+/isvPO5oU7L+Nvby9l+94jbP7VbvUu+GkX7ZvViZgE\nXJSdTXT16sc3WFap5AtQp09vdn/4UZAjC47CzCxikxKPb4iKKk6+qee05IxelzDz/F4UZeeQPuUZ\nzv5DH5pf/yfweKh/STfSzm1F+stPseCGO8nbf9Chs1CV4Y6UPmAROdYMKBCRV4CVgAfAGPNKEGI7\npWfe/x43cqE7AAAKN0lEQVSw+4C/eXYwqYnxZOcV0rV1Q6bMWV2qbkZWfnEred9vOaQmxrNj31ES\nE2JpXC+F7fuO0qV1A94rc/EunGWuXUvN9HQOzV9AUptzyNm69YQ6Sa1akbl2rQPRBd6B5T9yZp9L\n2TFrLrU7tifjl43FZYVHs3Dl5uHKy8fjdpN34BDxqSnMG3BjcZ1es95k+ZgJmnzDUCS1gI91pi73\nfq3v/RoyZ1jkcjPxraW8++CVRFkW0xesZ+9v2aQmxjNpRA9ue/YLHnp9MY/f0p3oKAvLgvHTFlHo\ncjNmyje8NPpyLMtihdnL16t2ln/AMHFo4UJSL+hE28kvg2Wx+Yknqd3rcqKrVWPf7DnEpKZSlJ3t\ndJgBs/PTr2jQ40L6fPYulmWx9K7xNP7TFcQmVmfTWx+y8a0P6PPpO7gLCsncvost733sdMjKTwKZ\ngEWkGvAOUBfIBG4yxhwoU2cMMBRwA08YY2aebp9WecM2ROQhY8z/llh/0hjzgC8BNxw8OWSStdM+\n/PVdp0MIGVs3aMvymCWHImP0gT9M8Wy3qrqPFnfO9DnnbHrpqgodT0T+AqQYYyaIyBCgmzHm7hLl\nqcAaoDmQCPxkjGl0un2ergviVuA2oLWI9PdujgZiAZ8SsFJKBZM7sHNBXAQ84/3+c+DhMuXZwA7s\n5JuI3Qo+rdN1QbwDfA2MBx73bnMDv58xOkqpsOKvO+G8DdB7y2zeBxzxfp8J1DjJS3cBv2A3Vp8s\n7zinTMDGmHxgu4gsAS4pUVQoIruMMYvL27lSSgWTx+2fu1mNMa8Br5XcJiIzgGTvajJQ9uaBftjX\nzpp4178QkSXGmO9PdRxfhqENxm5OLwU6AwmAS0RWGmPKfkIopZRj/JWAT2EJ0B/4HjvZLipT/huQ\nC+QbYzwikgGknm6HviTgWOBSY4xbRKKAz4wxfUVkaYXDV0qpAApwAp4MvCkii4EC7NEOxy7ObTbG\nzBaRy4FlIuIGFgNfnm6HviTgWthJON/7Nc27Pf6Ur1BKKQe4CwvKr1RJxpgcYNBJtj9X4vtHgUd9\n3acvCfglYI2IrANaAc+IyHhgrq8HUUqpYAhwC9jvyk3AxpjXRORj7LFtm40xh0Qk2hgTXmeqlIp4\nEZeARaQDMBz74hsigjHmlkAHppRSFRVxCRh4A/gn9vg2pZQKWZGYgPcaY14NeCRKKVVF7ghMwNtF\nZBywiuOzoc0LaFRKKVUJ7qLAjYIIBF8ScDwg3gXsJKwJWCkVcjyuCGsBG2OGiUhL7FEQa4DdAY9K\nKaUqIeL6gEVkFHAV9g0YbwAtgFGne41SSjkh3BJwuQ/lBIYAvYAMY8wLQJfAhqSUUpXjcbt8XkKB\nL33AUdj9vsfmefv9PLNbKRVWPGH2hG9fEvC7wEKgkYh8BujzW5RSISniRkEYY/4pIl8D59qrZk3g\nw1JKqYqLmHHAIvIkJz6A8zwRGWKMGR/YsJRSquIiaRjahqBFoZRSfhAqF9d8dbpHEr0ZzECUUqqq\nIiYBK6VUuAm3i3CWx3P6p4iKSIwxpqjEeqoxpuzD6JRSSlXQ6S7C1QdSgLdE5EbAwh4T/Bb2wzmV\nUkpVwem6ILoCd2NPwjMVOwG7gS+CEJdSSkU8X7og/mCMmV1iPdkYkxnwyJRSKsL5MhfEGBFpACAi\nXYDvAhuSUkr9PvgyCmIi8JmIfAt0Aq4JbEhKKfX74EsLeB2wH3tGtO+BLQGNSCmlfid8ScCLgJeN\nMW2wJ2PXLgillPIDXxJwT2PMLABjzCTgjsCGFDwi0lhEljkdh1NE5GkRWSMiPU5R/oaI9A1yWJUi\nIjEiskBElopITafj8TcRSRCR2yrxuqtEpGGZbTeLyFP+jktEJojICH/s9/fClz7gGiLyHlATeAf4\nObAhqSAaBLSPkFEtDYEUY0xHpwMJkPrAbUBFn1B+NzCCwD1KrLJxKXxLwC8Cw4B/Aa8BnwOfBDKo\nihCRasDrQCMgDrgHu5XeFIgGnjPGvC8i5wH/AFxAHnC7MxH7j4jcDLQyxowTkQTsCZSeAW7CHrP9\ngzFmtIicBbwCVANygeHYv9OGwKfeme9uMsYM8e53rzGmftBPqGqmAC1EZCqwyhgzRURaAVOMMT1E\nZA3wLdAOe5a/PwLnAWOBAuz3y3TgSWAj0NkYc1hERgLJxphngn9KpTwInCMijwAvYP8t1vKWjQYy\ngPnAxUBr7Ivnk4AO2DdTXWSMOeE+XRG5CxiK/TOZbox5UUTewH7wQmOgAXCzMeZHEbkV+3Fkh7F/\nZu8D6SXiAvijiAzyxvawMWaOX38KEcaXLgiMMZsBjzHmABBqraURwHZjTDfsxyddAhwwxlwIXA78\nr4jUxv4AGWWMuQR4GXjOqYADbBj2eXYD1otIDPYf4ovGmB7e758yxvwN2Av0xk7K4e7PwC/AnlOU\npwDveX//vwL9vNsbAX/CvvHofmOMG/g39nsJ4AYgFCamehz4xft7Gw98bYy5FPvDdLIxZhdwP3as\nzwPXebsOfwL+5xTJ9xxgMHAR0B0YKCLHnn6+wxjTB7vRMtz7NzQWO+H2BhJPEhfAr8aYy7AbQiP9\n+hOIQL4k4MMicgeQKCJDsD9pQ4ngvTBojNmE/Ym90Lueif1H2QxoaIz5yfuahUCb4IcaUJb36zDg\nTu+wwUbe7W2B8SLyDfAIUM/HfYW7suexyvt1F5Dg/X6tMabIGJPN8Q+iacCNInIusM8Ysy/woVZI\nW+AW7+/zX9gPzAX7aTVnAt8aY/7rw37OxX6PfO1damE/dBdO/Fk1x060OcYYF7D0FPtc6f26F6ju\n6wn9XvmSgG8FmgAHsccB3xLQiCpuPXABgIg0Ba7D/jRHRJKx36zbgN0i0s77mkuw/80Md3nYHzgA\n53u/3g6M8Lb0zgMuxO6aGOttAd8BfHiq/YhII47/QYejk/1MjjnZbZ8nbDPG7MBuaDyI/a9+KHBz\n/O91A/C89/d5Lfa1GYAxwDygk4h0PcnryjLYw0wv9e7rDeDYE2/K/lw2A61EpJqIRHF8Ppiy+z/9\nrbWqFF/6gEcbY8YdW/H2Fz4QuJAqbCowzdviiwb6YrcAF2P3eU40xuwXkduBf4qIBRRhf7CEu7nA\nSO+5rgSOAmuBRSKSif2v9nLgr8Bkbz9xNewLMyWtADJEZDn2B9q2IMUfCO8DH4jIJRxvjVXGv7Cv\nf9zgl6iqbj8QJyJPY//b/5qIDMfuWpkgIp2w+3K7Yfdn/0dEumG3VN8Skd7GmMMld2iMWe193Nhi\nEYnHHuf/68kObow56D32Iuw+4GpAYZm4IqErK6hOOReEt8P9NuwO/V+8m6OAOGNM2ZaFUhHFeyGp\nrTHmkXIr/w54ryWMNcY87m3ELAQeNMYsdDi0sHa6FvA72P1C47E/ccH+d2N/oINSykki8gRwKXCl\n07GECmNMkYgkisiP2CMglmO3hlUVlDsbmlJKqcDwaRiaUkop/9MErJRSDtEErJRSDtEErJRSDtEE\nrJRSDvl/vvJBHZG0bngAAAAASUVORK5CYII=\n", 666 | "text/plain": [ 667 | "" 668 | ] 669 | }, 670 | "metadata": {}, 671 | "output_type": "display_data" 672 | } 673 | ], 674 | "source": [ 675 | "sns.heatmap(corrmat,annot=True)" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "## NLP Classification Task" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 33, 688 | "metadata": { 689 | "collapsed": false 690 | }, 691 | "outputs": [ 692 | { 693 | "data": { 694 | "text/html": [ 695 | "
\n", 696 | "\n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | "
business_iddatereview_idstarstexttypeuser_idcoolusefulfunnytext length
09yKzy9PApeiPPOUJEtnvkg2011-01-26fWKvX83p0-ka4JS3dc6E5A5My wife took me here on my birthday for breakf...reviewrLtl8ZkDX5vH5nAx9C3q5Q250889
1ZRJwVLyzEJq1VAihDhYiow2011-07-27IjZ33sJrzXqU-0X6U8NwyA5I have no idea why some people give bad review...review0a2KyEL0d3Yb1V6aivbIuQ0001345
3_1QQZuf4zZOyFCvXc0o6Vg2010-05-27G-WvGaISbqqaMHlNnByodA5Rosie, Dakota, and I LOVE Chaparral Dog Park!!...reviewuZetl9T0NcROGOyFfughhg120419
46ozycU1RpktNG2-1BroVtw2012-01-051uJFq2r5QfJG_6ExMRCaGw5General Manager Scott Petello is a good egg!!!...reviewvYmM4KTsC8ZfQBg-j5MWkw000469
6zp713qNhx8d9KCJJnrw1xA2010-02-12riFQ3vxNpP4rWLk_CSri2A5Drop what you're doing and drive here. After I...reviewwFweIWhv2fREZV_dYkz_1g7741565
\n", 786 | "
" 787 | ], 788 | "text/plain": [ 789 | " business_id date review_id stars \\\n", 790 | "0 9yKzy9PApeiPPOUJEtnvkg 2011-01-26 fWKvX83p0-ka4JS3dc6E5A 5 \n", 791 | "1 ZRJwVLyzEJq1VAihDhYiow 2011-07-27 IjZ33sJrzXqU-0X6U8NwyA 5 \n", 792 | "3 _1QQZuf4zZOyFCvXc0o6Vg 2010-05-27 G-WvGaISbqqaMHlNnByodA 5 \n", 793 | "4 6ozycU1RpktNG2-1BroVtw 2012-01-05 1uJFq2r5QfJG_6ExMRCaGw 5 \n", 794 | "6 zp713qNhx8d9KCJJnrw1xA 2010-02-12 riFQ3vxNpP4rWLk_CSri2A 5 \n", 795 | "\n", 796 | " text type \\\n", 797 | "0 My wife took me here on my birthday for breakf... review \n", 798 | "1 I have no idea why some people give bad review... review \n", 799 | "3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!... review \n", 800 | "4 General Manager Scott Petello is a good egg!!!... review \n", 801 | "6 Drop what you're doing and drive here. After I... review \n", 802 | "\n", 803 | " user_id cool useful funny text length \n", 804 | "0 rLtl8ZkDX5vH5nAx9C3q5Q 2 5 0 889 \n", 805 | "1 0a2KyEL0d3Yb1V6aivbIuQ 0 0 0 1345 \n", 806 | "3 uZetl9T0NcROGOyFfughhg 1 2 0 419 \n", 807 | "4 vYmM4KTsC8ZfQBg-j5MWkw 0 0 0 469 \n", 808 | "6 wFweIWhv2fREZV_dYkz_1g 7 7 4 1565 " 809 | ] 810 | }, 811 | "execution_count": 33, 812 | "metadata": {}, 813 | "output_type": "execute_result" 814 | } 815 | ], 816 | "source": [ 817 | "yelp_class=yelp[(yelp['stars']==1) | (yelp['stars']==5)]\n", 818 | "yelp_class.head()" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 34, 824 | "metadata": { 825 | "collapsed": true 826 | }, 827 | "outputs": [], 828 | "source": [ 829 | "#creating feature vector X\n", 830 | "X=yelp_class['text']\n", 831 | "#creating target label\n", 832 | "y=yelp_class['stars']\n" 833 | ] 834 | }, 835 | { 836 | "cell_type": "markdown", 837 | "metadata": {}, 838 | "source": [ 839 | "**Importing CountVectorizer and creating a CountVectorizer object.**" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 36, 845 | "metadata": { 846 | "collapsed": false 847 | }, 848 | "outputs": [], 849 | "source": [ 850 | "from sklearn.feature_extraction.text import CountVectorizer\n", 851 | "cv = CountVectorizer()" 852 | ] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": 37, 857 | "metadata": { 858 | "collapsed": false 859 | }, 860 | "outputs": [], 861 | "source": [ 862 | "X=cv.fit_transform( X )" 863 | ] 864 | }, 865 | { 866 | "cell_type": "markdown", 867 | "metadata": {}, 868 | "source": [ 869 | "## Train Test Split" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": 39, 875 | "metadata": { 876 | "collapsed": false 877 | }, 878 | "outputs": [], 879 | "source": [ 880 | "from sklearn.model_selection import train_test_split" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "execution_count": 40, 886 | "metadata": { 887 | "collapsed": true 888 | }, 889 | "outputs": [], 890 | "source": [ 891 | "X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101)" 892 | ] 893 | }, 894 | { 895 | "cell_type": "markdown", 896 | "metadata": {}, 897 | "source": [ 898 | "## Training a Model" 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": 41, 904 | "metadata": { 905 | "collapsed": true 906 | }, 907 | "outputs": [], 908 | "source": [ 909 | "from sklearn.naive_bayes import MultinomialNB\n", 910 | "nb = MultinomialNB()" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 42, 916 | "metadata": { 917 | "collapsed": false 918 | }, 919 | "outputs": [ 920 | { 921 | "data": { 922 | "text/plain": [ 923 | "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" 924 | ] 925 | }, 926 | "execution_count": 42, 927 | "metadata": {}, 928 | "output_type": "execute_result" 929 | } 930 | ], 931 | "source": [ 932 | "nb.fit(X_train,y_train)" 933 | ] 934 | }, 935 | { 936 | "cell_type": "markdown", 937 | "metadata": {}, 938 | "source": [ 939 | "## Predictions and Evaluations" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": 43, 945 | "metadata": { 946 | "collapsed": true 947 | }, 948 | "outputs": [], 949 | "source": [ 950 | "predictions = nb.predict(X_test)" 951 | ] 952 | }, 953 | { 954 | "cell_type": "markdown", 955 | "metadata": {}, 956 | "source": [ 957 | "** Creating a confusion matrix and classification report using these predictions and y_test **" 958 | ] 959 | }, 960 | { 961 | "cell_type": "code", 962 | "execution_count": 44, 963 | "metadata": { 964 | "collapsed": true 965 | }, 966 | "outputs": [], 967 | "source": [ 968 | "from sklearn.metrics import confusion_matrix,classification_report" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": 45, 974 | "metadata": { 975 | "collapsed": false 976 | }, 977 | "outputs": [ 978 | { 979 | "name": "stdout", 980 | "output_type": "stream", 981 | "text": [ 982 | "[[159 69]\n", 983 | " [ 22 976]]\n", 984 | "\n", 985 | "\n", 986 | " precision recall f1-score support\n", 987 | "\n", 988 | " 1 0.88 0.70 0.78 228\n", 989 | " 5 0.93 0.98 0.96 998\n", 990 | "\n", 991 | "avg / total 0.92 0.93 0.92 1226\n", 992 | "\n" 993 | ] 994 | } 995 | ], 996 | "source": [ 997 | "print(confusion_matrix(y_test,predictions))\n", 998 | "print('\\n')\n", 999 | "print(classification_report(y_test,predictions))" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "markdown", 1004 | "metadata": {}, 1005 | "source": [ 1006 | "**Let's see what happens if we try to include TF-IDF to this process using a pipeline.**" 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "markdown", 1011 | "metadata": {}, 1012 | "source": [ 1013 | "# Using Text Processing" 1014 | ] 1015 | }, 1016 | { 1017 | "cell_type": "code", 1018 | "execution_count": 46, 1019 | "metadata": { 1020 | "collapsed": true 1021 | }, 1022 | "outputs": [], 1023 | "source": [ 1024 | "from sklearn.feature_extraction.text import TfidfTransformer" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": 47, 1030 | "metadata": { 1031 | "collapsed": true 1032 | }, 1033 | "outputs": [], 1034 | "source": [ 1035 | "from sklearn.pipeline import Pipeline" 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "markdown", 1040 | "metadata": {}, 1041 | "source": [ 1042 | "** creating a pipeline with the following steps:CountVectorizer(), TfidfTransformer(),MultinomialNB()**" 1043 | ] 1044 | }, 1045 | { 1046 | "cell_type": "code", 1047 | "execution_count": 49, 1048 | "metadata": { 1049 | "collapsed": false 1050 | }, 1051 | "outputs": [], 1052 | "source": [ 1053 | "pipeline = Pipeline([\n", 1054 | " ('bow', CountVectorizer()), # strings to token integer counts\n", 1055 | " ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores\n", 1056 | " ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier\n", 1057 | "])" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "markdown", 1062 | "metadata": {}, 1063 | "source": [ 1064 | "## Using the Pipeline" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "markdown", 1069 | "metadata": {}, 1070 | "source": [ 1071 | "### Train Test Split" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": 50, 1077 | "metadata": { 1078 | "collapsed": true 1079 | }, 1080 | "outputs": [], 1081 | "source": [ 1082 | "X = yelp_class['text']\n", 1083 | "y = yelp_class['stars']\n", 1084 | "X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101)" 1085 | ] 1086 | }, 1087 | { 1088 | "cell_type": "markdown", 1089 | "metadata": {}, 1090 | "source": [ 1091 | "**fitting the pipeline to the training data**" 1092 | ] 1093 | }, 1094 | { 1095 | "cell_type": "code", 1096 | "execution_count": 51, 1097 | "metadata": { 1098 | "collapsed": false 1099 | }, 1100 | "outputs": [ 1101 | { 1102 | "data": { 1103 | "text/plain": [ 1104 | "Pipeline(steps=[('bow', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',\n", 1105 | " dtype=, encoding=u'utf-8', input=u'content',\n", 1106 | " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", 1107 | " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", 1108 | " str... use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" 1109 | ] 1110 | }, 1111 | "execution_count": 51, 1112 | "metadata": {}, 1113 | "output_type": "execute_result" 1114 | } 1115 | ], 1116 | "source": [ 1117 | "pipeline.fit(X_train,y_train)" 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "markdown", 1122 | "metadata": {}, 1123 | "source": [ 1124 | "### Predictions and Evaluation" 1125 | ] 1126 | }, 1127 | { 1128 | "cell_type": "code", 1129 | "execution_count": 52, 1130 | "metadata": { 1131 | "collapsed": false 1132 | }, 1133 | "outputs": [], 1134 | "source": [ 1135 | "predictions = pipeline.predict(X_test)" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "code", 1140 | "execution_count": 53, 1141 | "metadata": { 1142 | "collapsed": false 1143 | }, 1144 | "outputs": [ 1145 | { 1146 | "name": "stdout", 1147 | "output_type": "stream", 1148 | "text": [ 1149 | "[[ 0 228]\n", 1150 | " [ 0 998]]\n", 1151 | " precision recall f1-score support\n", 1152 | "\n", 1153 | " 1 0.00 0.00 0.00 228\n", 1154 | " 5 0.81 1.00 0.90 998\n", 1155 | "\n", 1156 | "avg / total 0.66 0.81 0.73 1226\n", 1157 | "\n" 1158 | ] 1159 | }, 1160 | { 1161 | "name": "stderr", 1162 | "output_type": "stream", 1163 | "text": [ 1164 | "C:\\Users\\wafaa\\Anaconda2\\lib\\site-packages\\sklearn\\metrics\\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", 1165 | " 'precision', 'predicted', average, warn_for)\n" 1166 | ] 1167 | } 1168 | ], 1169 | "source": [ 1170 | "print(confusion_matrix(y_test,predictions))\n", 1171 | "print(classification_report(y_test,predictions))" 1172 | ] 1173 | } 1174 | ], 1175 | "metadata": { 1176 | "kernelspec": { 1177 | "display_name": "Python 2", 1178 | "language": "python", 1179 | "name": "python2" 1180 | }, 1181 | "language_info": { 1182 | "codemirror_mode": { 1183 | "name": "ipython", 1184 | "version": 2 1185 | }, 1186 | "file_extension": ".py", 1187 | "mimetype": "text/x-python", 1188 | "name": "python", 1189 | "nbconvert_exporter": "python", 1190 | "pygments_lexer": "ipython2", 1191 | "version": "2.7.13" 1192 | } 1193 | }, 1194 | "nbformat": 4, 1195 | "nbformat_minor": 0 1196 | } 1197 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-Projects 2 | Various projects in Linear Regression, Logistic Regression, k Nearest Neighbors, Decision Trees, Random Forests, SVM,Deep Learning 3 | 4 | ![pairplot](https://cloud.githubusercontent.com/assets/6215149/25064445/990f6d0c-21bf-11e7-9b61-1664a2b20210.JPG) 5 | 6 | ![fico](https://cloud.githubusercontent.com/assets/6215149/25140078/b91513c4-2424-11e7-91cc-153a429c5e67.JPG) 7 | -------------------------------------------------------------------------------- /SVM/DS_Meetup_6_22_17.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amydaali/Machine-Learning-Projects/7fa8bfecd6b778ba17db4b1b717f626c71701ced/SVM/DS_Meetup_6_22_17.pptx -------------------------------------------------------------------------------- /SVM/SVM_Breast_Cancer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "import seaborn as sns\n", 15 | "%matplotlib inline" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 12, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "from sklearn.datasets import load_breast_cancer" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 16, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "cancer = load_breast_cancer()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 17, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "['target_names', 'data', 'target', 'DESCR', 'feature_names']" 51 | ] 52 | }, 53 | "execution_count": 17, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "cancer.keys()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 22, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "Breast Cancer Wisconsin (Diagnostic) Database\n", 74 | "\n", 75 | "Notes\n", 76 | "-----\n", 77 | "Data Set Characteristics:\n", 78 | " :Number of Instances: 569\n", 79 | "\n", 80 | " :Number of Attributes: 30 numeric, predictive attributes and the class\n", 81 | "\n", 82 | " :Attribute Information:\n", 83 | " - radius (mean of distances from center to points on the perimeter)\n", 84 | " - texture (standard deviation of gray-scale values)\n", 85 | " - perimeter\n", 86 | " - area\n", 87 | " - smoothness (local variation in radius lengths)\n", 88 | " - compactness (perimeter^2 / area - 1.0)\n", 89 | " - concavity (severity of concave portions of the contour)\n", 90 | " - concave points (number of concave portions of the contour)\n", 91 | " - symmetry \n", 92 | " - fractal dimension (\"coastline approximation\" - 1)\n", 93 | " \n", 94 | " The mean, standard error, and \"worst\" or largest (mean of the three\n", 95 | " largest values) of these features were computed for each image,\n", 96 | " resulting in 30 features. For instance, field 3 is Mean Radius, field\n", 97 | " 13 is Radius SE, field 23 is Worst Radius.\n", 98 | " \n", 99 | " - class:\n", 100 | " - WDBC-Malignant\n", 101 | " - WDBC-Benign\n", 102 | "\n", 103 | " :Summary Statistics:\n", 104 | "\n", 105 | " ===================================== ======= ========\n", 106 | " Min Max\n", 107 | " ===================================== ======= ========\n", 108 | " radius (mean): 6.981 28.11\n", 109 | " texture (mean): 9.71 39.28\n", 110 | " perimeter (mean): 43.79 188.5\n", 111 | " area (mean): 143.5 2501.0\n", 112 | " smoothness (mean): 0.053 0.163\n", 113 | " compactness (mean): 0.019 0.345\n", 114 | " concavity (mean): 0.0 0.427\n", 115 | " concave points (mean): 0.0 0.201\n", 116 | " symmetry (mean): 0.106 0.304\n", 117 | " fractal dimension (mean): 0.05 0.097\n", 118 | " radius (standard error): 0.112 2.873\n", 119 | " texture (standard error): 0.36 4.885\n", 120 | " perimeter (standard error): 0.757 21.98\n", 121 | " area (standard error): 6.802 542.2\n", 122 | " smoothness (standard error): 0.002 0.031\n", 123 | " compactness (standard error): 0.002 0.135\n", 124 | " concavity (standard error): 0.0 0.396\n", 125 | " concave points (standard error): 0.0 0.053\n", 126 | " symmetry (standard error): 0.008 0.079\n", 127 | " fractal dimension (standard error): 0.001 0.03\n", 128 | " radius (worst): 7.93 36.04\n", 129 | " texture (worst): 12.02 49.54\n", 130 | " perimeter (worst): 50.41 251.2\n", 131 | " area (worst): 185.2 4254.0\n", 132 | " smoothness (worst): 0.071 0.223\n", 133 | " compactness (worst): 0.027 1.058\n", 134 | " concavity (worst): 0.0 1.252\n", 135 | " concave points (worst): 0.0 0.291\n", 136 | " symmetry (worst): 0.156 0.664\n", 137 | " fractal dimension (worst): 0.055 0.208\n", 138 | " ===================================== ======= ========\n", 139 | "\n", 140 | " :Missing Attribute Values: None\n", 141 | "\n", 142 | " :Class Distribution: 212 - Malignant, 357 - Benign\n", 143 | "\n", 144 | " :Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian\n", 145 | "\n", 146 | " :Donor: Nick Street\n", 147 | "\n", 148 | " :Date: November, 1995\n", 149 | "\n", 150 | "This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.\n", 151 | "https://goo.gl/U2Uwz2\n", 152 | "\n", 153 | "Features are computed from a digitized image of a fine needle\n", 154 | "aspirate (FNA) of a breast mass. They describe\n", 155 | "characteristics of the cell nuclei present in the image.\n", 156 | "A few of the images can be found at\n", 157 | "http://www.cs.wisc.edu/~street/images/\n", 158 | "\n", 159 | "Separating plane described above was obtained using\n", 160 | "Multisurface Method-Tree (MSM-T) [K. P. Bennett, \"Decision Tree\n", 161 | "Construction Via Linear Programming.\" Proceedings of the 4th\n", 162 | "Midwest Artificial Intelligence and Cognitive Science Society,\n", 163 | "pp. 97-101, 1992], a classification method which uses linear\n", 164 | "programming to construct a decision tree. Relevant features\n", 165 | "were selected using an exhaustive search in the space of 1-4\n", 166 | "features and 1-3 separating planes.\n", 167 | "\n", 168 | "The actual linear program used to obtain the separating plane\n", 169 | "in the 3-dimensional space is that described in:\n", 170 | "[K. P. Bennett and O. L. Mangasarian: \"Robust Linear\n", 171 | "Programming Discrimination of Two Linearly Inseparable Sets\",\n", 172 | "Optimization Methods and Software 1, 1992, 23-34].\n", 173 | "\n", 174 | "This database is also available through the UW CS ftp server:\n", 175 | "\n", 176 | "ftp ftp.cs.wisc.edu\n", 177 | "cd math-prog/cpo-dataset/machine-learn/WDBC/\n", 178 | "\n", 179 | "References\n", 180 | "----------\n", 181 | " - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction \n", 182 | " for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on \n", 183 | " Electronic Imaging: Science and Technology, volume 1905, pages 861-870, \n", 184 | " San Jose, CA, 1993. \n", 185 | " - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and \n", 186 | " prognosis via linear programming. Operations Research, 43(4), pages 570-577, \n", 187 | " July-August 1995.\n", 188 | " - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques\n", 189 | " to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) \n", 190 | " 163-171.\n", 191 | "\n" 192 | ] 193 | } 194 | ], 195 | "source": [ 196 | "print(cancer['DESCR'])" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 56, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',\n", 210 | " 'mean smoothness', 'mean compactness', 'mean concavity',\n", 211 | " 'mean concave points', 'mean symmetry', 'mean fractal dimension',\n", 212 | " 'radius error', 'texture error', 'perimeter error', 'area error',\n", 213 | " 'smoothness error', 'compactness error', 'concavity error',\n", 214 | " 'concave points error', 'symmetry error', 'fractal dimension error',\n", 215 | " 'worst radius', 'worst texture', 'worst perimeter', 'worst area',\n", 216 | " 'worst smoothness', 'worst compactness', 'worst concavity',\n", 217 | " 'worst concave points', 'worst symmetry', 'worst fractal dimension'], \n", 218 | " dtype='