├── CategoryClassifer.ipynb ├── README.md ├── Tutorial.ipynb ├── data ├── category │ ├── Books_small.json │ ├── Clothing_small.json │ ├── Electronics_small.json │ ├── Grocery_small.json │ └── Patio_small.json └── sentiment │ ├── Books_small.json │ └── Books_small_10000.json ├── data_process.py └── models ├── category_classifier.pkl ├── category_vectorizer.pkl └── sentiment_classifier.pkl /CategoryClassifer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import random\n", 11 | "from sklearn.model_selection import train_test_split\n", 12 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 13 | "from sklearn.metrics import f1_score\n", 14 | "\n", 15 | "import json" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### Load In Data" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "#### Data Class" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 7, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "class Category:\n", 39 | " ELECTRONICS = \"ELECTRONICS\"\n", 40 | " BOOKS = \"BOOKS\"\n", 41 | " CLOTHING = \"CLOTHING\"\n", 42 | " GROCERY = \"GROCERY\"\n", 43 | " PATIO = \"PATIO\"\n", 44 | " \n", 45 | "class Sentiment:\n", 46 | " POSITIVE = \"POSITIVE\"\n", 47 | " NEGATIVE = \"NEGATIVE\"\n", 48 | " NEUTRAL = \"NEUTRAL\"\n", 49 | "\n", 50 | "class Review:\n", 51 | " def __init__(self, category, text, score):\n", 52 | " self.category = category\n", 53 | " self.text = text\n", 54 | " self.score = score\n", 55 | " self.sentiment = self.get_sentiment()\n", 56 | " \n", 57 | " def get_sentiment(self):\n", 58 | " if self.score <= 2:\n", 59 | " return Sentiment.NEGATIVE\n", 60 | " elif self.score == 3:\n", 61 | " return Sentiment.NEUTRAL\n", 62 | " else: # Amazon review is a 4 or 5\n", 63 | " return Sentiment.POSITIVE\n", 64 | " \n", 65 | "class ReviewContainer:\n", 66 | " def __init__(self, reviews):\n", 67 | " self.reviews = reviews\n", 68 | " \n", 69 | " def get_text(self):\n", 70 | " return [x.text for x in self.reviews]\n", 71 | " \n", 72 | " def get_x(self, vectorizer):\n", 73 | " return vectorizer.transform(self.get_text())\n", 74 | " \n", 75 | " def get_y(self):\n", 76 | " return [x.sentiment for x in self.reviews]\n", 77 | " \n", 78 | " def get_category(self):\n", 79 | " return [x.category for x in self.reviews]\n", 80 | " \n", 81 | " def evenly_distribute(self):\n", 82 | " negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))\n", 83 | " positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))\n", 84 | " positive_shrunk = positive[:len(negative)]\n", 85 | " print(len(positive_shrunk))\n", 86 | " self.reviews = negative + positive_shrunk\n", 87 | " random.shuffle(self.reviews)\n", 88 | " print(self.reviews[0])\n", 89 | " " 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "#### Load in Data" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 9, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "file_names = ['./data/category/Electronics_small.json', './data/category/Books_small.json', './data/category/Clothing_small.json', './data/category/Grocery_small.json', './data/category/Patio_small.json']\n", 106 | "file_categories = [Category.ELECTRONICS, Category.BOOKS, Category.CLOTHING, Category.GROCERY, Category.PATIO]\n", 107 | "\n", 108 | "reviews = []\n", 109 | "for i in range(len(file_names)):\n", 110 | " file_name = file_names[i]\n", 111 | " category = file_categories[i]\n", 112 | " with open(file_name) as f:\n", 113 | " for line in f:\n", 114 | " review_json = json.loads(line)\n", 115 | " review = Review(category, review_json['reviewText'], review_json['overall'])\n", 116 | " reviews.append(review)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### Data Prep" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 11, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "train, test = train_test_split(reviews, test_size = 0.33, random_state=42)\n", 133 | "\n", 134 | "train_container = ReviewContainer(train)\n", 135 | "#train_container.evenly_distribute()\n", 136 | "test_container = ReviewContainer(test)\n", 137 | "#test_container.evenly_distribute()\n", 138 | "\n", 139 | "corpus = train_container.get_text()\n", 140 | "# vectorizer = CountVectorizer(binary=True)\n", 141 | "# vectorizer.fit(corpus)\n", 142 | "vectorizer = TfidfVectorizer()\n", 143 | "vectorizer.fit(corpus)\n", 144 | "\n", 145 | "train_x = train_container.get_x(vectorizer)\n", 146 | "train_y = train_container.get_category()\n", 147 | "\n", 148 | "test_x = test_container.get_x(vectorizer)\n", 149 | "test_y = test_container.get_category()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### Classification" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 12, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "SVC(C=16, cache_size=200, class_weight=None, coef0=0.0,\n", 168 | " decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n", 169 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 170 | " tol=0.001, verbose=False)" 171 | ] 172 | }, 173 | "execution_count": 12, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "from sklearn import svm\n", 180 | "\n", 181 | "clf = svm.SVC(C=16, kernel='linear', gamma='auto')\n", 182 | "clf.fit(train_x, train_y)\n", 183 | "\n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 13, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "array(['CLOTHING', 'PATIO', 'ELECTRONICS'], dtype='" 449 | ] 450 | }, 451 | "execution_count": 29, 452 | "metadata": {}, 453 | "output_type": "execute_result" 454 | }, 455 | { 456 | "data": { 457 | "image/png": "\n", 458 | "text/plain": [ 459 | "
" 460 | ] 461 | }, 462 | "metadata": { 463 | "needs_background": "light" 464 | }, 465 | "output_type": "display_data" 466 | } 467 | ], 468 | "source": [ 469 | "from sklearn.metrics import confusion_matrix\n", 470 | "import seaborn as sn\n", 471 | "import pandas as pd\n", 472 | "import matplotlib as plt\n", 473 | "\n", 474 | "y_pred = clf.predict(test_x)\n", 475 | "\n", 476 | "labels = [Category.ELECTRONICS, Category.BOOKS, Category.CLOTHING, Category.GROCERY, Category.PATIO]\n", 477 | "\n", 478 | "cm = confusion_matrix(test_y, y_pred, labels=labels)\n", 479 | "df_cm = pd.DataFrame(cm, index=labels, columns=labels)\n", 480 | "\n", 481 | "sn.heatmap(df_cm, annot=True, fmt='d')\n" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [] 490 | } 491 | ], 492 | "metadata": { 493 | "kernelspec": { 494 | "display_name": "Python 3", 495 | "language": "python", 496 | "name": "python3" 497 | }, 498 | "language_info": { 499 | "codemirror_mode": { 500 | "name": "ipython", 501 | "version": 3 502 | }, 503 | "file_extension": ".py", 504 | "mimetype": "text/x-python", 505 | "name": "python", 506 | "nbconvert_exporter": "python", 507 | "pygments_lexer": "ipython3", 508 | "version": "3.7.3" 509 | } 510 | }, 511 | "nbformat": 4, 512 | "nbformat_minor": 2 513 | } 514 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sklearn 2 | Data & Code associated with my tutorial on the sci-kit learn machine learning library in python 3 | 4 | Video link: https://youtu.be/M9Itm95JzL0 5 | 6 | Tutorial.ipynb is the file that I worked on during the video. 7 | 8 | Data directory contains several files of 1000+ amazon reviews across different departments. If you want the raw data that I created these files from, check out here: http://jmcauley.ucsd.edu/data/amazon/ 9 | -------------------------------------------------------------------------------- /Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Data Class" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import random\n", 17 | "\n", 18 | "class Sentiment:\n", 19 | " NEGATIVE = \"NEGATIVE\"\n", 20 | " NEUTRAL = \"NEUTRAL\"\n", 21 | " POSITIVE = \"POSITIVE\"\n", 22 | "\n", 23 | "class Review:\n", 24 | " def __init__(self, text, score):\n", 25 | " self.text = text\n", 26 | " self.score = score\n", 27 | " self.sentiment = self.get_sentiment()\n", 28 | " \n", 29 | " def get_sentiment(self):\n", 30 | " if self.score <= 2:\n", 31 | " return Sentiment.NEGATIVE\n", 32 | " elif self.score == 3:\n", 33 | " return Sentiment.NEUTRAL\n", 34 | " else: #Score of 4 or 5\n", 35 | " return Sentiment.POSITIVE\n", 36 | "\n", 37 | "class ReviewContainer:\n", 38 | " def __init__(self, reviews):\n", 39 | " self.reviews = reviews\n", 40 | " \n", 41 | " def get_text(self):\n", 42 | " return [x.text for x in self.reviews]\n", 43 | " \n", 44 | " def get_sentiment(self):\n", 45 | " return [x.sentiment for x in self.reviews]\n", 46 | " \n", 47 | " def evenly_distribute(self):\n", 48 | " negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))\n", 49 | " positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))\n", 50 | " positive_shrunk = positive[:len(negative)]\n", 51 | " self.reviews = negative + positive_shrunk\n", 52 | " random.shuffle(self.reviews)\n", 53 | " \n", 54 | " " 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "### Load Data" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 2, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "'I hoped for Mia to have some peace in this book, but her story is so real and raw. Broken World was so touching and emotional because you go from Mia\\'s trauma to her trying to cope. I love the way the story displays how there is no \"just bouncing back\" from being sexually assaulted. Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings. I found myself wishing I could give her some of my courage and strength or even just to be there for her. Thank you Lizzy for putting a great character\\'s voice on a strong subject and making it so that other peoples story may be heard through Mia\\'s.'" 73 | ] 74 | }, 75 | "execution_count": 2, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "import json\n", 82 | "\n", 83 | "file_name = './data/sentiment/books_small_10000.json'\n", 84 | "\n", 85 | "reviews = []\n", 86 | "with open(file_name) as f:\n", 87 | " for line in f:\n", 88 | " review = json.loads(line)\n", 89 | " reviews.append(Review(review['reviewText'], review['overall']))\n", 90 | " \n", 91 | "reviews[5].text\n", 92 | " " 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Prep Data" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 39, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "from sklearn.model_selection import train_test_split\n", 109 | "\n", 110 | "training, test = train_test_split(reviews, test_size=0.33, random_state=42)\n", 111 | "\n", 112 | "train_container = ReviewContainer(training)\n", 113 | "\n", 114 | "test_container = ReviewContainer(test)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 40, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "436\n", 127 | "436\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "train_container.evenly_distribute()\n", 133 | "train_x = train_container.get_text()\n", 134 | "train_y = train_container.get_sentiment()\n", 135 | "\n", 136 | "test_container.evenly_distribute()\n", 137 | "test_x = test_container.get_text()\n", 138 | "test_y = test_container.get_sentiment()\n", 139 | "\n", 140 | "print(train_y.count(Sentiment.POSITIVE))\n", 141 | "print(train_y.count(Sentiment.NEGATIVE))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "#### Bag of words vectorization" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 49, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "I read this book over a year ago & enjoyed the various stories, the author takes you on a journey of life as it pretty much is in today's world & society, as you end one story you look forward to starting the next, relaxed reading I highly recommend it for peps who enjoy stories from back in their grand-ma & grand-dad days in the South. I will peruse more books by this author for future purchase.\n", 161 | "[[0. 0. 0. ... 0. 0. 0.]]\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 167 | "\n", 168 | "# This book is great !\n", 169 | "# This book was so bad\n", 170 | "\n", 171 | "vectorizer = TfidfVectorizer()\n", 172 | "train_x_vectors = vectorizer.fit_transform(train_x)\n", 173 | "\n", 174 | "test_x_vectors = vectorizer.transform(test_x)\n", 175 | "\n", 176 | "print(train_x[0])\n", 177 | "print(train_x_vectors[0].toarray())\n", 178 | "\n", 179 | "\n", 180 | "\n" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## Classification" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "#### Linear SVM\n", 195 | "\n", 196 | "\n", 197 | "\n", 198 | "\n", 199 | "\n", 200 | "\n", 201 | "\n", 202 | "\n", 203 | "\n", 204 | "\n", 205 | "\n", 206 | "\n", 207 | "\n", 208 | "\n", 209 | "\n", 210 | "\n", 211 | "\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 50, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "array(['POSITIVE'], dtype='