├── .streamlit └── config.toml ├── 1.Inital_Data_Exploration (Notebook One).ipynb ├── 2.Text_EDA_&_PreProcessing (Notebook Two).ipynb ├── 3.Model Implementation and Evaluation (Notebook Three).ipynb ├── 4.Deployment.py ├── README.md ├── data and pickle files ├── amazon_reviews_2019.csv ├── best_model.pkl ├── cleaned_data.csv ├── count_vectorizer.pkl └── updated_data.csv └── requirements.txt /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | primaryColor = "#FFC657" 3 | backgroundColor = "#FFFFFF" 4 | secondaryBackgroundColor = "#f0f2f6" 5 | textColor = "#262730" 6 | font = "serif" 7 | -------------------------------------------------------------------------------- /3.Model Implementation and Evaluation (Notebook Three).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MODEL IMPLEMENTATION AND EVALUATION\n", 8 | "\n", 9 | "This is the stage where the three models are built, optimized and evaluated.\n", 10 | "\n", 11 | "Models used: `` Multinominal Naive Bayes`` , ``Support Vector Machine``, ``Logistic Regression``\n", 12 | "\n", 13 | "Evaluation methods used: ``accuracy, precision, recall, f1_score`` and ``confusion matrix``\n", 14 | "\n", 15 | "## Summary\n", 16 | "\n", 17 | "After appropriate evaluation, LR with count vectorizer has been deemed the best. The rest of the models has all worked above 80% accuracy, with the other metrics working out above 79%. " 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 139, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "#LIBRARIES \n", 27 | "import pandas as pd\n", 28 | "import nltk\n", 29 | "from nltk.corpus import stopwords\n", 30 | "import sklearn\n", 31 | "from sklearn.model_selection import train_test_split\n", 32 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 33 | "from sklearn.naive_bayes import MultinomialNB\n", 34 | "from sklearn.svm import SVC \n", 35 | "from sklearn.linear_model import LogisticRegression\n", 36 | "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", 37 | "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix\n", 38 | "import pickle\n", 39 | "import warnings\n", 40 | "warnings.simplefilter(\"ignore\")" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 140, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "#lOADING DATASETS \n", 50 | "df = pd.read_csv(\"data and pickle files/cleaned_data.csv\",encoding=\"latin1\") #due to special charas should be encoded as latin 1\n", 51 | "\n", 52 | "toCheck = pd.read_csv(\"data and pickle files/updated_data.csv\",encoding=\"latin1\")\n", 53 | "#REMOVE MAX\n", 54 | "pd.set_option('display.max_columns', None)\n", 55 | "pd.set_option('display.max_rows', None)\n", 56 | "\n", 57 | "#DROP EXTRA COLUMNS\n", 58 | "df.drop(['Unnamed: 0'], axis=1, inplace=True)\n", 59 | "toCheck.drop(['Unnamed: 0'], axis=1, inplace=True)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "# DOUBLE-CHECKING...\n", 67 | "\n", 68 | "Double checking if there are any NULL values within the dataset. This would cause issues later on if there are as such." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 141, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/html": [ 79 | "
\n", 80 | "\n", 93 | "\n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | "
review_textverified_purchase
191NaNTrue
523NaNTrue
1072NaNTrue
1111NaNTrue
1230NaNTrue
1316NaNTrue
\n", 134 | "
" 135 | ], 136 | "text/plain": [ 137 | " review_text verified_purchase\n", 138 | "191 NaN True\n", 139 | "523 NaN True\n", 140 | "1072 NaN True\n", 141 | "1111 NaN True\n", 142 | "1230 NaN True\n", 143 | "1316 NaN True" 144 | ] 145 | }, 146 | "execution_count": 141, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "#CHECKING WHICH ROW IS NULL FROM PRE-PROCESSING\n", 153 | "checkNULL = df.isnull()\n", 154 | "checkNULL = checkNULL.any(axis=1)\n", 155 | "df[checkNULL]" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 142, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "toCheck = toCheck.drop_duplicates().reset_index(drop=True)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 143, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/html": [ 175 | "
\n", 176 | "\n", 189 | "\n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | "
review_textverified_purchase
191A+True
5235*True
1072veryTrue
1111Does what it shouldTrue
1230A+True
1316A*****True
\n", 230 | "
" 231 | ], 232 | "text/plain": [ 233 | " review_text verified_purchase\n", 234 | "191 A+ True\n", 235 | "523 5* True\n", 236 | "1072 very True\n", 237 | "1111 Does what it should True\n", 238 | "1230 A+ True\n", 239 | "1316 A***** True" 240 | ] 241 | }, 242 | "execution_count": 143, 243 | "metadata": {}, 244 | "output_type": "execute_result" 245 | } 246 | ], 247 | "source": [ 248 | "toCheck.iloc[[191,523,1072,1111,1230,1316],[3,4]]" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "Checking the old csv file, it can be seen as to why the five rows were totally cleaned out within its review_text. That was because within the text processing stage previously, only words which held meaning were kept, and if we refer to the second table we can see that most of them were either stopwords or had symbols and numbers. Since they don't hold meaning either way, these will be dropped subsequently. " 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 144, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "#DROP THE NULL ROWS\n", 265 | "df = df.dropna(how='any',axis=0) " 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 145, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/plain": [ 276 | "False 0.525701\n", 277 | "True 0.474299\n", 278 | "Name: verified_purchase, dtype: float64" 279 | ] 280 | }, 281 | "execution_count": 145, 282 | "metadata": {}, 283 | "output_type": "execute_result" 284 | } 285 | ], 286 | "source": [ 287 | "#UPDATED VP VALUES \n", 288 | "df[\"verified_purchase\"].value_counts(normalize=True)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "The change barely had any affect on the T/F values, and thus we are ready to proceed." 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "# MODELING\n", 303 | "\n", 304 | "Within the dataset, there are currently only two columns. Out of the two, review_text is going to be assigned as the input variable, and verified_purchases as the target variable. The data is then going to be split accordingly." 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 146, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "#ASSIGN THE VARIABLES\n", 314 | "X = df['review_text'] #input var\n", 315 | "y = df['verified_purchase'] #target var" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 147, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "Number of rows:\n", 328 | "Entire dataset: 1712\n", 329 | "Train dataset: 1027\n", 330 | "Test dataset: 685\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "#SPLIT DATA\n", 336 | "X_train, X_test, y_train, y_test = train_test_split(\n", 337 | " df['review_text'], df['verified_purchase'],test_size=0.4, random_state=42) #40% gives best results, 42 is no of life...\n", 338 | "\n", 339 | "entiredf = format(df.shape[0])\n", 340 | "traindf = format(X_train.shape[0])\n", 341 | "testdf = format(X_test.shape[0])\n", 342 | "\n", 343 | "print('Number of rows:')\n", 344 | "print('Entire dataset:', entiredf)\n", 345 | "print('Train dataset:', traindf)\n", 346 | "print('Test dataset:',testdf)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "The data is decided to be split into 60 - 40, which has been determined by trial and error. This splitting produces the highest accuracy for the models, and thus we are going to with that. " 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "## COUNT VECTORIZER AND MODELING\n", 361 | "\n", 362 | "word vectorization maps words or phrases from a lexicon to a matching vector of real numbers, which may then be used to determine word predictions and semantics, and this is done due to the fact that models only understand numerical data.\n", 363 | "\n", 364 | "We are going to be utlizing two of the vectorization methods, the first one being count vectorizer. We just count the number of times a word appears in the document in CountVectorizer, which results in a bias in favor of the most common terms." 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 148, 370 | "metadata": {}, 371 | "outputs": [ 372 | { 373 | "name": "stdout", 374 | "output_type": "stream", 375 | "text": [ 376 | "\n", 377 | "Vocabulary: \n", 378 | " {'current': 413, 'sell': 1498, 'price': 1310, 'compar': 341, 'supermarket': 1705, 'good': 760, 'smell': 1563, 'pleasant': 1270, 'need': 1134, 'add': 14, 'small': 1558, 'cloth': 319, 'fresh': 700, 'great': 776, 'moisturis': 1100, 'sensit': 1502, 'love': 1009, 'pour': 1293, 'smaller': 1559, 'bottl': 193, 'make': 1032, 'manag': 1036, 'beat': 136, 'decent': 432, 'fabric': 614, 'soften': 1580, 'nice': 1142, 'fragranc': 695, 'purchas': 1334, 'deliveri': 456, 'cream': 397, 'handwash': 801, 'cheaper': 281, 'hand': 798, 'better': 154, 'liquid': 988, 'oh': 1174, 'wing': 1917, 'dove': 514, 'kitchen': 940, 'bathroom': 131, 'shower': 1527, 'room': 1436, 'recommend': 1378, 'highli': 830, 'star': 1636, 'simpl': 1537, 'gel': 727, 'like': 977, 'glue': 753, 'hard': 805, 'rub': 1444, 'slip': 1555, 'bath': 130, 'goe': 756, 'smoothli': 1569, 'easili': 539, 'wast': 1885, 'leav': 964, 'feel': 643, 'silki': 1535, 'soft': 1579, 'scenti': 1477, 'review': 1419, 'collect': 326, 'promot': 1322, 'excel': 589, 'everi': 580, 'day': 426, 'facial': 617, 'wash': 1882, 'excess': 591, 'face': 616, 'strip': 1669, 'natur': 1127, 'oil': 1175, 'care': 256, 'routin': 1442, 'morn': 1108, 'night': 1144, 'clean': 308, 'brilliant': 213, 'offer': 1172, 'gorgeou': 762, 'amaz': 52, 'valu': 1848, 'girli': 739, 'hair': 794, 'buy': 235, 'chang': 276, 'preserv': 1304, 'come': 331, 'bad': 109, 'sore': 1599, 'rash': 1354, 'eye': 608, 'burn': 230, 'lip': 987, 'tingl': 1775, 'phone': 1255, 'told': 1785, 'stuff': 1677, 'ask': 92, 'said': 1454, 'know': 943, 'want': 1878, 'reason': 1368, 'mayb': 1056, 'save': 1468, 'money': 1103, 'ingredi': 897, 'nearli': 1130, 'year': 1950, 'sinc': 1541, 'nivea': 1147, 'sold': 1583, 'compani': 339, 'german': 733, 'im': 872, 'realli': 1366, 'angri': 56, 'suppos': 1708, 'ok': 1177, 'rubbish': 1446, 'burnt': 231, 'pleas': 1269, 'usual': 1846, 'stock': 1651, 'fulli': 713, 'asda': 90, 'gave': 726, 'refund': 1386, 'gift': 735, 'card': 253, 'receipt': 1371, 'pocket': 1277, 'condition': 355, 'normal': 1151, 'oili': 1176, 'week': 1896, 'saw': 1470, 'differ': 480, 'felt': 646, 'cleans': 310, 'clearer': 313, 'notic': 1156, 'straightaway': 1659, 'red': 1381, 'blemish': 169, 'previou': 1308, 'kid': 936, 'doesnt': 505, 'irrit': 914, 'scent': 1476, 'littl': 993, 'bit': 162, 'long': 999, 'way': 1892, 'similar': 1536, 'perfect': 1236, 'got': 764, 'coupl': 390, 'ago': 32, 'refresh': 1385, 'bodi': 181, 'smooth': 1567, 'cucumb': 408, 'relax': 1393, 'best': 153, 'came': 243, 'separ': 1504, 'packet': 1208, 'sealabl': 1491, 'affect': 25, 'rough': 1440, 'dri': 525, 'otherwis': 1193, 'fantast': 631, 'lot': 1007, 'effort': 549, 'reduc': 1382, 'plastic': 1268, 'concentr': 350, 'buyer': 236, 'say': 1471, 'larger': 949, 'probabl': 1316, 'explain': 601, 'label': 945, 'fuchsia': 711, 'perfum': 1239, 'version': 1864, 'overpow': 1203, 'big': 155, 'plu': 1274, 'difficulti': 482, 'intens': 907, 'past': 1226, 'note': 1153, 'outer': 1194, 'sleev': 1550, 'recycl': 1380, 'dispos': 498, 'charg': 279, 'kind': 937, 'overbear': 1199, 'anyon': 66, 'glow': 751, 'afford': 26, 'comfort': 332, 'creation': 400, 'round': 1441, 'market': 1044, 'close': 318, 'match': 1053, 'honeysuckl': 846, 'sandalwood': 1463, 'person': 1246, 'favourit': 638, 'howev': 855, 'descript': 465, 'ad': 13, 'recent': 1373, 'amazon': 55, 'pantri': 1215, 'order': 1189, 'lamin': 946, 'tile': 1771, 'floor': 670, 'subtl': 1685, 'streak': 1662, 'free': 698, 'shine': 1523, 'moistur': 1099, 'essenti': 577, 'tri': 1808, 'time': 1773, 'today': 1781, 'packag': 1207, 'easi': 537, 'open': 1182, 'squeez': 1627, 'releas': 1394, 'puff': 1330, 'froth': 707, 'pure': 1335, 'smear': 1562, 'white': 1908, 'absorb': 4, 'non': 1148, 'greasi': 775, 'appli': 73, 'think': 1759, 'expens': 598, 'cheap': 280, 'qualiti': 1340, 'dairi': 419, 'aroma': 84, 'dont': 508, 'drench': 523, 'lotion': 1008, 'pretti': 1306, 'worth': 1937, 'thank': 1752, 'item': 919, 'describ': 464, 'rapid': 1352, 'stop': 1653, 'static': 1641, 'chocol': 294, 'flavour': 669, 'creami': 398, 'soap': 1576, 'anyth': 67, 'els': 553, 'lather': 954, 'noth': 1154, 'fail': 621, 'protect': 1326, 'gentl': 730, 'basic': 126, 'harm': 806, 'nourish': 1157, 'tasti': 1733, 'beef': 143, 'tomato': 1786, 'grate': 772, 'cheddar': 284, 'chees': 286, 'mix': 1091, 'nd': 1128, 'let': 969, 'sit': 1544, 'minut': 1083, 'regret': 1388, 'set': 1509, 'took': 1791, 'expect': 597, 'size': 1546, 'persil': 1244, 'quit': 1348, 'pictur': 1258, 'mislead': 1087, 'apart': 71, 'cheer': 285, 'makeup': 1034, 'eas': 536, 'micellar': 1074, 'water': 1888, 'left': 965, 'pack': 1206, 'extra': 602, 'impresss': 881, 'content': 368, 'aw': 103, 'cut': 416, 'right': 1427, 'fingernail': 658, 'turn': 1818, 'upsid': 1844, 'approach': 78, 'empti': 558, 'ridicul': 1426, 'design': 466, 'bought': 194, 'lumber': 1016, 'home': 841, 'larg': 948, 'decant': 431, 'exist': 594, 'contain': 367, 'tell': 1740, 'straight': 1658, 'away': 105, 'despit': 469, 'compart': 343, 'machin': 1025, 'towel': 1796, 'unlik': 1835, 'manufactur': 1040, 'healthi': 817, 'definit': 447, 'forese': 684, 'futur': 720, 'gotten': 766, 'result': 1415, 'brillant': 212, 'brand': 201, 'absolut': 3, 'fan': 629, 'wow': 1939, 'factor': 619, 'base': 123, 'commerci': 335, 'help': 825, 'disappoint': 489, 'especi': 576, 'point': 1279, 'ice': 866, 'chunk': 301, 'difficult': 481, 'eat': 540, 'entir': 569, 'tub': 1814, 'nasti': 1126, 'chemic': 287, 'super': 1701, 'famili': 628, 'young': 1956, 'old': 1179, 'friendli': 705, 'mean': 1059, 'colour': 329, 'honestli': 844, 'problemat': 1318, 'prone': 1323, 'dermat': 462, 'tast': 1732, 'weed': 1895, 'vinegar': 1867, 'work': 1932, 'remov': 1403, 'havent': 811, 'hamper': 797, 'christma': 298, 'sure': 1709, 'magnum': 1029, 'incas': 885, 'amazebal': 53, 'togeth': 1782, 'delici': 453, 'superb': 1702, 'kit': 939, 'bio': 159, 'stubborn': 1674, 'stain': 1631, 'word': 1931, 'warn': 1881, 'handl': 800, 'tend': 1743, 'ensur': 566, 'thoroughli': 1763, 'rins': 1428, 'repair': 1405, 'guy': 790, 'fals': 627, 'economi': 543, 'build': 226, 'caus': 261, 'issu': 916, 'huggabl': 858, 'strong': 1671, 'alcohol': 36, 'pot': 1291, 'noodl': 1149, 'student': 1676, 'yuck': 1959, 'addit': 15, 'sauc': 1466, 'pasta': 1227, 'carboard': 250, 'situat': 1545, 'access': 6, 'kettl': 933, 'chose': 297, 'mugshot': 1116, 'mistak': 1089, 'scrumptiou': 1488, 'authent': 99, 'linger': 984, 'bonu': 188, 'percent': 1235, 'fact': 618, 'husband': 862, 'classic': 307, 'lynx': 1024, 'deodor': 459, 'regularli': 1390, 'number': 1160, 'skincar': 1547, 'avoid': 102, 'toner': 1788, 'experi': 599, 'anoth': 62, 'major': 1031, 'extrem': 605, 'flaki': 665, 'sting': 1649, 'dewi': 475, 'look': 1003, 'deffin': 440, 'opinion': 1183, 'reccomend': 1370, 'deal': 429, 'slow': 1556, 'cook': 375, 'beauti': 138, 'tip': 1777, 'mark': 1043, 'alway': 51, 'fresher': 701, 'regular': 1389, 'late': 951, 'overal': 1198, 'improv': 882, 'claim': 304, 'perform': 1238, 'miracl': 1084, 'lol': 997, 'reaction': 1360, 'happen': 802, 'winter': 1919, 'weather': 1894, 'central': 265, 'heat': 819, 'vaselin': 1853, 'dealt': 430, 'includ': 887, 'elbow': 551, 'knee': 941, 'place': 1263, 'glide': 746, 'dream': 522, 'summer': 1695, 'spent': 1614, 'lay': 958, 'beach': 133, 'eyemak': 611, 'men': 1067, 'boot': 190, 'tame': 1726, 'divin': 500, 'iron': 913, 'tear': 1737, 'penni': 1233, 'scale': 1472, 'deliv': 455, 'quickli': 1346, 'shampoo': 1516, 'leak': 963, 'tan': 1727, 'sunb': 1697, 'spray': 1624, 'thought': 1764, 'clear': 312, 'friend': 704, 'huge': 857, 'indoor': 892, 'didnt': 477, 'exxtra': 607, 'fast': 633, 'thankyou': 1753, 'half': 796, 'shop': 1526, 'line': 982, 'fake': 625, 'africa': 28, 'poorer': 1282, 'new': 1138, 'softer': 1581, 'particularli': 1221, 'wrap': 1940, 'storag': 1655, 'measur': 1061, 'touch': 1794, 'brighter': 209, 'cleaner': 309, 'afterward': 29, 'everyon': 583, 'impress': 880, 'neutral': 1137, 'defiantli': 442, 'fab': 613, 'rate': 1356, 'mani': 1038, 'micel': 1073, 'wipe': 1920, 'total': 1793, 'alreadi': 46, 'receiv': 1372, 'feed': 642, 'event': 578, 'absoulut': 5, 'stick': 1646, 'job': 926, 'live': 994, 'hype': 865, 'clearli': 314, 'lead': 961, 'leader': 962, 'inspir': 901, 'confid': 357, 'certainli': 269, 'harsh': 808, 'favorit': 636, 'imposs': 879, 'sourc': 1603, 'check': 283, 'target': 1731, 'piec': 1260, 'crack': 395, 'bare': 119, 'far': 632, 'concern': 352, 'waterproof': 1890, 'mascara': 1048, 'powder': 1295, 'return': 1417, 'paid': 1210, 'daili': 418, 'basi': 125, 'age': 30, 'multipl': 1117, 'bulk': 227, 'door': 509, 'funnel': 717, 'easier': 538, 'section': 1496, 'washer': 1883, 'moreov': 1107, 'thing': 1758, 'exactli': 588, 'someth': 1588, 'odor': 1169, 'blend': 170, 'energis': 563, 'complain': 345, 'lid': 971, 'bin': 158, 'spill': 1618, 'everywher': 586, 'box': 197, 'ml': 1093, 'hope': 847, 'misl': 1086, 'month': 1104, 'start': 1637, 'enjoy': 565, 'mr': 1114, 'hinch': 832, 'glad': 742, 'scratch': 1481, 'scour': 1480, 'pad': 1209, 'minki': 1082, 'spong': 1621, 'oven': 1197, 'hob': 837, 'tap': 1730, 'sink': 1543, 'screen': 1483, 'gleam': 745, 'compact': 338, 'rang': 1351, 'actual': 12, 'complet': 347, 'ha': 792, 'excema': 590, 'hit': 834, 'everyday': 582, 'massiv': 1052, 'cap': 245, 'lenor': 967, 'unstopp': 1840, 'convert': 373, 'mild': 1077, 'messi': 1071, 'step': 1645, 'dad': 417, 'awhil': 107, 'suitabl': 1694, 'obviou': 1162, 'choic': 295, 'believ': 146, 'continu': 369, 'term': 1746, 'benefit': 150, 'happi': 803, 'alround': 48, 'consist': 363, 'dilut': 483, 'guess': 789, 'blue': 177, 'pigment': 1261, 'act': 11, 'reli': 1395, 'hydrat': 864, 'mixtur': 1092, 'varieti': 1851, 'reach': 1358, 'matter': 1054, 'neg': 1135, 'pump': 1332, 'end': 560, 'foam': 677, 'met': 1072, 'standard': 1634, 'bar': 118, 'quarter': 1342, 'truli': 1813, 'load': 995, 'sometim': 1589, 'stapl': 1635, 'heal': 816, 'abras': 2, 'boy': 198, 'store': 1656, 'cupboard': 410, 'dead': 428, 'therapi': 1754, 'origin': 1192, 'mini': 1080, 'arriv': 85, 'real': 1363, 'delight': 454, 'quick': 1345, 'subscrib': 1682, 'light': 975, 'bubbl': 223, 'floral': 671, 'masculin': 1049, 'bigger': 156, 'volum': 1870, 'garag': 723, 'known': 944, 'sweet': 1719, 'textur': 1751, 'artifici': 89, 'justv': 929, 'hour': 852, 'pale': 1212, 'disastr': 490, 'success': 1686, 'creat': 399, 'spot': 1623, 'abl': 1, 'deepli': 438, 'exfoli': 593, 'wont': 1928, 'suffer': 1689, 'babi': 108, 'style': 1679, 'genuin': 732, 'neatli': 1131, 'everyth': 584, 'shall': 1514, 'tresmemm': 1807, 'fussi': 719, 'suit': 1693, 'therefor': 1755, 'prefer': 1301, 'burst': 232, 'transit': 1802, 'snack': 1572, 'went': 1900, 'weekend': 1897, 'decid': 433, 'wet': 1901, 'cashmer': 259, 'sweater': 1718, 'sainsburi': 1455, 'scrummi': 1487, 'delic': 452, 'requir': 1410, 'bring': 214, 'runni': 1449, 'glam': 743, 'somewher': 1591, 'follow': 680, 'inexpens': 894, 'plain': 1264, 'mother': 1112, 'law': 957, 'wat': 1886, 'discount': 492, 'tin': 1774, 'sooth': 1597, 'residu': 1411, 'cool': 378, 'readi': 1362, 'magic': 1027, 'latest': 953, 'variat': 1850, 'address': 16, 'environment': 571, 'transport': 1803, 'cost': 384, 'commend': 333, 'sens': 1500, 'detect': 470, 'breath': 205, 'form': 687, 'slightli': 1552, 'zesti': 1961, 'user': 1845, 'repres': 1408, 'lessen': 968, 'impact': 877, 'environ': 570, 'albeit': 35, 'smallest': 1560, 'ideal': 869, 'son': 1592, 'adult': 20, 'children': 292, 'belov': 147, 'pet': 1250, 'incred': 890, 'lolli': 998, 'bargain': 120, 'introduc': 909, 'partner': 1223, 'sadli': 1452, 'share': 1517, 'hot': 851, 'melt': 1065, 'drip': 528, 'finish': 659, 'favour': 637, 'properli': 1324, 'inferior': 895, 'tesco': 1748, 'microwav': 1076, 'sever': 1510, 'badli': 111, 'lumpi': 1017, 'mostli': 1109, 'simplic': 1539, 'gone': 758, 'effect': 547, 'trace': 1799, 'fanci': 630, 'maskara': 1050, 'remain': 1399, 'sticki': 1647, 'defin': 444, 'heavi': 820, 'frequent': 699, 'perhap': 1240, 'capsul': 247, 'household': 854, 'terribl': 1747, 'allerg': 37, 'constant': 364, 'hay': 812, 'fever': 648, 'wrong': 1945, 'sign': 1529, 'gluten': 754, 'casserol': 260, 'sat': 1464, 'toilet': 1783, 'barley': 122, 'coeliac': 323, 'peopl': 1234, 'unsuit': 1841, 'pick': 1257, 'signific': 1531, 'rel': 1392, 'sturdier': 1678, 'travel': 1804, 'hous': 853, 'tini': 1776, 'surfac': 1711, 'pic': 1256, 'wonder': 1926, 'deni': 457, 'fix': 663, 'instead': 904, 'whenev': 1903, 'react': 1359, 'true': 1812, 'luxuri': 1023, 'prime': 1312, 'conveni': 372, 'incorpor': 888, 'cleanser': 311, 'eventu': 579, 'margin': 1042, 'obvious': 1163, 'comment': 334, 'appreci': 76, 'break': 203, 'vegan': 1856, 'daughter': 425, 'persuad': 1248, 'hesit': 826, 'pay': 1230, 'forward': 692, 'soup': 1602, 'risotto': 1432, 'younger': 1957, 'menopaus': 1068, 'calm': 240, 'wrinkl': 1942, 'crepey': 401, 'layer': 959, 'conjunct': 359, 'hyaluron': 863, 'companion': 340, 'loss': 1005, 'firm': 660, 'oz': 1205, 'csmart': 406, 'screw': 1484, 'determin': 472, 'loos': 1004, 'knife': 942, 'wise': 1921, 'advoid': 24, 'smart': 1561, 'local': 996, 'okay': 1178, 'meant': 1060, 'equal': 572, 'artif': 88, 'upset': 1843, 'simpli': 1538, 'english': 564, 'mustard': 1122, 'colman': 327, 'sunday': 1698, 'roast': 1433, 'tube': 1815, 'invari': 910, 'serv': 1507, 'nozzl': 1159, 'block': 172, 'someon': 1587, 'scrub': 1486, 'extremli': 606, 'frangranc': 697, 'fairli': 624, 'spread': 1625, 'soak': 1575, 'smelt': 1566, 'wilkinson': 1914, 'area': 80, 'soon': 1593, 'becom': 139, 'scalp': 1474, 'satisfi': 1465, 'sorri': 1600, 'voucher': 1871, 'post': 1289, 'slimi': 1554, 'problem': 1317, 'unhappi': 1832, 'respons': 1413, 'treat': 1805, 'portug': 1287, 'miss': 1088, 'tea': 1736, 'straighten': 1660, 'curl': 412, 'damag': 420, 'seller': 1499, 'pod': 1278, 'bag': 113, 'filler': 653, 'rich': 1423, 'special': 1609, 'elsewher': 554, 'laundri': 956, 'hate': 809, 'arrog': 86, 'unilev': 1833, 'hold': 838, 'palm': 1213, 'fine': 656, 'paragon': 1217, 'champ': 274, 'tight': 1769, 'win': 1916, 'doubl': 512, 'struggl': 1673, 'tough': 1795, 'unbeliev': 1825, 'compliment': 348, 'august': 98, 'itchi': 918, 'stream': 1664, 'grandchildren': 770, 'unfortun': 1831, 'green': 778, 'lime': 979, 'individu': 891, 'dandruff': 423, 'anti': 63, 'prior': 1314, 'particular': 1220, 'busi': 233, 'mum': 1118, 'admit': 18, 'neglect': 1136, 'empathis': 557, 'trial': 1809, 'opportun': 1184, 'reconnect': 1379, 'cme': 320, 'smother': 1570, 'offens': 1171, 'alo': 41, 'applic': 74, 'bedtim': 141, 'verdict': 1860, 'thumb': 1766, 'ye': 1949, 'second': 1493, 'lightweight': 976, 'tendenc': 1744, 'pull': 1331, 'liber': 970, 'discreet': 494, 'overwhelm': 1204, 'protector': 1327, 'retain': 1416, 'ezyema': 612, 'switch': 1721, 'radiant': 1349, 'regim': 1387, 'thinner': 1760, 'heavili': 821, 'cancel': 244, 'subscript': 1683, 'wait': 1874, 'fyi': 721, 'attract': 97, 'floweri': 673, 'badeda': 110, 'holiday': 840, 'sun': 1696, 'till': 1772, 'stope': 1654, 'blow': 176, 'prevent': 1307, 'transform': 1801, 'nose': 1152, 'dehydr': 451, 'sort': 1601, 'rest': 1414, 'sleep': 1549, 'caramel': 248, 'marshmallow': 1046, 'liter': 991, 'calori': 242, 'bomb': 185, 'gross': 782, 'gonna': 759, 'throw': 1765, 'coca': 322, 'butter': 234, 'marmit': 1045, 'lover': 1011, 'glass': 744, 'jar': 923, 'oddli': 1168, 'given': 740, 'persev': 1243, 'shini': 1524, 'root': 1437, 'drier': 527, 'grown': 785, 'horribl': 849, 'agre': 33, 'tighter': 1770, 'import': 878, 'confus': 358, 'milk': 1079, 'maker': 1033, 'risk': 1431, 'underwear': 1830, 'ultra': 1824, 'twice': 1819, 'combin': 330, 'harmoni': 807, 'chicken': 290, 'mushroom': 1121, 'pie': 1259, 'salt': 1458, 'ruin': 1447, 'chuck': 300, 'tongu': 1790, 'season': 1492, 'accident': 7, 'swallow': 1715, 'sea': 1489, 'xd': 1946, 'older': 1180, 'wide': 1911, 'awak': 104, 'flakey': 664, 'smelli': 1564, 'pit': 1262, 'plenti': 1273, 'longer': 1000, 'comparison': 342, 'yo': 1954, 'born': 191, 'defient': 443, 'odourless': 1170, 'suppl': 1706, 'nightli': 1145, 'bed': 140, 'clog': 317, 'pore': 1285, 'replenish': 1407, 'daytim': 427, 'reappli': 1367, 'overnight': 1202, 'unscent': 1837, 'patch': 1228, 'stone': 1652, 'munchi': 1119, 'food': 682, 'depend': 461, 'wish': 1922, 'talk': 1724, 'visit': 1869, 'certifi': 270, 'british': 215, 'foundat': 693, 'freshli': 702, 'launder': 955, 'reliabl': 1396, 'cooler': 379, 'wahs': 1873, 'soapi': 1577, 'type': 1820, 'downsid': 515, 'hole': 839, 'tresemm': 1806, 'stand': 1633, 'yummi': 1960, 'sugari': 1691, 'eco': 541, 'parcel': 1218, 'brain': 199, 'op': 1181, 'wife': 1912, 'safe': 1453, 'recomend': 1377, 'aswel': 94, 'moist': 1097, 'surf': 1710, 'stay': 1642, 'articl': 87, 'acn': 10, 'scar': 1475, 'opposit': 1185, 'life': 973, 'visibl': 1868, 'fade': 620, 'dermatologist': 463, 'game': 722, 'changer': 277, 'main': 1030, 'test': 1749, 'singl': 1542, 'mayo': 1057, 'bod': 180, 'bat': 127, 'quench': 1343, 'fell': 645, 'cooki': 377, 'dough': 513, 'superdrug': 1703, 'prize': 1315, 'buzz': 237, 'limit': 981, 'edit': 546, 'stayer': 1643, 'scratchi': 1482, 'rip': 1429, 'mistreat': 1090, 'dock': 501, 'reus': 1418, 'revitalis': 1420, 'gentli': 731, 'impur': 883, 'servic': 1508, 'fault': 634, 'rocemmend': 1434, 'almond': 40, 'mouth': 1113, 'fave': 635, 'dark': 424, 'passion': 1225, 'narrow': 1125, 'accur': 8, 'slim': 1553, 'pourer': 1294, 'dribbl': 526, 'washload': 1884, 'flower': 672, 'hint': 833, 'fruit': 709, 'mango': 1037, 'brought': 219, 'cornet': 381, 'luvli': 1021, 'carri': 257, 'worri': 1934, 'man': 1035, 'choos': 296, 'sweat': 1717, 'mayonnais': 1058, 'substanc': 1684, 'squeezi': 1628, 'geniu': 729, 'idea': 868, 'tall': 1725, 'unstabl': 1839, 'ive': 922, 'fridg': 703, 'tumbl': 1816, 'brittl': 216, 'shatter': 1520, 'groundhog': 783, 'cri': 402, 'overhaul': 1200, 'curri': 414, 'besid': 152, 'sachet': 1451, 'edibl': 545, 'phenomen': 1253, 'function': 715, 'longest': 1001, 'rid': 1425, 'kept': 932, 'drawback': 519, 'suppli': 1707, 'wear': 1893, 'diabet': 476, 'carbohydr': 251, 'nutrit': 1161, 'inform': 896, 'whatsoev': 1902, 'wall': 1877, 'specif': 1610, 'portion': 1286, 'whitehead': 1909, 'bash': 124, 'inadvert': 884, 'deterg': 471, 'fairi': 623, 'joint': 927, 'biolog': 161, 'septic': 1505, 'tank': 1728, 'dispens': 497, 'insert': 899, 'suffici': 1690, 'condit': 354, 'competit': 344, 'hav': 810, 'btilliant': 221, 'wild': 1913, 'enthusiast': 567, 'err': 573, 'gener': 728, 'custom': 415, 'write': 1944, 'uncomfort': 1829, 'clash': 305, 'coat': 321, 'sickli': 1528, 'raspberri': 1355, 'cover': 392, 'core': 380, 'remind': 1402, 'cheapest': 282, 'imagin': 873, 'insipid': 900, 'rush': 1450, 'develop': 473, 'process': 1319, 'bitter': 164, 'rippl': 1430, 'consid': 361, 'lush': 1020, 'suggest': 1692, 'petrolatum': 1251, 'damp': 421, 'eczema': 544, 'rosacea': 1439, 'coz': 394, 'wors': 1935, 'apprehens': 77, 'broken': 218, 'fit': 662, 'purpos': 1338, 'sport': 1622, 'mad': 1026, 'teenag': 1738, 'promis': 1321, 'moisturisor': 1101, 'deoder': 458, 'discov': 493, 'shame': 1515, 'broke': 217, 'manli': 1039, 'foami': 678, 'wondr': 1927, 'variou': 1852, 'chamomil': 273, 'weight': 1899, 'thicker': 1757, 'space': 1604, 'nicest': 1143, 'mmmm': 1094, 'bo': 178, 'beater': 137, 'plainli': 1265, 'bright': 207, 'bold': 184, 'grudg': 786, 'dirti': 487, 'tablet': 1722, 'necess': 1132, 'weekli': 1898, 'groceri': 781, 'budget': 225, 'cake': 239, 'exot': 595, 'veget': 1857, 'ariel': 81, 'domin': 507, 'global': 748, 'consum': 365, 'giant': 734, 'expert': 600, 'dodgi': 503, 'imit': 874, 'sub': 1680, 'guarante': 788, 'high': 829, 'rubberi': 1445, 'gossam': 763, 'case': 258, 'brightli': 210, 'funki': 716, 'purpl': 1337, 'grey': 780, 'swirl': 1720, 'gu': 787, 'outsid': 1195, 'grade': 767, 'disappear': 488, 'mysteri': 1123, 'sphinx': 1616, 'giza': 741, 'bermuda': 151, 'triangl': 1810, 'voynich': 1872, 'manuscript': 1041, 'xlarg': 1947, 'discontinu': 491, 'option': 1187, 'limescal': 980, 'unclog': 1828, 'wake': 1875, 'plump': 1276, 'ill': 871, 'pun': 1333, 'intend': 906, 'altern': 49, 'forth': 690, 'healthier': 818, 'advic': 22, 'minimum': 1081, 'annoy': 60, 'asid': 91, 'run': 1448, 'plug': 1275, 'potenti': 1292, 'increas': 889, 'member': 1066, 'endors': 561, 'practic': 1298, 'nappi': 1124, 'pyramid': 1339, 'warm': 1880, 'present': 1303, 'st': 1630, 'rememb': 1401, 'repurchas': 1409, 'figur': 652, 'medicin': 1062, 'cabinet': 238, 'salti': 1459, 'yeast': 1951, 'honest': 843, 'toast': 1780, 'fluctuat': 674, 'particulr': 1222, 'watch': 1887, 'smellllllll': 1565, 'sauna': 1467, 'outstand': 1196, 'turkey': 1817, 'neck': 1133, 'shake': 1512, 'graviti': 774, 'vastli': 1854, 'sharp': 1519, 'begin': 144, 'assum': 93, 'grow': 784, 'ocado': 1164, 'sampl': 1460, 'state': 1639, 'formula': 689, 'correct': 383, 'later': 952, 'boost': 189, 'sooo': 1595, 'complaint': 346, 'concept': 351, 'recal': 1369, 'tetra': 1750, 'public': 1329, 'innov': 898, 'refil': 1384, 'cardboard': 254, 'dose': 511, 'counter': 388, 'read': 1361, 'print': 1313, 'wrapper': 1941, 'magnif': 1028, 'anim': 57, 'whilst': 1906, 'provid': 1328, 'headquart': 815, 'offic': 1173, 'produc': 1320, 'countri': 389, 'sale': 1457, 'carbon': 252, 'footprint': 683, 'torn': 1792, 'extrat': 604, 'wherea': 1904, 'secondli': 1494, 'spare': 1607, 'rib': 1422, 'persdper': 1242, 'reorder': 1404, 'somewhat': 1590, 'feminin': 647, 'perspir': 1247, 'danc': 422, 'bay': 132, 'waxi': 1891, 'shark': 1518, 'ben': 148, 'jerri': 925, 'fragrant': 696, 'bargin': 121, 'vanilla': 1849, 'iritatw': 912, 'tbh': 1735, 'loo': 1002, 'conceal': 349, 'moment': 1102, 'coverag': 393, 'brighten': 208, 'glowi': 752, 'perfectli': 1237, 'view': 1866, 'consciou': 360, 'paper': 1216, 'temperament': 1741, 'lazi': 960, 'girl': 737, 'solut': 1585, 'deep': 437, 'repeat': 1406, 'versatil': 1863, 'heel': 822, 'econom': 542, 'defo': 449, 'direct': 485, 'chapstick': 278, 'jot': 928, 'bob': 179, 'uncl': 1827, 'awesom': 106, 'doeant': 504, 'soggi': 1582, 'mess': 1070, 'dish': 496, 'lux': 1022, 'plan': 1266, 'kcal': 930, 'control': 371, 'diet': 479, 'biodegrad': 160, 'forev': 685, 'blackhead': 165, 'zone': 1962, 'fed': 641, 'tone': 1787, 'anymor': 65, 'bland': 166, 'wateri': 1889, 'gold': 757, 'spring': 1626, 'cold': 325, 'cardigan': 255, 'thirti': 1761, 'degre': 450, 'tie': 1768, 'boil': 182, 'bikini': 157, 'dedic': 436, 'ylang': 1953, 'town': 1797, 'aaaaamaz': 0, 'tattoooo': 1734, 'previous': 1309, 'admir': 17, 'blast': 167, 'whiff': 1905, 'planet': 1267, 'pollut': 1280, 'funni': 718, 'cone': 356, 'crispi': 403, 'cornetto': 382, 'flare': 666, 'itch': 917, 'crazi': 396, 'escap': 575, 'lift': 974, 'drop': 529, 'instantli': 903, 'relief': 1397, 'effortlessli': 550, 'alot': 44, 'effici': 548, 'linen': 983, 'pre': 1300, 'extract': 603, 'honey': 845, 'suckl': 1687, 'yesterday': 1952, 'perman': 1241, 'cherri': 288, 'blossom': 174, 'pea': 1231, 'sandal': 1462, 'wood': 1929, 'cup': 409, 'lunch': 1018, 'king': 938, 'gram': 769, 'antiperspir': 64, 'unblock': 1826, 'gradual': 768, 'pleasantli': 1271, 'surpris': 1712, 'cours': 391, 'fcuk': 639, 'opt': 1186, 'starter': 1638, 'newborn': 1139, 'gym': 791, 'hunger': 859, 'challeng': 272, 'strength': 1666, 'endur': 562, 'helmann': 824, 'bone': 187, 'tendon': 1745, 'appeal': 72, 'advantag': 21, 'citru': 303, 'muscl': 1120, 'bump': 228, 'ador': 19, 'handi': 799, 'persist': 1245, 'whitout': 1910, 'sin': 1540, 'spend': 1613, 'woken': 1924, 'reiment': 1391, 'smudg': 1571, 'novelti': 1158, 'fish': 661, 'quantiti': 1341, 'deodour': 460, 'swear': 1716, 'arkward': 82, 'kick': 935, 'sent': 1503, 'toiletri': 1784, 'linnen': 986, 'film': 654, 'teeth': 1739, 'ceram': 266, 'realis': 1364, 'cent': 264, 'gooey': 761, 'marshmallowey': 1047, 'phish': 1254, 'class': 306, 'spain': 1606, 'breakout': 204, 'intoler': 908, 'superior': 1704, 'alon': 43, 'thiught': 1762, 'cojld': 324, 'garnier': 724, 'apar': 70, 'stripey': 1670, 'feet': 644, 'orang': 1188, 'stink': 1650, 'moral': 1106, 'stori': 1657, 'format': 688, 'shave': 1521, 'dress': 524, 'fewer': 649, 'worn': 1933, 'meet': 1064, 'amazingli': 54, 'flavor': 668, 'powderi': 1296, 'exempt': 592, 'medium': 1063, 'annoyiji': 61, 'occas': 1165, 'immedi': 875, 'serum': 1506, 'contribut': 370, 'deco': 435, 'women': 1925, 'eldest': 552, 'forgotten': 686, 'chew': 289, 'rope': 1438, 'school': 1478, 'lie': 972, 'nother': 1155, 'automat': 100, 'gloopi': 749, 'drugstor': 530, 'brainer': 200, 'partnership': 1224, 'encourag': 559, 'spici': 1617, 'fear': 640, 'spilt': 1619, 'alright': 47, 'gotta': 765, 'cetearyl': 271, 'cif': 302, 'power': 1297, 'mirror': 1085, 'nightmar': 1146, 'fun': 714, 'ordinari': 1190, 'thicken': 1756, 'gravi': 773, 'pop': 1283, 'press': 1305, 'spell': 1612, 'scali': 1473, 'quid': 1347, 'tempt': 1742, 'everytim': 585, 'spf': 1615, 'critic': 405, 'uva': 1847, 'convinc': 374, 'refer': 1383, 'verifi': 1861, 'overli': 1201, 'allergi': 38, 'certain': 268, 'bondi': 186, 'sand': 1461, 'slight': 1551, 'fabul': 615, 'hike': 831, 'itsel': 920, 'frizz': 706, 'flat': 667, 'lank': 947, 'bounc': 195, 'stuck': 1675, 'unus': 1842, 'cooker': 376, 'invigor': 911, 'newer': 1140, 'rd': 1357, 'isnt': 915, 'cerav': 267, 'norm': 1150, 'begun': 145, 'calmer': 241, 'sciencey': 1479, 'id': 867, 'unless': 1834, 'anytim': 68, 'indulg': 893, 'bite': 163, 'dog': 506, 'micro': 1075, 'granul': 771, 'wari': 1879, 'stainless': 1632, 'steel': 1644, 'liquidi': 989, 'velvet': 1858, 'hopingthat': 848, 'fair': 622, 'btw': 222, 'instruct': 905, 'typic': 1821, 'dy': 535, 'tong': 1789, 'question': 1344, 'clip': 316, 'sensat': 1501, 'definatli': 445, 'surprisingli': 1713, 'shock': 1525, 'litr': 992, 'imo': 876, 'salad': 1456, 'bake': 114, 'bean': 134, 'frozen': 708, 'pear': 1232, 'youth': 1958, 'anitipersperi': 58, 'accustom': 9, 'richer': 1424, 'bundl': 229, 'low': 1012, 'carb': 249, 'duti': 534, 'sock': 1578, 'fraction': 694, 'loveliest': 1010, 'wrist': 1943, 'drum': 531, 'modest': 1096, 'inch': 886, 'god': 755, 'somebodi': 1586, 'brazil': 202, 'expat': 596, 'bearabl': 135, 'vfm': 1865, 'head': 813, 'winner': 1918, 'iv': 921, 'batgain': 129, 'averag': 101, 'oreal': 1191, 'resist': 1412, 'girlfriend': 738, 'mention': 1069, 'benefici': 149, 'list': 990, 'chanc': 275, 'transfer': 1800, 'snif': 1574, 'commit': 336, 'stiff': 1648, 'kg': 934, 'alobg': 42, 'matur': 1055, 'hiya': 836, 'stress': 1667, 'pricey': 1311, 'reliev': 1398, 'final': 655, 'tropic': 1811, 'lili': 978, 'mud': 1115, 'spag': 1605, 'bol': 183, 'attent': 96, 'lingeri': 985, 'cotton': 386, 'silk': 1534, 'fiber': 650, 'gigant': 736, 'possibl': 1288, 'lash': 950, 'eyebrow': 609, 'hide': 828, 'sunshin': 1700, 'clingi': 315, 'greazi': 777, 'advis': 23, 'rare': 1353, 'desir': 467, 'blotch': 175, 'brown': 220, 'attach': 95, 'ignor': 870, 'fortun': 691, 'fulfil': 712, 'criteria': 404, 'wilt': 1915, 'yard': 1948, 'radiu': 1350, 'alarm': 34, 'whisk': 1907, 'drainag': 518, 'devin': 474, 'signatur': 1530, 'haha': 793, 'saver': 1469, 'pamper': 1214, 'heheh': 823, 'hunk': 860, 'shelf': 1522, 'unscrew': 1838, 'glove': 750, 'odd': 1167, 'horrif': 850, 'headach': 814, 'desper': 468, 'afraid': 27, 'disturb': 499, 'properti': 1325, 'fluff': 675, 'dryness': 532, 'unpleas': 1836, 'seal': 1490, 'drain': 517, 'yogurt': 1955, 'alpro': 45, 'blop': 173, 'postag': 1290, 'doctor': 502, 'soooo': 1596, 'walk': 1876, 'pervas': 1249, 'sky': 1548, 'fruiti': 710, 'strike': 1668, 'balanc': 115, 'slurp': 1557, 'reckon': 1375, 'nearer': 1129, 'path': 1229, 'decidedli': 434, 'newsweek': 1141, 'fluffi': 676, 'smoother': 1568, 'hive': 835, 'aggrav': 31, 'wool': 1930, 'chsnge': 299, 'remark': 1400, 'specifi': 1611, 'conclus': 353, 'lure': 1019, 'entic': 568, 'squint': 1629, 'reviv': 1421, 'fallen': 626, 'deffinalti': 441, 'fond': 681, 'poor': 1281, 'consider': 362, 'dinner': 484, 'parti': 1219, 'everybodi': 581, 'jelli': 924, 'loyal': 1013, 'balm': 116, 'finger': 657, 'color': 328, 'petroleum': 1252, 'shadow': 1511, 'appropri': 79, 'tidi': 1767, 'eyelin': 610, 'error': 574, 'spars': 1608, 'bud': 224, 'homemad': 842, 'brill': 211, 'row': 1443, 'roll': 1435, 'shaken': 1513, 'disgust': 495, 'contact': 366, 'suddenli': 1688, 'alth': 50, 'insread': 902, 'toxic': 1798, 'dud': 533, 'split': 1620, 'cautiou': 262, 'lucki': 1015, 'monthli': 1105, 'beed': 142, 'scrib': 1485, 'hurt': 861, 'glitter': 747, 'ankl': 59, 'drawer': 520, 'bovril': 196, 'moan': 1095, 'arm': 83, 'significantli': 1532, 'strang': 1661, 'bleach': 168, 'occasion': 1166, 'recognis': 1376, 'cedarwood': 263, 'popular': 1284, 'bother': 192, 'dozen': 516, 'blindingli': 171, 'leg': 966, 'tanner': 1729, 'stronger': 1672, 'street': 1665, 'veg': 1855, 'cube': 407, 'batch': 128, 'recipebut': 1374, 'secret': 1495, 'mile': 1078, 'hr': 856, 'sedentari': 1497, 'worst': 1936, 'verri': 1862, 'hi': 827, 'uk': 1822, 'anywher': 69, 'massag': 1051, 'keen': 931, 'ban': 117, 'sneaki': 1573, 'woke': 1923, 'sunk': 1699, 'tacki': 1723, 'surviv': 1714, 'brexit': 206, 'defintley': 448, 'die': 478, 'em': 556, 'pregnant': 1302, 'moisteris': 1098, 'lost': 1006, 'count': 387, 'streaki': 1663, 'capful': 246, 'common': 337, 'loyalti': 1014, 'sooner': 1594, 'mostur': 1110, 'definetli': 446, 'mostureris': 1111, 'tissu': 1779, 'pain': 1211, 'pleasur': 1272, 'happili': 804, 'vera': 1859, 'directli': 486, 'allow': 39, 'subject': 1681, 'greenhous': 779, 'gase': 725, 'ultim': 1823, 'fight': 651, 'tire': 1778, 'drawn': 521, 'child': 291, 'dosag': 510, 'statement': 1640, 'worthwhil': 1938, 'sorbet': 1598, 'focus': 679, 'exact': 587, 'costco': 385, 'curiou': 411, 'baffl': 112, 'elviv': 555, 'realiz': 1365, 'silicon': 1533, 'hairdress': 795, 'appoint': 75, 'defenc': 439, 'purifi': 1336, 'pralin': 1299, 'choc': 293, 'solero': 1584}\n" 379 | ] 380 | } 381 | ], 382 | "source": [ 383 | "count_vectorizer = CountVectorizer(stop_words='english')\n", 384 | "count_vectorizer.fit(X_train)\n", 385 | "print('\\nVocabulary: \\n', count_vectorizer.vocabulary_)\n", 386 | "\n", 387 | "train_c = count_vectorizer.fit_transform(X_train)\n", 388 | "test_c = count_vectorizer.transform(X_test)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "### Multinomial Naive Bayes model" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 149, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "#IMPLEMENTING AND RUNNNING MNB MODEL - COUNT\n", 405 | "mnb1 = MultinomialNB()\n", 406 | "mnb1.fit(train_c, y_train)\n", 407 | "prediction = mnb1.predict(test_c)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 150, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "#EVALUATION\n", 417 | "mnb_a1 = accuracy_score(y_test, prediction)*100\n", 418 | "mnb_p1 = precision_score(y_test, prediction)* 100\n", 419 | "mnb_r1 = recall_score(y_test, prediction)*100\n", 420 | "mnb_f11 = f1_score(y_test, prediction)*100" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 151, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "" 432 | ] 433 | }, 434 | "execution_count": 151, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | }, 438 | { 439 | "data": { 440 | "image/png": "\n", 441 | "text/plain": [ 442 | "
" 443 | ] 444 | }, 445 | "metadata": { 446 | "needs_background": "light" 447 | }, 448 | "output_type": "display_data" 449 | } 450 | ], 451 | "source": [ 452 | "#CONFUSION MATRIX\n", 453 | "cm = confusion_matrix(y_test, prediction, labels=mnb1.classes_)\n", 454 | "display = ConfusionMatrixDisplay(confusion_matrix=cm,\n", 455 | " display_labels=mnb1.classes_) \n", 456 | "display.plot() " 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "### Support Vector Machine model" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 152, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "#IMPLEMENTING AND RUNNNING SVM MODEL - COUNT\n", 473 | "svm1 = SVC(kernel='linear')\n", 474 | "svm1.fit(train_c, y_train)\n", 475 | "prediction = svm1.predict(test_c)" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 153, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "#EVALUATION\n", 485 | "svm_a1 = accuracy_score(y_test, prediction)*100\n", 486 | "svm_p1 = precision_score(y_test, prediction)* 100\n", 487 | "svm_r1 = recall_score(y_test, prediction)*100\n", 488 | "svm_f11 = f1_score(y_test, prediction)*100" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 154, 494 | "metadata": {}, 495 | "outputs": [ 496 | { 497 | "data": { 498 | "text/plain": [ 499 | "" 500 | ] 501 | }, 502 | "execution_count": 154, 503 | "metadata": {}, 504 | "output_type": "execute_result" 505 | }, 506 | { 507 | "data": { 508 | "image/png": "\n", 509 | "text/plain": [ 510 | "
" 511 | ] 512 | }, 513 | "metadata": { 514 | "needs_background": "light" 515 | }, 516 | "output_type": "display_data" 517 | } 518 | ], 519 | "source": [ 520 | "#CONFUSION MATRIX\n", 521 | "cm = confusion_matrix(y_test, prediction, labels=svm1.classes_)\n", 522 | "display = ConfusionMatrixDisplay(confusion_matrix=cm,\n", 523 | " display_labels=svm1.classes_) \n", 524 | "display.plot() " 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "### Logistic Regression model" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 155, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "#IMPLEMENTING AND RUNNNING LR MODEL - COUNT\n", 541 | "lr1 = LogisticRegression()\n", 542 | "lr1.fit(train_c, y_train)\n", 543 | "prediction = lr1.predict(test_c)" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 156, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "#EVALUATION\n", 553 | "lr_a1 = accuracy_score(y_test, prediction)*100\n", 554 | "lr_p1 = precision_score(y_test, prediction)* 100\n", 555 | "lr_r1 = recall_score(y_test, prediction)*100\n", 556 | "lr_f11 = f1_score(y_test, prediction)*100" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 157, 562 | "metadata": {}, 563 | "outputs": [ 564 | { 565 | "data": { 566 | "text/plain": [ 567 | "" 568 | ] 569 | }, 570 | "execution_count": 157, 571 | "metadata": {}, 572 | "output_type": "execute_result" 573 | }, 574 | { 575 | "data": { 576 | "image/png": "\n", 577 | "text/plain": [ 578 | "
" 579 | ] 580 | }, 581 | "metadata": { 582 | "needs_background": "light" 583 | }, 584 | "output_type": "display_data" 585 | } 586 | ], 587 | "source": [ 588 | "#CONFUSION MATRIX\n", 589 | "cm = confusion_matrix(y_test, prediction, labels=lr1.classes_)\n", 590 | "display = ConfusionMatrixDisplay(confusion_matrix=cm,\n", 591 | " display_labels=lr1.classes_) \n", 592 | "display.plot() " 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "## TFIDF VECTORIZER AND MODELING\n", 600 | "\n", 601 | "We examine the total document weightage of a word in TfidfVectorizer. It assists us in coping with the most common terms. We may use it to penalize them. The word counts are weighted by a measure of how frequently they appear in the documents in TfidfVectorizer." 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 158, 607 | "metadata": {}, 608 | "outputs": [ 609 | { 610 | "name": "stdout", 611 | "output_type": "stream", 612 | "text": [ 613 | "\n", 614 | "Vocabulary: \n", 615 | " {'current': 413, 'sell': 1498, 'price': 1310, 'compar': 341, 'supermarket': 1705, 'good': 760, 'smell': 1563, 'pleasant': 1270, 'need': 1134, 'add': 14, 'small': 1558, 'cloth': 319, 'fresh': 700, 'great': 776, 'moisturis': 1100, 'sensit': 1502, 'love': 1009, 'pour': 1293, 'smaller': 1559, 'bottl': 193, 'make': 1032, 'manag': 1036, 'beat': 136, 'decent': 432, 'fabric': 614, 'soften': 1580, 'nice': 1142, 'fragranc': 695, 'purchas': 1334, 'deliveri': 456, 'cream': 397, 'handwash': 801, 'cheaper': 281, 'hand': 798, 'better': 154, 'liquid': 988, 'oh': 1174, 'wing': 1917, 'dove': 514, 'kitchen': 940, 'bathroom': 131, 'shower': 1527, 'room': 1436, 'recommend': 1378, 'highli': 830, 'star': 1636, 'simpl': 1537, 'gel': 727, 'like': 977, 'glue': 753, 'hard': 805, 'rub': 1444, 'slip': 1555, 'bath': 130, 'goe': 756, 'smoothli': 1569, 'easili': 539, 'wast': 1885, 'leav': 964, 'feel': 643, 'silki': 1535, 'soft': 1579, 'scenti': 1477, 'review': 1419, 'collect': 326, 'promot': 1322, 'excel': 589, 'everi': 580, 'day': 426, 'facial': 617, 'wash': 1882, 'excess': 591, 'face': 616, 'strip': 1669, 'natur': 1127, 'oil': 1175, 'care': 256, 'routin': 1442, 'morn': 1108, 'night': 1144, 'clean': 308, 'brilliant': 213, 'offer': 1172, 'gorgeou': 762, 'amaz': 52, 'valu': 1848, 'girli': 739, 'hair': 794, 'buy': 235, 'chang': 276, 'preserv': 1304, 'come': 331, 'bad': 109, 'sore': 1599, 'rash': 1354, 'eye': 608, 'burn': 230, 'lip': 987, 'tingl': 1775, 'phone': 1255, 'told': 1785, 'stuff': 1677, 'ask': 92, 'said': 1454, 'know': 943, 'want': 1878, 'reason': 1368, 'mayb': 1056, 'save': 1468, 'money': 1103, 'ingredi': 897, 'nearli': 1130, 'year': 1950, 'sinc': 1541, 'nivea': 1147, 'sold': 1583, 'compani': 339, 'german': 733, 'im': 872, 'realli': 1366, 'angri': 56, 'suppos': 1708, 'ok': 1177, 'rubbish': 1446, 'burnt': 231, 'pleas': 1269, 'usual': 1846, 'stock': 1651, 'fulli': 713, 'asda': 90, 'gave': 726, 'refund': 1386, 'gift': 735, 'card': 253, 'receipt': 1371, 'pocket': 1277, 'condition': 355, 'normal': 1151, 'oili': 1176, 'week': 1896, 'saw': 1470, 'differ': 480, 'felt': 646, 'cleans': 310, 'clearer': 313, 'notic': 1156, 'straightaway': 1659, 'red': 1381, 'blemish': 169, 'previou': 1308, 'kid': 936, 'doesnt': 505, 'irrit': 914, 'scent': 1476, 'littl': 993, 'bit': 162, 'long': 999, 'way': 1892, 'similar': 1536, 'perfect': 1236, 'got': 764, 'coupl': 390, 'ago': 32, 'refresh': 1385, 'bodi': 181, 'smooth': 1567, 'cucumb': 408, 'relax': 1393, 'best': 153, 'came': 243, 'separ': 1504, 'packet': 1208, 'sealabl': 1491, 'affect': 25, 'rough': 1440, 'dri': 525, 'otherwis': 1193, 'fantast': 631, 'lot': 1007, 'effort': 549, 'reduc': 1382, 'plastic': 1268, 'concentr': 350, 'buyer': 236, 'say': 1471, 'larger': 949, 'probabl': 1316, 'explain': 601, 'label': 945, 'fuchsia': 711, 'perfum': 1239, 'version': 1864, 'overpow': 1203, 'big': 155, 'plu': 1274, 'difficulti': 482, 'intens': 907, 'past': 1226, 'note': 1153, 'outer': 1194, 'sleev': 1550, 'recycl': 1380, 'dispos': 498, 'charg': 279, 'kind': 937, 'overbear': 1199, 'anyon': 66, 'glow': 751, 'afford': 26, 'comfort': 332, 'creation': 400, 'round': 1441, 'market': 1044, 'close': 318, 'match': 1053, 'honeysuckl': 846, 'sandalwood': 1463, 'person': 1246, 'favourit': 638, 'howev': 855, 'descript': 465, 'ad': 13, 'recent': 1373, 'amazon': 55, 'pantri': 1215, 'order': 1189, 'lamin': 946, 'tile': 1771, 'floor': 670, 'subtl': 1685, 'streak': 1662, 'free': 698, 'shine': 1523, 'moistur': 1099, 'essenti': 577, 'tri': 1808, 'time': 1773, 'today': 1781, 'packag': 1207, 'easi': 537, 'open': 1182, 'squeez': 1627, 'releas': 1394, 'puff': 1330, 'froth': 707, 'pure': 1335, 'smear': 1562, 'white': 1908, 'absorb': 4, 'non': 1148, 'greasi': 775, 'appli': 73, 'think': 1759, 'expens': 598, 'cheap': 280, 'qualiti': 1340, 'dairi': 419, 'aroma': 84, 'dont': 508, 'drench': 523, 'lotion': 1008, 'pretti': 1306, 'worth': 1937, 'thank': 1752, 'item': 919, 'describ': 464, 'rapid': 1352, 'stop': 1653, 'static': 1641, 'chocol': 294, 'flavour': 669, 'creami': 398, 'soap': 1576, 'anyth': 67, 'els': 553, 'lather': 954, 'noth': 1154, 'fail': 621, 'protect': 1326, 'gentl': 730, 'basic': 126, 'harm': 806, 'nourish': 1157, 'tasti': 1733, 'beef': 143, 'tomato': 1786, 'grate': 772, 'cheddar': 284, 'chees': 286, 'mix': 1091, 'nd': 1128, 'let': 969, 'sit': 1544, 'minut': 1083, 'regret': 1388, 'set': 1509, 'took': 1791, 'expect': 597, 'size': 1546, 'persil': 1244, 'quit': 1348, 'pictur': 1258, 'mislead': 1087, 'apart': 71, 'cheer': 285, 'makeup': 1034, 'eas': 536, 'micellar': 1074, 'water': 1888, 'left': 965, 'pack': 1206, 'extra': 602, 'impresss': 881, 'content': 368, 'aw': 103, 'cut': 416, 'right': 1427, 'fingernail': 658, 'turn': 1818, 'upsid': 1844, 'approach': 78, 'empti': 558, 'ridicul': 1426, 'design': 466, 'bought': 194, 'lumber': 1016, 'home': 841, 'larg': 948, 'decant': 431, 'exist': 594, 'contain': 367, 'tell': 1740, 'straight': 1658, 'away': 105, 'despit': 469, 'compart': 343, 'machin': 1025, 'towel': 1796, 'unlik': 1835, 'manufactur': 1040, 'healthi': 817, 'definit': 447, 'forese': 684, 'futur': 720, 'gotten': 766, 'result': 1415, 'brillant': 212, 'brand': 201, 'absolut': 3, 'fan': 629, 'wow': 1939, 'factor': 619, 'base': 123, 'commerci': 335, 'help': 825, 'disappoint': 489, 'especi': 576, 'point': 1279, 'ice': 866, 'chunk': 301, 'difficult': 481, 'eat': 540, 'entir': 569, 'tub': 1814, 'nasti': 1126, 'chemic': 287, 'super': 1701, 'famili': 628, 'young': 1956, 'old': 1179, 'friendli': 705, 'mean': 1059, 'colour': 329, 'honestli': 844, 'problemat': 1318, 'prone': 1323, 'dermat': 462, 'tast': 1732, 'weed': 1895, 'vinegar': 1867, 'work': 1932, 'remov': 1403, 'havent': 811, 'hamper': 797, 'christma': 298, 'sure': 1709, 'magnum': 1029, 'incas': 885, 'amazebal': 53, 'togeth': 1782, 'delici': 453, 'superb': 1702, 'kit': 939, 'bio': 159, 'stubborn': 1674, 'stain': 1631, 'word': 1931, 'warn': 1881, 'handl': 800, 'tend': 1743, 'ensur': 566, 'thoroughli': 1763, 'rins': 1428, 'repair': 1405, 'guy': 790, 'fals': 627, 'economi': 543, 'build': 226, 'caus': 261, 'issu': 916, 'huggabl': 858, 'strong': 1671, 'alcohol': 36, 'pot': 1291, 'noodl': 1149, 'student': 1676, 'yuck': 1959, 'addit': 15, 'sauc': 1466, 'pasta': 1227, 'carboard': 250, 'situat': 1545, 'access': 6, 'kettl': 933, 'chose': 297, 'mugshot': 1116, 'mistak': 1089, 'scrumptiou': 1488, 'authent': 99, 'linger': 984, 'bonu': 188, 'percent': 1235, 'fact': 618, 'husband': 862, 'classic': 307, 'lynx': 1024, 'deodor': 459, 'regularli': 1390, 'number': 1160, 'skincar': 1547, 'avoid': 102, 'toner': 1788, 'experi': 599, 'anoth': 62, 'major': 1031, 'extrem': 605, 'flaki': 665, 'sting': 1649, 'dewi': 475, 'look': 1003, 'deffin': 440, 'opinion': 1183, 'reccomend': 1370, 'deal': 429, 'slow': 1556, 'cook': 375, 'beauti': 138, 'tip': 1777, 'mark': 1043, 'alway': 51, 'fresher': 701, 'regular': 1389, 'late': 951, 'overal': 1198, 'improv': 882, 'claim': 304, 'perform': 1238, 'miracl': 1084, 'lol': 997, 'reaction': 1360, 'happen': 802, 'winter': 1919, 'weather': 1894, 'central': 265, 'heat': 819, 'vaselin': 1853, 'dealt': 430, 'includ': 887, 'elbow': 551, 'knee': 941, 'place': 1263, 'glide': 746, 'dream': 522, 'summer': 1695, 'spent': 1614, 'lay': 958, 'beach': 133, 'eyemak': 611, 'men': 1067, 'boot': 190, 'tame': 1726, 'divin': 500, 'iron': 913, 'tear': 1737, 'penni': 1233, 'scale': 1472, 'deliv': 455, 'quickli': 1346, 'shampoo': 1516, 'leak': 963, 'tan': 1727, 'sunb': 1697, 'spray': 1624, 'thought': 1764, 'clear': 312, 'friend': 704, 'huge': 857, 'indoor': 892, 'didnt': 477, 'exxtra': 607, 'fast': 633, 'thankyou': 1753, 'half': 796, 'shop': 1526, 'line': 982, 'fake': 625, 'africa': 28, 'poorer': 1282, 'new': 1138, 'softer': 1581, 'particularli': 1221, 'wrap': 1940, 'storag': 1655, 'measur': 1061, 'touch': 1794, 'brighter': 209, 'cleaner': 309, 'afterward': 29, 'everyon': 583, 'impress': 880, 'neutral': 1137, 'defiantli': 442, 'fab': 613, 'rate': 1356, 'mani': 1038, 'micel': 1073, 'wipe': 1920, 'total': 1793, 'alreadi': 46, 'receiv': 1372, 'feed': 642, 'event': 578, 'absoulut': 5, 'stick': 1646, 'job': 926, 'live': 994, 'hype': 865, 'clearli': 314, 'lead': 961, 'leader': 962, 'inspir': 901, 'confid': 357, 'certainli': 269, 'harsh': 808, 'favorit': 636, 'imposs': 879, 'sourc': 1603, 'check': 283, 'target': 1731, 'piec': 1260, 'crack': 395, 'bare': 119, 'far': 632, 'concern': 352, 'waterproof': 1890, 'mascara': 1048, 'powder': 1295, 'return': 1417, 'paid': 1210, 'daili': 418, 'basi': 125, 'age': 30, 'multipl': 1117, 'bulk': 227, 'door': 509, 'funnel': 717, 'easier': 538, 'section': 1496, 'washer': 1883, 'moreov': 1107, 'thing': 1758, 'exactli': 588, 'someth': 1588, 'odor': 1169, 'blend': 170, 'energis': 563, 'complain': 345, 'lid': 971, 'bin': 158, 'spill': 1618, 'everywher': 586, 'box': 197, 'ml': 1093, 'hope': 847, 'misl': 1086, 'month': 1104, 'start': 1637, 'enjoy': 565, 'mr': 1114, 'hinch': 832, 'glad': 742, 'scratch': 1481, 'scour': 1480, 'pad': 1209, 'minki': 1082, 'spong': 1621, 'oven': 1197, 'hob': 837, 'tap': 1730, 'sink': 1543, 'screen': 1483, 'gleam': 745, 'compact': 338, 'rang': 1351, 'actual': 12, 'complet': 347, 'ha': 792, 'excema': 590, 'hit': 834, 'everyday': 582, 'massiv': 1052, 'cap': 245, 'lenor': 967, 'unstopp': 1840, 'convert': 373, 'mild': 1077, 'messi': 1071, 'step': 1645, 'dad': 417, 'awhil': 107, 'suitabl': 1694, 'obviou': 1162, 'choic': 295, 'believ': 146, 'continu': 369, 'term': 1746, 'benefit': 150, 'happi': 803, 'alround': 48, 'consist': 363, 'dilut': 483, 'guess': 789, 'blue': 177, 'pigment': 1261, 'act': 11, 'reli': 1395, 'hydrat': 864, 'mixtur': 1092, 'varieti': 1851, 'reach': 1358, 'matter': 1054, 'neg': 1135, 'pump': 1332, 'end': 560, 'foam': 677, 'met': 1072, 'standard': 1634, 'bar': 118, 'quarter': 1342, 'truli': 1813, 'load': 995, 'sometim': 1589, 'stapl': 1635, 'heal': 816, 'abras': 2, 'boy': 198, 'store': 1656, 'cupboard': 410, 'dead': 428, 'therapi': 1754, 'origin': 1192, 'mini': 1080, 'arriv': 85, 'real': 1363, 'delight': 454, 'quick': 1345, 'subscrib': 1682, 'light': 975, 'bubbl': 223, 'floral': 671, 'masculin': 1049, 'bigger': 156, 'volum': 1870, 'garag': 723, 'known': 944, 'sweet': 1719, 'textur': 1751, 'artifici': 89, 'justv': 929, 'hour': 852, 'pale': 1212, 'disastr': 490, 'success': 1686, 'creat': 399, 'spot': 1623, 'abl': 1, 'deepli': 438, 'exfoli': 593, 'wont': 1928, 'suffer': 1689, 'babi': 108, 'style': 1679, 'genuin': 732, 'neatli': 1131, 'everyth': 584, 'shall': 1514, 'tresmemm': 1807, 'fussi': 719, 'suit': 1693, 'therefor': 1755, 'prefer': 1301, 'burst': 232, 'transit': 1802, 'snack': 1572, 'went': 1900, 'weekend': 1897, 'decid': 433, 'wet': 1901, 'cashmer': 259, 'sweater': 1718, 'sainsburi': 1455, 'scrummi': 1487, 'delic': 452, 'requir': 1410, 'bring': 214, 'runni': 1449, 'glam': 743, 'somewher': 1591, 'follow': 680, 'inexpens': 894, 'plain': 1264, 'mother': 1112, 'law': 957, 'wat': 1886, 'discount': 492, 'tin': 1774, 'sooth': 1597, 'residu': 1411, 'cool': 378, 'readi': 1362, 'magic': 1027, 'latest': 953, 'variat': 1850, 'address': 16, 'environment': 571, 'transport': 1803, 'cost': 384, 'commend': 333, 'sens': 1500, 'detect': 470, 'breath': 205, 'form': 687, 'slightli': 1552, 'zesti': 1961, 'user': 1845, 'repres': 1408, 'lessen': 968, 'impact': 877, 'environ': 570, 'albeit': 35, 'smallest': 1560, 'ideal': 869, 'son': 1592, 'adult': 20, 'children': 292, 'belov': 147, 'pet': 1250, 'incred': 890, 'lolli': 998, 'bargain': 120, 'introduc': 909, 'partner': 1223, 'sadli': 1452, 'share': 1517, 'hot': 851, 'melt': 1065, 'drip': 528, 'finish': 659, 'favour': 637, 'properli': 1324, 'inferior': 895, 'tesco': 1748, 'microwav': 1076, 'sever': 1510, 'badli': 111, 'lumpi': 1017, 'mostli': 1109, 'simplic': 1539, 'gone': 758, 'effect': 547, 'trace': 1799, 'fanci': 630, 'maskara': 1050, 'remain': 1399, 'sticki': 1647, 'defin': 444, 'heavi': 820, 'frequent': 699, 'perhap': 1240, 'capsul': 247, 'household': 854, 'terribl': 1747, 'allerg': 37, 'constant': 364, 'hay': 812, 'fever': 648, 'wrong': 1945, 'sign': 1529, 'gluten': 754, 'casserol': 260, 'sat': 1464, 'toilet': 1783, 'barley': 122, 'coeliac': 323, 'peopl': 1234, 'unsuit': 1841, 'pick': 1257, 'signific': 1531, 'rel': 1392, 'sturdier': 1678, 'travel': 1804, 'hous': 853, 'tini': 1776, 'surfac': 1711, 'pic': 1256, 'wonder': 1926, 'deni': 457, 'fix': 663, 'instead': 904, 'whenev': 1903, 'react': 1359, 'true': 1812, 'luxuri': 1023, 'prime': 1312, 'conveni': 372, 'incorpor': 888, 'cleanser': 311, 'eventu': 579, 'margin': 1042, 'obvious': 1163, 'comment': 334, 'appreci': 76, 'break': 203, 'vegan': 1856, 'daughter': 425, 'persuad': 1248, 'hesit': 826, 'pay': 1230, 'forward': 692, 'soup': 1602, 'risotto': 1432, 'younger': 1957, 'menopaus': 1068, 'calm': 240, 'wrinkl': 1942, 'crepey': 401, 'layer': 959, 'conjunct': 359, 'hyaluron': 863, 'companion': 340, 'loss': 1005, 'firm': 660, 'oz': 1205, 'csmart': 406, 'screw': 1484, 'determin': 472, 'loos': 1004, 'knife': 942, 'wise': 1921, 'advoid': 24, 'smart': 1561, 'local': 996, 'okay': 1178, 'meant': 1060, 'equal': 572, 'artif': 88, 'upset': 1843, 'simpli': 1538, 'english': 564, 'mustard': 1122, 'colman': 327, 'sunday': 1698, 'roast': 1433, 'tube': 1815, 'invari': 910, 'serv': 1507, 'nozzl': 1159, 'block': 172, 'someon': 1587, 'scrub': 1486, 'extremli': 606, 'frangranc': 697, 'fairli': 624, 'spread': 1625, 'soak': 1575, 'smelt': 1566, 'wilkinson': 1914, 'area': 80, 'soon': 1593, 'becom': 139, 'scalp': 1474, 'satisfi': 1465, 'sorri': 1600, 'voucher': 1871, 'post': 1289, 'slimi': 1554, 'problem': 1317, 'unhappi': 1832, 'respons': 1413, 'treat': 1805, 'portug': 1287, 'miss': 1088, 'tea': 1736, 'straighten': 1660, 'curl': 412, 'damag': 420, 'seller': 1499, 'pod': 1278, 'bag': 113, 'filler': 653, 'rich': 1423, 'special': 1609, 'elsewher': 554, 'laundri': 956, 'hate': 809, 'arrog': 86, 'unilev': 1833, 'hold': 838, 'palm': 1213, 'fine': 656, 'paragon': 1217, 'champ': 274, 'tight': 1769, 'win': 1916, 'doubl': 512, 'struggl': 1673, 'tough': 1795, 'unbeliev': 1825, 'compliment': 348, 'august': 98, 'itchi': 918, 'stream': 1664, 'grandchildren': 770, 'unfortun': 1831, 'green': 778, 'lime': 979, 'individu': 891, 'dandruff': 423, 'anti': 63, 'prior': 1314, 'particular': 1220, 'busi': 233, 'mum': 1118, 'admit': 18, 'neglect': 1136, 'empathis': 557, 'trial': 1809, 'opportun': 1184, 'reconnect': 1379, 'cme': 320, 'smother': 1570, 'offens': 1171, 'alo': 41, 'applic': 74, 'bedtim': 141, 'verdict': 1860, 'thumb': 1766, 'ye': 1949, 'second': 1493, 'lightweight': 976, 'tendenc': 1744, 'pull': 1331, 'liber': 970, 'discreet': 494, 'overwhelm': 1204, 'protector': 1327, 'retain': 1416, 'ezyema': 612, 'switch': 1721, 'radiant': 1349, 'regim': 1387, 'thinner': 1760, 'heavili': 821, 'cancel': 244, 'subscript': 1683, 'wait': 1874, 'fyi': 721, 'attract': 97, 'floweri': 673, 'badeda': 110, 'holiday': 840, 'sun': 1696, 'till': 1772, 'stope': 1654, 'blow': 176, 'prevent': 1307, 'transform': 1801, 'nose': 1152, 'dehydr': 451, 'sort': 1601, 'rest': 1414, 'sleep': 1549, 'caramel': 248, 'marshmallow': 1046, 'liter': 991, 'calori': 242, 'bomb': 185, 'gross': 782, 'gonna': 759, 'throw': 1765, 'coca': 322, 'butter': 234, 'marmit': 1045, 'lover': 1011, 'glass': 744, 'jar': 923, 'oddli': 1168, 'given': 740, 'persev': 1243, 'shini': 1524, 'root': 1437, 'drier': 527, 'grown': 785, 'horribl': 849, 'agre': 33, 'tighter': 1770, 'import': 878, 'confus': 358, 'milk': 1079, 'maker': 1033, 'risk': 1431, 'underwear': 1830, 'ultra': 1824, 'twice': 1819, 'combin': 330, 'harmoni': 807, 'chicken': 290, 'mushroom': 1121, 'pie': 1259, 'salt': 1458, 'ruin': 1447, 'chuck': 300, 'tongu': 1790, 'season': 1492, 'accident': 7, 'swallow': 1715, 'sea': 1489, 'xd': 1946, 'older': 1180, 'wide': 1911, 'awak': 104, 'flakey': 664, 'smelli': 1564, 'pit': 1262, 'plenti': 1273, 'longer': 1000, 'comparison': 342, 'yo': 1954, 'born': 191, 'defient': 443, 'odourless': 1170, 'suppl': 1706, 'nightli': 1145, 'bed': 140, 'clog': 317, 'pore': 1285, 'replenish': 1407, 'daytim': 427, 'reappli': 1367, 'overnight': 1202, 'unscent': 1837, 'patch': 1228, 'stone': 1652, 'munchi': 1119, 'food': 682, 'depend': 461, 'wish': 1922, 'talk': 1724, 'visit': 1869, 'certifi': 270, 'british': 215, 'foundat': 693, 'freshli': 702, 'launder': 955, 'reliabl': 1396, 'cooler': 379, 'wahs': 1873, 'soapi': 1577, 'type': 1820, 'downsid': 515, 'hole': 839, 'tresemm': 1806, 'stand': 1633, 'yummi': 1960, 'sugari': 1691, 'eco': 541, 'parcel': 1218, 'brain': 199, 'op': 1181, 'wife': 1912, 'safe': 1453, 'recomend': 1377, 'aswel': 94, 'moist': 1097, 'surf': 1710, 'stay': 1642, 'articl': 87, 'acn': 10, 'scar': 1475, 'opposit': 1185, 'life': 973, 'visibl': 1868, 'fade': 620, 'dermatologist': 463, 'game': 722, 'changer': 277, 'main': 1030, 'test': 1749, 'singl': 1542, 'mayo': 1057, 'bod': 180, 'bat': 127, 'quench': 1343, 'fell': 645, 'cooki': 377, 'dough': 513, 'superdrug': 1703, 'prize': 1315, 'buzz': 237, 'limit': 981, 'edit': 546, 'stayer': 1643, 'scratchi': 1482, 'rip': 1429, 'mistreat': 1090, 'dock': 501, 'reus': 1418, 'revitalis': 1420, 'gentli': 731, 'impur': 883, 'servic': 1508, 'fault': 634, 'rocemmend': 1434, 'almond': 40, 'mouth': 1113, 'fave': 635, 'dark': 424, 'passion': 1225, 'narrow': 1125, 'accur': 8, 'slim': 1553, 'pourer': 1294, 'dribbl': 526, 'washload': 1884, 'flower': 672, 'hint': 833, 'fruit': 709, 'mango': 1037, 'brought': 219, 'cornet': 381, 'luvli': 1021, 'carri': 257, 'worri': 1934, 'man': 1035, 'choos': 296, 'sweat': 1717, 'mayonnais': 1058, 'substanc': 1684, 'squeezi': 1628, 'geniu': 729, 'idea': 868, 'tall': 1725, 'unstabl': 1839, 'ive': 922, 'fridg': 703, 'tumbl': 1816, 'brittl': 216, 'shatter': 1520, 'groundhog': 783, 'cri': 402, 'overhaul': 1200, 'curri': 414, 'besid': 152, 'sachet': 1451, 'edibl': 545, 'phenomen': 1253, 'function': 715, 'longest': 1001, 'rid': 1425, 'kept': 932, 'drawback': 519, 'suppli': 1707, 'wear': 1893, 'diabet': 476, 'carbohydr': 251, 'nutrit': 1161, 'inform': 896, 'whatsoev': 1902, 'wall': 1877, 'specif': 1610, 'portion': 1286, 'whitehead': 1909, 'bash': 124, 'inadvert': 884, 'deterg': 471, 'fairi': 623, 'joint': 927, 'biolog': 161, 'septic': 1505, 'tank': 1728, 'dispens': 497, 'insert': 899, 'suffici': 1690, 'condit': 354, 'competit': 344, 'hav': 810, 'btilliant': 221, 'wild': 1913, 'enthusiast': 567, 'err': 573, 'gener': 728, 'custom': 415, 'write': 1944, 'uncomfort': 1829, 'clash': 305, 'coat': 321, 'sickli': 1528, 'raspberri': 1355, 'cover': 392, 'core': 380, 'remind': 1402, 'cheapest': 282, 'imagin': 873, 'insipid': 900, 'rush': 1450, 'develop': 473, 'process': 1319, 'bitter': 164, 'rippl': 1430, 'consid': 361, 'lush': 1020, 'suggest': 1692, 'petrolatum': 1251, 'damp': 421, 'eczema': 544, 'rosacea': 1439, 'coz': 394, 'wors': 1935, 'apprehens': 77, 'broken': 218, 'fit': 662, 'purpos': 1338, 'sport': 1622, 'mad': 1026, 'teenag': 1738, 'promis': 1321, 'moisturisor': 1101, 'deoder': 458, 'discov': 493, 'shame': 1515, 'broke': 217, 'manli': 1039, 'foami': 678, 'wondr': 1927, 'variou': 1852, 'chamomil': 273, 'weight': 1899, 'thicker': 1757, 'space': 1604, 'nicest': 1143, 'mmmm': 1094, 'bo': 178, 'beater': 137, 'plainli': 1265, 'bright': 207, 'bold': 184, 'grudg': 786, 'dirti': 487, 'tablet': 1722, 'necess': 1132, 'weekli': 1898, 'groceri': 781, 'budget': 225, 'cake': 239, 'exot': 595, 'veget': 1857, 'ariel': 81, 'domin': 507, 'global': 748, 'consum': 365, 'giant': 734, 'expert': 600, 'dodgi': 503, 'imit': 874, 'sub': 1680, 'guarante': 788, 'high': 829, 'rubberi': 1445, 'gossam': 763, 'case': 258, 'brightli': 210, 'funki': 716, 'purpl': 1337, 'grey': 780, 'swirl': 1720, 'gu': 787, 'outsid': 1195, 'grade': 767, 'disappear': 488, 'mysteri': 1123, 'sphinx': 1616, 'giza': 741, 'bermuda': 151, 'triangl': 1810, 'voynich': 1872, 'manuscript': 1041, 'xlarg': 1947, 'discontinu': 491, 'option': 1187, 'limescal': 980, 'unclog': 1828, 'wake': 1875, 'plump': 1276, 'ill': 871, 'pun': 1333, 'intend': 906, 'altern': 49, 'forth': 690, 'healthier': 818, 'advic': 22, 'minimum': 1081, 'annoy': 60, 'asid': 91, 'run': 1448, 'plug': 1275, 'potenti': 1292, 'increas': 889, 'member': 1066, 'endors': 561, 'practic': 1298, 'nappi': 1124, 'pyramid': 1339, 'warm': 1880, 'present': 1303, 'st': 1630, 'rememb': 1401, 'repurchas': 1409, 'figur': 652, 'medicin': 1062, 'cabinet': 238, 'salti': 1459, 'yeast': 1951, 'honest': 843, 'toast': 1780, 'fluctuat': 674, 'particulr': 1222, 'watch': 1887, 'smellllllll': 1565, 'sauna': 1467, 'outstand': 1196, 'turkey': 1817, 'neck': 1133, 'shake': 1512, 'graviti': 774, 'vastli': 1854, 'sharp': 1519, 'begin': 144, 'assum': 93, 'grow': 784, 'ocado': 1164, 'sampl': 1460, 'state': 1639, 'formula': 689, 'correct': 383, 'later': 952, 'boost': 189, 'sooo': 1595, 'complaint': 346, 'concept': 351, 'recal': 1369, 'tetra': 1750, 'public': 1329, 'innov': 898, 'refil': 1384, 'cardboard': 254, 'dose': 511, 'counter': 388, 'read': 1361, 'print': 1313, 'wrapper': 1941, 'magnif': 1028, 'anim': 57, 'whilst': 1906, 'provid': 1328, 'headquart': 815, 'offic': 1173, 'produc': 1320, 'countri': 389, 'sale': 1457, 'carbon': 252, 'footprint': 683, 'torn': 1792, 'extrat': 604, 'wherea': 1904, 'secondli': 1494, 'spare': 1607, 'rib': 1422, 'persdper': 1242, 'reorder': 1404, 'somewhat': 1590, 'feminin': 647, 'perspir': 1247, 'danc': 422, 'bay': 132, 'waxi': 1891, 'shark': 1518, 'ben': 148, 'jerri': 925, 'fragrant': 696, 'bargin': 121, 'vanilla': 1849, 'iritatw': 912, 'tbh': 1735, 'loo': 1002, 'conceal': 349, 'moment': 1102, 'coverag': 393, 'brighten': 208, 'glowi': 752, 'perfectli': 1237, 'view': 1866, 'consciou': 360, 'paper': 1216, 'temperament': 1741, 'lazi': 960, 'girl': 737, 'solut': 1585, 'deep': 437, 'repeat': 1406, 'versatil': 1863, 'heel': 822, 'econom': 542, 'defo': 449, 'direct': 485, 'chapstick': 278, 'jot': 928, 'bob': 179, 'uncl': 1827, 'awesom': 106, 'doeant': 504, 'soggi': 1582, 'mess': 1070, 'dish': 496, 'lux': 1022, 'plan': 1266, 'kcal': 930, 'control': 371, 'diet': 479, 'biodegrad': 160, 'forev': 685, 'blackhead': 165, 'zone': 1962, 'fed': 641, 'tone': 1787, 'anymor': 65, 'bland': 166, 'wateri': 1889, 'gold': 757, 'spring': 1626, 'cold': 325, 'cardigan': 255, 'thirti': 1761, 'degre': 450, 'tie': 1768, 'boil': 182, 'bikini': 157, 'dedic': 436, 'ylang': 1953, 'town': 1797, 'aaaaamaz': 0, 'tattoooo': 1734, 'previous': 1309, 'admir': 17, 'blast': 167, 'whiff': 1905, 'planet': 1267, 'pollut': 1280, 'funni': 718, 'cone': 356, 'crispi': 403, 'cornetto': 382, 'flare': 666, 'itch': 917, 'crazi': 396, 'escap': 575, 'lift': 974, 'drop': 529, 'instantli': 903, 'relief': 1397, 'effortlessli': 550, 'alot': 44, 'effici': 548, 'linen': 983, 'pre': 1300, 'extract': 603, 'honey': 845, 'suckl': 1687, 'yesterday': 1952, 'perman': 1241, 'cherri': 288, 'blossom': 174, 'pea': 1231, 'sandal': 1462, 'wood': 1929, 'cup': 409, 'lunch': 1018, 'king': 938, 'gram': 769, 'antiperspir': 64, 'unblock': 1826, 'gradual': 768, 'pleasantli': 1271, 'surpris': 1712, 'cours': 391, 'fcuk': 639, 'opt': 1186, 'starter': 1638, 'newborn': 1139, 'gym': 791, 'hunger': 859, 'challeng': 272, 'strength': 1666, 'endur': 562, 'helmann': 824, 'bone': 187, 'tendon': 1745, 'appeal': 72, 'advantag': 21, 'citru': 303, 'muscl': 1120, 'bump': 228, 'ador': 19, 'handi': 799, 'persist': 1245, 'whitout': 1910, 'sin': 1540, 'spend': 1613, 'woken': 1924, 'reiment': 1391, 'smudg': 1571, 'novelti': 1158, 'fish': 661, 'quantiti': 1341, 'deodour': 460, 'swear': 1716, 'arkward': 82, 'kick': 935, 'sent': 1503, 'toiletri': 1784, 'linnen': 986, 'film': 654, 'teeth': 1739, 'ceram': 266, 'realis': 1364, 'cent': 264, 'gooey': 761, 'marshmallowey': 1047, 'phish': 1254, 'class': 306, 'spain': 1606, 'breakout': 204, 'intoler': 908, 'superior': 1704, 'alon': 43, 'thiught': 1762, 'cojld': 324, 'garnier': 724, 'apar': 70, 'stripey': 1670, 'feet': 644, 'orang': 1188, 'stink': 1650, 'moral': 1106, 'stori': 1657, 'format': 688, 'shave': 1521, 'dress': 524, 'fewer': 649, 'worn': 1933, 'meet': 1064, 'amazingli': 54, 'flavor': 668, 'powderi': 1296, 'exempt': 592, 'medium': 1063, 'annoyiji': 61, 'occas': 1165, 'immedi': 875, 'serum': 1506, 'contribut': 370, 'deco': 435, 'women': 1925, 'eldest': 552, 'forgotten': 686, 'chew': 289, 'rope': 1438, 'school': 1478, 'lie': 972, 'nother': 1155, 'automat': 100, 'gloopi': 749, 'drugstor': 530, 'brainer': 200, 'partnership': 1224, 'encourag': 559, 'spici': 1617, 'fear': 640, 'spilt': 1619, 'alright': 47, 'gotta': 765, 'cetearyl': 271, 'cif': 302, 'power': 1297, 'mirror': 1085, 'nightmar': 1146, 'fun': 714, 'ordinari': 1190, 'thicken': 1756, 'gravi': 773, 'pop': 1283, 'press': 1305, 'spell': 1612, 'scali': 1473, 'quid': 1347, 'tempt': 1742, 'everytim': 585, 'spf': 1615, 'critic': 405, 'uva': 1847, 'convinc': 374, 'refer': 1383, 'verifi': 1861, 'overli': 1201, 'allergi': 38, 'certain': 268, 'bondi': 186, 'sand': 1461, 'slight': 1551, 'fabul': 615, 'hike': 831, 'itsel': 920, 'frizz': 706, 'flat': 667, 'lank': 947, 'bounc': 195, 'stuck': 1675, 'unus': 1842, 'cooker': 376, 'invigor': 911, 'newer': 1140, 'rd': 1357, 'isnt': 915, 'cerav': 267, 'norm': 1150, 'begun': 145, 'calmer': 241, 'sciencey': 1479, 'id': 867, 'unless': 1834, 'anytim': 68, 'indulg': 893, 'bite': 163, 'dog': 506, 'micro': 1075, 'granul': 771, 'wari': 1879, 'stainless': 1632, 'steel': 1644, 'liquidi': 989, 'velvet': 1858, 'hopingthat': 848, 'fair': 622, 'btw': 222, 'instruct': 905, 'typic': 1821, 'dy': 535, 'tong': 1789, 'question': 1344, 'clip': 316, 'sensat': 1501, 'definatli': 445, 'surprisingli': 1713, 'shock': 1525, 'litr': 992, 'imo': 876, 'salad': 1456, 'bake': 114, 'bean': 134, 'frozen': 708, 'pear': 1232, 'youth': 1958, 'anitipersperi': 58, 'accustom': 9, 'richer': 1424, 'bundl': 229, 'low': 1012, 'carb': 249, 'duti': 534, 'sock': 1578, 'fraction': 694, 'loveliest': 1010, 'wrist': 1943, 'drum': 531, 'modest': 1096, 'inch': 886, 'god': 755, 'somebodi': 1586, 'brazil': 202, 'expat': 596, 'bearabl': 135, 'vfm': 1865, 'head': 813, 'winner': 1918, 'iv': 921, 'batgain': 129, 'averag': 101, 'oreal': 1191, 'resist': 1412, 'girlfriend': 738, 'mention': 1069, 'benefici': 149, 'list': 990, 'chanc': 275, 'transfer': 1800, 'snif': 1574, 'commit': 336, 'stiff': 1648, 'kg': 934, 'alobg': 42, 'matur': 1055, 'hiya': 836, 'stress': 1667, 'pricey': 1311, 'reliev': 1398, 'final': 655, 'tropic': 1811, 'lili': 978, 'mud': 1115, 'spag': 1605, 'bol': 183, 'attent': 96, 'lingeri': 985, 'cotton': 386, 'silk': 1534, 'fiber': 650, 'gigant': 736, 'possibl': 1288, 'lash': 950, 'eyebrow': 609, 'hide': 828, 'sunshin': 1700, 'clingi': 315, 'greazi': 777, 'advis': 23, 'rare': 1353, 'desir': 467, 'blotch': 175, 'brown': 220, 'attach': 95, 'ignor': 870, 'fortun': 691, 'fulfil': 712, 'criteria': 404, 'wilt': 1915, 'yard': 1948, 'radiu': 1350, 'alarm': 34, 'whisk': 1907, 'drainag': 518, 'devin': 474, 'signatur': 1530, 'haha': 793, 'saver': 1469, 'pamper': 1214, 'heheh': 823, 'hunk': 860, 'shelf': 1522, 'unscrew': 1838, 'glove': 750, 'odd': 1167, 'horrif': 850, 'headach': 814, 'desper': 468, 'afraid': 27, 'disturb': 499, 'properti': 1325, 'fluff': 675, 'dryness': 532, 'unpleas': 1836, 'seal': 1490, 'drain': 517, 'yogurt': 1955, 'alpro': 45, 'blop': 173, 'postag': 1290, 'doctor': 502, 'soooo': 1596, 'walk': 1876, 'pervas': 1249, 'sky': 1548, 'fruiti': 710, 'strike': 1668, 'balanc': 115, 'slurp': 1557, 'reckon': 1375, 'nearer': 1129, 'path': 1229, 'decidedli': 434, 'newsweek': 1141, 'fluffi': 676, 'smoother': 1568, 'hive': 835, 'aggrav': 31, 'wool': 1930, 'chsnge': 299, 'remark': 1400, 'specifi': 1611, 'conclus': 353, 'lure': 1019, 'entic': 568, 'squint': 1629, 'reviv': 1421, 'fallen': 626, 'deffinalti': 441, 'fond': 681, 'poor': 1281, 'consider': 362, 'dinner': 484, 'parti': 1219, 'everybodi': 581, 'jelli': 924, 'loyal': 1013, 'balm': 116, 'finger': 657, 'color': 328, 'petroleum': 1252, 'shadow': 1511, 'appropri': 79, 'tidi': 1767, 'eyelin': 610, 'error': 574, 'spars': 1608, 'bud': 224, 'homemad': 842, 'brill': 211, 'row': 1443, 'roll': 1435, 'shaken': 1513, 'disgust': 495, 'contact': 366, 'suddenli': 1688, 'alth': 50, 'insread': 902, 'toxic': 1798, 'dud': 533, 'split': 1620, 'cautiou': 262, 'lucki': 1015, 'monthli': 1105, 'beed': 142, 'scrib': 1485, 'hurt': 861, 'glitter': 747, 'ankl': 59, 'drawer': 520, 'bovril': 196, 'moan': 1095, 'arm': 83, 'significantli': 1532, 'strang': 1661, 'bleach': 168, 'occasion': 1166, 'recognis': 1376, 'cedarwood': 263, 'popular': 1284, 'bother': 192, 'dozen': 516, 'blindingli': 171, 'leg': 966, 'tanner': 1729, 'stronger': 1672, 'street': 1665, 'veg': 1855, 'cube': 407, 'batch': 128, 'recipebut': 1374, 'secret': 1495, 'mile': 1078, 'hr': 856, 'sedentari': 1497, 'worst': 1936, 'verri': 1862, 'hi': 827, 'uk': 1822, 'anywher': 69, 'massag': 1051, 'keen': 931, 'ban': 117, 'sneaki': 1573, 'woke': 1923, 'sunk': 1699, 'tacki': 1723, 'surviv': 1714, 'brexit': 206, 'defintley': 448, 'die': 478, 'em': 556, 'pregnant': 1302, 'moisteris': 1098, 'lost': 1006, 'count': 387, 'streaki': 1663, 'capful': 246, 'common': 337, 'loyalti': 1014, 'sooner': 1594, 'mostur': 1110, 'definetli': 446, 'mostureris': 1111, 'tissu': 1779, 'pain': 1211, 'pleasur': 1272, 'happili': 804, 'vera': 1859, 'directli': 486, 'allow': 39, 'subject': 1681, 'greenhous': 779, 'gase': 725, 'ultim': 1823, 'fight': 651, 'tire': 1778, 'drawn': 521, 'child': 291, 'dosag': 510, 'statement': 1640, 'worthwhil': 1938, 'sorbet': 1598, 'focus': 679, 'exact': 587, 'costco': 385, 'curiou': 411, 'baffl': 112, 'elviv': 555, 'realiz': 1365, 'silicon': 1533, 'hairdress': 795, 'appoint': 75, 'defenc': 439, 'purifi': 1336, 'pralin': 1299, 'choc': 293, 'solero': 1584}\n" 616 | ] 617 | } 618 | ], 619 | "source": [ 620 | "TFIDF_vectorizer = TfidfVectorizer(stop_words='english')\n", 621 | "\n", 622 | "TFIDF_vectorizer.fit(X_train)\n", 623 | "print('\\nVocabulary: \\n', TFIDF_vectorizer.vocabulary_)\n", 624 | "\n", 625 | "train_tf = TFIDF_vectorizer.fit_transform(X_train)\n", 626 | "test_tf = TFIDF_vectorizer.transform(X_test)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "markdown", 631 | "metadata": {}, 632 | "source": [ 633 | "### Multinomial Naive Bayes model" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": 159, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "#IMPLEMENTING AND RUNNING MNB MODEL - TFIDF\n", 643 | "mnb2 = MultinomialNB()\n", 644 | "mnb2.fit(train_tf, y_train)\n", 645 | "prediction = mnb2.predict(test_tf)" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": 160, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "#EVALUATION\n", 655 | "mnb_a2 = accuracy_score(y_test, prediction)*100\n", 656 | "mnb_p2 = precision_score(y_test, prediction)* 100\n", 657 | "mnb_r2 = recall_score(y_test, prediction)*100\n", 658 | "mnb_f12 = f1_score(y_test, prediction)*100" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 161, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "data": { 668 | "text/plain": [ 669 | "" 670 | ] 671 | }, 672 | "execution_count": 161, 673 | "metadata": {}, 674 | "output_type": "execute_result" 675 | }, 676 | { 677 | "data": { 678 | "image/png": "\n", 679 | "text/plain": [ 680 | "
" 681 | ] 682 | }, 683 | "metadata": { 684 | "needs_background": "light" 685 | }, 686 | "output_type": "display_data" 687 | } 688 | ], 689 | "source": [ 690 | "#CONFUSION MATRIX\n", 691 | "cm = confusion_matrix(y_test, prediction, labels=mnb2.classes_)\n", 692 | "display = ConfusionMatrixDisplay(confusion_matrix=cm,\n", 693 | " display_labels=mnb2.classes_) \n", 694 | "display.plot() " 695 | ] 696 | }, 697 | { 698 | "cell_type": "markdown", 699 | "metadata": {}, 700 | "source": [ 701 | "### Support Vector Machine model" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 162, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "#IMPLEMENTING AND RUNNING SVM MODEL - TFIDF \n", 711 | "svm2 = SVC(kernel='linear')\n", 712 | "svm2.fit(train_tf, y_train)\n", 713 | "prediction = svm2.predict(test_tf)" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 163, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "#EVALUATION\n", 723 | "svm_a2 = accuracy_score(y_test, prediction)*100\n", 724 | "svm_p2 = precision_score(y_test, prediction)* 100\n", 725 | "svm_r2 = recall_score(y_test, prediction)*100\n", 726 | "svm_f12 = f1_score(y_test, prediction)*100" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": 164, 732 | "metadata": {}, 733 | "outputs": [ 734 | { 735 | "data": { 736 | "text/plain": [ 737 | "" 738 | ] 739 | }, 740 | "execution_count": 164, 741 | "metadata": {}, 742 | "output_type": "execute_result" 743 | }, 744 | { 745 | "data": { 746 | "image/png": "\n", 747 | "text/plain": [ 748 | "
" 749 | ] 750 | }, 751 | "metadata": { 752 | "needs_background": "light" 753 | }, 754 | "output_type": "display_data" 755 | } 756 | ], 757 | "source": [ 758 | "#CONFUSION MATRIX\n", 759 | "cm = confusion_matrix(y_test, prediction, labels=svm2.classes_)\n", 760 | "display = ConfusionMatrixDisplay(confusion_matrix=cm,\n", 761 | " display_labels=svm2.classes_) \n", 762 | "display.plot() " 763 | ] 764 | }, 765 | { 766 | "cell_type": "markdown", 767 | "metadata": {}, 768 | "source": [ 769 | "### Logistic Regression model" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": 165, 775 | "metadata": {}, 776 | "outputs": [], 777 | "source": [ 778 | "#IMPLEMENTATION AND RUNNING LR MODEL - TFIDF \n", 779 | "lr2 = LogisticRegression()\n", 780 | "lr2.fit(train_tf, y_train)\n", 781 | "prediction = lr2.predict(test_tf)" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": 166, 787 | "metadata": {}, 788 | "outputs": [], 789 | "source": [ 790 | "#EVALUATION\n", 791 | "lr_a2 = accuracy_score(y_test, prediction)*100\n", 792 | "lr_p2 = precision_score(y_test, prediction)* 100\n", 793 | "lr_r2 = recall_score(y_test, prediction)*100\n", 794 | "lr_f12 = f1_score(y_test, prediction)*100" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": 167, 800 | "metadata": {}, 801 | "outputs": [ 802 | { 803 | "data": { 804 | "text/plain": [ 805 | "" 806 | ] 807 | }, 808 | "execution_count": 167, 809 | "metadata": {}, 810 | "output_type": "execute_result" 811 | }, 812 | { 813 | "data": { 814 | "image/png": "\n", 815 | "text/plain": [ 816 | "
" 817 | ] 818 | }, 819 | "metadata": { 820 | "needs_background": "light" 821 | }, 822 | "output_type": "display_data" 823 | } 824 | ], 825 | "source": [ 826 | "#CONFUSION MATRIX\n", 827 | "cm = confusion_matrix(y_test, prediction, labels=lr2.classes_)\n", 828 | "display = ConfusionMatrixDisplay(confusion_matrix=cm,\n", 829 | " display_labels=lr2.classes_) \n", 830 | "display.plot() " 831 | ] 832 | }, 833 | { 834 | "cell_type": "markdown", 835 | "metadata": {}, 836 | "source": [ 837 | "### COMPARING ACCURACY" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": 168, 843 | "metadata": {}, 844 | "outputs": [ 845 | { 846 | "data": { 847 | "text/html": [ 848 | "
\n", 849 | "\n", 862 | "\n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | "
MNBSVMLR
Count Vectorizer80.084.085.0
Tfidf Vectorizer81.084.082.0
\n", 886 | "
" 887 | ], 888 | "text/plain": [ 889 | " MNB SVM LR\n", 890 | "Count Vectorizer 80.0 84.0 85.0\n", 891 | "Tfidf Vectorizer 81.0 84.0 82.0" 892 | ] 893 | }, 894 | "execution_count": 168, 895 | "metadata": {}, 896 | "output_type": "execute_result" 897 | } 898 | ], 899 | "source": [ 900 | "model_accuracy={'MNB': [round(mnb_a1), round(mnb_a2)],\n", 901 | " 'SVM': [round(svm_a1), round(svm_a2)],\n", 902 | " 'LR': [round(lr_a1), round(lr_a2)]\n", 903 | " }\n", 904 | "ma = pd.DataFrame(model_accuracy, columns = ['MNB','SVM','LR'], index=['Count Vectorizer','Tfidf Vectorizer'])\n", 905 | "ma" 906 | ] 907 | }, 908 | { 909 | "cell_type": "markdown", 910 | "metadata": {}, 911 | "source": [ 912 | "### COMPARING PRECISION" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": 169, 918 | "metadata": {}, 919 | "outputs": [ 920 | { 921 | "data": { 922 | "text/html": [ 923 | "
\n", 924 | "\n", 937 | "\n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | "
MNBSVMLR
Count Vectorizer81.079.080.0
Tfidf Vectorizer85.082.081.0
\n", 961 | "
" 962 | ], 963 | "text/plain": [ 964 | " MNB SVM LR\n", 965 | "Count Vectorizer 81.0 79.0 80.0\n", 966 | "Tfidf Vectorizer 85.0 82.0 81.0" 967 | ] 968 | }, 969 | "execution_count": 169, 970 | "metadata": {}, 971 | "output_type": "execute_result" 972 | } 973 | ], 974 | "source": [ 975 | "model_precision={'MNB': [round(mnb_p1), round(mnb_p2)],\n", 976 | " 'SVM': [round(svm_p1), round(svm_p2)],\n", 977 | " 'LR': [round(lr_p1), round(lr_p2)]\n", 978 | " }\n", 979 | "mp = pd.DataFrame(model_precision, columns = ['MNB','SVM','LR'], index=['Count Vectorizer','Tfidf Vectorizer'])\n", 980 | "mp" 981 | ] 982 | }, 983 | { 984 | "cell_type": "markdown", 985 | "metadata": {}, 986 | "source": [ 987 | "### COMPARING RECALL" 988 | ] 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": 170, 993 | "metadata": {}, 994 | "outputs": [ 995 | { 996 | "data": { 997 | "text/html": [ 998 | "
\n", 999 | "\n", 1012 | "\n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | "
MNBSVMLR
Count Vectorizer77.091.092.0
Tfidf Vectorizer74.084.081.0
\n", 1036 | "
" 1037 | ], 1038 | "text/plain": [ 1039 | " MNB SVM LR\n", 1040 | "Count Vectorizer 77.0 91.0 92.0\n", 1041 | "Tfidf Vectorizer 74.0 84.0 81.0" 1042 | ] 1043 | }, 1044 | "execution_count": 170, 1045 | "metadata": {}, 1046 | "output_type": "execute_result" 1047 | } 1048 | ], 1049 | "source": [ 1050 | "model_recall={'MNB': [round(mnb_r1), round(mnb_r2)],\n", 1051 | " 'SVM': [round(svm_r1), round(svm_r2)],\n", 1052 | " 'LR': [round(lr_r1), round(lr_r2)]\n", 1053 | " }\n", 1054 | "mr = pd.DataFrame(model_recall, columns = ['MNB','SVM','LR'], index=['Count Vectorizer','Tfidf Vectorizer'])\n", 1055 | "mr" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "markdown", 1060 | "metadata": {}, 1061 | "source": [ 1062 | "### COMPARING F1 SCORE" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": 171, 1068 | "metadata": {}, 1069 | "outputs": [ 1070 | { 1071 | "data": { 1072 | "text/html": [ 1073 | "
\n", 1074 | "\n", 1087 | "\n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | "
MNBSVMLR
Count Vectorizer79.084.085.0
Tfidf Vectorizer79.083.081.0
\n", 1111 | "
" 1112 | ], 1113 | "text/plain": [ 1114 | " MNB SVM LR\n", 1115 | "Count Vectorizer 79.0 84.0 85.0\n", 1116 | "Tfidf Vectorizer 79.0 83.0 81.0" 1117 | ] 1118 | }, 1119 | "execution_count": 171, 1120 | "metadata": {}, 1121 | "output_type": "execute_result" 1122 | } 1123 | ], 1124 | "source": [ 1125 | "model_f1={'MNB': [round(mnb_f11), round(mnb_f12)],\n", 1126 | " 'SVM': [round(svm_f11), round(svm_f12)],\n", 1127 | " 'LR': [round(lr_f11), round(lr_f12)]\n", 1128 | " }\n", 1129 | "mf1 = pd.DataFrame(model_f1, columns = ['MNB','SVM','LR'], index=['Count Vectorizer','Tfidf Vectorizer'])\n", 1130 | "mf1" 1131 | ] 1132 | }, 1133 | { 1134 | "cell_type": "code", 1135 | "execution_count": 172, 1136 | "metadata": {}, 1137 | "outputs": [], 1138 | "source": [ 1139 | "#SAVING THE BEST MODEL WITH ITS RESPECTIVE VECTORIZER\n", 1140 | "pickle.dump(lr1, open('data and pickle files/data and pickle files/best_model.pkl', 'wb'))\n", 1141 | "pickle.dump(count_vectorizer, open('data and pickle files/count_vectorizer.pkl', 'wb'))" 1142 | ] 1143 | } 1144 | ], 1145 | "metadata": { 1146 | "kernelspec": { 1147 | "display_name": "Python 3", 1148 | "language": "python", 1149 | "name": "python3" 1150 | }, 1151 | "language_info": { 1152 | "codemirror_mode": { 1153 | "name": "ipython", 1154 | "version": 3 1155 | }, 1156 | "file_extension": ".py", 1157 | "mimetype": "text/x-python", 1158 | "name": "python", 1159 | "nbconvert_exporter": "python", 1160 | "pygments_lexer": "ipython3", 1161 | "version": "3.8.3" 1162 | } 1163 | }, 1164 | "nbformat": 4, 1165 | "nbformat_minor": 4 1166 | } 1167 | -------------------------------------------------------------------------------- /4.Deployment.py: -------------------------------------------------------------------------------- 1 | #LIBRARIES 2 | import streamlit as st 3 | import pickle 4 | import nltk 5 | from textblob import TextBlob 6 | from nltk.corpus import stopwords 7 | from nltk.stem import PorterStemmer 8 | import re 9 | 10 | 11 | #LOAD PICKLE FILES 12 | model = pickle.load(open('data and pickle files/best_model.pkl','rb')) 13 | vectorizer = pickle.load(open('data and pickle files/count_vectorizer.pkl','rb')) 14 | 15 | #FOR STREAMLIT 16 | nltk.download('stopwords') 17 | 18 | #TEXT PREPROCESSING 19 | sw = set(stopwords.words('english')) 20 | def text_preprocessing(text): 21 | txt = TextBlob(text) 22 | result = txt.correct() 23 | removed_special_characters = re.sub("[^a-zA-Z]", " ", str(result)) 24 | tokens = removed_special_characters.lower().split() 25 | stemmer = PorterStemmer() 26 | 27 | cleaned = [] 28 | stemmed = [] 29 | 30 | for token in tokens: 31 | if token not in sw: 32 | cleaned.append(token) 33 | 34 | for token in cleaned: 35 | token = stemmer.stem(token) 36 | stemmed.append(token) 37 | 38 | return " ".join(stemmed) 39 | 40 | #TEXT CLASSIFICATION 41 | def text_classification(text): 42 | if len(text) < 1: 43 | st.write(" ") 44 | else: 45 | with st.spinner("Classification in progress..."): 46 | cleaned_review = text_preprocessing(text) 47 | process = vectorizer.transform([cleaned_review]).toarray() 48 | prediction = model.predict(process) 49 | p = ''.join(str(i) for i in prediction) 50 | 51 | if p == 'True': 52 | st.success("The review entered is Legitimate.") 53 | if p == 'False': 54 | st.error("The review entered is Fraudulent.") 55 | 56 | #PAGE FORMATTING AND APPLICATION 57 | def main(): 58 | st.title("Fraud Detection in Online Consumer Reviews Using Machine Learning Techniques") 59 | 60 | 61 | # --EXPANDERS-- 62 | abstract = st.expander("Abstract") 63 | if abstract: 64 | abstract.write("In today's world, both businesses and customers believe reviews to be quite beneficial. It's no surprise that review fraud has devalued the whole experience, from nasty reviews putting harm to the business's credibility to breaking international laws. This has been seen as a developing problem, and because it is related to natural language processing, it was critical to develop various machine learning methodologies and techniques to achieve a breakthrough in this sector. Many e-commerce sites, such as Amazon, have their systems in place, including Verified Purchase, which labels review language as accurate when items are purchased directly from the website. This work proposes to use Amazon's verified purchases label to train three classifiers for supervised training on Amazon’s labelled dataset. MNB, SVM, and LR were chosen as classifiers, and model tuning was done using two distinct vectorizers, Count Vectorizer and TF-IDF Vectorizers. Overall, all of the trained models had an accuracy rate of 80%, indicating that the vectorizers functioned admirably and that there are distinctions between false and actual reviews. Out of the two, the count vectorizer improved the models' performance more, and out of the three inside counts, LR performed the best, with an accuracy rate of 85% and a recall rate of 92%. The LR classifier was used, and it was accessible to the public to see if the reviews entered were genuine or not, with a probability score.") 65 | #st.write(abstract) 66 | 67 | links = st.expander("Related Links") 68 | if links: 69 | links.write("[Dataset utilized](https://www.kaggle.com/akudnaver/amazon-reviews-dataset)") 70 | links.write("[Github](https://github.com/kntb0107/Fraud-Detection-in-Online-Consumer-Reviews-Using-Machine-Learning-Techniques)") 71 | 72 | # --CHECKBOXES-- 73 | st.subheader("Information on the Classifier") 74 | if st.checkbox("About Classifer"): 75 | st.markdown('**Model:** Logistic Regression') 76 | st.markdown('**Vectorizer:** Count') 77 | st.markdown('**Test-Train splitting:** 40% - 60%') 78 | st.markdown('**Spelling Correction Library:** TextBlob') 79 | st.markdown('**Stemmer:** PorterStemmer') 80 | 81 | if st.checkbox("Evaluation Results"): 82 | st.markdown('**Accuracy:** 85%') 83 | st.markdown('**Precision:** 80%') 84 | st.markdown('**Recall:** 92%') 85 | st.markdown('**F-1 Score:** 85%') 86 | 87 | 88 | #--IMPLEMENTATION OF THE CLASSIFIER-- 89 | st.subheader("Fake Review Classifier") 90 | review = st.text_area("Enter Review: ") 91 | if st.button("Check"): 92 | text_classification(review) 93 | 94 | #RUN MAIN 95 | main() 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://fakereviewdetectorv1.streamlit.app/) 2 | 3 | 4 | # Fraud Detection in Online Consumer Reviews Using Machine Learning Techniques 5 | Final Year Project describing the path to creating classifiers which aids in identifying the fake reviews from the real one using an Amazon dataset. 6 | 7 | # Abstract 8 | In today's world, both businesses and customers believe reviews to be quite beneficial. It's no surprise that review fraud has devalued the whole experience, from nasty reviews putting harm to the business's credibility to breaking international laws. This has been seen as a developing problem, and because it is related to natural language processing, it was critical to develop various machine learning methodologies and techniques to achieve a breakthrough in this sector. Many e-commerce sites, such as Amazon, have their systems in place, including Verified Purchase, which labels review language as accurate when items are purchased directly from the website. This work proposes to use Amazon's verified purchases label to train three classifiers for supervised training on Amazon’s labelled dataset. MNB, SVM, and LR were chosen as classifiers, and model tuning was done using two distinct vectorizers, Count Vectorizer and TF-IDF Vectorizers. Overall, all of the trained models had an accuracy rate of 80%, indicating that the vectorizers functioned admirably and that there are distinctions between false and actual reviews. Out of the two, the count vectorizer improved the models' performance more, and out of the three inside counts, LR performed the best, with an accuracy rate of 85% and a recall rate of 92%. The LR classifier was used, and it was accessible to the public to see if the reviews entered were genuine or not, with a probability score. 9 | 10 | # Proceeding with the files 11 | The notebooks and the python file has been numbered in order, and hence for easier readibility please refer to them in order. 12 | 13 | This project has already been deployed online, and hence you can simply click the button on top to be referred to the web application 14 | 15 | 16 | # MISC 17 | 18 | DATASET: https://bit.ly/2Rzvjqf [KAGGLE] 19 | 20 | main dataset used: amazon_reviews_2019.csv 21 | 22 | 23 | 24 | # INFO 25 | 26 | IDE: Jupyter Notebook 27 | 28 | Language: Python 29 | 30 | Models utlized: Logistic Regression, SVM, MNB 31 | 32 | Deployment Platform: Streamlit.io 33 | 34 | 35 | # Future Enchancements 36 | The dataset utilized in this research was found to be focused on reviews from the United Kingdom, namely from the grocery section of well-known supermarkets and pharmacies available on Amazon. Furthermore, because the reviews have mainly positive sentiments, it can be argued that the web application performed better when reviews fit the aforementioned criteria against product reviews from other areas and sites when it came down to the classification. 37 | 38 | This restriction is a symptom of a larger problem that this industry has been grappling with a lack of a standardized dataset for the sole purpose of detecting and analysing fraudulent reviews. This introduces external elements that may have an impact on the classifiers' performance, and because authors in past works of literature have taken liberties, it is difficult to say which sort of setup and models operate best in which scenario. 39 | 40 | One of the future enhancements suggested is the development of a large dataset with various types of online reviews from various backgrounds so that the study and performance of these reviews can be as unbiased as possible, and researchers can focus entirely on developing automated detection techniques similar to modern email spam detection systems. 41 | 42 | -------------------------------------------------------------------------------- /data and pickle files/amazon_reviews_2019.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kntb0107/fake_review_detector/eae6fcb8f804e6931281ca18efdb43eced7c8548/data and pickle files/amazon_reviews_2019.csv -------------------------------------------------------------------------------- /data and pickle files/best_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kntb0107/fake_review_detector/eae6fcb8f804e6931281ca18efdb43eced7c8548/data and pickle files/best_model.pkl -------------------------------------------------------------------------------- /data and pickle files/count_vectorizer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kntb0107/fake_review_detector/eae6fcb8f804e6931281ca18efdb43eced7c8548/data and pickle files/count_vectorizer.pkl -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | matplotlib 3 | seaborn 4 | textblob 5 | nltk 6 | regex 7 | scikit-learn 8 | pickle-mixin 9 | --------------------------------------------------------------------------------