├── GPNBSingularity.ipynb ├── GPSMSSpamDetector.ipynb ├── GPSentimentAnalyzer.ipynb ├── ID.py ├── NLProcLSAPy.ipynb ├── knnbreaksdown.py ├── knnnestedclass.py ├── knnsingularity.py ├── knnxorsingularity.py ├── nbsingularity.py ├── train.csv └── util.py /GPNBSingularity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# dataset: https://archive.ics.uci.edu/ml/datasets/Spambase" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from sklearn.naive_bayes import MultinomialNB" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import pandas as pd" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import numpy as np" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "data = pd.read_csv('spambase.data').as_matrix() # use pandas for convenience" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 6, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "np.random.shuffle(data) # shuffle each row in-place, but preserve the row" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 7, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "X = data[:,:48]" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 8, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "Y = data[:,-1]" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 9, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# last 100 rows will be test" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 10, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "Xtrain = X[:-100,]\n", 91 | "Ytrain = Y[:-100,]\n", 92 | "Xtest = X[-100:,]\n", 93 | "Ytest = Y[-100:,]" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 11, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "Classification rate for NB: 0.86\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "\n", 111 | "model = MultinomialNB()\n", 112 | "model.fit(Xtrain, Ytrain)\n", 113 | "print(\"Classification rate for NB:\", model.score(Xtest, Ytest))" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 12, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "from sklearn.ensemble import AdaBoostClassifier" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 13, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "model = AdaBoostClassifier()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 14, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,\n", 143 | " learning_rate=1.0, n_estimators=50, random_state=None)" 144 | ] 145 | }, 146 | "execution_count": 14, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "model.fit(Xtrain, Ytrain)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 15, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "Classification rate for AdaBoost: 0.93\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "print(\"Classification rate for AdaBoost:\", model.score(Xtest, Ytest))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [] 178 | } 179 | ], 180 | "metadata": { 181 | "kernelspec": { 182 | "display_name": "Python 3", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.6.4" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 2 201 | } 202 | -------------------------------------------------------------------------------- /GPSentimentAnalyzer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from __future__ import print_function, division" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from future.utils import iteritems" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from builtins import range" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import nltk" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "import numpy as np" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 6, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "from nltk.stem import WordNetLemmatizer" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 7, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "from sklearn.linear_model import LogisticRegression" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 57, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from bs4 import BeautifulSoup\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 55, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 9, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "wordnet_lemmatizer = WordNetLemmatizer()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 10, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# Onix Text Retrieval Toolkit provides around 429 Stopwords \n", 98 | "# usually considered as what usually an user searches in a review" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 11, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# The dataset for stopwords can be downloaded from http://www.lextek.com/manuals/onix/stopwords1.html" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 13, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "stopwords = set(w.rstrip() for w in open('/users/GP/PycharmProjects/Python/Electronics/stopwords.txt'))" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 14, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "# The positive reviews, negative reviews, and unlabled reviews\n", 126 | "# can be downloaded from the following Multi-Domain Sentiment Dataset\n", 127 | "# http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 59, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "name": "stderr", 137 | "output_type": "stream", 138 | "text": [ 139 | "/anaconda3/lib/python3.6/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", 140 | "\n", 141 | "The code that caused this warning is on line 193 of the file /anaconda3/lib/python3.6/runpy.py. To get rid of this warning, change code that looks like this:\n", 142 | "\n", 143 | " BeautifulSoup(YOUR_MARKUP})\n", 144 | "\n", 145 | "to this:\n", 146 | "\n", 147 | " BeautifulSoup(YOUR_MARKUP, \"lxml\")\n", 148 | "\n", 149 | " markup_type=markup_type))\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "positive_reviews = BeautifulSoup(open('/users/GP/PycharmProjects/Python/Electronics/positive.review').read())\n", 155 | "positive_reviews = positive_reviews.findAll('review_text')" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 17, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stderr", 165 | "output_type": "stream", 166 | "text": [ 167 | "/anaconda3/lib/python3.6/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", 168 | "\n", 169 | "The code that caused this warning is on line 193 of the file /anaconda3/lib/python3.6/runpy.py. To get rid of this warning, change code that looks like this:\n", 170 | "\n", 171 | " BeautifulSoup(YOUR_MARKUP})\n", 172 | "\n", 173 | "to this:\n", 174 | "\n", 175 | " BeautifulSoup(YOUR_MARKUP, \"lxml\")\n", 176 | "\n", 177 | " markup_type=markup_type))\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "negative_reviews = BeautifulSoup(open('/users/GP/PycharmProjects/Python/Electronics/negative.review').read())" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 18, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "negative_reviews = negative_reviews.findAll('review_text')" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 19, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "# Based on the dataset, there are more positive reviews found than negative reviews\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 20, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "diff = len(positive_reviews) - len(negative_reviews)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 21, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "idxs = np.random.choice(len(negative_reviews), size=diff)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 22, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "extra = [negative_reviews[i] for i in idxs]" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 23, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "negative_reviews += extra" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 25, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "\n", 246 | "def my_tokenizer(s):\n", 247 | " s = s.lower() # downcase\n", 248 | " tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)\n", 249 | " tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful\n", 250 | " tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form\n", 251 | " tokens = [t for t in tokens if t not in stopwords] # remove stopwords\n", 252 | " return tokens\n" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 27, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "word_index_map = {}\n", 262 | "current_index = 0\n", 263 | "positive_tokenized = []\n", 264 | "negative_tokenized = []\n", 265 | "\n", 266 | "for review in positive_reviews:\n", 267 | " tokens = my_tokenizer(review.text)\n", 268 | " positive_tokenized.append(tokens)\n", 269 | " for token in tokens:\n", 270 | " if token not in word_index_map:\n", 271 | " word_index_map[token] = current_index\n", 272 | " current_index += 1\n", 273 | "\n", 274 | "for review in negative_reviews:\n", 275 | " tokens = my_tokenizer(review.text)\n", 276 | " negative_tokenized.append(tokens)\n", 277 | " for token in tokens:\n", 278 | " if token not in word_index_map:\n", 279 | " word_index_map[token] = current_index\n", 280 | " current_index += 1" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 28, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "# creating input matrices" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 29, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "def tokens_to_vector(tokens, label):\n", 299 | " x = np.zeros(len(word_index_map) + 1) # The last element is for the label\n", 300 | " for t in tokens:\n", 301 | " i = word_index_map[t]\n", 302 | " x[i] += 1\n", 303 | " x = x / x.sum() # Perform normalization before setting the label\n", 304 | " x[-1] = label\n", 305 | " return x\n" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 30, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "N = len(positive_tokenized) + len(negative_tokenized)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 31, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "data = np.zeros((N, len(word_index_map) + 1))" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 32, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "i = 0" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 33, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "for tokens in positive_tokenized:\n", 342 | " xy = tokens_to_vector(tokens, 1)\n", 343 | " data[i,:] = xy\n", 344 | " i += 1" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 34, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "for tokens in negative_tokenized:\n", 354 | " xy = tokens_to_vector(tokens, 0)\n", 355 | " data[i,:] = xy\n", 356 | " i += 1" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 35, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "# Perform random shuffle and create the data split for train and test" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 36, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "np.random.shuffle(data)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 37, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "K = data[:,:-1]" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 38, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "Z = data[:,-1]" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 39, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "# Test the last 100 rows" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 41, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "Ktrain = K[:-100,]\n", 411 | "Ztrain = Z[:-100,]\n", 412 | "Ktest = K[-100:,]\n", 413 | "Ztest = Z[-100:,]" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 42, 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "name": "stdout", 423 | "output_type": "stream", 424 | "text": [ 425 | "Machine learning classification rate: 0.7\n" 426 | ] 427 | } 428 | ], 429 | "source": [ 430 | "model = LogisticRegression()\n", 431 | "model.fit(Ktrain, Ztrain)\n", 432 | "print(\"Machine learning classification rate:\", model.score(Ktest, Ztest))" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 43, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "# Set different threshold values by looking at the weight of each word" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 44, 447 | "metadata": {}, 448 | "outputs": [ 449 | { 450 | "name": "stdout", 451 | "output_type": "stream", 452 | "text": [ 453 | "unit -0.6654115536910598\n", 454 | "bad -0.7530980513734542\n", 455 | "cable 0.7327088936910008\n", 456 | "time -0.655438206274543\n", 457 | "'ve 0.6890309629141371\n", 458 | "month -0.7354160723459667\n", 459 | "pro 0.5110164469382066\n", 460 | "sound 1.0284167296485074\n", 461 | "lot 0.7303453599700148\n", 462 | "you 0.9890614268534846\n", 463 | "n't -2.076753643831598\n", 464 | "easy 1.6668537424052763\n", 465 | "quality 1.5194027377291928\n", 466 | "company -0.5260024382677113\n", 467 | "item -0.9109250430895411\n", 468 | "wa -1.6313229127613524\n", 469 | "perfect 0.986642999168318\n", 470 | "fast 0.8901264571515678\n", 471 | "ha 0.6843005059287284\n", 472 | "price 2.7068565591464564\n", 473 | "value 0.567320022960696\n", 474 | "money -0.9689483718157904\n", 475 | "memory 0.9891753058871854\n", 476 | "picture 0.5836717714691185\n", 477 | "buy -0.8634162155707061\n", 478 | "bit 0.6022461095877204\n", 479 | "happy 0.6155667232771299\n", 480 | "pretty 0.7579533047223903\n", 481 | "doe -1.2796512008978298\n", 482 | "highly 1.016011628872127\n", 483 | "recommend 0.6695073553233495\n", 484 | "customer -0.6553943268222613\n", 485 | "support -0.9050893406388059\n", 486 | "little 0.9707521309468207\n", 487 | "returned -0.7621062577476018\n", 488 | "excellent 1.3353095024737722\n", 489 | "love 1.2161006438692832\n", 490 | "home 0.5941637789618377\n", 491 | "week -0.7337195544879093\n", 492 | "using 0.6070028026246911\n", 493 | "laptop 0.5407051441572146\n", 494 | "video 0.5874050624543495\n", 495 | "poor -0.7759972517212054\n", 496 | "look 0.5244324095711806\n", 497 | "then -1.0796219610835414\n", 498 | "tried -0.804279251985134\n", 499 | "try -0.6625252016037377\n", 500 | "space 0.5814703505058424\n", 501 | "comfortable 0.6437374441148461\n", 502 | "hour -0.569256808466944\n", 503 | "expected 0.5852407130773088\n", 504 | "speaker 0.9706366980157677\n", 505 | "warranty -0.6224205849911729\n", 506 | "stopped -0.5407614907848408\n", 507 | "junk -0.5000688658821605\n", 508 | "returning -0.5326830881222238\n", 509 | "paper 0.6238682864052281\n", 510 | "return -1.1127370121422835\n", 511 | "waste -1.0132601986856526\n", 512 | "refund -0.6072124869094646\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "threshold = 0.5\n", 518 | "for word, index in iteritems(word_index_map):\n", 519 | " weight = model.coef_[0][index]\n", 520 | " if weight > threshold or weight < -threshold:\n", 521 | " print(word, weight)" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [] 530 | } 531 | ], 532 | "metadata": { 533 | "kernelspec": { 534 | "display_name": "Python 3", 535 | "language": "python", 536 | "name": "python3" 537 | }, 538 | "language_info": { 539 | "codemirror_mode": { 540 | "name": "ipython", 541 | "version": 3 542 | }, 543 | "file_extension": ".py", 544 | "mimetype": "text/x-python", 545 | "name": "python", 546 | "nbconvert_exporter": "python", 547 | "pygments_lexer": "ipython3", 548 | "version": "3.6.4" 549 | } 550 | }, 551 | "nbformat": 4, 552 | "nbformat_minor": 2 553 | } 554 | -------------------------------------------------------------------------------- /ID.py: -------------------------------------------------------------------------------- 1 | class Z(object): 2 | pass 3 | if __name__ == '__main__': 4 | z = Z() 5 | 6 | 7 | y = Z() 8 | print(id(z) == id(y)) 9 | print(z, y) 10 | 11 | -------------------------------------------------------------------------------- /NLProcLSAPy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from __future__ import print_function, division\n", 10 | "from builtins import range\n", 11 | "\n", 12 | "# Import NLKT and Numpy libraries\n", 13 | "import nltk\n", 14 | "import numpy as np\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "from nltk.stem import WordNetLemmatizer\n", 17 | "from sklearn.decomposition import TruncatedSVD\n", 18 | "\n", 19 | "\n", 20 | "wordnet_lemmatizer = WordNetLemmatizer()\n", 21 | "\n", 22 | "#Load the file content into titles\n", 23 | "booktitles = [line.rstrip() for line in open('/users/GP/PycharmProjects/Python/LSA/all_book_titles.txt')]" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Load the stopwords content into stopwords\n", 33 | "stopwords = set(w.rstrip() for w in open('/users/GP/PycharmProjects/Python/LSA/stopwords.txt'))\n", 34 | "\n", 35 | "\n", 36 | "# Additional stopwords can be added in the program to address the problem\n", 37 | "stopwords = stopwords.union({\n", 38 | " 'introduction', 'edition', 'series', 'application',\n", 39 | " 'approach', 'card', 'access', 'package', 'plus', 'etext',\n", 40 | " 'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',\n", 41 | " 'third', 'second', 'fourth', })\n", 42 | "def my_tokenizer(s):\n", 43 | " s = s.lower() # downcase\n", 44 | " tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)\n", 45 | " tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful\n", 46 | " tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form\n", 47 | " tokens = [t for t in tokens if t not in stopwords] # remove stopwords\n", 48 | " tokens = [t for t in tokens if not any(c.isdigit() for c in t)] # remove any digits, i.e. \"3rd edition\"\n", 49 | " return tokens\n", 50 | "\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "Display the number of errors from the file: 0 Total SCOL for the number of files: 2373\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "# A wordl-to-index has to be created to build the word-frequency vectors.\n", 68 | "# Performing tokenization at this stage will save time not to retokenize later.\n", 69 | "\n", 70 | "word_index_map = {}\n", 71 | "current_index = 0\n", 72 | "all_tokens = []\n", 73 | "all_titles = []\n", 74 | "index_word_map = []\n", 75 | "error_count = 0\n", 76 | "for title in booktitles:\n", 77 | " try:\n", 78 | " title = title.encode('ascii', 'ignore').decode('utf-8') # Add UTF-8 to avoid any bad characters in the file\n", 79 | " all_titles.append(title)\n", 80 | " tokens = my_tokenizer(title)\n", 81 | " all_tokens.append(tokens)\n", 82 | " for token in tokens:\n", 83 | " if token not in word_index_map:\n", 84 | " word_index_map[token] = current_index\n", 85 | " current_index += 1\n", 86 | " index_word_map.append(token)\n", 87 | " except Exception as e:\n", 88 | " print(e)\n", 89 | " print(title)\n", 90 | " error_count += 1\n", 91 | "\n", 92 | "\n", 93 | "print(\"Display the number of errors from the file:\", error_count, \"Total SCOL for the number of files:\", len(booktitles))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD8CAYAAACCRVh7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xd8VMX6+PHP7G7aJoEACQFCCSCglIRAQAGVIsWCCoIClgt6kateUbyKgihivSoWFO/XK6Kg6EX80URRQZRiQSFIaNJbaEIghIQUkuw+vz9OshSztGyyJDzv12tf7J49OzNnwytPnpkzM0ZEUEoppYpj83cDlFJKXbg0SCillPJKg4RSSimvNEgopZTySoOEUkoprzRIKKWU8kqDhFJKKa80SCillPJKg4RSSimvHP6oNDIyUmJjY/1RtVJKlVsrVqw4KCJRZVmnX4JEbGwsSUlJ/qhaKaXKLWPMzrKuU7ublFJKeaVBQp2XHTt20Lx5878cHz16NAsWLPD6udmzZ/PHH3+UZtOUUj501kHCGPOhMeaAMWbtCceqGmO+M8ZsLvy3Suk0U5UXzz33HF27dvX6/vkEiYKCgpI2Syl1ns4lk5gMXHvKsRHA9yLSCPi+8LW6SLhcLu69916aNWtG9+7dycnJYdCgQUyfPh2AESNG0LRpU+Li4njsscf45ZdfmDNnDsOHD6dly5Zs3bqV5ORkrrjiCuLi4ujduzeHDx8GoFOnTjz55JN07NiRF198kfr165Ofnw9ARkYGsbGxntdKqdJz1gPXIrLEGBN7yuGbgU6Fzz8CFgFP+KBdqhzYvHkzU6dO5f333+e2225jxowZnvfS0tKYNWsWGzZswBhDeno6ERER3HTTTfTs2ZO+ffsCEBcXx/jx4+nYsSOjR4/m2WefZdy4cQCkp6ezePFiwOremjt3Lr169eKzzz6jT58+BAQElP1FK3WRKemYRLSI7AMo/Ld6yZukLlSzV+6hw8s/UH/EXPq8+wvVa9WhZcuWALRu3ZodO3Z4zq1UqRLBwcEMHjyYmTNn4nQ6/1LekSNHSE9Pp2PHjgAMHDiQJUuWeN7v16+f5/ngwYOZNGkSAJMmTeLuu+8ujUtUSp2izAaujTFDjDFJxpik1NTUsqpW+cjslXsYOXMNe9JzEGB/Ri6HcoXZK/cAYLfbTxo7cDgcLFu2jD59+jB79myuvfbUnsozCw0N9Tzv0KEDO3bsYPHixbhcrmIHzZVSvlfSILHfGFMToPDfA95OFJEJIpIoIolRUWU6F0T5wNh5G8nJd510TEQYO29jsecfPXqUI0eOcP311zNu3DiSk5MBCA8PJzMzE4DKlStTpUoVfvzxRwCmTJniySqK87e//Y0BAwZoFqFUGSppkJgDDCx8PhD4ooTlqQvU3vQcxO0ibeqjHNu78aTjxcnMzKRnz57ExcXRsWNH3nzzTQD69+/P2LFjSUhIYOvWrXz00UcMHz6cuLg4kpOTGT16tNc23HHHHRw+fJgBAwb49uKUUl4ZETm7E42ZijVIHQnsB54BZgOfA3WBFOBWEUk7U1mJiYmiM67Llw4v/8DGn+dy7PtxBNVqQljfsQDERITw84guZdKG6dOn88UXXzBlypQyqU+pC40xZoWIJJZlnedyd5O3P9+u8VFb1AXsX9c05O+vf8TnD7oZ9ME2ju1ZT0Rsc4b3aFIm9Q8dOpRvvvmGr7/+ukzqU0pZ/LJ2kyp/cjYs4bKoHK6Nhxd6H+OZnybz73/No1dCTJnUP378+DKpRyl1Ml2WQ51RQUEBz48ZyQu9sjEGBl4NgUe2E52b4u+mKaVKmQYJdUafTZ1K9ZAjdGlmvQ50wKgbsnn26eH+bZhSqtRpkFCnVZRFjLnpKMYcPz7watiwbiVLly71X+OUUqVOg4Q6rVOziCKaTSh1cdAgobzylkUU0WxCqYpPg4TyylsWUUSzCaUqPg0SqlhnyiKKaDahVMWmQUIV60xZRBHNJpSq2HQynSrWtKkfs3RDFqGDz/xfxO0WClxLOXjwIJGRkWXQOqVUWdEgoYo1a8435OXlnfX5NpuN4ODgUmyRUsofNEioYjkcDhwO/e+h1MVOxySUUkp5pUFCKaWUVxoklFJKeaVBQimllFcaJJRSSnmlQUIppZRXGiSUUkp5pUFCKaWUVz4JEsaYR4wx64wxa40xU40xOvVWKaUqgBIHCWNMDPAQkCgizQE70L+k5SqllPI/X3U3OYAQY4wDcAJ7fVSuUkopPypxkBCRPcBrQAqwDzgiIvNPPc8YM8QYk2SMSUpNTS1ptUoppcqAL7qbqgA3A/WBWkCoMebOU88TkQkikigiiVFRUSWtVimlVBnwRXdTV2C7iKSKSD4wE2jvg3KVUkr5mS+CRApwhTHGaYwxwDXAeh+Uq5RSys98MSbxGzAd+B1YU1jmhJKWq5RSyv98squMiDwDPOOLspRSSl04dMa1UkoprzRIKKWU8kqDhFJKKa80SCillPJKg4RSSimvNEgopZTySoOEUkoprzRIKKWU8kqDhFJKKa80SCillPJKg4RSSimvNEgopZTySoOEUkoprzRIKKWU8kqDhFJKKa80SCillPJKg4RSSimvNEhcJMaNG0d2dra/m6GUKmc0SFwEXC6XBgml1HnxSZAwxkQYY6YbYzYYY9YbY9r5olx1dnr16kXr1q1p1qwZEyZMACAsLIzRo0dz+eWX8+KLL7J37146d+5M586d/dxapVR5YkSk5IUY8xHwo4hMNMYEAk4RSfd2fmJioiQlJZW4XmVJS0ujatWq5OTk0KZNGxYvXkxkZCTTpk3jtttuAyA2NpakpCQiIyP93Fql1PkyxqwQkcSyrNNR0gKMMZWAq4FBACKSB+SVtFx19t5++21mzZoFwK5du9i8eTN2u50+ffr4uWVKqfKuxEECaACkApOMMfHACuBhEcnyQdmqGLNX7mHsvI3sTc8hNG0jruVfs2LpUpxOJ506dSI3N5fg4GDsdru/m6qUKud8MSbhAFoB74pIApAFjDj1JGPMEGNMkjEmKTU11QfVXpxmr9zDyJlr2JOegwAHDh1mV5Zh/sbDbNiwgV9//bXYz4WHh5OZmVm2jVVKlXu+CBK7gd0i8lvh6+lYQeMkIjJBRBJFJDEqKsoH1V6cxs7bSE6+CxEh78A2Quq3xlXg4o7rr+Lpp5/miiuuKPZzQ4YM4brrrtOBa6XUOfHVwPWPwGAR2WiMGQOEishwb+frwPX5qz9iLgLkbF/Jgc+fpuagtwiMbogBtr98g7+bp5QqRf4YuPbVPImhwKfGmNVAS+AlH5WrTlErIgQRoWDpB3RqCnm/fOg5rpRSvuaTICEiyYVdSXEi0ktEDvuiXPVXw3s0QXavJiz/T756FGT/esyh7Qzv0cTfTVNKVUA647qcubllLSqvmcLLfXIJDYanb8qn5qZp9EqI8XfTlFIVkAaJcmbBggUUHNnFbYXj0//oIuz8YwUrV670b8OUUhWSBolyREQYM+pRRt+Yhb3wJxcSCE9cn8uzT3u9T0Appc6bBolyZMGCBaTt3+bJIorc21lY/tsvmk0opXxOg0Q5UVwWUUSzCaVUadEgUU54yyKKaDahlCoNGiTKgdNlEUU0m1BKlQYNEuXAmbKIIppNKKV8zSfLcpwrXZbj3HRoG8clwWu4Pv7M5077DdzVr2H2VwtKv2FKqTJVLveTUKWvRXwChw/FMHPPmc8NqA316jcq/UYppS4KGiTKgf++/5G/m6CUukjpmIRSSimvNEgopZTySoOEUkoprzRIKKWU8kqDhFJKKa80SCillPJKg4RSSimvNEgopZTyymdBwhhjN8asNMZ85asylVJK+ZcvM4mHgfU+LE8ppZSf+SRIGGNqAzcAE31RnlJKqQuDrzKJccDjgNtH5SmllLoAlDhIGGN6AgdEZMUZzhtijEkyxiSlpqaWtFqllFJlwBeZRAfgJmPMDuAzoIsx5pNTTxKRCSKSKCKJUVFRPqhWKaVUaStxkBCRkSJSW0Rigf7ADyJyZ4lbppRSyu90noRSSimvfLrpkIgsAhb5skyllFL+o5mEUkoprzRIKKWU8kqDhFJKKa80SJSBHTt20Lx5c383QymlzpkGCaWUUl5pkDhPTzzxBP/3f//neT1mzBhef/11hg8fTvPmzWnRogXTpk37y+cmT57Mgw8+6Hnds2dPFi1aBEBYWBhPPPEErVu3pmvXrixbtoxOnTrRoEED5syZA4DL5WL48OG0adOGuLg43nvvvdK9UKXURU2DxHnq37//SUHg888/JzIykuTkZFatWsWCBQsYPnw4+/btO+sys7Ky6NSpEytWrCA8PJynnnqK7777jlmzZjF69GgAPvjgAypXrszy5ctZvnw577//Ptu3b/f59SmlFPh4nsTFYPbKPYydt5G96TnsX7+DD+evoHW0gypVqpCcnMyAAQOw2+1ER0fTsWNHli9fTlxc3FmVHRgYyLXXXgtAixYtCAoKIiAggBYtWrBjxw4A5s+fz+rVq5k+fToAR44cYfPmzdSvX79UrlcpdXHTIHEOZq/cw8iZa8jJdwEQeEk7Rr4+katiHPTv35+tW7eesQyHw4HbfXyx3NzcXM/zgIAAjDEA2Gw2goKCPM8LCgoAEBHGjx9Pjx49fHZdSinljXY3nYOx8zZ6AgSA87KrObJ2EXPnzKJv375cffXVTJs2DZfLRWpqKkuWLKFt27YnlREbG0tycjJut5tdu3axbNmyc2pDjx49ePfdd8nPzwdg06ZNZGVllfzilFKqGJpJnIO96TkA5CybCsZBSJtbceflYA+tSs2aNenduzdLly4lPj4eYwyvvvoqNWrU8HQVAXTo0IH69evTokULmjdvTqtWrc6pDYMHD2bHjh20atUKESEqKorZs2f78jKVUsrDiEiZV5qYmChJSUllXm9JdXj5B1L2/knaB/cgQLXBk7GHhBMTEcLPI7r4u3lKqQrOGLNCRBLLsk7tbjoHw3s0If/36dx2uZverYWcpBmEBNgZ3qOJv5umlFKlQoPEObiyThC5q7/h2V75PH9LHjnJX/LkNbXplRDj76YppVSp0CBxDl5/9d8MaCfUjYQG1aH/FYbNC/6yCZ9SSlUYGiTO0sGDB5kw4V1G3nDMc2zUjbn8553xpKWl+bFlSilVejRInKXXX/03t13upm7k8WMNqkOv1m7GvfGq/xqmlFKlSIPEWSguiyii2YRSqiLTIHEWissiimg2oZSqyDRInMHpsogimk0opSqqEgcJY0wdY8xCY8x6Y8w6Y8zDvmjYheJ0WUQRX2UT7du3L9HnlVLK10o849oYUxOoKSK/G2PCgRVALxH5w9tnysuM64MHD9LkkrqsfC7ntEECYNsBaDPGyeZtu6hateo51eNyubDb7SVoqVLqYlAuZ1yLyD4R+b3weSawHqgQs8tmzphBemYurZ8JpvK9NioNNlQabKj6jwCi/hlM+N+hyhAHlQYbEkbZSM/MoX379me1SdCiRYvo3Lkzt99+Oy1atACsTYeKvPrqq7Ro0YL4+HhGjBgBwPvvv0+bNm2Ij4+nT58+ZGdnAzBo0CAeeughT91Fy4grpVSJiYjPHkAskAJUKua9IUASkFS3bl0pD1wulxw4cEAOHDggGzdulAMHDsjOnTulSZMmsmHDBgFk6tSpcuDAAbnuuuukY8eOkpubK8nJyRIfHy8iIu+99548//zzIiKSm5srrVu3lm3btsnChQvF6XTKtm3bPPWFhoaKiMjXX38t7dq1k6ysLBEROXTokIiIHDx40HPuqFGj5O233xYRkYEDB0rfvn3F5XLJunXrpGHDhqX/5SilyhyQJD78nX02D5+tAmuMCQNmAMNEJKOYYDQBmABWd5Ov6i1NNpuNqKgoAP7zn/8wa9YsAPbt28fhw4cJDAykX79+GGNITEwkKCiIoKCgs9okKDAwkLZt2xa7WdCCBQu4++67cTqdAJ7uq7Vr1/LUU0+Rnp7O0aNHT9pTolevXthsNpo2bcr+/ftL7TtRSl1cfBIkjDEBWAHiUxGZ6Ysy/a3LTbexJ/wy8uq0JTRtI67lX7Ni6VKcTiedOnUiNzeX/Px8Dh06RGRk5FltEjR58mSSkpLo3r07ixYtIjQ0tNi6RcSz+dCJBg0axOzZs4mPj2fy5MmevbEBT91Fn1dKKV/wxd1NBvgAWC8ib5S8Sf43e+UeVuw8TFpWHgIcOHSYXVmG+RsPs2HDBn799dezLuvUTYLS09PPuElQ9+7d+fDDDz1jDkW31mZmZlKzZk3y8/P59NNPz+/ilFLqHPgik+gA3AWsMcYkFx57UkS+9kHZZeL555/n008/pU6dOkRGRrL0SGVcbiFv/1YCo+phbAFk71zDLW3rU692zEm7zY0dO5aFCxeyZ88e7rrrLgC+/PJLcnJySEhIoFq1ajRr1oxWrVp5ftkXZRneXHvttSQnJ5OYmEhgYCDXX389L730Es8//zyXX3459erVo0WLFmRmZpbel6KUUuimQyQlJTF48GCWLl1KQUEBrVq14lDtqzm2fysFW38mrFYDjhxKI7r/iwRWjeGqvdNo1aoVw4YNIzY2lnvvvZdRo0bx8ccf8/DDD3P48GEOHz5MREQExhgmTpzI+vXref311z3dTe+8846/L1spVQ6Vy1tgy7uffvqJm2++mZCQEMLDw+nYsSOZSbNwpe2ieW3g8HZsweEEVI1h33t/p1evXixZssTz+QEDBuByuRgwYAA2m/V17t69G6fTSdOmTRk7dizr1q3z09UppVTJXDR7XNesWZMjR47gdru5osv15F/9IL881QObzY4RNxMnTqR9+/b89NNPFGSlU3D0MKnVIMhA2qEU9n/8CAVHDzF//nzAmufw559/MmzYMLZs2cKqVas83UlDhw7FbrcjIhQUFPDHH9a8wsWLFzN37lyWLFlCo0aNmDJliucOJqWUuhBV+O6movGGyMhI6tSpAxExfPbeOAKqxZB/MAVHtboYu42g3HTcBXk4HA4yMqw7eGOqwMFMOFYA9Zs0Z/e2jRQUFHDppZdy9OhRdu3ahdPpxG63U6lSJfbu3cvatWtJTEwkJyeHNm3aUKlSJRYuXMjll19OcHAwmzZtol+/foSEhBAdHc3QoUNZtGgRr732Gl999VWZfCdKqfJJu5t8aPbKPcQ/+C7PvzOJiDve5NDRPKZNm8ZnE8aBuAio3gAAZ+MrsIdWIzsnh/z8/JNuJc3IgbzCMebtG9eSn5+P0+kkIyOD3bt3Y7fbyc/Px2azERgYiM1mo1evXlxxxRXYbDYyMzP58ccfcTgcNG7cmJSUFFwuF/Pnz+fTTz/Vbiil1AWvQgaJgQ+P4p6hj7N20khcmYf4bcxNbFjzO42atcTYrB62nPWLAMhcPhubsxIBMU3Jz88nNTXVU86xAhDAYQMDBAQE0K1bN44cOYKI4Ha7ERGcTidutxu3283mzZtZtGgRbrebTZs2eWYtrl+/nkOHDlGzZk127NhBbm4uKSkpnroyMjJo3bo1VatW5b777sPtdpfhN6aUUsWrkEFicUYkOfs24s4+gjv3KIgL3C62bt2CsVmT1AzWL39bYAjZ65dwbMdKz+cdNutxWS0IDYQCNwQ4oCA/n/nz59OwYUMCAwOpXbs2brcbl8vF/v37ERECAgL43//+Z9VhDE6nk+joaJKTk8nPz2fGjBkkJCRQvXp1li1b5qlz2bJl9OvXj/79+7N161ZmzqwQcxKVUuVchQsST81eg0Q2IO/PrYABEZxNu4Cx4crJxLis/qPKIfDPbuDKPgLiBpu1CmslpxUUCtywZhdk5VkBw+W2sors7GzWrVtHXl4eu3btwu1207lzZ4KDgwFrDsTf/vY3jDFUrlyZgoICdu/ejTGGgIAAmjVrxs8//0xKSgrp6elMmjSJO++8k6CgIDZv3ozNZmPAgAH89NNPfvoGlVLquAoVJJ6avYZPfk3B2B2IuDEIgXYhZ/MvmABrrKF3714EB8LRY7BiB4QEYr3ndmMM3BBvlVU1FCKcYLeB3Q43t7ayD4CYGGuR2zp16hAeHs6MGTPIyMggMDCQyZMne7qh0tPTCQkJAazVYNu3b88DDzzAlVdeyTfffAPAM888wzvvvEN8fLznLiig2GU5lFKqrFWYIDF75R5eHngNu965k51v3EpAfgahQRBTFWzuY4jLBcCCb7/k0eugcU24vCFUcYLk5QJCtVBo18gqLz0bClwQEgCxkdD5MqsssPaZMMYQExNDbGwswcHBGGPIz8/n559/JiIigujoaESEAQMGAFC7dm0WL17M6tWr+f3337nppptwuVx06tSJiIgIli9fTteuXRERpk2bxpVXXumHb1EppU5WYYLEmDnrcGUcBFsAQSaP61taWUCEE3C7wZVHYFAwuI9xZ+Hv39fugB1vWd1J8XUhIxcen2q9Vy0MqoRBTj6kHIR/fQphQRAeDFlZWYgIW7duZevWrWRlZdGwYUMA/t//+3/k5OSQk5ODiNC4cWOMMYSGhhIWFsa6deswxtCgQQPsdrsnY2jXrh1ffvklU6dOpX79+vTu3dsP36JSSp2swgSJTf97FhDsWanYxMXrd8CIm6xf6nWqWefk5+USGwUtR0KtCPh1MzzyidXltCoF7AZyrXX4MAZ2HrTGInLyId8FG/ZBZu7xOg8dOkR2djYi4rmTKT09nd69e3vGKEaPHo2IsHfvXtLS0oiKiqJhw4bcd999tGvXjkWLFtGiRQvmzZtHaGgot99+O//97389s7eVUsqfKsxvopxN1sqsdhv0b2/tOz20O6zaBQ92szIDERjdGy6pARGh8NDHMOEHyDoGVzWxgkURR1gtbCGVrReFg9oEBEPg8RnS77//vud5o0aN6N69OwDfffcd3bp1A+Cjjz7Cbrdz7bXX4nQ6mTx5Mps2bcJms2G32xkzZgzt2rWja9eutGrVqhS/IaWUOg9lvcuRiNC6detz3I+peKGhoTLr990SULWWYN18JCEByNY3EfnUevy7HxIWjNgNEmBHGlS3zouujOczvnoEBgZ6nttsNgGkYcOG4nA4pE6dOmKMEafTKQ6HQ2rUqCF16tSRTz75RJo0aSLh4eGyZcuWYq/zxRdf9Mn3pZQq3/DDznTlOpPIysrigf/MIT9tLwA2Ay3rWVlEkaHdrQyhaph1W+v+I9bxon+dhdlDkA9WscrLywOsvaqLJsNt3boVu91OvXr1EBGys7MJCAggKiqKXbt28eWXX9K3b1/q1KnD+PHjSUpK4qGHHjqp3JdeeqnkjVNKqfNR1lFJfJBJ/Pjjj8X+JR8cgEQ4kaubIAtHIZ0vQ5rVRmpG+D5r8NXDGCOBgYHidDqlWrVq0qpVK2natKkEBgaKiMgtt9wixhiJj4+X22+/XUREpkyZInFxcRIcHCxDhgyRlJQU6dOnT4m+U6XUhQ8/ZBLlMkj4+xd7eXkUdX8ZY8Rms4ndbpfo6GiJiooSY4x88MEHIiLSpk0bCQ8Pl5iYGElNTZU333xTsrKyJDQ0VEREnnnmGU8w+/nnn+Xvf/+7TJo0SaKiouTxxx8XEfF8pqS2b98uzZo1K3E5SlVE/ggS5bq7SZ1eUfeX3W7H7XZTu3Zt7HY7ERERiAh//PEH7dq1448//mDcuHE4HA5EhHHjxjF9+vST1o8Ssdaq+umnn5g4cSKxsbG0bduWV155BYBx48Z5tls9F2fapU8p5V/lbqnwbm8sYsGjnX3cIlUSNpvtpIBijMEY4zlW9H7Dhg15/PHHeeSRR8jPz8ftdlO9enVq1KjBpZdeyubNm8nMzCQzM5M9e/b463KUumDpUuFnEDtiLpsPZPm7GeoURX9oOBwO7Ha7JyMBCA4OJiwsDIAGDRrw8MMPExQUROfOndm7dy8pKSn07t2bxYsXs3z5cr744gvS0tLYvn27365HKXWcT3amM8ZcC7wF2IGJIvKyL8o9Uf0Rc31dpPKRooBQUFDgWfk2MzOToKAgXC4XV111Fd9++y0rV65ERMjJyeGBBx7g0UcfJTk5mV27dpGRkUHLli3Jy8vD5XKxefNm6tev7+crU0qVOJMwxtiB/wDXAU2BAcaYpiUt91Rl3ymmzofdbueSSy456VhAQADGGDIyMnC5XERGRjJ37lyio6NZtWoVnTt3xhhDcnIyX3/9NY0bN/ZMTFRK+ZcvupvaAltEZJuI5AGfATf7oFyP2BOyiJ2v9PRl0crHjDFs2LCB4OBg8vLycLvdpKSkICJER0dTUFBAeHg4R48epWrVqrjdbipVqoTb7SY/31oTJS8vj6ws7VZU6kLgiyARA+w64fXuwmM+t/OVnugK2heeE5c1z8/PJycnh5CQEJxOJy6Xy7Oo4SOPPEJAQAC7d+/ml19+4YUXXqBdu3bUrFmTgIAAWrVqRffu3dmzZ4/e9aTUBcIXQaK4X9t/6R0yxgwxxiQZY5JO3CL0fIQFnfmcYJ+Mtlw8HA4HxhgCAwMREeLi4ggJCcEYQ6NGjQgNDaVJkyb06dOHiRMnIiLs27cPm83G8OHDufPOOxERMjMzERFWrVpFdHQ0+/btY/LkyVx//fVs3bqVd999l4yMDHbs2EF2djbLly/n5ZdfJi8vjzVr1rBp0yaysrKoXLmyv78SpRS+GbjeDdQ54XVtYO+pJ4nIBGACWLfAnmslO1/pSaUQGHgVvL/w9OfWjIB96edaw8XLGMMll1zChg0byMvLIygoiKZNmzJ27Fh+/fVXpkyZAkDr1q05cOAAd955J+np6bRq1Qqn08mMGTP4+uuvAejZsyfp6emkp6fjcrno3r07ERERpKWlcfjwYV5//XV/XqpS6hz5IkgsBxoZY+oDe4D+wO0+KPcvROCFW62VXicutHaXq1EJ9mdY79tt1l7UPz1z8vpNLjc0GAYph0qjVRcuu92Oy+XyZAjHjh0DrHkLDRs2ZPPmzYSGhhISEsKWLVsICAigS5cufPfddzRu3Jg333zT88sfYMWKFZ7njz32GE6nk0WLFtG4cWPP8UWLFpXZ9SmlSl+Ju5tEpAB4EJgHrAc+F5F1JS33REVZxOM9rT2oR94ErsJc5HC2lTnE17VEBVKCAAAYmElEQVR2m+t3xckBAqzg8coAa2+JxjVgwUgIDrAegTYIdMC4uwr3nijcfS44AL4bAdUrWe+XJ0VjBAEBAYB1i2pRgABrl7wtW7YA1iKJR44cISQkhIYNGxIYGEjdunUZPnw4W7ZsOSkAnGj8+PGnfV8pVTH4ZDKdiHwtIo1FpKGIvOiLMv9aBzzUw3pevTL8o4s1NnGscHzzj72wNw2e9rKh262XQ5VQqB8FHRpbGUe1cGuriOkPW+UVBYMqodAjDrq2gC1vFB53BBBYuxnhbXpT/cp+hNS8hKDazag1+D3slaLAEUTMfR8AEBJVF+yB1n4UtgBsBggMwxFREwC783h/e7Vq1TzP7777bs9sZWMM4eHh2Gw2HA4HTqeTpk2tO4sTExNxOBwEBQWddB5YYwv/+Mc/PGMJYE1oS0lJITAw0LNL3rBhw3A4HISHh+N2u8nJyaF79+5UrVrVBz8tpVRFccHPuDbGnJRFFDkxmzh01NpUqE61v2YRRYqyiaVb4I2vod0l8PF9EBQA18VDcCCMvsXa2/pYPvy7n/W58BAYcSM47fnU6PcCYXHdSFu3hGp9xlDjjleQglxcGanY3cdwZR3BBIVyLPMwuPKwR9SgUqid5rUh2HaMkEvaAiBuN0Xj/RkZGdjt1qZG33//PaGhoQQFBXkGgd1uNwUFBRQUFNCrVy/A6vZxu9243W5rAS6bzRMk3G43q1atsuopfK9q1arUqVOH2rVrExoaypIlS9i/fz9ut5vQ0FDPuk5vvfUW6enppKSk+PAnqJQqzy74IAEnZxFFTs0mDLD1AOw+zbjDrZdbe16/OAee6wudm0LTGJhmbWrHkM5W1lC7Klx2wk28D/Ww9sE+OP0pcLuIuOou/pzyKLvfuYODX71BWGgAdapC+uzRhNSLp8FNDxIU4iSq3S1EOIV5I8C48zm6ej7Y7ATVaUb121/CHlaNAjG4XG6MMYSEhJCQkODJHho0aEDTpk0xxpCXl8fs2bMBqFu3Lk6n0zOvIDQ0FKfTicPhwGazsXnzZmw2G/fffz8nrs0VGRlJbm4uV199NbNmzcLpdLJy5UpsNhuBgYHExcWxdu1a6tWrp3cXKaWACzxIFGURw0/JIoqcmk1EhMLjU2Hd7uIfG/bC4M7W7bFtG1r7WL9wKzw3yxrcLsomDmRA1gl7WYcXZjLsW0fekndxNumAM6AAR/5RwlvfRJCtgIVPgSnIo9b1/2TsiH+Sk3WUKptm8sqtx6gRAfd3Bac9j+DYBKp2u5/gmk2o/c+PCLnkckAwNjs2m41Vq1bRrFkzRITg4GDPcr3t27cnPj4egNzcXJ588knq1atHYGAgtWvXJjs7m+3btxMTE+MZVG7dujUBAQHExcVRUFDAb7/9Ro0aNfjyyy8JCQkhOjqaWbNmERYWRs+ePVm9ejXx8fFkZmZSr169Uv/5KqXKgbJem1zOYT8JsLYePfI+nu1IT30MuxYJC7L2T6gVgQQ6kEY1kKYxSNVQJKYKcmktazOixjWQJjWR0CBkW+EWp+5PkKuaIJ88YL3OmYREhiMP9Ti5noyJSJVQJCLMLpWvvF3iGwbLv26wS0TVSvLOQOuc+3s45Obb/yYiIvPmzZPL6oVKwRTrvQPvIs5AJMgZIlF9nxFHZF2xV44WW2hVMUFOCYq5TBwBgRIZGSnr1q2TkJAQCQoKkrCwMAkNDZVBgwaJiIjD4ZCYmBjp1KmT1KtXTwIDA+W7776TTp06SVBQkNhsNrnsssukQYMGsnz5cmnYsKFUqVJFnE6nxMXFSZUqVSQ1NVVGjRolISEhYoyR8PBwCQkJkbi4OImIiJAnn3zyrH4+ZWXlypUyd+5cfzdDKb9D95M4zttYxKlOzSYiw6Fbc1j3Kky8F3LzYe3L8Oj1Vvaw4TVrrsW7C4rqgTF9Ts4mnu4NU376azbx2A1wWQ0XJH/O2Ftz6dHchT0vg8GFK5c/fWMBS776nH379jHmqUcZfWMW9sJvOKoSDOkCTpOD62galS9th2QeIKLjXUh+HlG9R9H2uW/YuHEjqampNGrUiOzsbLZs2UJoaCgdO3YEICYmhuTkZBYuXMiOHTvo3r07drudhQsXkpubS506dViyZAlbt24FrDuZ0tLSyMrKYtWqVaSlpREZGckLL7xAdnY2brebjIwMsrOzWbVqFYcPH+bFF0vl3oPzVrSm07nQGdtK+cYFGyQA3AIP9zj9OX+500ngox+tsYmbW1t7W4/6HIb2gG9Xw8a9MOImmLgIDhTuc925KURXgs+WWq+HdLYGup/8/OS6hnaHLfvB5i4gPBjemgfP9rUGvwFqVoG/XenmwfuHkH5gO7defvLnn7zZGl/J+XECOStn0/8KoWD9d9hDI9j/+dMsf+PvdOvWjaCgIBISEmjWrBn33HMPHTp08Hr9gwYN4r777qNly5bk5OSc9Xd7Lj7++GPi4uKIj4/nrrvuYufOnVxzzTXExcVxzTXXeAa6Bw0axP3330/nzp1p0KABixcv5p577uGyyy5j0KBBnvLCwsJ49NFHadWqFddccw1FM/A7depE0T4jBw8eJDY2lry8PEaPHs20adNo2bIl06ZNIysri3vuuYc2bdqQkJDAF198AcDkyZO59dZbufHGG3WBQKV8paxTFzmL7qYnn3xSwoKQ+lHIwKvO/OiZgNhtVpdTUABSqwryQFerm2fmMKRaGFLwMfJ8X+TODtbxB7oiw2843p30/ZNI45p4uofe+pvVvXT0g5O7nV68DWl3CdKuEVKnGpI7+eT3d49HKjuNTH2w+O6xx3ta5fZrZ5PDE5DQYCO1hkyQek98Je3//f255Z5lYO3atdK4cWNJTU0VEZFDhw5Jz549ZfLkySIi8sEHH8jNN98sIiIDBw6Ufv36idvtltmzZ0t4eLisXr1aXC6XtGrVSlauXCkiVjfiJ598IiIizz77rPzzn/8UEZGOHTvK8uXLRUQkNTVV6tWrJyIikyZN8pwjIjJy5EiZMmWKiIgcPnxYGjVqJEePHpVJkyZJTEyMHDp0qJS/FaX8Az90N12Q08SGDBnCSy+9xNFU2H6OyzwF2GHvYZiwEO7tAk1qWncsjfocRt4Ml/zreDYRP9LqQqpe+eRs4o4OVjbx3Ezo+hJcG3+8/Mxc+G2r1f300m3Hs4gi63ZDtTD5SxZR5N7OMH4+vNDXTUQoPNxDeHfR/1Hptn8zvEeTc7vYMvDDDz/Qt29fIiMjAahatSpLly5l5syZANx11108/vjjnvNvvPFGjDG0aNGC6OhoWrRoAUCzZs3YsWMHLVu2xGaz0a+fdY/xnXfeyS233HJObZo/fz5z5szhtddeA6yB/KJsplu3bjrXQykfuiCDRL169ZATbt3cu3cvLVu25MSFAYPsYIrpLCtwQUiANdbQ8fnjxyf/aC3jUS0MXpgNbw+Em1rBa3Ph1duPj030egMenGy9PpoLm/fDzh+staAGXgnTfrNmdX+/Dv7e6eS6RWDMTHjxNjxjEaf63y9wSyJcUsN6PfwGeGveKh5KDKNXQqksnnvOZq/cw9h5G9mbnoNZt5HW0fbTnn/iKrBBQdbqizabzfO86LW3cYKizzscDs+Wp7m5ucWeC1b2O2PGDJo0OTmo/vbbb4SGhp62rUqpc3NBj0kUqVWrFgcOHDgpBfr5tySatYgnumZNgkLCOFZgDVLnFViPY/nWwHPRI+0ofLAIdh6EWcuh/sMwOwl++AM27bMeMVWsCXnP9YHV/4Ztb8Lu8fDpA9ZM7QmD4YGusGIHPHPLX7OI79ZAehZes4j0LCuLGNPn+LGibOLb958rra/vnMxeuYeRM9ewJz0HAXKrN+WLWTP4eOFaANLS0mjfvj2fffYZAJ9++ilXXnnlOdXhdruZPn06AP/73/88n4+NjfWsD1X0PkB4eDiZmZme1z169GD8+PGePyRWrlx5fherlDqjCzKTOButW7cmaUXyScdyc3PJz89n9erV5Ofns3jxYj75ZAoH9u3EiAu3+3h2UvR0T5rVpXTiPhVLt0D9E2ZuC9ag+A2vWdnCoUzvWcT1LeGXTcW3+aMlcH388SyiyPAboO7D37Nly5a/7OpW1sbO20hOvoucHcnIwS04E/tS6YrbuG/AjbweXYmEhATefvtt7rnnHsaOHUtUVBSTJk06pzpCQ0NZt24drVu3pnLlykybNg2wFg287bbbmDJlCl26dPGc37lzZ15++WVatmzJyJEjefrppxk2bBhxcXGICLGxsXz11Vc+/R6UUhZzYrdOWUlMTJSiu1jKitvtJi8vj9zcXHbv3s2WLVt4euQj4C4g99gx8vPzTjo/OyuLEEcBlUKscQ6AFnVgVYo1RnFqkMh3wf0fQs4Jxew/Ajn51rhIgQvW7ILkl/4aJMAaM1lb0I0vvprv2ws/R/VHzMUtbo5MHkxO+iGiBn+II7waBtj+8g0+qSMsLIyjR4/6pCylLibGmBUikliWdZbbTOJc2Ww2goODCQ4OJiIigubNm3vWQjrV2rVr6dqxLX+8WkBo8PHjbjckPg0zllkPsIJCbr61KKBbIDXTWsIjwG4dT6gHi562BsEviS4+QMCFk03Uighh06/fEePMoHOC4bNlUwm75kFqRYT4rU1KKf+5aILEuXh73Ou43QVcP64yBQUFHM3KOmkgOvyE35fBgbB2F1SLCCUk0M6fR47SoWNXqlWLJGXldBaNyvOMRSwd473OiFAY2t3No8Me8Gs28Wi3Rgwedztj/5ZLq1iY9NgPRHS4k+E9WvqsDs0ilCo/NEgU4+lnnuOOuwYB4HK52LlzJ6frlrPZbMTFxeFwWF9nnTp1aJPQlNHX5bH9AIyfZy097i2LKHIhZBPubb8SG57JdfHWOM09ndzs/PMbeiXc4Zf2KKX866IZkyhL27dvp/s1V1GQfwy3WziQeoj+7SA28syfnbEcLknowcwvvi39hp7C7XYT36whr9y4g+sLE4c/06HpyGDWrt9KrVq1yrxNSqnj/DEmUS5ugS1v6tevz+Ztu9m+K5X1m1Pofu2N/LTZgVs446NWFdi1a49f2j1zxgxC3Ae57oTJgzUi4O6r3bzy0rN+aZNSyr80kygDXTtfyQ+Lf8bubYbdCUQEh8NBauohwsPDy6B1luKyiCKaTSh1YdC7myqo+d8vweVynfX5xhjP+EZZKS6LKHJiNvHWO++VabuUUv6lmYQ6bRZRRLMJpfyv3I1JGGPGGmM2GGNWG2NmGWMifNUwVXZOl0UU0bEJpS5OJR24/g5oLiJxwCZgZMmbpMqS2+3m2dGPM+bmoyctTVKc4dflMWXKx+zdu7dsGqeU8rsSdXyLyImzvn4F+pasOaqsLVy4kLUbdvCcqczzZ7H8UUFBFhPe+y9jnr0wFiRUSpUuX46O3gNM8/amMWYIMASgbt26PqxWlcRVV13F0qVLTztZ8FSNGzcuxRYppS4kZxy4NsYsAIqbKzxKRL4oPGcUkAjcImfx20YHrpVS6txdkLfAikjX071vjBkI9ASuOZsAoZRSqvwoUXeTMeZa4Amgo4hk+6ZJSimlLhQlvbvpHSAc+M4Yk2yM+a8P2qSUUuoCUdK7m/y7jZpSSqlSpQv8KaWU8kqDhFJKKa80SCillPJKg4RSSimvNEgopZTySoOEUkoprzRIKKWU8kqDhFJKKa80SCillPJKg4RSSimvNEgopZTySoOEUkoprzRIKKWU8kqDhFJKKa80SCillPJKg4RSSimvNEgopdQFKCkpiYceesjfzSjZznRKKaVKR2JiIomJif5uhmYSSilVlrKysrjhhhuIj4+nefPmTJs2jeXLl9O+fXvi4+Np27YtmZmZLFq0iJ49e3o+c8899wBcZoxZaYy5GcAYM8gYM9MY860xZrMx5tWieowx1xpjfjfGrDLGfF94LNQY86ExZvmJ5ZyOTzIJY8xjwFggSkQO+qJMpZSqiL799ltq1arF3LlzAThy5AgJCQlMmzaNNm3akJGRQUhIyEmfefHFF+nSpQuTJk1aD3QFlhljFhS+3RJIAI4BG40x44Fc4H3gahHZboypWnjuKOAHEbnHGBNRVI6IZHlrb4mDhDGmDtANSClpWUopVdG1aNGCxx57jCeeeIKePXsSERFBzZo1adOmDQCVKlX6y2fmz5/PnDlzAJoCi4BgoG7h29+LyBEAY8wfQD2gCrBERLYDiEha4bndgZsK/7DnhHLWe2uvLzKJN4HHgS98UJZSSlU4s1fuYey8jexNz6FWRAjPTfoSszuZkSNH0r17d4wxp/28iDBjxgwuvfTSP0TEM1BhjLkcK4Mo4sL6vW4AKaYoA/QRkY1n2/YSjUkYY24C9ojIqpKUo5RSFdXslXsYOXMNe9JzEGDnrt28MG8bYc0689hjj/Hrr7+yd+9eli9fDkBmZiYFBQUnldGjRw/Gjx/veW2MSThDtUuBjsaY+oXnF3U3zQOGmsKodBblnDmTKOz3qlHMW6OAJ7HSlzMyxgwBhgDUrVv3DGcrpVTFMHbeRnLyXRQcOUDuxiXYI+uz/f9N4o6P7DSNqcK7776LiDB06FBycnIICQlhwYIFJ5Xx9NNPM2zYMICmxpi1wA6gp7c6RSS18HfuTGOMDTiANSzwPDAOWF0YKE5bDoARKS4jOTNjTAvgeyC78FBtYC/QVkT+PN1nExMTJSkp6bzqVUqp8qT+iLkIcPSbVzi0+kdq3v02gdUbYIDtL99wTmUZY1ac2N1UFs67u0lE1ohIdRGJFZFYYDfQ6kwBQimlLia1IkLIP7yXY1t/46nekLd0sud4eaDzJJRSqhQN79EE17JPGNbdxcgbwbV3LSZtJ8N7NPF3086Kz4JEYUahcySUUuoEzcNzyNv2G49e58IZBKNuzCdm8+f0Sojxd9POimYSSilVil58dhQPd3cREWq9fuAaYeuaZaxaVT5uCtUgoZRSpWTLli18+eUcHu5+/JZWZxA8fl0uz41+wo8tO3saJJRSqpS8+OwohnYr8GQRRf7RRfjl5yXlIpvQIKGUUqWguCyiSHnKJjRIKKVUKfCWRRQpL9mEBgmllPKx02URRcpLNnHeM65LVKkxqcDO8/x4JFDRb7Wt6NdY0a8PKv416vWdRkggDf51PREv3MppV+7LPga1/okcyWE9kHMWRdcTkajzbdf58EuQKAljTFJZT0svaxX9Giv69UHFv0a9vtN+toYx7Gl3CdnhIbjPdP7qFIIzcph9NFf6nU99pU23L1VKKd86JEKvXzYTcA6f2VxqrSkhDRJKKeVDIpIPfOnvdvhKeRy4nuDvBpSBin6NFf36oOJfo17fRaLcjUkopZQqO+Uxk1BKKVVGylWQMMZca4zZaIzZYowZ4e/2+JIxpo4xZqExZr0xZp0x5mF/t6k0GGPsxpiVxpiv/N2W0mCMiTDGTDfGbCj8Wbbzd5t8yRjzSOH/z7XGmKnGmGB/t6mkjDEfGmMOFO74VnSsqjHmO2PM5sJ/q/izjf5UboKEMcYO/Ae4DmgKDDDGNPVvq3yqAHhURC4DrgD+WcGur8jDwHp/N6IUvQV8KyKXAvFUoGs1xsQADwGJItIcsAP9/dsqn5gMXHvKsRHA9yLSCGsHzgr1R+m5KDdBAmgLbBGRbSKSB3wG3OznNvmMiOwTkd8Ln2di/XIpHwvOnyVjTG3gBmCiv9tSGowxlYCrgQ8ARCRPRNL92yqfcwAhxhgH4MTasrhcE5ElQNoph28GPip8/hHQq0wbdQEpT0EiBth1wuvdVLBfokWMMbFAAvCbf1vic+OAx+HME4zKqQZAKjCpsEttojHGy8o95Y+I7AFeA1KAfcAREZnv31aVmmgR2QfWH3BAdT+3x2/KU5Aobnp7hbs1yxgTBswAholIhr/b4yvGmJ7AARFZ4e+2lCIH0Ap4V0QSgCwqUDdFYb/8zUB9oBYQaoy507+tUqWtPAWJ3UCdE17XpgKkuicyxgRgBYhPRWSmv9vjYx2Am4wxO7C6CrsYYz7xb5N8bjewW0SKMsDpWEGjougKbBeR1MIJYzOB9n5uU2nZb4ypCVD47wE/t8dvylOQWA40MsbUN8YEYg2YzfFzm3zGGGOw+rLXi8gb/m6Pr4nISBGpLSKxWD+7H0SkQv0VKiJ/AruMMUU73F8D/OHHJvlaCnCFMcZZ+P/1GirQwPwp5gADC58PBL7wY1v8qtwsyyEiBcaYB4F5WHdVfCgi6/zcLF/qANwFrDHGJBcee1JEvvZjm9S5Gwp8WviHzDbgbj+3x2dE5DdjzHTgd6y78VZSAWYmG2OmAp2ASGPMbuAZ4GXgc2PM37GC463+a6F/6YxrpZRSXpWn7iallFJlTIOEUkoprzRIKKWU8kqDhFJKKa80SCillPJKg4RSSimvNEgopZTySoOEUkopr/4/PJcxSovGpx8AAAAASUVORK5CYII=\n", 104 | "text/plain": [ 105 | "" 106 | ] 107 | }, 108 | "metadata": {}, 109 | "output_type": "display_data" 110 | } 111 | ], 112 | "source": [ 113 | "if error_count == len(booktitles):\n", 114 | " print(\"There is no data to do anything with! Quitting...\")\n", 115 | " exit()\n", 116 | "\n", 117 | "# creation of input matrices\n", 118 | "\n", 119 | "def tokens_to_vector(tokens):\n", 120 | " x = np.zeros(len(word_index_map))\n", 121 | " for t in tokens:\n", 122 | " i = word_index_map[t]\n", 123 | " x[i] = 1\n", 124 | " return x\n", 125 | "\n", 126 | "N = len(all_tokens)\n", 127 | "D = len(word_index_map)\n", 128 | "X = np.zeros((D, N)) # terms will go along rows, documents along columns\n", 129 | "i = 0\n", 130 | "for tokens in all_tokens:\n", 131 | " X[:,i] = tokens_to_vector(tokens)\n", 132 | " i += 1\n", 133 | "\n", 134 | "def main():\n", 135 | " svd = TruncatedSVD()\n", 136 | " Z = svd.fit_transform(X)\n", 137 | " plt.scatter(Z[:,0], Z[:,1])\n", 138 | " for i in range(D):\n", 139 | " plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1]),arrowprops=dict(facecolor='orange', shrink=0.05))\n", 140 | " plt.show()\n", 141 | "\n", 142 | "if __name__ == '__main__':\n", 143 | " main()\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.6.4" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 2 175 | } 176 | -------------------------------------------------------------------------------- /knnbreaksdown.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from knnsingularity import KNN 4 | 5 | def get_data(): 6 | width = 8 7 | height = 8 8 | N = width * height 9 | X = np.zeros((N, 2)) 10 | Y = np.zeros(N) 11 | n = 0 12 | start_t = 0 13 | for i in range(width): 14 | t = start_t 15 | for j in range(height): 16 | X[n] = [i, j] 17 | Y[n] = t 18 | n += 1 19 | t = (t + 1) % 2 # alternate between 0 and 1 20 | start_t = (start_t + 1) % 2 21 | return X, Y 22 | 23 | 24 | if __name__ == '__main__': 25 | X, Y = get_data() 26 | 27 | # display the data 28 | plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5) 29 | plt.show() 30 | 31 | # get the accuracy 32 | model = KNN(3) 33 | model.fit(X, Y) 34 | print ("Train accuracy:", model.score(X, Y)) -------------------------------------------------------------------------------- /knnnestedclass.py: -------------------------------------------------------------------------------- 1 | from knnsingularity import KNN 2 | from util import get_donut 3 | import matplotlib.pyplot as plt 4 | 5 | if __name__ == '__main__': 6 | X, Y = get_donut() 7 | 8 | # display the data 9 | plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5) 10 | plt.show() 11 | 12 | # get the accuracy 13 | model = KNN(3) 14 | model.fit(X, Y) 15 | print ("Accuracy:", model.score(X, Y)) -------------------------------------------------------------------------------- /knnsingularity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sortedcontainers import SortedList 3 | # Note: You can't use SortedDict because the key is distance 4 | # if 2 close points are the same distance away, one will be overwritten 5 | from util import get_data 6 | from datetime import datetime 7 | 8 | 9 | class KNN(object): 10 | def __init__(self, k): 11 | self.k = k 12 | 13 | def fit(self, X, y): 14 | self.X = X 15 | self.y = y 16 | 17 | def predict(self, X): 18 | y = np.zeros(len(X)) 19 | for i,x in enumerate(X): # test points 20 | sl = SortedList(load=self.k) # stores (distance, class) tuples 21 | for j,xt in enumerate(self.X): # training points 22 | diff = x - xt 23 | d = diff.dot(diff) 24 | if len(sl) < self.k: 25 | # don't need to check, just add 26 | sl.add( (d, self.y[j]) ) 27 | else: 28 | if d < sl[-1][0]: 29 | del sl[-1] 30 | sl.add( (d, self.y[j]) ) 31 | # print "input:", x 32 | # print "sl:", sl 33 | 34 | # vote 35 | votes = {} 36 | for _, v in sl: 37 | # print "v:", v 38 | votes[v] = votes.get(v,0) + 1 39 | # print "votes:", votes, "true:", Ytest[i] 40 | max_votes = 0 41 | max_votes_class = -1 42 | for v,count in votes.items(): 43 | if count > max_votes: 44 | max_votes = count 45 | max_votes_class = v 46 | y[i] = max_votes_class 47 | return y 48 | 49 | def score(self, X, Y): 50 | P = self.predict(X) 51 | return np.mean(P == Y) 52 | 53 | 54 | if __name__ == '__main__': 55 | X, Y = get_data(2000) 56 | Ntrain = 1000 57 | Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain] 58 | Xtest, Ytest = X[Ntrain:], Y[Ntrain:] 59 | for k in (1,2,3,4,5): 60 | knn = KNN(k) 61 | t0 = datetime.now() 62 | knn.fit(Xtrain, Ytrain) 63 | print ("Training time:", (datetime.now() - t0)) 64 | 65 | t0 = datetime.now() 66 | print ("Train accuracy:", knn.score(Xtrain, Ytrain)) 67 | print ("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(Ytrain)) 68 | 69 | t0 = datetime.now() 70 | print ("Test accuracy:", knn.score(Xtest, Ytest)) 71 | print ("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest)) -------------------------------------------------------------------------------- /knnxorsingularity.py: -------------------------------------------------------------------------------- 1 | from knnsingularity import KNN 2 | from util import get_xor 3 | import matplotlib.pyplot as plt 4 | 5 | if __name__ == '__main__': 6 | X, Y = get_xor() 7 | 8 | # display the data 9 | plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5) 10 | plt.show() 11 | 12 | # get the accuracy 13 | model = KNN(3) 14 | model.fit(X, Y) 15 | print ("Accuracy:", model.score(X, Y)) -------------------------------------------------------------------------------- /nbsingularity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from util import get_data 3 | from datetime import datetime 4 | from scipy.stats import norm 5 | from scipy.stats import multivariate_normal as mvn 6 | 7 | class NaiveBayes(object): 8 | def fit(self, X, Y, smoothing=10e-3): 9 | # N, D = X.shape 10 | self.gaussians = dict() 11 | self.priors = dict() 12 | labels = set(Y) 13 | for c in labels: 14 | current_x = X[Y == c] 15 | self.gaussians[c] = { 16 | 'mean': current_x.mean(axis=0), 17 | 'var': current_x.var(axis=0) + smoothing, 18 | } 19 | # assert(self.gaussians[c]['mean'].shape[0] == D) 20 | self.priors[c] = float(len(Y[Y == c])) / len(Y) 21 | # print "gaussians:", self.gaussians 22 | # print "priors:", self.priors 23 | 24 | def score(self, X, Y): 25 | P = self.predict(X) 26 | return np.mean(P == Y) 27 | 28 | def predict(self, X): 29 | N, D = X.shape 30 | K = len(self.gaussians) 31 | P = np.zeros((N, K)) 32 | for c, g in self.gaussians.items(): 33 | # print "c:", c 34 | mean, var = g['mean'], g['var'] 35 | P[:,c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c]) 36 | return np.argmax(P, axis=1) 37 | 38 | 39 | if __name__ == '__main__': 40 | X, Y = get_data(10000) 41 | Ntrain = len(Y) / 2 42 | Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain] 43 | Xtest, Ytest = X[Ntrain:], Y[Ntrain:] 44 | 45 | model = NaiveBayes() 46 | t0 = datetime.now() 47 | model.fit(Xtrain, Ytrain) 48 | print ("Training time:", (datetime.now() - t0)) 49 | 50 | t0 = datetime.now() 51 | print ("Train accuracy:", model.score(Xtrain, Ytrain)) 52 | print ("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(Ytrain)) 53 | 54 | t0 = datetime.now() 55 | print ("Test accuracy:", model.score(Xtest, Ytest)) 56 | print ("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest)) -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def get_data(limit=None): 5 | print ("Reading in and transforming data...") 6 | df = pd.read_csv('/Users/GPnew/mnist_csv/train.csv') 7 | data = df.as_matrix() 8 | np.random.shuffle(data) 9 | X = data[:, 1:] / 255.0 # data is from 0..255 10 | Y = data[:, 0] 11 | if limit is not None: 12 | X, Y = X[:limit], Y[:limit] 13 | return X, Y 14 | 15 | def get_xor(): 16 | X = np.zeros((200, 2)) 17 | X[:50] = np.random.random((50, 2)) / 2 + 0.5 # (0.5-1, 0.5-1) 18 | X[50:100] = np.random.random((50, 2)) / 2 # (0-0.5, 0-0.5) 19 | X[100:150] = np.random.random((50, 2)) / 2 + np.array([[0, 0.5]]) # (0-0.5, 0.5-1) 20 | X[150:] = np.random.random((50, 2)) / 2 + np.array([[0.5, 0]]) # (0.5-1, 0-0.5) 21 | Y = np.array([0]*100 + [1]*100) 22 | return X, Y 23 | 24 | def get_donut(): 25 | N = 200 26 | R_inner = 5 27 | R_outer = 10 28 | 29 | # distance from origin is radius + random normal 30 | # angle theta is uniformly distributed between (0, 2pi) 31 | R1 = np.random.randn(N/2) + R_inner 32 | theta = 2*np.pi*np.random.random(N/2) 33 | X_inner = np.concatenate([[R1 * np.cos(theta)], [R1 * np.sin(theta)]]).T 34 | 35 | R2 = np.random.randn(N/2) + R_outer 36 | theta = 2*np.pi*np.random.random(N/2) 37 | X_outer = np.concatenate([[R2 * np.cos(theta)], [R2 * np.sin(theta)]]).T 38 | 39 | X = np.concatenate([ X_inner, X_outer ]) 40 | Y = np.array([0]*(N/2) + [1]*(N/2)) 41 | return X, Y --------------------------------------------------------------------------------