├── .gitattributes ├── .gitignore ├── Chapter 01 └── Python ML Blueprints - Ch. 1.ipynb ├── Chapter 02 └── Python Machine Learning Blueprints Ch. 2.ipynb ├── Chapter 03 ├── Python Machine Learning Blueprints - Ch 3.ipynb └── fare_alerter.py ├── Chapter 04 └── Python Machine Learning Blueprints - Ch. 4.ipynb ├── Chapter 05 ├── Python ML Blueprints - Ch. 5.ipynb └── custom_feed.py ├── Chapter 06 └── Python ML Blueprints - Ch. 6.ipynb ├── Chapter 07 └── Python ML Blueprints - Ch 7.ipynb ├── Chapter 08 ├── Python ML Blueprints - Ch 8 - Python 2.ipynb └── Python ML Blueprints - Ch 8 - Python 3.ipynb ├── Chapter 09 ├── Python ML Blueprints - Ch 9.ipynb └── run_flask.py ├── Chapter 10 └── Python ML Blueprints - Ch 10.ipynb ├── LICENSE └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /Chapter 03/fare_alerter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | import requests 5 | from selenium import webdriver 6 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 7 | from selenium.webdriver.common.by import By 8 | from selenium.webdriver.support.ui import WebDriverWait 9 | from selenium.webdriver.support import expected_conditions as EC 10 | from bs4 import BeautifulSoup 11 | from sklearn.cluster import DBSCAN 12 | from sklearn.preprocessing import StandardScaler 13 | import schedule 14 | import time 15 | 16 | def check_flights(): 17 | URL="https://www.google.com/flights/explore/#explore;f=JFK,EWR,LGA;t=HND,NRT,TPE,HKG,KIX;s=1;li=8;lx=12;d=2017-06-01" 18 | driver = webdriver.PhantomJS(PJS_PATH) 19 | dcap = dict(DesiredCapabilities.PHANTOMJS) 20 | dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36") 21 | driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path=PJS_PATH) 22 | driver.implicitly_wait(20) 23 | driver.get(URL) 24 | wait = WebDriverWait(driver, 20) 25 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span.FTWFGDB-v-c"))) 26 | 27 | s = BeautifulSoup(driver.page_source, "lxml") 28 | 29 | best_price_tags = s.findAll('div', 'FTWFGDB-w-e') 30 | 31 | # check if scrape worked - alert if it fails and shutdown 32 | if len(best_price_tags) < 4: 33 | print('Failed to Load Page Data') 34 | requests.post('https://maker.ifttt.com/trigger/fare_alert/with/key/MY_SECRET_KEY',\ 35 | data={ "value1" : "script", "value2" : "failed", "value3" : "" }) 36 | sys.exit(0) 37 | else: 38 | print('Successfully Loaded Page Data') 39 | 40 | best_prices = [] 41 | for tag in best_price_tags: 42 | best_prices.append(int(tag.text.replace('$',''))) 43 | 44 | best_price = best_prices[0] 45 | 46 | best_height_tags = s.findAll('div', 'FTWFGDB-w-f') 47 | best_heights = [] 48 | for t in best_height_tags: 49 | best_heights.append(float(t.attrs['style']\ 50 | .split('height:')[1].replace('px;',''))) 51 | 52 | best_height = best_heights[0] 53 | 54 | # price per pixel of height 55 | pph = np.array(best_price)/np.array(best_height) 56 | 57 | cities = s.findAll('div', 'FTWFGDB-w-o') 58 | 59 | hlist=[] 60 | for bar in cities[0]\ 61 | .findAll('div', 'FTWFGDB-w-x'): 62 | hlist.append(float(bar['style']\ 63 | .split('height: ')[1].replace('px;','')) * pph) 64 | 65 | fares = pd.DataFrame(hlist, columns=['price']) 66 | px = [x for x in fares['price']] 67 | ff = pd.DataFrame(px, columns=['fare']).reset_index() 68 | 69 | # begin the clustering 70 | X = StandardScaler().fit_transform(ff) 71 | db = DBSCAN(eps=1.5, min_samples=1).fit(X) 72 | 73 | labels = db.labels_ 74 | clusters = len(set(labels)) 75 | 76 | pf = pd.concat([ff,pd.DataFrame(db.labels_, 77 | columns=['cluster'])], axis=1) 78 | 79 | rf = pf.groupby('cluster')['fare'].agg(['min','count']).sort_values('min', ascending=True) 80 | 81 | # set up our rules 82 | # must have more than one cluster 83 | # cluster min must be equal to lowest price fare 84 | # cluster size must be less than 10th percentile 85 | # cluster must be $100 less the next lowest-priced cluster 86 | if clusters > 1 and ff['fare'].min() == rf.iloc[0]['min']\ 87 | and rf.iloc[0]['count'] < rf['count'].quantile(.10)\ 88 | and rf.iloc[0]['fare'] + 100 < rf.iloc[1]['fare']: 89 | city = s.find('span','FTWFGDB-v-c').text 90 | fare = s.find('div','FTWFGDB-w-e').text 91 | r = requests.post('https://maker.ifttt.com/trigger/fare_alert/with/key/MY_SECRET_KEY',\ 92 | data={ "value1" : city, "value2" : fare, "value3" : "" }) 93 | else: 94 | print('no alert triggered') 95 | 96 | # set up the scheduler to run our code every 60 min 97 | schedule.every(60).minutes.do(check_flights) 98 | 99 | while 1: 100 | schedule.run_pending() 101 | time.sleep(1) 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /Chapter 05/Python ML Blueprints - Ch. 5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import requests\n", 12 | "import pandas as pd\n", 13 | "import json\n", 14 | "pd.set_option('display.max_colwidth', 200)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Get Stories from Pocket" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "auth_params = {'consumer_key': 'CONSUMER_KEY', 'redirect_uri': 'https://twitter.com/acombs'}" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "tkn = requests.post('https://getpocket.com/v3/oauth/request', data=auth_params)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "tkn.content" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "usr_params = {'consumer_key':'CONSUMER_KEY', 'code': 'CODE'}" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "usr = requests.post('https://getpocket.com/v3/oauth/authorize', data=usr_params)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "usr.content" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "### Get 'no' stories" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "no_params = {'consumer_key': 'CONSUMER_KEY',\n", 106 | "'access_token': 'SOME_SUPER_LONG_TOKEN',\n", 107 | "'tag': 'n'}" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "no_result = requests.post('https://getpocket.com/v3/get', data=no_params)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "no_result.text" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "no_jf = json.loads(no_result.text)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "no_jd = no_jf['list']" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "no_jd" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "no_urls=[]\n", 174 | "for i in no_jd.values():\n", 175 | " no_urls.append(i.get('resolved_url'))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": false, 183 | "scrolled": true 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "no_urls" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "len(no_urls)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": false, 206 | "scrolled": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "no_uf = pd.DataFrame(no_urls, columns=['urls'])" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": false, 218 | "scrolled": true 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "no_uf" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": { 229 | "collapsed": true 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "no_uf = no_uf.assign(wanted = lambda x: 'n')" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": false, 241 | "scrolled": true 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "no_uf" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "### Get 'yes' stories" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "ye_params = {'consumer_key': 'CONSUMER_KEY',\n", 264 | "'access_token': 'SOME_SUPER_LONG_KEY',\n", 265 | "'tag': 'y',\n", 266 | "'state': 'archive'}" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "collapsed": true 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "yes_result = requests.post('https://getpocket.com/v3/get', data=yes_params)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "yes_result.text" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": true 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "yes_jf = json.loads(yes_result.text)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "collapsed": true 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "yes_jd = yes_jf['list']" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": { 317 | "collapsed": false 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "yes_jf" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": { 328 | "collapsed": false 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "yes_urls=[]\n", 333 | "for i in yes_jd.values():\n", 334 | " yes_urls.append(i.get('resolved_url'))" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "collapsed": false 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "len(yes_urls)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "collapsed": false, 353 | "scrolled": true 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "yes_urls" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "yes_uf = pd.DataFrame(yes_urls, columns=['urls'])" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "collapsed": false, 376 | "scrolled": true 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "yes_uf" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": { 387 | "collapsed": false, 388 | "scrolled": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "yes_uf = yes_uf.assign(wanted = lambda x: 'y')" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "collapsed": false, 400 | "scrolled": true 401 | }, 402 | "outputs": [], 403 | "source": [ 404 | "yes_uf" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": { 411 | "collapsed": false 412 | }, 413 | "outputs": [], 414 | "source": [ 415 | "df = pd.concat([yes_uf, no_uf])" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": { 422 | "collapsed": false, 423 | "scrolled": true 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "df.dropna(inplace=1)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": { 434 | "collapsed": false, 435 | "scrolled": true 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "df" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "## Download Articles to Run Through Model" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "collapsed": true 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "import urllib" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": { 464 | "collapsed": false 465 | }, 466 | "outputs": [], 467 | "source": [ 468 | "def get_html(x):\n", 469 | " qurl = urllib.parse.quote(x)\n", 470 | " rhtml = requests.get('https://api.embedly.com/1/extract?url=' + qurl + '&key=SOME_KEY')\n", 471 | " ctnt = json.loads(rhtml.text).get('content')\n", 472 | " return ctnt" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": { 479 | "collapsed": false, 480 | "scrolled": true 481 | }, 482 | "outputs": [], 483 | "source": [ 484 | "df.loc[:,'html'] = df['urls'].map(get_html)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": { 491 | "collapsed": false, 492 | "scrolled": true 493 | }, 494 | "outputs": [], 495 | "source": [ 496 | "df.dropna(inplace=1)" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": { 503 | "collapsed": false, 504 | "scrolled": true 505 | }, 506 | "outputs": [], 507 | "source": [ 508 | "df" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "### Extract the text" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "collapsed": true 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "from bs4 import BeautifulSoup" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": { 533 | "collapsed": true 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "def get_text(x):\n", 538 | " soup = BeautifulSoup(x, 'lxml')\n", 539 | " text = soup.get_text()\n", 540 | " return text" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": { 547 | "collapsed": false 548 | }, 549 | "outputs": [], 550 | "source": [ 551 | "df.loc[:,'text'] = df['html'].map(get_text)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": { 558 | "collapsed": false 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "df" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": {}, 568 | "source": [ 569 | "# Implement Tfid Vectorization & Fit Model" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": { 576 | "collapsed": true 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 581 | "from sklearn.svm import LinearSVC" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "metadata": { 588 | "collapsed": false 589 | }, 590 | "outputs": [], 591 | "source": [ 592 | "vect = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=3)" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": null, 598 | "metadata": { 599 | "collapsed": false 600 | }, 601 | "outputs": [], 602 | "source": [ 603 | "tv = vect.fit_transform(df['text'])" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": { 610 | "collapsed": true 611 | }, 612 | "outputs": [], 613 | "source": [ 614 | "clf = LinearSVC()" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "metadata": { 621 | "collapsed": false 622 | }, 623 | "outputs": [], 624 | "source": [ 625 | "model = clf.fit(tv, df['wanted'])" 626 | ] 627 | }, 628 | { 629 | "cell_type": "markdown", 630 | "metadata": {}, 631 | "source": [ 632 | "## Pull New Articles from Google Drive Sheet to Evaluate" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": { 639 | "collapsed": true 640 | }, 641 | "outputs": [], 642 | "source": [ 643 | "import gspread\n", 644 | "from oauth2client.client import SignedJwtAssertionCredentials" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": null, 650 | "metadata": { 651 | "collapsed": true 652 | }, 653 | "outputs": [], 654 | "source": [ 655 | "json_key = json.load(open(r'/Users/alexcombs/Downloads/API_KEY.json'))\n", 656 | "scope = ['https://spreadsheets.google.com/feeds']\n", 657 | "credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'].encode(), scope)\n", 658 | "gc = gspread.authorize(credentials)" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": null, 664 | "metadata": { 665 | "collapsed": true 666 | }, 667 | "outputs": [], 668 | "source": [ 669 | "# must share with client_email in json api key file\n", 670 | "ws = gc.open(\"NewStories\")" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "metadata": { 677 | "collapsed": true 678 | }, 679 | "outputs": [], 680 | "source": [ 681 | "sh = ws.sheet1" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": { 688 | "collapsed": false 689 | }, 690 | "outputs": [], 691 | "source": [ 692 | "zd = list(zip(sh.col_values(2),sh.col_values(3), sh.col_values(4)))" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": { 699 | "collapsed": false 700 | }, 701 | "outputs": [], 702 | "source": [ 703 | "zf = pd.DataFrame(zd, columns=['title','urls','html'])" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "metadata": { 710 | "collapsed": false, 711 | "scrolled": true 712 | }, 713 | "outputs": [], 714 | "source": [ 715 | "zf.replace('', pd.np.nan, inplace=True)\n", 716 | "zf.dropna(inplace=True)" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": { 723 | "collapsed": false, 724 | "scrolled": true 725 | }, 726 | "outputs": [], 727 | "source": [ 728 | "zf" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": { 735 | "collapsed": false 736 | }, 737 | "outputs": [], 738 | "source": [ 739 | "zf.loc[:,'text'] = zf['html'].map(get_text)" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": null, 745 | "metadata": { 746 | "collapsed": false 747 | }, 748 | "outputs": [], 749 | "source": [ 750 | "zf.reset_index(drop=True, inplace=True)" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": { 757 | "collapsed": false 758 | }, 759 | "outputs": [], 760 | "source": [ 761 | "test_matrix = vect.transform(zf['text'])" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": null, 767 | "metadata": { 768 | "collapsed": false 769 | }, 770 | "outputs": [], 771 | "source": [ 772 | "test_matrix" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": null, 778 | "metadata": { 779 | "collapsed": false, 780 | "scrolled": true 781 | }, 782 | "outputs": [], 783 | "source": [ 784 | "results = pd.DataFrame(model.predict(test_matrix), columns=['wanted'])" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "metadata": { 791 | "collapsed": false, 792 | "scrolled": true 793 | }, 794 | "outputs": [], 795 | "source": [ 796 | "results" 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": null, 802 | "metadata": { 803 | "collapsed": false 804 | }, 805 | "outputs": [], 806 | "source": [ 807 | "rez = pd.merge(results,zf, left_index=True, right_index=True)" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": null, 813 | "metadata": { 814 | "collapsed": false, 815 | "scrolled": true 816 | }, 817 | "outputs": [], 818 | "source": [ 819 | "rez" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": null, 825 | "metadata": { 826 | "collapsed": false, 827 | "scrolled": true 828 | }, 829 | "outputs": [], 830 | "source": [ 831 | "for i, w, t in zip(rez[rez['wanted']=='y'].index, rez[rez['wanted']=='y']['wanted'], rez[rez['wanted']=='y']['title']):\n", 832 | " print(i, w, t)" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "metadata": { 839 | "collapsed": true 840 | }, 841 | "outputs": [], 842 | "source": [ 843 | "change_to_no = [130, 145, 148, 163, 178, 199, 219, 222, 223, 226, 235, 279, 348, 357, 427, 440, 542, 544, 546, 568, 614, 619, 660, 668, 679, 686, 740, 829]" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": null, 849 | "metadata": { 850 | "collapsed": false 851 | }, 852 | "outputs": [], 853 | "source": [ 854 | "for i, w, t in zip(rez[rez['wanted']=='n'].index, rez[rez['wanted']=='n']['wanted'], rez[rez['wanted']=='n']['title']):\n", 855 | " print(i, w, t)" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": null, 861 | "metadata": { 862 | "collapsed": false 863 | }, 864 | "outputs": [], 865 | "source": [ 866 | "change_to_yes = [0, 9, 29, 35, 42, 71, 110, 190, 319, 335, 344, 371, 385, 399, 408, 409, 422, 472, 520, 534, 672]" 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": null, 872 | "metadata": { 873 | "collapsed": false 874 | }, 875 | "outputs": [], 876 | "source": [ 877 | "rez" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": null, 883 | "metadata": { 884 | "collapsed": false 885 | }, 886 | "outputs": [], 887 | "source": [ 888 | "for i in rez.iloc[change_to_yes].index:\n", 889 | " rez.iloc[i]['wanted'] = 'y'" 890 | ] 891 | }, 892 | { 893 | "cell_type": "code", 894 | "execution_count": null, 895 | "metadata": { 896 | "collapsed": true 897 | }, 898 | "outputs": [], 899 | "source": [ 900 | "for i in rez.iloc[change_to_no].index:\n", 901 | " rez.iloc[i]['wanted'] = 'n'" 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": null, 907 | "metadata": { 908 | "collapsed": false 909 | }, 910 | "outputs": [], 911 | "source": [ 912 | "rez" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": null, 918 | "metadata": { 919 | "collapsed": false 920 | }, 921 | "outputs": [], 922 | "source": [ 923 | "df" 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": null, 929 | "metadata": { 930 | "collapsed": false 931 | }, 932 | "outputs": [], 933 | "source": [ 934 | "combined = pd.concat([df[['wanted', 'text']], rez[['wanted', 'text']]])" 935 | ] 936 | }, 937 | { 938 | "cell_type": "code", 939 | "execution_count": null, 940 | "metadata": { 941 | "collapsed": false 942 | }, 943 | "outputs": [], 944 | "source": [ 945 | "combined" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": null, 951 | "metadata": { 952 | "collapsed": true 953 | }, 954 | "outputs": [], 955 | "source": [ 956 | "tvcomb = vect.fit_transform(combined['text'], combined['wanted'])" 957 | ] 958 | }, 959 | { 960 | "cell_type": "code", 961 | "execution_count": null, 962 | "metadata": { 963 | "collapsed": true 964 | }, 965 | "outputs": [], 966 | "source": [ 967 | "model = clf.fit(tvcomb, combined['wanted'])" 968 | ] 969 | }, 970 | { 971 | "cell_type": "code", 972 | "execution_count": null, 973 | "metadata": { 974 | "collapsed": false 975 | }, 976 | "outputs": [], 977 | "source": [ 978 | "model" 979 | ] 980 | }, 981 | { 982 | "cell_type": "code", 983 | "execution_count": null, 984 | "metadata": { 985 | "collapsed": true 986 | }, 987 | "outputs": [], 988 | "source": [ 989 | "import pickle" 990 | ] 991 | }, 992 | { 993 | "cell_type": "code", 994 | "execution_count": null, 995 | "metadata": { 996 | "collapsed": false 997 | }, 998 | "outputs": [], 999 | "source": [ 1000 | "pickle.dump(model, open(r'/Users/alexcombs/Downloads/news_model_pickle.p', 'wb'))" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": null, 1006 | "metadata": { 1007 | "collapsed": true 1008 | }, 1009 | "outputs": [], 1010 | "source": [ 1011 | "pickle.dump(vect, open(r'/Users/alexcombs/Downloads/news_vect_pickle.p', 'wb'))" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": null, 1017 | "metadata": { 1018 | "collapsed": true 1019 | }, 1020 | "outputs": [], 1021 | "source": [] 1022 | } 1023 | ], 1024 | "metadata": { 1025 | "kernelspec": { 1026 | "display_name": "Python 3", 1027 | "language": "python", 1028 | "name": "python3" 1029 | }, 1030 | "language_info": { 1031 | "codemirror_mode": { 1032 | "name": "ipython", 1033 | "version": 3 1034 | }, 1035 | "file_extension": ".py", 1036 | "mimetype": "text/x-python", 1037 | "name": "python", 1038 | "nbconvert_exporter": "python", 1039 | "pygments_lexer": "ipython3", 1040 | "version": "3.5.0" 1041 | } 1042 | }, 1043 | "nbformat": 4, 1044 | "nbformat_minor": 0 1045 | } 1046 | -------------------------------------------------------------------------------- /Chapter 05/custom_feed.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.feature_extraction.text import TfidfVectorizer 3 | from sklearn.svm import LinearSVC 4 | import schedule 5 | import time 6 | import pickle 7 | import json 8 | import gspread 9 | import requests 10 | from oauth2client.client import SignedJwtAssertionCredentials 11 | from bs4 import BeautifulSoup 12 | 13 | pd.set_option('display.max_colwidth', 250) 14 | 15 | def fetch_news(): 16 | try: 17 | vect = pickle.load(open(r'/Users/alexcombs/Downloads/news_vect_pickle.p', 'rb')) 18 | model = pickle.load(open(r'/Users/alexcombs/Downloads/news_model_pickle.p', 'rb')) 19 | 20 | json_key = json.load(open(r'/Users/alexcombs/Downloads/API Project-5d8d50bccf0b.json')) 21 | scope = ['https://spreadsheets.google.com/feeds'] 22 | credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'].encode(), scope) 23 | gc = gspread.authorize(credentials) 24 | 25 | 26 | ws = gc.open("NewStories") 27 | sh = ws.sheet1 28 | zd = list(zip(sh.col_values(2),sh.col_values(3), sh.col_values(4))) 29 | zf = pd.DataFrame(zd, columns=['title','urls','html']) 30 | zf.replace('', pd.np.nan, inplace=True) 31 | zf.dropna(inplace=True) 32 | 33 | def get_text(x): 34 | soup = BeautifulSoup(x, 'lxml') 35 | text = soup.get_text() 36 | return text 37 | 38 | zf.loc[:,'text'] = zf['html'].map(get_text) 39 | 40 | tv = vect.transform(zf['text']) 41 | res = model.predict(tv) 42 | 43 | rf = pd.DataFrame(res, columns=['wanted']) 44 | rez = pd.merge(rf, zf, left_index=True, right_index=True) 45 | 46 | news_str = '' 47 | for t, u in zip(rez[rez['wanted']=='y']['title'], rez[rez['wanted']=='y']['urls']): 48 | news_str = news_str + t + '\n' + u + '\n' 49 | 50 | payload = {"value1" : news_str} 51 | r = requests.post('https://maker.ifttt.com/trigger/news_event/with/key/banZCjMLOotibc4WguJx0B', data=payload) 52 | 53 | # clean up worksheet 54 | lenv = len(sh.col_values(1)) 55 | cell_list = sh.range('A1:F' + str(lenv)) 56 | for cell in cell_list: 57 | cell.value = "" 58 | sh.update_cells(cell_list) 59 | print(r.text) 60 | 61 | except: 62 | print('Failed') 63 | 64 | schedule.every(480).minutes.do(fetch_news) 65 | 66 | while 1: 67 | schedule.run_pending() 68 | time.sleep(1) -------------------------------------------------------------------------------- /Chapter 08/Python ML Blueprints - Ch 8 - Python 3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from sklearn import datasets\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "digits = datasets.load_digits()" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "def display_img(img_no):\n", 37 | " fig, ax = plt.subplots()\n", 38 | " ax.set_xticklabels([])\n", 39 | " ax.set_yticklabels([])\n", 40 | " ax.matshow(digits.images[img_no], cmap = plt.cm.binary);" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAO0AAADtCAYAAABTTfKPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAABPNJREFUeJzt3TFuE2sYQNHxEwWUoaSkpSMpKdJ4CWQrUIbOa2AJziay\ngGQBIAV6KFxC59e8Aj2RgK3kZ645p8QZPgvmaizhn2+x3W4noOOfP/0GgN2IFmJECzGihRjRQoxo\nIebRXS8uFgv/HgR/yHa7Xfzs1++M9r8Ldx52fn4+nZ+f73zdvvadd3Fxsde89Xo9nZ2d7Xzdmzdv\n9pq32Wymo6Ojna9bLpd7zbu6uppOTk52vm61Wu01b7VaTW/fvt35un3+TKZp7P2576zF4qe9TtPk\n4zHkiBZiHiTa09PTh/htZzPvxYsXQ+c9fvx46Lxnz54Nnffq1auh80beLw8xS7R7GB3tkydPhs4T\n7bxn+XgMMaKFGNFCjGghRrQQI1qIES3EiBZifnlg4McvO5+eng7/IgP8DS4vL6fLy8vf+tmdogUe\nxv8fiO/evbv1Z308hhjRQoxoIUa0ECNaiBEtxIgWYkQLMaKFGNFCjGgh5pffPT5k+/6P//v6/Pnz\n0HmbzWbovKdPnw6dt16vh857/fr10Hm38aSFGNFCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMaCFG\ntBAjWoixFgRmwFoQiLEWBA6YaCFGtBAjWogRLcSIFmJECzGihRjRQoxoIUa0ECNaiJnVLp/r6+uh\n80bv1rm5uRk67/nz50PnLZfLofNG3y92+QB7ES3EiBZiRAsxooUY0UKMaCFGtBAjWogRLcSIFmJE\nCzGihRi7fGAG7PKBGLt84ICJFmJECzGihRjRQoxoIUa0ECNaiBEtxIgWYkQLMaKFmFnt8tlsNkPn\nvXz5cui80bt1Rjs+Pv7Tb+Gv4EkLMaKFGNFCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMaCFGtBBj\nlw/MgF0+EGOXDxww0UKMaCFGtBAjWogRLcSIFmJECzGihRjRQoxoIUa0EPNX7/JZLpdD5x260X9/\nR0dHQ+fNhSctxIgWYkQLMaKFGNFCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMXT4wA3b5QIxdPnDA\nRAsxooUY0UKMaCFGtBAjWogRLcSIFmJECzGihRjRQsysdvmM3s1yfX09dN5oo3frXF1dDZ13dnY2\ndN5ceNJCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMaCFGtBAjWogRLcTY5QMzYJcPxNjlAwdMtBAj\nWogRLcSIFmJECzGihRjRQoxoIUa0ECNaiBEtxCy22+3tLy4W27tev2+fPn0aNmuapun4+HjovPfv\n3w+dd3FxMXTezc3N0HmHvItpsVhM2+128bPXPGkhRrQQI1qIES3EiBZiRAsxooUY0UKMaCFGtBAj\nWogRLcSIFmLs8oEZ2GWXj6N5Azmad78czQMSRAsxooUY0UKMaCFGtBAjWogRLcSIFmJECzGihRjR\nQsysDgyMNvoL/KvVaui8k5OTofPW6/XQeYfMgQE4IKKFGNFCjGghRrQQI1qIES3EiBZiRAsxooUY\n0UKMaCFGtBBjlw/MQHaXz2iO5t0vR/Puj6N5cEBECzGihRjRQoxoIUa0ECNaiBEtxDxItL/7zY7q\nvA8fPgyd9+3bt6Hzvnz5MnTeId8vDzFLtHv4+PHj0Hnfv38fOu/r169D5x3y/ZKJFng4ooWYXx4Y\nGPhegB/cdmDgzmiB+fHxGGJECzGihRjRQoxoIeZffu8xKwIJF6EAAAAASUVORK5CYII=\n", 53 | "text/plain": [ 54 | "" 55 | ] 56 | }, 57 | "metadata": {}, 58 | "output_type": "display_data" 59 | } 60 | ], 61 | "source": [ 62 | "display_img(0)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "metadata": { 69 | "collapsed": false 70 | }, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "array([[ 0., 0., 5., 13., 9., 1., 0., 0.],\n", 76 | " [ 0., 0., 13., 15., 10., 15., 5., 0.],\n", 77 | " [ 0., 3., 15., 2., 0., 11., 8., 0.],\n", 78 | " [ 0., 4., 12., 0., 0., 8., 8., 0.],\n", 79 | " [ 0., 5., 8., 0., 0., 9., 8., 0.],\n", 80 | " [ 0., 4., 11., 0., 1., 12., 7., 0.],\n", 81 | " [ 0., 2., 14., 5., 10., 12., 0., 0.],\n", 82 | " [ 0., 0., 6., 13., 10., 0., 0., 0.]])" 83 | ] 84 | }, 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "digits.images[0]" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 6, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "(64,)" 105 | ] 106 | }, 107 | "execution_count": 6, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "digits.data[0].shape" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 7, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "0" 127 | ] 128 | }, 129 | "execution_count": 7, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "digits.target[0]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 8, 141 | "metadata": { 142 | "collapsed": true 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "import pandas as pd\n", 147 | "from sklearn.metrics.pairwise import cosine_similarity\n", 148 | "from sklearn.metrics.pairwise import chi2_kernel" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "X = digits.data\n", 160 | "y = digits.target" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 10, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "k_sim = chi2_kernel(X[0].reshape(1,-1), X)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 11, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "array([[ 1.00000000e+000, 7.57695024e-116, 1.95599924e-105, ...,\n", 185 | " 1.29644889e-083, 2.49956726e-051, 1.10169569e-079]])" 186 | ] 187 | }, 188 | "execution_count": 11, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "k_sim" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 12, 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "kf = pd.DataFrame(k_sim).T\n", 206 | "kf.columns = ['similarity']" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 13, 212 | "metadata": { 213 | "collapsed": false, 214 | "scrolled": true 215 | }, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/html": [ 220 | "
\n", 221 | "\n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | "
similarity
01.000000e+00
11671.644255e-07
8771.040593e-07
4641.232666e-08
15418.598399e-09
13658.274881e-09
10291.907361e-09
8551.487874e-10
16971.191874e-10
9571.870301e-11
14631.714631e-12
12361.528919e-13
6468.264444e-14
3357.758213e-14
8124.250581e-14
2762.589843e-14
3051.141329e-14
8061.101042e-14
3116.290698e-15
11285.081774e-15
14944.847325e-15
1664.115303e-15
5162.819098e-15
6421.384852e-15
2291.246194e-15
10021.177180e-15
6768.403675e-16
17457.786381e-16
3967.525816e-16
9416.983304e-16
......
5581.580173e-124
5371.045260e-124
17188.244835e-125
9873.545141e-125
8321.385241e-125
9471.010415e-125
15518.810739e-126
10004.658142e-126
16483.785987e-126
5723.661237e-126
9943.042558e-126
3122.429941e-126
9533.466028e-127
14371.789318e-127
9867.574662e-128
2155.942922e-128
16401.382383e-128
3365.279808e-129
13802.082809e-129
13294.867742e-131
5171.673149e-131
13347.601630e-132
16264.137893e-132
3412.398919e-132
6236.193922e-133
15851.176835e-133
9164.820881e-134
12131.319706e-134
16315.139275e-138
6092.381570e-138
\n", 475 | "

1797 rows × 1 columns

\n", 476 | "
" 477 | ], 478 | "text/plain": [ 479 | " similarity\n", 480 | "0 1.000000e+00\n", 481 | "1167 1.644255e-07\n", 482 | "877 1.040593e-07\n", 483 | "464 1.232666e-08\n", 484 | "1541 8.598399e-09\n", 485 | "1365 8.274881e-09\n", 486 | "1029 1.907361e-09\n", 487 | "855 1.487874e-10\n", 488 | "1697 1.191874e-10\n", 489 | "957 1.870301e-11\n", 490 | "1463 1.714631e-12\n", 491 | "1236 1.528919e-13\n", 492 | "646 8.264444e-14\n", 493 | "335 7.758213e-14\n", 494 | "812 4.250581e-14\n", 495 | "276 2.589843e-14\n", 496 | "305 1.141329e-14\n", 497 | "806 1.101042e-14\n", 498 | "311 6.290698e-15\n", 499 | "1128 5.081774e-15\n", 500 | "1494 4.847325e-15\n", 501 | "166 4.115303e-15\n", 502 | "516 2.819098e-15\n", 503 | "642 1.384852e-15\n", 504 | "229 1.246194e-15\n", 505 | "1002 1.177180e-15\n", 506 | "676 8.403675e-16\n", 507 | "1745 7.786381e-16\n", 508 | "396 7.525816e-16\n", 509 | "941 6.983304e-16\n", 510 | "... ...\n", 511 | "558 1.580173e-124\n", 512 | "537 1.045260e-124\n", 513 | "1718 8.244835e-125\n", 514 | "987 3.545141e-125\n", 515 | "832 1.385241e-125\n", 516 | "947 1.010415e-125\n", 517 | "1551 8.810739e-126\n", 518 | "1000 4.658142e-126\n", 519 | "1648 3.785987e-126\n", 520 | "572 3.661237e-126\n", 521 | "994 3.042558e-126\n", 522 | "312 2.429941e-126\n", 523 | "953 3.466028e-127\n", 524 | "1437 1.789318e-127\n", 525 | "986 7.574662e-128\n", 526 | "215 5.942922e-128\n", 527 | "1640 1.382383e-128\n", 528 | "336 5.279808e-129\n", 529 | "1380 2.082809e-129\n", 530 | "1329 4.867742e-131\n", 531 | "517 1.673149e-131\n", 532 | "1334 7.601630e-132\n", 533 | "1626 4.137893e-132\n", 534 | "341 2.398919e-132\n", 535 | "623 6.193922e-133\n", 536 | "1585 1.176835e-133\n", 537 | "916 4.820881e-134\n", 538 | "1213 1.319706e-134\n", 539 | "1631 5.139275e-138\n", 540 | "609 2.381570e-138\n", 541 | "\n", 542 | "[1797 rows x 1 columns]" 543 | ] 544 | }, 545 | "execution_count": 13, 546 | "metadata": {}, 547 | "output_type": "execute_result" 548 | } 549 | ], 550 | "source": [ 551 | "kf.sort_values('similarity', ascending=False)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 14, 557 | "metadata": { 558 | "collapsed": false 559 | }, 560 | "outputs": [ 561 | { 562 | "data": { 563 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAO0AAADtCAYAAABTTfKPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAABM5JREFUeJzt3bFx1GoUgNHVmxcQUgCBS9iAgAyXsCU4JKQE0wEhISVA\nBw7JcEABLsGhM5G8gHmDjVfj/dEnnxOyXq4x+kYrkOZO8zzvgI5//vY3ABxHtBAjWogRLcSIFmJE\nCzH/PvTiNE3+Pwj+knmep9/9+oPR/vfGo4ddXl7uLi8vj37fUkvnXV9fL5r36dOn3bt3745+3+Fw\nWDTv9vZ29/Lly2Hzvn37tnvz5s3R77u4uFg0b+nPc7/fL5o38vhcOmuaftvrbrfz8RhyRAsxJ4n2\n/Pz8FL/taua9fv166LwXL14Mnffq1auh80b/PEceL6eYNT10zTpN07zle5OXXtMutfQaszJv6TXt\nUkuvaQumabr3H6J8PIYY0UKMaCFGtBAjWogRLcSIFmJECzF/fGDg15udz8/Ph999BM/B1dXV7urq\n6lFf646ogdwR9bTcEQUkiBZiRAsxooUY0UKMaCFGtBAjWogRLcSIFmJECzF/fGBgy0ZuQdjtdou2\nBJTmjb7X+bE32D+Vs7OzofPu40wLMaKFGNFCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMaCFGtBBj\nLQiswDFrQY6KFjiN/58QP3z4cO/X+ngMMaKFGNFCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMaCFm\nVbt8bm5uhs77+vXr0Hnfv38fOm+/3w+dd3t7O3Te6OPFLh9gEdFCjGghRrQQI1qIES3EiBZiRAsx\nooUY0UKMaCFGtBAjWoixywdWwC4fiLHLBzZMtBAjWogRLcSIFmJECzGihRjRQoxoIUa0ECNaiBEt\nxKxql8/19fXQeW/fvh06b/RundEOh8PQeY99KuaprOUJN2daiBEtxIgWYkQLMaKFGNFCjGghRrQQ\nI1qIES3EiBZiRAsxooUYu3xgBezygRi7fGDDRAsxooUY0UKMaCFGtBAjWogRLcSIFmJECzGihRjR\nQsyz3uXjiaWnNfrv77lypoUY0UKMaCFGtBAjWogRLcSIFmJECzGihRjRQoxoIUa0ECNaiLHLB1bA\nLh+IscsHNky0ECNaiBEtxIgWYkQLMaKFGNFCjGghRrQQI1qIES3ErGqXz36/Hzrv8+fPQ+eNdnNz\nM3Te6IdLHvtUzNY400KMaCFGtBAjWogRLcSIFmJECzGihRjRQoxoIUa0ECNaiBEtxNjlAytglw/E\n2OUDGyZaiBEtxIgWYkQLMaKFGNFCjGghRrQQI1qIES3EiBZipnme739xmuaHXq87OzsbOu/jx49D\n541+2OPi4mLovPfv3w+dN9I0Tbt5nqffveZMCzGihRjRQoxoIUa0ECNaiBEtxIgWYkQLMaKFGNFC\njGghRrQQY5cPrIBdPhBjlw9smGghRrQQI1qIES3EiBZiRAsxooUY0UKMaCFGtBAjWoh51rt8vnz5\nMnTe6N0zo3cHHQ6HofO2zC4f2BDRQoxoIUa0ECNaiBEtxIgWYkQLMaKFGNFCjGghRrQQI1qIscsH\nVsAuH4ixywc2TLQQI1qIES3EiBZiRAsxooUY0ULMSaJ97J0d1Xk/fvwYOu/u7m7ovNF/vi0fL6eY\nJdoFRPu0tny8ZKIFTke0EPPHXT4DvxfgF/ft8nkwWmB9fDyGGNFCjGghRrQQI1qI+QlR1yOA76Xr\nFAAAAABJRU5ErkJggg==\n", 564 | "text/plain": [ 565 | "" 566 | ] 567 | }, 568 | "metadata": {}, 569 | "output_type": "display_data" 570 | } 571 | ], 572 | "source": [ 573 | "display_img(1167)" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 15, 579 | "metadata": { 580 | "collapsed": false 581 | }, 582 | "outputs": [], 583 | "source": [ 584 | "co_sim = cosine_similarity(X[0].reshape(1,-1), X)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 16, 590 | "metadata": { 591 | "collapsed": false 592 | }, 593 | "outputs": [], 594 | "source": [ 595 | "cosf = pd.DataFrame(co_sim).T\n", 596 | "cosf.columns = ['similarity']" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 17, 602 | "metadata": { 603 | "collapsed": false, 604 | "scrolled": true 605 | }, 606 | "outputs": [ 607 | { 608 | "data": { 609 | "text/html": [ 610 | "
\n", 611 | "\n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | "
similarity
01.000000
8770.980739
4640.974474
13650.974188
15410.971831
11670.971130
10290.970858
3960.968793
16970.966019
6460.965490
13420.963990
1600.961824
9570.960468
3350.959937
14630.958401
8550.958079
2290.957180
6420.956975
6820.956633
8120.954502
2760.953733
3110.953675
7250.953565
300.953453
6660.952949
5160.952674
3050.952255
7240.951774
14940.951671
4580.951614
......
13720.486735
16130.486612
5170.485804
9720.484865
10000.479740
13570.479682
6230.476078
2150.473342
1070.472412
16340.471961
7770.469736
850.466485
9940.466440
15900.463689
13800.456986
6090.456557
13770.449912
2670.449633
3360.446742
16210.442270
15510.440442
3410.430761
16480.426239
13290.425764
16400.420540
13340.420014
15850.402730
12130.393677
16310.368377
16260.361120
\n", 865 | "

1797 rows × 1 columns

\n", 866 | "
" 867 | ], 868 | "text/plain": [ 869 | " similarity\n", 870 | "0 1.000000\n", 871 | "877 0.980739\n", 872 | "464 0.974474\n", 873 | "1365 0.974188\n", 874 | "1541 0.971831\n", 875 | "1167 0.971130\n", 876 | "1029 0.970858\n", 877 | "396 0.968793\n", 878 | "1697 0.966019\n", 879 | "646 0.965490\n", 880 | "1342 0.963990\n", 881 | "160 0.961824\n", 882 | "957 0.960468\n", 883 | "335 0.959937\n", 884 | "1463 0.958401\n", 885 | "855 0.958079\n", 886 | "229 0.957180\n", 887 | "642 0.956975\n", 888 | "682 0.956633\n", 889 | "812 0.954502\n", 890 | "276 0.953733\n", 891 | "311 0.953675\n", 892 | "725 0.953565\n", 893 | "30 0.953453\n", 894 | "666 0.952949\n", 895 | "516 0.952674\n", 896 | "305 0.952255\n", 897 | "724 0.951774\n", 898 | "1494 0.951671\n", 899 | "458 0.951614\n", 900 | "... ...\n", 901 | "1372 0.486735\n", 902 | "1613 0.486612\n", 903 | "517 0.485804\n", 904 | "972 0.484865\n", 905 | "1000 0.479740\n", 906 | "1357 0.479682\n", 907 | "623 0.476078\n", 908 | "215 0.473342\n", 909 | "107 0.472412\n", 910 | "1634 0.471961\n", 911 | "777 0.469736\n", 912 | "85 0.466485\n", 913 | "994 0.466440\n", 914 | "1590 0.463689\n", 915 | "1380 0.456986\n", 916 | "609 0.456557\n", 917 | "1377 0.449912\n", 918 | "267 0.449633\n", 919 | "336 0.446742\n", 920 | "1621 0.442270\n", 921 | "1551 0.440442\n", 922 | "341 0.430761\n", 923 | "1648 0.426239\n", 924 | "1329 0.425764\n", 925 | "1640 0.420540\n", 926 | "1334 0.420014\n", 927 | "1585 0.402730\n", 928 | "1213 0.393677\n", 929 | "1631 0.368377\n", 930 | "1626 0.361120\n", 931 | "\n", 932 | "[1797 rows x 1 columns]" 933 | ] 934 | }, 935 | "execution_count": 17, 936 | "metadata": {}, 937 | "output_type": "execute_result" 938 | } 939 | ], 940 | "source": [ 941 | "cosf.sort_values('similarity', ascending=False)" 942 | ] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": 18, 947 | "metadata": { 948 | "collapsed": false 949 | }, 950 | "outputs": [ 951 | { 952 | "data": { 953 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAO0AAADtCAYAAABTTfKPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAABMdJREFUeJzt3bFRG2kYgOHdmwsdOHRIQAGUQOiQDqAElQAduASXQOhQ\nBTigBEKHInO2F9wFNx7Algb93ld+nhBp50MavbM7g5ZvXpZlAjr++t2/ALAf0UKMaCFGtBAjWogR\nLcT8/dqD8zz7exD8JsuyzM/9/NVo/ztw72G3t7fT7e3t3scd6tB5nz59Omjely9fpo8fP+593OfP\nnw+a9+3bt+nDhw97H3d1dXXQvO12O11eXu593CHHTNO/78vNzc2weSM/n4fOmudne52myeUx5IgW\nYo4S7aGXLZV55+fnQ+e9e/du6Lyzs7Oh8y4uLobOG/l5OcYs0R5AtG9LtPtxeQwxooUY0UKMaCFG\ntBAjWogRLcSIFmJ+esPA/7/sfHl5OfyLDPAn2G6303a7/aXn7hUtcBw/nhDv7u5efK7LY4gRLcSI\nFmJECzGihRjRQoxoIUa0ECNaiBEtxIgWYubXNgjM87yM3BS/2+2GzZqm8f91cPTNFqNf36EbFA71\n8PAwdN7I93Oe5xfXgjjTQoxoIUa0ECNaiBEtxIgWYkQLMaKFGNFCjGghRrQQI1qIES3EWAsCK7DP\nWhC35g3k1ry35dY8IEG0ECNaiBEtxIgWYkQLMaKFGNFCjGghRrQQI1qIES3E/PQun5FGfwH86elp\n6Lyrq6uh825ubobOe//+/dB59/f3Q+dtNpuh817iTAsxooUY0UKMaCFGtBAjWogRLcSIFmJECzGi\nhRjRQoxoIUa0EGOXD6zAPrt89ooWOI4fT4h3d3cvPtflMcSIFmJECzGihRjRQoxoIUa0ECNaiBEt\nxIgWYkQLMaKFmFXt8hnt+vp66LzRu3VGG73LZ7fbDZ23Fs60ECNaiBEtxIgWYkQLMaKFGNFCjGgh\nRrQQI1qIES3EiBZiRAsxdvnACtjlAzF2+cAJEy3EiBZiRAsxooUY0UKMaCFGtBAjWogRLcSIFmJE\nCzF/9C4f3tafultnNGdaiBEtxIgWYkQLMaKFGNFCjGghRrQQI1qIES3EiBZiRAsxooUYu3xgBezy\ngRi7fOCEiRZiRAsxooUY0UKMaCFGtBAjWogRLcSIFmJECzGihZhV7fIZvQvm8fFx6LxT96t3qbyV\nzWYzdN5aONNCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMaCFGtBAjWogRLcTY5QMrYJcPxNjlAydM\ntBAjWogRLcSIFmJECzGihRjRQoxoIUa0ECNaiBEtxKxql8/Z2dnQeQ8PD0Pn3d/fD503erfO6N1I\nFxcXQ+ethTMtxIgWYkQLMaKFGNFCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMXT6wAnb5QIxdPnDC\nRAsxooUY0UKMaCFGtBAjWogRLcSIFmJECzGihRjRQsyqdvmM3s2y2WxOet7o3UijdxWNfn1r4UwL\nMaKFGNFCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMaCFGtBBjlw+sgF0+EGOXD5ww0UKMaCFGtBAj\nWogRLcSIFmJECzFHifZXv9lRnff4+Dh03vfv34fO2+12Q+d9/fp16LyRn5djzBLtAUT7tkS7H5fH\nECNaiJmXZXn5wXl++UHgqJZlmZ/7+avRAuvj8hhiRAsxooUY0UKMaCHmH68rElEe1RxYAAAAAElF\nTkSuQmCC\n", 954 | "text/plain": [ 955 | "" 956 | ] 957 | }, 958 | "metadata": {}, 959 | "output_type": "display_data" 960 | } 961 | ], 962 | "source": [ 963 | "display_img(877)" 964 | ] 965 | }, 966 | { 967 | "cell_type": "code", 968 | "execution_count": 19, 969 | "metadata": { 970 | "collapsed": false 971 | }, 972 | "outputs": [ 973 | { 974 | "data": { 975 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAO0AAADtCAYAAABTTfKPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAABIJJREFUeJzt3c1NW1sUgFHfpzTgKTOgBRfAhBZogTZogxZogQkF0IIZ\nMsQl+E3eINIj/Fj4+H43aw2TOPtK1ifj5Bztab/fr4COf079AMD3iBZiRAsxooUY0UKMaCHm10e/\nOU2T/w+CE9nv99N7v/5htP+98NvD7u7uVnd3d99+3aEq866vrw+at91uV5eXl99+3cXFxUHznp+f\nV5vN5tuvu7+/P2he5f0bOWua3u11tVr58RhyRAsxR4n26urqGH/tXztvvV4PnXd2djZ03pLfv2PM\nmj76zjpN097Z5J9z6HfaQx36nfZQh36n5f+mafrjP0T58RhiRAsxooUY0UKMaCFGtBAjWogRLcR8\nemHg98POV1dXw0+vwN/g6elp9fT09KU/60TUQE5E8VVORMGCiBZiRAsxooUY0UKMaCFGtBAjWogR\nLcSIFmJECzHOHg80+uzx8/Pz0Hm73W7ovCVz9hgWRLQQI1qIES3EiBZiRAsxooUY0UKMaCFGtBAj\nWogRLcSIFmKsBYEZsBZkplzN46tczYMFES3EiBZiRAsxooUY0UKMaCFGtBAjWogRLcSIFmJECzGf\n3vKha71en/oROAKftBAjWogRLcSIFmJECzGihRjRQoxoIUa0ECNaiBEtxIgWYkQLMXb5wAzY5TNT\no3f5bLfbofNeXl6Gzlsyu3xgQUQLMaKFGNFCjGghRrQQI1qIES3EiBZiRAsxooUY0UKMCwMDTdO7\n57+P5vz8fOg8FwZ+jgsDsCCihRjRQoxoIUa0ECNaiBEtxIgWYkQLMaKFGNFCjGghRrQQY5cPzIBd\nPjPlah5f5WoeLIhoIUa0ECNaiBEtxIgWYkQLMaKFGNFCjGghRrQQI1qI+fSWDz9n9AF+lsknLcSI\nFmJECzGihRjRQoxoIUa0ECNaiBEtxIgWYkQLMaKFGNFCjF0+MAN2+czUxcXFqR/hqOzy+Tl2+cCC\niBZiRAsxooUY0UKMaCFGtBAjWogRLcSIFmJECzGihRi7fBbs+vr61I/AEfikhRjRQoxoIUa0ECNa\niBEtxIgWYkQLMaKFGNFCjGghRrQQI1qIscsHZsAun5kavctn9NW8+/v7ofOWzC4fWBDRQoxoIUa0\nECNaiBEtxIgWYkQLMaKFGNFCjGghRrQQY5fPQJvNZui83W43dB5j+KSFGNFCjGghRrQQI1qIES3E\niBZiRAsxooUY0UKMaCFGtBAjWoixywdmwC6fmbq5uTn1IxzVw8PDqR9hMezygQURLcSIFmJECzGi\nhRjRQoxoIUa0ECNaiBEtxIgWYkQLMXb5DLRer4fOe3l5GTqPMXzSQoxoIUa0ECNaiBEtxIgWYkQL\nMaKFGNFCjGghRrQQI1qIES3E2OUDM2CXz0zd3t4OnTf6at7j4+PQeUtmlw8siGghRrQQI1qIES3E\niBZiRAsxooWYo0T71ZMd5n3N6+vr0Hlvb29D5y35/TvGLNEG5o2OdrfbDZ235PcvEy1wPKKFmE8v\nDAx8FuA3f7ow8GG0wPz48RhiRAsxooUY0UKMaCHmX1ydOmft43HTAAAAAElFTkSuQmCC\n", 976 | "text/plain": [ 977 | "" 978 | ] 979 | }, 980 | "metadata": {}, 981 | "output_type": "display_data" 982 | } 983 | ], 984 | "source": [ 985 | "display_img(1626)" 986 | ] 987 | }, 988 | { 989 | "cell_type": "code", 990 | "execution_count": null, 991 | "metadata": { 992 | "collapsed": true 993 | }, 994 | "outputs": [], 995 | "source": [] 996 | } 997 | ], 998 | "metadata": { 999 | "kernelspec": { 1000 | "display_name": "Python 3", 1001 | "language": "python", 1002 | "name": "python3" 1003 | }, 1004 | "language_info": { 1005 | "codemirror_mode": { 1006 | "name": "ipython", 1007 | "version": 3 1008 | }, 1009 | "file_extension": ".py", 1010 | "mimetype": "text/x-python", 1011 | "name": "python", 1012 | "nbconvert_exporter": "python", 1013 | "pygments_lexer": "ipython3", 1014 | "version": "3.5.0" 1015 | } 1016 | }, 1017 | "nbformat": 4, 1018 | "nbformat_minor": 0 1019 | } 1020 | -------------------------------------------------------------------------------- /Chapter 09/run_flask.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, redirect 2 | import twilio.twiml 3 | import pandas as pd 4 | import re 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | app = Flask(__name__) 9 | 10 | PATH_TO_CSV = 'path/to/file.csv' 11 | df = pd.read_csv(PATH_TO_CSV) 12 | 13 | convo = df.iloc[:,0] 14 | 15 | clist = [] 16 | def qa_pairs(x): 17 | cpairs = re.findall(": (.*?)(?:$|\n)", x) 18 | clist.extend(list(zip(cpairs, cpairs[1:]))) 19 | 20 | convo.map(qa_pairs); 21 | 22 | convo_frame = pd.Series(dict(clist)).to_frame().reset_index() 23 | convo_frame.columns = ['q', 'a'] 24 | 25 | vectorizer = TfidfVectorizer(ngram_range=(1,3)) 26 | vec = vectorizer.fit_transform(convo_frame['q']) 27 | 28 | @app.route("/", methods=['GET', 'POST']) 29 | def get_response(): 30 | input_str = request.values.get('Body') 31 | 32 | def get_response(q): 33 | my_q = vectorizer.transform([input_str]) 34 | cs = cosine_similarity(my_q, vec) 35 | rs = pd.Series(cs[0]).sort_values(ascending=0) 36 | rsi = rs.index[0] 37 | return convo_frame.iloc[rsi]['a'] 38 | 39 | resp = twilio.twiml.Response() 40 | if input_str: 41 | resp.message(get_response(input_str)) 42 | return str(resp) 43 | else: 44 | resp.message('Something bad happened here.') 45 | return str(resp) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Python Machine Learning Blueprints 5 | 6 | This is the code repository for [Python Machine Learning Blueprints](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning-blueprints?utm_source=github&utm_medium=repository&utm_campaign=9781784394752), published by Packt. It contains all the supporting project files necessary to work through the book from start to finish. 7 | 8 | ## Instructions and Navigation: 9 | 10 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, `Chapter02`. 11 | 12 | To install the software for code testing, kindly refer to the Software and Hardware Requirement section. To run the code given in the code bundle, follow the steps/instructions given in the book. 13 | 14 | ### Software and Hardware Requirement: 15 | 16 | * Software: 17 | 1. All(example): [Anaconda, Python 3.5-Free](https://www.continuum.io/downloads) 18 | 2. Chapter 3: [PhantomJS-Free](http://phantomjs.org/download.html) 19 | 20 | * Hardware: 21 | 1. ~350MB of disk space. 22 | 2. 16-23MB of disk space. 23 | 24 | * OS: Mac OS X, Windows, or Linux 25 | 26 | 27 | ## Related Python books and videos: 28 | 29 | * [Designing Machine Learning Systems with Python] (https://www.packtpub.com/big-data-and-business-intelligence/designing-machine-learning-systems-python?utm_source=github&utm_medium=repository&utm_campaign=9781785882951) 30 | * [Functional Python Programming] (https://www.packtpub.com/application-development/functional-python-programming?utm_source=github&utm_medium=repository&utm_campaign=9781784396992) 31 | * [Beginning Python [Video]] (https://www.packtpub.com/application-development/beginning-python-video?utm_source=github&utm_medium=repository&utm_campaign=9781786468994) 32 | 33 | 34 | 35 | 36 | --------------------------------------------------------------------------------