├── Classification.ipynb ├── WebScrapingYoutube.ipynb └── df_new.csv /Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 350, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import math\n", 11 | "import seaborn as sns\n", 12 | "from sklearn import svm\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "import pandas as pd\n", 15 | "from sklearn.model_selection import train_test_split\n", 16 | "from sklearn.preprocessing import StandardScaler\n", 17 | "from sklearn.model_selection import *\n", 18 | "from sklearn.preprocessing import MinMaxScaler\n", 19 | "from sklearn import linear_model\n", 20 | "from sklearn.metrics import *" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# Loading the dataset " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 229, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "dataset = pd.read_csv('/home/shubhamsingh/Desktop/df_orignal.csv')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "# Getting all the features separately " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "df_link = dataset['links']\n", 53 | "df_title = dataset['title']\n", 54 | "df_description = dataset['description']\n", 55 | "df_category = dataset['category']" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Importing liberaries for data cleaning " 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 232, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stderr", 72 | "output_type": "stream", 73 | "text": [ 74 | "[nltk_data] Downloading package stopwords to\n", 75 | "[nltk_data] /home/shubhamsingh/nltk_data...\n", 76 | "[nltk_data] Package stopwords is already up-to-date!\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "import re\n", 82 | "import nltk\n", 83 | "nltk.download('stopwords')\n", 84 | "from nltk.corpus import stopwords\n", 85 | "from nltk.stem.porter import PorterStemmer" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "# Cleaning the data and storing it into a list " 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 233, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Cleaning the text data; in my experiemnted, i worked on only 1000 observations (reviews)\n", 102 | "corpus = []\n", 103 | "for i in range(0, 8375):\n", 104 | " review = re.sub('[^a-zA-Z]', ' ', df_title['title'][i])\n", 105 | " review = review.lower()\n", 106 | " review = review.split()\n", 107 | " ps = PorterStemmer()\n", 108 | " review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]\n", 109 | " review = ' '.join(review)\n", 110 | " corpus.append(review)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 234, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "corpus1 = []\n", 120 | "for i in range(0, 8375):\n", 121 | " review = re.sub('[^a-zA-Z]', ' ', df_description['description'][i])\n", 122 | " review = review.lower()\n", 123 | " review = review.split()\n", 124 | " ps = PorterStemmer()\n", 125 | " review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]\n", 126 | " review = ' '.join(review)\n", 127 | " corpus1.append(review)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "# Creating dataframes from the lists " 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 237, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "dftitle = pd.DataFrame({'title':corpus})\n", 144 | "dfdescription = pd.DataFrame({'description':corpus1})" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "# Performing label encoding on the category feature" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 355, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "from sklearn.preprocessing import LabelEncoder " 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 247, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "dfcategory1 = dfcategory.apply(LabelEncoder().fit_transform)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "# Creating a new dataset after cleaning the data and label encoding the categories" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 294, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "df_new = pd.concat([dflink, dftitle, dfdescription, dfcategory1], axis=1, join_axes=[dflink.index])" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 296, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "df_new.to_csv(\"/home/shubhamsingh/Desktop/df_new.csv\", encoding='utf-8', index=False)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 295, 200 | "metadata": { 201 | "scrolled": true 202 | }, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/html": [ 207 | "
\n", 208 | "\n", 221 | "\n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | "
linkstitledescriptioncategory
06bBQ3pd0YU8american tap danc orchestra strike train chore...atdo perform strike train joyc theater nyc cho...0
1JLU0c0mmvxgrobonaut space station nasa space scienc hd videovisit websit http www junglejoel com robonaut ...4
2IojqhtUwz50european spacecraft pass key reentri test esa ...visit websit http www junglejoel com european ...4
3-zgGVyADnFEjordan bouri frontrow world danc franc qualififirst perform world danc0
4ZZXWS0n0MCAscienc univers space satellit hindihello bodhaguru learn proudli present anim vid...4
5Hz029D4wn1Ihot young star creat bright red nebula eso spa...space news info http www coconutsciencelab com...4
60jnuiRot6d0aaja ko bigyaan episod school astronomispace scienc technolog4
7FgBhMVgLtgdanc african danc zehil rugaro nekutamba happietienn cakpo guest perform profession dancer c...0
80o90mJe21Htip travel indiaindia massiv countri overwhelm plan trip spend...5
9PB3E_C1608kkorean food buffet eattravel korea choic food overwhelm want tri eve...1
102qwCB42_C1Ispacecraft take pictur planet nasa space scien...visit websit http www junglejoel com video des...4
11Ev1gpq51ntgnagareyamalove travel japan5
12xJSoETckuBYheat temperatur physic gk question hindi rrb ntpcrrb je4
13MnuRX73YSFgbhimkund waterfalmayurbhanj5
14YVtElThaNpIultim usa food battl fridgecamepisod fridgecam show bring yet anoth ultim ba...1
15lo0X2ZdElQ4conscious final frontier dada gunamuktananda t...dada gunamuktananda yogi medit teacher bio dad...4
168c5YY9DcoiEprofession dancer tri fortnit danc challeng pr...pick squat kick game credit fortnit epic game ...0
17MXexpBitoFIblack bean noodlspici rice cake kimbap korean food mukbang eat...1
18UNEi_TeKd5Uindia iceland travel k day budget tripwatch budget iceland trip hindi heard peopl sa...5
19Ww5L5cXUjKEhappen earth sun die space scienc documentariregist facebook happen earth sun die space sci...4
20i5QeyztIIT8brief histori colorado time geolog coloradominut movi illustr geolog evolut colorado time...2
21CRJqiMlGLkhighlight night sky novemb astronomi space sci...space news info http www coconutsciencelab com...4
22EvopK4qTEgdanc india danc season novembswarali0
23LTS_VWTE7znsd react bt boy w luv danc practic thing didn...sign patreon ahl run bt bon voyag http www pat...0
24itU6dp5tlAirish war independ minutfind tumultu time ireland led independ britain...2
25OMmer1JRrvUdanc india asia pacifdanc india asia pacif return singapor third ti...0
26rMiel4nt434question onsen japan japan travel guidquestion onsen japan japan travel guid http ww...5
27IGCVTSQw7WUbrief histori univers crash cours astronomithank wonder physic astronom map timelin unive...2
28_XxecflRFKUindian reaction pakistan tourism summit food t...join us premium benefit http www youtub com ch...5
29dLkPiY3i1qMhalal korean food singapor eatbook vlog epsinc korean dish contain pork may tough muslim...1
...............
8345NFN803DvgBQtravel japan day shibuya knu reunionspent second day shibuya walk around shop met ...5
8346APC_jD95TH8top tourist attract state main travel guid usahttp ultramodern home ru top tourist attract s...5
8347YvGDafHwVjamerican tri bizarr russian food first timelove pickl check awesom video buzzfeedvideo mu...1
8348_eFZM-fQgdAporsch boxster sport car manufactur fiber opti...watch porsch boxster sport car manufactur fibe...3
8349JKm3uzL_A4wit day asteroid struck jaw drop virtual reali...asteroidday friday june wit extraordinari jour...4
8350FwFqvOFefqgauto clinic bust myth women car patric bank te...find femal mechan becom one patric bank discus...3
8351B14BiB-Bv3bin process explan ncix tech tipepisod ncix tech tip linu explain bin process ...3
8352BU5t9m5SAiUtravel india delhitravel india delhi video first hour day india ...5
8353fenZhUxLZrQhindi top futur weapon india drdo space scienchindi video space scienc told futur project mi...4
8354FD3MPQyEub0zotac game pax east highlightwatch day pax east game event featur game acti...3
8355SZQWtSgcO4tip travel japanwatch month long road trip across japan http w...5
8356Wyncghonest trailer solo star war storitoday episod brought u armi join team make dif...2
8357UNpyF58BYfourier transform visual introductanim introduct fourier transform home page htt...4
8358NBSv_0yHnB0nyc center space scienc educnyc center space scienc educ experienti space ...4
8359YJjL82-KORAhubbl telescop show spiral black hole power je...visit websit http www junglejoel com hubbl spa...4
83600pBm2DjkzaQsalif lasourc michael jackson danc compilsalif lasourc michael jackson danc compil tag ...0
83619e6Oi__HGIAhotel se acha aur ghar ka maza jaipur travel g...travel paaji show uniqu beauti airbnb properti...5
8362qUz25YgfTgIkorean food seawe soup recip miyeokgukhealthi comfort soup multi task nourish stapl ...1
8363Ov-hREl5wEYhuman evolut historibiolog human histori evolut2
8364RVDidS5Ynknorth indian thali indian food delhi street fo...super delici thali street delhi india delhi fo...1
83655-tktFyIzballet piec francchoreograph adi morgan perform adi morgan edit...0
8366jSJhesy2ztravel japan ep vlog departurhello guy japan probabl favourit place go visi...5
83670tim0_WzXYtop fifteen femal ballet dancerlist top fifteen favorit femal ballet dancer r...0
8368NSAgLvKOPLQmodel atom timelinsee chemistri video check http socrat org chem...4
8369JzZ4u5XxvFUprobabl find electron beyond bohr radiu hydrog...solut question chapter quantum physic hc verma4
8370Tohl-nFCugpeni fish korean street food noryangjin fisher...peni fish krw usd noryangjin fisheri wholesal ...1
8371Fr48ud9Yipaneer kofta recip hindi indian food made easipaneer kofta recip hindi indian food made easi...1
8372LEkSQGzYn4Eamerican danc american ambidextr sbsm school s...american danc american ambidextr sbsm school s...0
83738YuKbcHn1Igwendymin cook mini korean bbq mini cookmake mini edibl korean bbq use everyth miniatur1
8374J4jRR5r4-Apawn star star war collect season historiowner valuabl origin star war figurin throw hi...2
\n", 661 | "

8375 rows × 4 columns

\n", 662 | "
" 663 | ], 664 | "text/plain": [ 665 | " links title \\\n", 666 | "0 6bBQ3pd0YU8 american tap danc orchestra strike train chore... \n", 667 | "1 JLU0c0mmvxg robonaut space station nasa space scienc hd video \n", 668 | "2 IojqhtUwz50 european spacecraft pass key reentri test esa ... \n", 669 | "3 -zgGVyADnFE jordan bouri frontrow world danc franc qualifi \n", 670 | "4 ZZXWS0n0MCA scienc univers space satellit hindi \n", 671 | "5 Hz029D4wn1I hot young star creat bright red nebula eso spa... \n", 672 | "6 0jnuiRot6d0 aaja ko bigyaan episod school astronomi \n", 673 | "7 FgBhMVgLtg danc african danc zehil rugaro nekutamba happi \n", 674 | "8 0o90mJe21H tip travel india \n", 675 | "9 PB3E_C1608k korean food buffet eat \n", 676 | "10 2qwCB42_C1I spacecraft take pictur planet nasa space scien... \n", 677 | "11 Ev1gpq51ntg nagareyama \n", 678 | "12 xJSoETckuBY heat temperatur physic gk question hindi rrb ntpc \n", 679 | "13 MnuRX73YSFg bhimkund waterfal \n", 680 | "14 YVtElThaNpI ultim usa food battl fridgecam \n", 681 | "15 lo0X2ZdElQ4 conscious final frontier dada gunamuktananda t... \n", 682 | "16 8c5YY9DcoiE profession dancer tri fortnit danc challeng pr... \n", 683 | "17 MXexpBitoFI black bean noodl \n", 684 | "18 UNEi_TeKd5U india iceland travel k day budget trip \n", 685 | "19 Ww5L5cXUjKE happen earth sun die space scienc documentari \n", 686 | "20 i5QeyztIIT8 brief histori colorado time geolog colorado \n", 687 | "21 CRJqiMlGLk highlight night sky novemb astronomi space sci... \n", 688 | "22 EvopK4qTEg danc india danc season novemb \n", 689 | "23 LTS_VWTE7z nsd react bt boy w luv danc practic thing didn... \n", 690 | "24 itU6dp5tlA irish war independ minut \n", 691 | "25 OMmer1JRrvU danc india asia pacif \n", 692 | "26 rMiel4nt434 question onsen japan japan travel guid \n", 693 | "27 IGCVTSQw7WU brief histori univers crash cours astronomi \n", 694 | "28 _XxecflRFKU indian reaction pakistan tourism summit food t... \n", 695 | "29 dLkPiY3i1qM halal korean food singapor eatbook vlog ep \n", 696 | "... ... ... \n", 697 | "8345 NFN803DvgBQ travel japan day shibuya knu reunion \n", 698 | "8346 APC_jD95TH8 top tourist attract state main travel guid usa \n", 699 | "8347 YvGDafHwVj american tri bizarr russian food first time \n", 700 | "8348 _eFZM-fQgdA porsch boxster sport car manufactur fiber opti... \n", 701 | "8349 JKm3uzL_A4 wit day asteroid struck jaw drop virtual reali... \n", 702 | "8350 FwFqvOFefqg auto clinic bust myth women car patric bank te... \n", 703 | "8351 B14BiB-Bv3 bin process explan ncix tech tip \n", 704 | "8352 BU5t9m5SAiU travel india delhi \n", 705 | "8353 fenZhUxLZrQ hindi top futur weapon india drdo space scienc \n", 706 | "8354 FD3MPQyEub0 zotac game pax east highlight \n", 707 | "8355 SZQWtSgcO4 tip travel japan \n", 708 | "8356 Wyncg honest trailer solo star war stori \n", 709 | "8357 UNpyF58BY fourier transform visual introduct \n", 710 | "8358 NBSv_0yHnB0 nyc center space scienc educ \n", 711 | "8359 YJjL82-KORA hubbl telescop show spiral black hole power je... \n", 712 | "8360 0pBm2DjkzaQ salif lasourc michael jackson danc compil \n", 713 | "8361 9e6Oi__HGIA hotel se acha aur ghar ka maza jaipur travel g... \n", 714 | "8362 qUz25YgfTgI korean food seawe soup recip miyeokguk \n", 715 | "8363 Ov-hREl5wEY human evolut histori \n", 716 | "8364 RVDidS5Ynk north indian thali indian food delhi street fo... \n", 717 | "8365 5-tktFyIz ballet piec franc \n", 718 | "8366 jSJhesy2z travel japan ep vlog departur \n", 719 | "8367 0tim0_WzXY top fifteen femal ballet dancer \n", 720 | "8368 NSAgLvKOPLQ model atom timelin \n", 721 | "8369 JzZ4u5XxvFU probabl find electron beyond bohr radiu hydrog... \n", 722 | "8370 Tohl-nFCug peni fish korean street food noryangjin fisher... \n", 723 | "8371 Fr48ud9Yi paneer kofta recip hindi indian food made easi \n", 724 | "8372 LEkSQGzYn4E american danc american ambidextr sbsm school s... \n", 725 | "8373 8YuKbcHn1Ig wendymin cook mini korean bbq mini cook \n", 726 | "8374 J4jRR5r4-A pawn star star war collect season histori \n", 727 | "\n", 728 | " description category \n", 729 | "0 atdo perform strike train joyc theater nyc cho... 0 \n", 730 | "1 visit websit http www junglejoel com robonaut ... 4 \n", 731 | "2 visit websit http www junglejoel com european ... 4 \n", 732 | "3 first perform world danc 0 \n", 733 | "4 hello bodhaguru learn proudli present anim vid... 4 \n", 734 | "5 space news info http www coconutsciencelab com... 4 \n", 735 | "6 space scienc technolog 4 \n", 736 | "7 etienn cakpo guest perform profession dancer c... 0 \n", 737 | "8 india massiv countri overwhelm plan trip spend... 5 \n", 738 | "9 travel korea choic food overwhelm want tri eve... 1 \n", 739 | "10 visit websit http www junglejoel com video des... 4 \n", 740 | "11 love travel japan 5 \n", 741 | "12 rrb je 4 \n", 742 | "13 mayurbhanj 5 \n", 743 | "14 episod fridgecam show bring yet anoth ultim ba... 1 \n", 744 | "15 dada gunamuktananda yogi medit teacher bio dad... 4 \n", 745 | "16 pick squat kick game credit fortnit epic game ... 0 \n", 746 | "17 spici rice cake kimbap korean food mukbang eat... 1 \n", 747 | "18 watch budget iceland trip hindi heard peopl sa... 5 \n", 748 | "19 regist facebook happen earth sun die space sci... 4 \n", 749 | "20 minut movi illustr geolog evolut colorado time... 2 \n", 750 | "21 space news info http www coconutsciencelab com... 4 \n", 751 | "22 swarali 0 \n", 752 | "23 sign patreon ahl run bt bon voyag http www pat... 0 \n", 753 | "24 find tumultu time ireland led independ britain... 2 \n", 754 | "25 danc india asia pacif return singapor third ti... 0 \n", 755 | "26 question onsen japan japan travel guid http ww... 5 \n", 756 | "27 thank wonder physic astronom map timelin unive... 2 \n", 757 | "28 join us premium benefit http www youtub com ch... 5 \n", 758 | "29 sinc korean dish contain pork may tough muslim... 1 \n", 759 | "... ... ... \n", 760 | "8345 spent second day shibuya walk around shop met ... 5 \n", 761 | "8346 http ultramodern home ru top tourist attract s... 5 \n", 762 | "8347 love pickl check awesom video buzzfeedvideo mu... 1 \n", 763 | "8348 watch porsch boxster sport car manufactur fibe... 3 \n", 764 | "8349 asteroidday friday june wit extraordinari jour... 4 \n", 765 | "8350 find femal mechan becom one patric bank discus... 3 \n", 766 | "8351 episod ncix tech tip linu explain bin process ... 3 \n", 767 | "8352 travel india delhi video first hour day india ... 5 \n", 768 | "8353 hindi video space scienc told futur project mi... 4 \n", 769 | "8354 watch day pax east game event featur game acti... 3 \n", 770 | "8355 watch month long road trip across japan http w... 5 \n", 771 | "8356 today episod brought u armi join team make dif... 2 \n", 772 | "8357 anim introduct fourier transform home page htt... 4 \n", 773 | "8358 nyc center space scienc educ experienti space ... 4 \n", 774 | "8359 visit websit http www junglejoel com hubbl spa... 4 \n", 775 | "8360 salif lasourc michael jackson danc compil tag ... 0 \n", 776 | "8361 travel paaji show uniqu beauti airbnb properti... 5 \n", 777 | "8362 healthi comfort soup multi task nourish stapl ... 1 \n", 778 | "8363 biolog human histori evolut 2 \n", 779 | "8364 super delici thali street delhi india delhi fo... 1 \n", 780 | "8365 choreograph adi morgan perform adi morgan edit... 0 \n", 781 | "8366 hello guy japan probabl favourit place go visi... 5 \n", 782 | "8367 list top fifteen favorit femal ballet dancer r... 0 \n", 783 | "8368 see chemistri video check http socrat org chem... 4 \n", 784 | "8369 solut question chapter quantum physic hc verma 4 \n", 785 | "8370 peni fish krw usd noryangjin fisheri wholesal ... 1 \n", 786 | "8371 paneer kofta recip hindi indian food made easi... 1 \n", 787 | "8372 american danc american ambidextr sbsm school s... 0 \n", 788 | "8373 make mini edibl korean bbq use everyth miniatur 1 \n", 789 | "8374 owner valuabl origin star war figurin throw hi... 2 \n", 790 | "\n", 791 | "[8375 rows x 4 columns]" 792 | ] 793 | }, 794 | "execution_count": 295, 795 | "metadata": {}, 796 | "output_type": "execute_result" 797 | } 798 | ], 799 | "source": [ 800 | "df_new" 801 | ] 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "metadata": {}, 806 | "source": [ 807 | "# Creating the bag of words model" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 250, 813 | "metadata": {}, 814 | "outputs": [], 815 | "source": [ 816 | "\n", 817 | "from sklearn.feature_extraction.text import CountVectorizer\n", 818 | "cv = CountVectorizer(max_features = 1500)\n", 819 | "X = cv.fit_transform(corpus, corpus1).toarray()\n", 820 | "y = df_new.iloc[:, 3].values" 821 | ] 822 | }, 823 | { 824 | "cell_type": "markdown", 825 | "metadata": {}, 826 | "source": [ 827 | "# Splitting the dataset into the Training set and Test set " 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 251, 833 | "metadata": {}, 834 | "outputs": [], 835 | "source": [ 836 | "from sklearn.model_selection import train_test_split\n", 837 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)" 838 | ] 839 | }, 840 | { 841 | "cell_type": "markdown", 842 | "metadata": {}, 843 | "source": [ 844 | "# Random Forest " 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 462, 850 | "metadata": {}, 851 | "outputs": [ 852 | { 853 | "data": { 854 | "text/plain": [ 855 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n", 856 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 857 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 858 | " min_samples_leaf=1, min_samples_split=2,\n", 859 | " min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,\n", 860 | " oob_score=False, random_state=None, verbose=0,\n", 861 | " warm_start=False)" 862 | ] 863 | }, 864 | "execution_count": 462, 865 | "metadata": {}, 866 | "output_type": "execute_result" 867 | } 868 | ], 869 | "source": [ 870 | "from sklearn.ensemble import RandomForestClassifier\n", 871 | "classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy')\n", 872 | "classifier.fit(X_train, y_train)" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": 463, 878 | "metadata": {}, 879 | "outputs": [], 880 | "source": [ 881 | "y_pred = classifier.predict(X_test)" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": 464, 887 | "metadata": {}, 888 | "outputs": [ 889 | { 890 | "data": { 891 | "text/plain": [ 892 | "0.9605970149253731" 893 | ] 894 | }, 895 | "execution_count": 464, 896 | "metadata": {}, 897 | "output_type": "execute_result" 898 | } 899 | ], 900 | "source": [ 901 | "classifier.score(X_test, y_test)" 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": 485, 907 | "metadata": {}, 908 | "outputs": [ 909 | { 910 | "name": "stdout", 911 | "output_type": "stream", 912 | "text": [ 913 | " precision recall f1-score support\n", 914 | "\n", 915 | " Art & Dance 0.95 0.97 0.96 313\n", 916 | " Food 0.96 0.99 0.98 272\n", 917 | " History 0.96 0.98 0.97 287\n", 918 | "Manufacturing 0.95 0.94 0.94 241\n", 919 | " Science 0.97 0.94 0.96 289\n", 920 | " Travel 0.98 0.93 0.96 273\n", 921 | "\n", 922 | " micro avg 0.96 0.96 0.96 1675\n", 923 | " macro avg 0.96 0.96 0.96 1675\n", 924 | " weighted avg 0.96 0.96 0.96 1675\n", 925 | "\n" 926 | ] 927 | } 928 | ], 929 | "source": [ 930 | "print(classification_report(y_test, y_pred))" 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": 465, 936 | "metadata": {}, 937 | "outputs": [], 938 | "source": [ 939 | "# Making the Confusion Matrix\n", 940 | "from sklearn.metrics import confusion_matrix\n", 941 | "cm = confusion_matrix(y_test, y_pred)" 942 | ] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": 466, 947 | "metadata": {}, 948 | "outputs": [ 949 | { 950 | "data": { 951 | "text/plain": [ 952 | "array([[305, 0, 2, 3, 0, 3],\n", 953 | " [ 0, 269, 0, 1, 0, 2],\n", 954 | " [ 2, 1, 281, 1, 2, 0],\n", 955 | " [ 5, 1, 3, 226, 5, 1],\n", 956 | " [ 6, 0, 6, 4, 273, 0],\n", 957 | " [ 3, 8, 2, 3, 2, 255]])" 958 | ] 959 | }, 960 | "execution_count": 466, 961 | "metadata": {}, 962 | "output_type": "execute_result" 963 | } 964 | ], 965 | "source": [ 966 | "cm" 967 | ] 968 | }, 969 | { 970 | "cell_type": "markdown", 971 | "metadata": {}, 972 | "source": [ 973 | "# SVM " 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": 467, 979 | "metadata": {}, 980 | "outputs": [ 981 | { 982 | "data": { 983 | "text/plain": [ 984 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 985 | " decision_function_shape='ovr', degree=3, gamma='auto_deprecated',\n", 986 | " kernel='linear', max_iter=-1, probability=False, random_state=0,\n", 987 | " shrinking=True, tol=0.001, verbose=False)" 988 | ] 989 | }, 990 | "execution_count": 467, 991 | "metadata": {}, 992 | "output_type": "execute_result" 993 | } 994 | ], 995 | "source": [ 996 | "from sklearn.svm import SVC\n", 997 | "classifier1 = SVC(kernel = 'linear', random_state = 0)\n", 998 | "classifier1.fit(X_train, y_train)" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "execution_count": 468, 1004 | "metadata": {}, 1005 | "outputs": [], 1006 | "source": [ 1007 | "y_pred1 = classifier1.predict(X_test)" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": 469, 1013 | "metadata": {}, 1014 | "outputs": [ 1015 | { 1016 | "data": { 1017 | "text/plain": [ 1018 | "0.9564179104477611" 1019 | ] 1020 | }, 1021 | "execution_count": 469, 1022 | "metadata": {}, 1023 | "output_type": "execute_result" 1024 | } 1025 | ], 1026 | "source": [ 1027 | "classifier1.score(X_test, y_test)" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": 470, 1033 | "metadata": {}, 1034 | "outputs": [], 1035 | "source": [ 1036 | "# Making the Confusion Matrix\n", 1037 | "cm1 = confusion_matrix(y_test, y_pred1)" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": 471, 1043 | "metadata": {}, 1044 | "outputs": [ 1045 | { 1046 | "data": { 1047 | "text/plain": [ 1048 | "array([[301, 0, 4, 6, 0, 2],\n", 1049 | " [ 0, 266, 0, 1, 0, 5],\n", 1050 | " [ 2, 1, 278, 3, 2, 1],\n", 1051 | " [ 0, 1, 4, 229, 4, 3],\n", 1052 | " [ 1, 0, 9, 9, 270, 0],\n", 1053 | " [ 2, 4, 1, 6, 2, 258]])" 1054 | ] 1055 | }, 1056 | "execution_count": 471, 1057 | "metadata": {}, 1058 | "output_type": "execute_result" 1059 | } 1060 | ], 1061 | "source": [ 1062 | "cm1" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "markdown", 1067 | "metadata": {}, 1068 | "source": [ 1069 | "# Naive Bayes " 1070 | ] 1071 | }, 1072 | { 1073 | "cell_type": "code", 1074 | "execution_count": 472, 1075 | "metadata": {}, 1076 | "outputs": [ 1077 | { 1078 | "data": { 1079 | "text/plain": [ 1080 | "GaussianNB(priors=None, var_smoothing=1e-09)" 1081 | ] 1082 | }, 1083 | "execution_count": 472, 1084 | "metadata": {}, 1085 | "output_type": "execute_result" 1086 | } 1087 | ], 1088 | "source": [ 1089 | "from sklearn.naive_bayes import GaussianNB\n", 1090 | "classifier2 = GaussianNB()\n", 1091 | "classifier2.fit(X_train, y_train)" 1092 | ] 1093 | }, 1094 | { 1095 | "cell_type": "code", 1096 | "execution_count": 473, 1097 | "metadata": {}, 1098 | "outputs": [], 1099 | "source": [ 1100 | "y_pred2 = classifier2.predict(X_test)" 1101 | ] 1102 | }, 1103 | { 1104 | "cell_type": "code", 1105 | "execution_count": 474, 1106 | "metadata": {}, 1107 | "outputs": [ 1108 | { 1109 | "data": { 1110 | "text/plain": [ 1111 | "0.8107462686567164" 1112 | ] 1113 | }, 1114 | "execution_count": 474, 1115 | "metadata": {}, 1116 | "output_type": "execute_result" 1117 | } 1118 | ], 1119 | "source": [ 1120 | "classifier2.score(X_test, y_test)" 1121 | ] 1122 | }, 1123 | { 1124 | "cell_type": "code", 1125 | "execution_count": 475, 1126 | "metadata": {}, 1127 | "outputs": [], 1128 | "source": [ 1129 | "# Making the Confusion Matrix\n", 1130 | "cm2 = confusion_matrix(y_test, y_pred2)" 1131 | ] 1132 | }, 1133 | { 1134 | "cell_type": "code", 1135 | "execution_count": 476, 1136 | "metadata": {}, 1137 | "outputs": [ 1138 | { 1139 | "data": { 1140 | "text/plain": [ 1141 | "array([[289, 4, 10, 2, 0, 8],\n", 1142 | " [ 0, 253, 1, 0, 0, 18],\n", 1143 | " [ 40, 4, 194, 16, 10, 23],\n", 1144 | " [ 1, 7, 4, 219, 4, 6],\n", 1145 | " [ 4, 6, 59, 15, 199, 6],\n", 1146 | " [ 3, 55, 4, 0, 7, 204]])" 1147 | ] 1148 | }, 1149 | "execution_count": 476, 1150 | "metadata": {}, 1151 | "output_type": "execute_result" 1152 | } 1153 | ], 1154 | "source": [ 1155 | "cm2" 1156 | ] 1157 | }, 1158 | { 1159 | "cell_type": "markdown", 1160 | "metadata": {}, 1161 | "source": [ 1162 | "# XGboost " 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "execution_count": 477, 1168 | "metadata": {}, 1169 | "outputs": [ 1170 | { 1171 | "data": { 1172 | "text/plain": [ 1173 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", 1174 | " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", 1175 | " max_depth=3, min_child_weight=1, missing=None, n_estimators=100,\n", 1176 | " n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,\n", 1177 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n", 1178 | " silent=True, subsample=1)" 1179 | ] 1180 | }, 1181 | "execution_count": 477, 1182 | "metadata": {}, 1183 | "output_type": "execute_result" 1184 | } 1185 | ], 1186 | "source": [ 1187 | "from xgboost import XGBClassifier\n", 1188 | "classifier3 = XGBClassifier()\n", 1189 | "classifier3.fit(X_train, y_train)" 1190 | ] 1191 | }, 1192 | { 1193 | "cell_type": "code", 1194 | "execution_count": 293, 1195 | "metadata": {}, 1196 | "outputs": [], 1197 | "source": [ 1198 | "y_pred3 = classifier3.predict(X_test)" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "code", 1203 | "execution_count": 271, 1204 | "metadata": {}, 1205 | "outputs": [ 1206 | { 1207 | "data": { 1208 | "text/plain": [ 1209 | "0.937910447761194" 1210 | ] 1211 | }, 1212 | "execution_count": 271, 1213 | "metadata": {}, 1214 | "output_type": "execute_result" 1215 | } 1216 | ], 1217 | "source": [ 1218 | "classifier3.score(X_test, y_test)" 1219 | ] 1220 | }, 1221 | { 1222 | "cell_type": "code", 1223 | "execution_count": 458, 1224 | "metadata": {}, 1225 | "outputs": [], 1226 | "source": [ 1227 | "# Making the Confusion Matrix\n", 1228 | "cm3 = confusion_matrix(y_test, y_pred3)" 1229 | ] 1230 | }, 1231 | { 1232 | "cell_type": "code", 1233 | "execution_count": 459, 1234 | "metadata": {}, 1235 | "outputs": [ 1236 | { 1237 | "data": { 1238 | "text/plain": [ 1239 | "array([[287, 0, 3, 20, 0, 3],\n", 1240 | " [ 0, 264, 1, 4, 0, 3],\n", 1241 | " [ 0, 1, 275, 9, 2, 0],\n", 1242 | " [ 1, 1, 2, 235, 2, 0],\n", 1243 | " [ 0, 0, 3, 28, 258, 0],\n", 1244 | " [ 0, 7, 0, 14, 0, 252]])" 1245 | ] 1246 | }, 1247 | "execution_count": 459, 1248 | "metadata": {}, 1249 | "output_type": "execute_result" 1250 | } 1251 | ], 1252 | "source": [ 1253 | "cm3" 1254 | ] 1255 | } 1256 | ], 1257 | "metadata": { 1258 | "kernelspec": { 1259 | "display_name": "Python 3", 1260 | "language": "python", 1261 | "name": "python3" 1262 | }, 1263 | "language_info": { 1264 | "codemirror_mode": { 1265 | "name": "ipython", 1266 | "version": 2 1267 | }, 1268 | "file_extension": ".py", 1269 | "mimetype": "text/x-python", 1270 | "name": "python", 1271 | "nbconvert_exporter": "python", 1272 | "pygments_lexer": "ipython2", 1273 | "version": "2.7.15rc1" 1274 | } 1275 | }, 1276 | "nbformat": 4, 1277 | "nbformat_minor": 2 1278 | } 1279 | --------------------------------------------------------------------------------