├── Classification.ipynb
├── WebScrapingYoutube.ipynb
└── df_new.csv
/Classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 350,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import math\n",
11 | "import seaborn as sns\n",
12 | "from sklearn import svm\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "import pandas as pd\n",
15 | "from sklearn.model_selection import train_test_split\n",
16 | "from sklearn.preprocessing import StandardScaler\n",
17 | "from sklearn.model_selection import *\n",
18 | "from sklearn.preprocessing import MinMaxScaler\n",
19 | "from sklearn import linear_model\n",
20 | "from sklearn.metrics import *"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "# Loading the dataset "
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 229,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "dataset = pd.read_csv('/home/shubhamsingh/Desktop/df_orignal.csv')"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "# Getting all the features separately "
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "df_link = dataset['links']\n",
53 | "df_title = dataset['title']\n",
54 | "df_description = dataset['description']\n",
55 | "df_category = dataset['category']"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "# Importing liberaries for data cleaning "
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 232,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "name": "stderr",
72 | "output_type": "stream",
73 | "text": [
74 | "[nltk_data] Downloading package stopwords to\n",
75 | "[nltk_data] /home/shubhamsingh/nltk_data...\n",
76 | "[nltk_data] Package stopwords is already up-to-date!\n"
77 | ]
78 | }
79 | ],
80 | "source": [
81 | "import re\n",
82 | "import nltk\n",
83 | "nltk.download('stopwords')\n",
84 | "from nltk.corpus import stopwords\n",
85 | "from nltk.stem.porter import PorterStemmer"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "# Cleaning the data and storing it into a list "
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 233,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "# Cleaning the text data; in my experiemnted, i worked on only 1000 observations (reviews)\n",
102 | "corpus = []\n",
103 | "for i in range(0, 8375):\n",
104 | " review = re.sub('[^a-zA-Z]', ' ', df_title['title'][i])\n",
105 | " review = review.lower()\n",
106 | " review = review.split()\n",
107 | " ps = PorterStemmer()\n",
108 | " review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]\n",
109 | " review = ' '.join(review)\n",
110 | " corpus.append(review)"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 234,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "corpus1 = []\n",
120 | "for i in range(0, 8375):\n",
121 | " review = re.sub('[^a-zA-Z]', ' ', df_description['description'][i])\n",
122 | " review = review.lower()\n",
123 | " review = review.split()\n",
124 | " ps = PorterStemmer()\n",
125 | " review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]\n",
126 | " review = ' '.join(review)\n",
127 | " corpus1.append(review)"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "# Creating dataframes from the lists "
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 237,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "dftitle = pd.DataFrame({'title':corpus})\n",
144 | "dfdescription = pd.DataFrame({'description':corpus1})"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "# Performing label encoding on the category feature"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 355,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "from sklearn.preprocessing import LabelEncoder "
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 247,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "dfcategory1 = dfcategory.apply(LabelEncoder().fit_transform)"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "# Creating a new dataset after cleaning the data and label encoding the categories"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 294,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "df_new = pd.concat([dflink, dftitle, dfdescription, dfcategory1], axis=1, join_axes=[dflink.index])"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 296,
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "df_new.to_csv(\"/home/shubhamsingh/Desktop/df_new.csv\", encoding='utf-8', index=False)"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 295,
200 | "metadata": {
201 | "scrolled": true
202 | },
203 | "outputs": [
204 | {
205 | "data": {
206 | "text/html": [
207 | "
\n",
208 | "\n",
221 | "
\n",
222 | " \n",
223 | " \n",
224 | " | \n",
225 | " links | \n",
226 | " title | \n",
227 | " description | \n",
228 | " category | \n",
229 | "
\n",
230 | " \n",
231 | " \n",
232 | " \n",
233 | " 0 | \n",
234 | " 6bBQ3pd0YU8 | \n",
235 | " american tap danc orchestra strike train chore... | \n",
236 | " atdo perform strike train joyc theater nyc cho... | \n",
237 | " 0 | \n",
238 | "
\n",
239 | " \n",
240 | " 1 | \n",
241 | " JLU0c0mmvxg | \n",
242 | " robonaut space station nasa space scienc hd video | \n",
243 | " visit websit http www junglejoel com robonaut ... | \n",
244 | " 4 | \n",
245 | "
\n",
246 | " \n",
247 | " 2 | \n",
248 | " IojqhtUwz50 | \n",
249 | " european spacecraft pass key reentri test esa ... | \n",
250 | " visit websit http www junglejoel com european ... | \n",
251 | " 4 | \n",
252 | "
\n",
253 | " \n",
254 | " 3 | \n",
255 | " -zgGVyADnFE | \n",
256 | " jordan bouri frontrow world danc franc qualifi | \n",
257 | " first perform world danc | \n",
258 | " 0 | \n",
259 | "
\n",
260 | " \n",
261 | " 4 | \n",
262 | " ZZXWS0n0MCA | \n",
263 | " scienc univers space satellit hindi | \n",
264 | " hello bodhaguru learn proudli present anim vid... | \n",
265 | " 4 | \n",
266 | "
\n",
267 | " \n",
268 | " 5 | \n",
269 | " Hz029D4wn1I | \n",
270 | " hot young star creat bright red nebula eso spa... | \n",
271 | " space news info http www coconutsciencelab com... | \n",
272 | " 4 | \n",
273 | "
\n",
274 | " \n",
275 | " 6 | \n",
276 | " 0jnuiRot6d0 | \n",
277 | " aaja ko bigyaan episod school astronomi | \n",
278 | " space scienc technolog | \n",
279 | " 4 | \n",
280 | "
\n",
281 | " \n",
282 | " 7 | \n",
283 | " FgBhMVgLtg | \n",
284 | " danc african danc zehil rugaro nekutamba happi | \n",
285 | " etienn cakpo guest perform profession dancer c... | \n",
286 | " 0 | \n",
287 | "
\n",
288 | " \n",
289 | " 8 | \n",
290 | " 0o90mJe21H | \n",
291 | " tip travel india | \n",
292 | " india massiv countri overwhelm plan trip spend... | \n",
293 | " 5 | \n",
294 | "
\n",
295 | " \n",
296 | " 9 | \n",
297 | " PB3E_C1608k | \n",
298 | " korean food buffet eat | \n",
299 | " travel korea choic food overwhelm want tri eve... | \n",
300 | " 1 | \n",
301 | "
\n",
302 | " \n",
303 | " 10 | \n",
304 | " 2qwCB42_C1I | \n",
305 | " spacecraft take pictur planet nasa space scien... | \n",
306 | " visit websit http www junglejoel com video des... | \n",
307 | " 4 | \n",
308 | "
\n",
309 | " \n",
310 | " 11 | \n",
311 | " Ev1gpq51ntg | \n",
312 | " nagareyama | \n",
313 | " love travel japan | \n",
314 | " 5 | \n",
315 | "
\n",
316 | " \n",
317 | " 12 | \n",
318 | " xJSoETckuBY | \n",
319 | " heat temperatur physic gk question hindi rrb ntpc | \n",
320 | " rrb je | \n",
321 | " 4 | \n",
322 | "
\n",
323 | " \n",
324 | " 13 | \n",
325 | " MnuRX73YSFg | \n",
326 | " bhimkund waterfal | \n",
327 | " mayurbhanj | \n",
328 | " 5 | \n",
329 | "
\n",
330 | " \n",
331 | " 14 | \n",
332 | " YVtElThaNpI | \n",
333 | " ultim usa food battl fridgecam | \n",
334 | " episod fridgecam show bring yet anoth ultim ba... | \n",
335 | " 1 | \n",
336 | "
\n",
337 | " \n",
338 | " 15 | \n",
339 | " lo0X2ZdElQ4 | \n",
340 | " conscious final frontier dada gunamuktananda t... | \n",
341 | " dada gunamuktananda yogi medit teacher bio dad... | \n",
342 | " 4 | \n",
343 | "
\n",
344 | " \n",
345 | " 16 | \n",
346 | " 8c5YY9DcoiE | \n",
347 | " profession dancer tri fortnit danc challeng pr... | \n",
348 | " pick squat kick game credit fortnit epic game ... | \n",
349 | " 0 | \n",
350 | "
\n",
351 | " \n",
352 | " 17 | \n",
353 | " MXexpBitoFI | \n",
354 | " black bean noodl | \n",
355 | " spici rice cake kimbap korean food mukbang eat... | \n",
356 | " 1 | \n",
357 | "
\n",
358 | " \n",
359 | " 18 | \n",
360 | " UNEi_TeKd5U | \n",
361 | " india iceland travel k day budget trip | \n",
362 | " watch budget iceland trip hindi heard peopl sa... | \n",
363 | " 5 | \n",
364 | "
\n",
365 | " \n",
366 | " 19 | \n",
367 | " Ww5L5cXUjKE | \n",
368 | " happen earth sun die space scienc documentari | \n",
369 | " regist facebook happen earth sun die space sci... | \n",
370 | " 4 | \n",
371 | "
\n",
372 | " \n",
373 | " 20 | \n",
374 | " i5QeyztIIT8 | \n",
375 | " brief histori colorado time geolog colorado | \n",
376 | " minut movi illustr geolog evolut colorado time... | \n",
377 | " 2 | \n",
378 | "
\n",
379 | " \n",
380 | " 21 | \n",
381 | " CRJqiMlGLk | \n",
382 | " highlight night sky novemb astronomi space sci... | \n",
383 | " space news info http www coconutsciencelab com... | \n",
384 | " 4 | \n",
385 | "
\n",
386 | " \n",
387 | " 22 | \n",
388 | " EvopK4qTEg | \n",
389 | " danc india danc season novemb | \n",
390 | " swarali | \n",
391 | " 0 | \n",
392 | "
\n",
393 | " \n",
394 | " 23 | \n",
395 | " LTS_VWTE7z | \n",
396 | " nsd react bt boy w luv danc practic thing didn... | \n",
397 | " sign patreon ahl run bt bon voyag http www pat... | \n",
398 | " 0 | \n",
399 | "
\n",
400 | " \n",
401 | " 24 | \n",
402 | " itU6dp5tlA | \n",
403 | " irish war independ minut | \n",
404 | " find tumultu time ireland led independ britain... | \n",
405 | " 2 | \n",
406 | "
\n",
407 | " \n",
408 | " 25 | \n",
409 | " OMmer1JRrvU | \n",
410 | " danc india asia pacif | \n",
411 | " danc india asia pacif return singapor third ti... | \n",
412 | " 0 | \n",
413 | "
\n",
414 | " \n",
415 | " 26 | \n",
416 | " rMiel4nt434 | \n",
417 | " question onsen japan japan travel guid | \n",
418 | " question onsen japan japan travel guid http ww... | \n",
419 | " 5 | \n",
420 | "
\n",
421 | " \n",
422 | " 27 | \n",
423 | " IGCVTSQw7WU | \n",
424 | " brief histori univers crash cours astronomi | \n",
425 | " thank wonder physic astronom map timelin unive... | \n",
426 | " 2 | \n",
427 | "
\n",
428 | " \n",
429 | " 28 | \n",
430 | " _XxecflRFKU | \n",
431 | " indian reaction pakistan tourism summit food t... | \n",
432 | " join us premium benefit http www youtub com ch... | \n",
433 | " 5 | \n",
434 | "
\n",
435 | " \n",
436 | " 29 | \n",
437 | " dLkPiY3i1qM | \n",
438 | " halal korean food singapor eatbook vlog ep | \n",
439 | " sinc korean dish contain pork may tough muslim... | \n",
440 | " 1 | \n",
441 | "
\n",
442 | " \n",
443 | " ... | \n",
444 | " ... | \n",
445 | " ... | \n",
446 | " ... | \n",
447 | " ... | \n",
448 | "
\n",
449 | " \n",
450 | " 8345 | \n",
451 | " NFN803DvgBQ | \n",
452 | " travel japan day shibuya knu reunion | \n",
453 | " spent second day shibuya walk around shop met ... | \n",
454 | " 5 | \n",
455 | "
\n",
456 | " \n",
457 | " 8346 | \n",
458 | " APC_jD95TH8 | \n",
459 | " top tourist attract state main travel guid usa | \n",
460 | " http ultramodern home ru top tourist attract s... | \n",
461 | " 5 | \n",
462 | "
\n",
463 | " \n",
464 | " 8347 | \n",
465 | " YvGDafHwVj | \n",
466 | " american tri bizarr russian food first time | \n",
467 | " love pickl check awesom video buzzfeedvideo mu... | \n",
468 | " 1 | \n",
469 | "
\n",
470 | " \n",
471 | " 8348 | \n",
472 | " _eFZM-fQgdA | \n",
473 | " porsch boxster sport car manufactur fiber opti... | \n",
474 | " watch porsch boxster sport car manufactur fibe... | \n",
475 | " 3 | \n",
476 | "
\n",
477 | " \n",
478 | " 8349 | \n",
479 | " JKm3uzL_A4 | \n",
480 | " wit day asteroid struck jaw drop virtual reali... | \n",
481 | " asteroidday friday june wit extraordinari jour... | \n",
482 | " 4 | \n",
483 | "
\n",
484 | " \n",
485 | " 8350 | \n",
486 | " FwFqvOFefqg | \n",
487 | " auto clinic bust myth women car patric bank te... | \n",
488 | " find femal mechan becom one patric bank discus... | \n",
489 | " 3 | \n",
490 | "
\n",
491 | " \n",
492 | " 8351 | \n",
493 | " B14BiB-Bv3 | \n",
494 | " bin process explan ncix tech tip | \n",
495 | " episod ncix tech tip linu explain bin process ... | \n",
496 | " 3 | \n",
497 | "
\n",
498 | " \n",
499 | " 8352 | \n",
500 | " BU5t9m5SAiU | \n",
501 | " travel india delhi | \n",
502 | " travel india delhi video first hour day india ... | \n",
503 | " 5 | \n",
504 | "
\n",
505 | " \n",
506 | " 8353 | \n",
507 | " fenZhUxLZrQ | \n",
508 | " hindi top futur weapon india drdo space scienc | \n",
509 | " hindi video space scienc told futur project mi... | \n",
510 | " 4 | \n",
511 | "
\n",
512 | " \n",
513 | " 8354 | \n",
514 | " FD3MPQyEub0 | \n",
515 | " zotac game pax east highlight | \n",
516 | " watch day pax east game event featur game acti... | \n",
517 | " 3 | \n",
518 | "
\n",
519 | " \n",
520 | " 8355 | \n",
521 | " SZQWtSgcO4 | \n",
522 | " tip travel japan | \n",
523 | " watch month long road trip across japan http w... | \n",
524 | " 5 | \n",
525 | "
\n",
526 | " \n",
527 | " 8356 | \n",
528 | " Wyncg | \n",
529 | " honest trailer solo star war stori | \n",
530 | " today episod brought u armi join team make dif... | \n",
531 | " 2 | \n",
532 | "
\n",
533 | " \n",
534 | " 8357 | \n",
535 | " UNpyF58BY | \n",
536 | " fourier transform visual introduct | \n",
537 | " anim introduct fourier transform home page htt... | \n",
538 | " 4 | \n",
539 | "
\n",
540 | " \n",
541 | " 8358 | \n",
542 | " NBSv_0yHnB0 | \n",
543 | " nyc center space scienc educ | \n",
544 | " nyc center space scienc educ experienti space ... | \n",
545 | " 4 | \n",
546 | "
\n",
547 | " \n",
548 | " 8359 | \n",
549 | " YJjL82-KORA | \n",
550 | " hubbl telescop show spiral black hole power je... | \n",
551 | " visit websit http www junglejoel com hubbl spa... | \n",
552 | " 4 | \n",
553 | "
\n",
554 | " \n",
555 | " 8360 | \n",
556 | " 0pBm2DjkzaQ | \n",
557 | " salif lasourc michael jackson danc compil | \n",
558 | " salif lasourc michael jackson danc compil tag ... | \n",
559 | " 0 | \n",
560 | "
\n",
561 | " \n",
562 | " 8361 | \n",
563 | " 9e6Oi__HGIA | \n",
564 | " hotel se acha aur ghar ka maza jaipur travel g... | \n",
565 | " travel paaji show uniqu beauti airbnb properti... | \n",
566 | " 5 | \n",
567 | "
\n",
568 | " \n",
569 | " 8362 | \n",
570 | " qUz25YgfTgI | \n",
571 | " korean food seawe soup recip miyeokguk | \n",
572 | " healthi comfort soup multi task nourish stapl ... | \n",
573 | " 1 | \n",
574 | "
\n",
575 | " \n",
576 | " 8363 | \n",
577 | " Ov-hREl5wEY | \n",
578 | " human evolut histori | \n",
579 | " biolog human histori evolut | \n",
580 | " 2 | \n",
581 | "
\n",
582 | " \n",
583 | " 8364 | \n",
584 | " RVDidS5Ynk | \n",
585 | " north indian thali indian food delhi street fo... | \n",
586 | " super delici thali street delhi india delhi fo... | \n",
587 | " 1 | \n",
588 | "
\n",
589 | " \n",
590 | " 8365 | \n",
591 | " 5-tktFyIz | \n",
592 | " ballet piec franc | \n",
593 | " choreograph adi morgan perform adi morgan edit... | \n",
594 | " 0 | \n",
595 | "
\n",
596 | " \n",
597 | " 8366 | \n",
598 | " jSJhesy2z | \n",
599 | " travel japan ep vlog departur | \n",
600 | " hello guy japan probabl favourit place go visi... | \n",
601 | " 5 | \n",
602 | "
\n",
603 | " \n",
604 | " 8367 | \n",
605 | " 0tim0_WzXY | \n",
606 | " top fifteen femal ballet dancer | \n",
607 | " list top fifteen favorit femal ballet dancer r... | \n",
608 | " 0 | \n",
609 | "
\n",
610 | " \n",
611 | " 8368 | \n",
612 | " NSAgLvKOPLQ | \n",
613 | " model atom timelin | \n",
614 | " see chemistri video check http socrat org chem... | \n",
615 | " 4 | \n",
616 | "
\n",
617 | " \n",
618 | " 8369 | \n",
619 | " JzZ4u5XxvFU | \n",
620 | " probabl find electron beyond bohr radiu hydrog... | \n",
621 | " solut question chapter quantum physic hc verma | \n",
622 | " 4 | \n",
623 | "
\n",
624 | " \n",
625 | " 8370 | \n",
626 | " Tohl-nFCug | \n",
627 | " peni fish korean street food noryangjin fisher... | \n",
628 | " peni fish krw usd noryangjin fisheri wholesal ... | \n",
629 | " 1 | \n",
630 | "
\n",
631 | " \n",
632 | " 8371 | \n",
633 | " Fr48ud9Yi | \n",
634 | " paneer kofta recip hindi indian food made easi | \n",
635 | " paneer kofta recip hindi indian food made easi... | \n",
636 | " 1 | \n",
637 | "
\n",
638 | " \n",
639 | " 8372 | \n",
640 | " LEkSQGzYn4E | \n",
641 | " american danc american ambidextr sbsm school s... | \n",
642 | " american danc american ambidextr sbsm school s... | \n",
643 | " 0 | \n",
644 | "
\n",
645 | " \n",
646 | " 8373 | \n",
647 | " 8YuKbcHn1Ig | \n",
648 | " wendymin cook mini korean bbq mini cook | \n",
649 | " make mini edibl korean bbq use everyth miniatur | \n",
650 | " 1 | \n",
651 | "
\n",
652 | " \n",
653 | " 8374 | \n",
654 | " J4jRR5r4-A | \n",
655 | " pawn star star war collect season histori | \n",
656 | " owner valuabl origin star war figurin throw hi... | \n",
657 | " 2 | \n",
658 | "
\n",
659 | " \n",
660 | "
\n",
661 | "
8375 rows × 4 columns
\n",
662 | "
"
663 | ],
664 | "text/plain": [
665 | " links title \\\n",
666 | "0 6bBQ3pd0YU8 american tap danc orchestra strike train chore... \n",
667 | "1 JLU0c0mmvxg robonaut space station nasa space scienc hd video \n",
668 | "2 IojqhtUwz50 european spacecraft pass key reentri test esa ... \n",
669 | "3 -zgGVyADnFE jordan bouri frontrow world danc franc qualifi \n",
670 | "4 ZZXWS0n0MCA scienc univers space satellit hindi \n",
671 | "5 Hz029D4wn1I hot young star creat bright red nebula eso spa... \n",
672 | "6 0jnuiRot6d0 aaja ko bigyaan episod school astronomi \n",
673 | "7 FgBhMVgLtg danc african danc zehil rugaro nekutamba happi \n",
674 | "8 0o90mJe21H tip travel india \n",
675 | "9 PB3E_C1608k korean food buffet eat \n",
676 | "10 2qwCB42_C1I spacecraft take pictur planet nasa space scien... \n",
677 | "11 Ev1gpq51ntg nagareyama \n",
678 | "12 xJSoETckuBY heat temperatur physic gk question hindi rrb ntpc \n",
679 | "13 MnuRX73YSFg bhimkund waterfal \n",
680 | "14 YVtElThaNpI ultim usa food battl fridgecam \n",
681 | "15 lo0X2ZdElQ4 conscious final frontier dada gunamuktananda t... \n",
682 | "16 8c5YY9DcoiE profession dancer tri fortnit danc challeng pr... \n",
683 | "17 MXexpBitoFI black bean noodl \n",
684 | "18 UNEi_TeKd5U india iceland travel k day budget trip \n",
685 | "19 Ww5L5cXUjKE happen earth sun die space scienc documentari \n",
686 | "20 i5QeyztIIT8 brief histori colorado time geolog colorado \n",
687 | "21 CRJqiMlGLk highlight night sky novemb astronomi space sci... \n",
688 | "22 EvopK4qTEg danc india danc season novemb \n",
689 | "23 LTS_VWTE7z nsd react bt boy w luv danc practic thing didn... \n",
690 | "24 itU6dp5tlA irish war independ minut \n",
691 | "25 OMmer1JRrvU danc india asia pacif \n",
692 | "26 rMiel4nt434 question onsen japan japan travel guid \n",
693 | "27 IGCVTSQw7WU brief histori univers crash cours astronomi \n",
694 | "28 _XxecflRFKU indian reaction pakistan tourism summit food t... \n",
695 | "29 dLkPiY3i1qM halal korean food singapor eatbook vlog ep \n",
696 | "... ... ... \n",
697 | "8345 NFN803DvgBQ travel japan day shibuya knu reunion \n",
698 | "8346 APC_jD95TH8 top tourist attract state main travel guid usa \n",
699 | "8347 YvGDafHwVj american tri bizarr russian food first time \n",
700 | "8348 _eFZM-fQgdA porsch boxster sport car manufactur fiber opti... \n",
701 | "8349 JKm3uzL_A4 wit day asteroid struck jaw drop virtual reali... \n",
702 | "8350 FwFqvOFefqg auto clinic bust myth women car patric bank te... \n",
703 | "8351 B14BiB-Bv3 bin process explan ncix tech tip \n",
704 | "8352 BU5t9m5SAiU travel india delhi \n",
705 | "8353 fenZhUxLZrQ hindi top futur weapon india drdo space scienc \n",
706 | "8354 FD3MPQyEub0 zotac game pax east highlight \n",
707 | "8355 SZQWtSgcO4 tip travel japan \n",
708 | "8356 Wyncg honest trailer solo star war stori \n",
709 | "8357 UNpyF58BY fourier transform visual introduct \n",
710 | "8358 NBSv_0yHnB0 nyc center space scienc educ \n",
711 | "8359 YJjL82-KORA hubbl telescop show spiral black hole power je... \n",
712 | "8360 0pBm2DjkzaQ salif lasourc michael jackson danc compil \n",
713 | "8361 9e6Oi__HGIA hotel se acha aur ghar ka maza jaipur travel g... \n",
714 | "8362 qUz25YgfTgI korean food seawe soup recip miyeokguk \n",
715 | "8363 Ov-hREl5wEY human evolut histori \n",
716 | "8364 RVDidS5Ynk north indian thali indian food delhi street fo... \n",
717 | "8365 5-tktFyIz ballet piec franc \n",
718 | "8366 jSJhesy2z travel japan ep vlog departur \n",
719 | "8367 0tim0_WzXY top fifteen femal ballet dancer \n",
720 | "8368 NSAgLvKOPLQ model atom timelin \n",
721 | "8369 JzZ4u5XxvFU probabl find electron beyond bohr radiu hydrog... \n",
722 | "8370 Tohl-nFCug peni fish korean street food noryangjin fisher... \n",
723 | "8371 Fr48ud9Yi paneer kofta recip hindi indian food made easi \n",
724 | "8372 LEkSQGzYn4E american danc american ambidextr sbsm school s... \n",
725 | "8373 8YuKbcHn1Ig wendymin cook mini korean bbq mini cook \n",
726 | "8374 J4jRR5r4-A pawn star star war collect season histori \n",
727 | "\n",
728 | " description category \n",
729 | "0 atdo perform strike train joyc theater nyc cho... 0 \n",
730 | "1 visit websit http www junglejoel com robonaut ... 4 \n",
731 | "2 visit websit http www junglejoel com european ... 4 \n",
732 | "3 first perform world danc 0 \n",
733 | "4 hello bodhaguru learn proudli present anim vid... 4 \n",
734 | "5 space news info http www coconutsciencelab com... 4 \n",
735 | "6 space scienc technolog 4 \n",
736 | "7 etienn cakpo guest perform profession dancer c... 0 \n",
737 | "8 india massiv countri overwhelm plan trip spend... 5 \n",
738 | "9 travel korea choic food overwhelm want tri eve... 1 \n",
739 | "10 visit websit http www junglejoel com video des... 4 \n",
740 | "11 love travel japan 5 \n",
741 | "12 rrb je 4 \n",
742 | "13 mayurbhanj 5 \n",
743 | "14 episod fridgecam show bring yet anoth ultim ba... 1 \n",
744 | "15 dada gunamuktananda yogi medit teacher bio dad... 4 \n",
745 | "16 pick squat kick game credit fortnit epic game ... 0 \n",
746 | "17 spici rice cake kimbap korean food mukbang eat... 1 \n",
747 | "18 watch budget iceland trip hindi heard peopl sa... 5 \n",
748 | "19 regist facebook happen earth sun die space sci... 4 \n",
749 | "20 minut movi illustr geolog evolut colorado time... 2 \n",
750 | "21 space news info http www coconutsciencelab com... 4 \n",
751 | "22 swarali 0 \n",
752 | "23 sign patreon ahl run bt bon voyag http www pat... 0 \n",
753 | "24 find tumultu time ireland led independ britain... 2 \n",
754 | "25 danc india asia pacif return singapor third ti... 0 \n",
755 | "26 question onsen japan japan travel guid http ww... 5 \n",
756 | "27 thank wonder physic astronom map timelin unive... 2 \n",
757 | "28 join us premium benefit http www youtub com ch... 5 \n",
758 | "29 sinc korean dish contain pork may tough muslim... 1 \n",
759 | "... ... ... \n",
760 | "8345 spent second day shibuya walk around shop met ... 5 \n",
761 | "8346 http ultramodern home ru top tourist attract s... 5 \n",
762 | "8347 love pickl check awesom video buzzfeedvideo mu... 1 \n",
763 | "8348 watch porsch boxster sport car manufactur fibe... 3 \n",
764 | "8349 asteroidday friday june wit extraordinari jour... 4 \n",
765 | "8350 find femal mechan becom one patric bank discus... 3 \n",
766 | "8351 episod ncix tech tip linu explain bin process ... 3 \n",
767 | "8352 travel india delhi video first hour day india ... 5 \n",
768 | "8353 hindi video space scienc told futur project mi... 4 \n",
769 | "8354 watch day pax east game event featur game acti... 3 \n",
770 | "8355 watch month long road trip across japan http w... 5 \n",
771 | "8356 today episod brought u armi join team make dif... 2 \n",
772 | "8357 anim introduct fourier transform home page htt... 4 \n",
773 | "8358 nyc center space scienc educ experienti space ... 4 \n",
774 | "8359 visit websit http www junglejoel com hubbl spa... 4 \n",
775 | "8360 salif lasourc michael jackson danc compil tag ... 0 \n",
776 | "8361 travel paaji show uniqu beauti airbnb properti... 5 \n",
777 | "8362 healthi comfort soup multi task nourish stapl ... 1 \n",
778 | "8363 biolog human histori evolut 2 \n",
779 | "8364 super delici thali street delhi india delhi fo... 1 \n",
780 | "8365 choreograph adi morgan perform adi morgan edit... 0 \n",
781 | "8366 hello guy japan probabl favourit place go visi... 5 \n",
782 | "8367 list top fifteen favorit femal ballet dancer r... 0 \n",
783 | "8368 see chemistri video check http socrat org chem... 4 \n",
784 | "8369 solut question chapter quantum physic hc verma 4 \n",
785 | "8370 peni fish krw usd noryangjin fisheri wholesal ... 1 \n",
786 | "8371 paneer kofta recip hindi indian food made easi... 1 \n",
787 | "8372 american danc american ambidextr sbsm school s... 0 \n",
788 | "8373 make mini edibl korean bbq use everyth miniatur 1 \n",
789 | "8374 owner valuabl origin star war figurin throw hi... 2 \n",
790 | "\n",
791 | "[8375 rows x 4 columns]"
792 | ]
793 | },
794 | "execution_count": 295,
795 | "metadata": {},
796 | "output_type": "execute_result"
797 | }
798 | ],
799 | "source": [
800 | "df_new"
801 | ]
802 | },
803 | {
804 | "cell_type": "markdown",
805 | "metadata": {},
806 | "source": [
807 | "# Creating the bag of words model"
808 | ]
809 | },
810 | {
811 | "cell_type": "code",
812 | "execution_count": 250,
813 | "metadata": {},
814 | "outputs": [],
815 | "source": [
816 | "\n",
817 | "from sklearn.feature_extraction.text import CountVectorizer\n",
818 | "cv = CountVectorizer(max_features = 1500)\n",
819 | "X = cv.fit_transform(corpus, corpus1).toarray()\n",
820 | "y = df_new.iloc[:, 3].values"
821 | ]
822 | },
823 | {
824 | "cell_type": "markdown",
825 | "metadata": {},
826 | "source": [
827 | "# Splitting the dataset into the Training set and Test set "
828 | ]
829 | },
830 | {
831 | "cell_type": "code",
832 | "execution_count": 251,
833 | "metadata": {},
834 | "outputs": [],
835 | "source": [
836 | "from sklearn.model_selection import train_test_split\n",
837 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)"
838 | ]
839 | },
840 | {
841 | "cell_type": "markdown",
842 | "metadata": {},
843 | "source": [
844 | "# Random Forest "
845 | ]
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": 462,
850 | "metadata": {},
851 | "outputs": [
852 | {
853 | "data": {
854 | "text/plain": [
855 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n",
856 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
857 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
858 | " min_samples_leaf=1, min_samples_split=2,\n",
859 | " min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,\n",
860 | " oob_score=False, random_state=None, verbose=0,\n",
861 | " warm_start=False)"
862 | ]
863 | },
864 | "execution_count": 462,
865 | "metadata": {},
866 | "output_type": "execute_result"
867 | }
868 | ],
869 | "source": [
870 | "from sklearn.ensemble import RandomForestClassifier\n",
871 | "classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy')\n",
872 | "classifier.fit(X_train, y_train)"
873 | ]
874 | },
875 | {
876 | "cell_type": "code",
877 | "execution_count": 463,
878 | "metadata": {},
879 | "outputs": [],
880 | "source": [
881 | "y_pred = classifier.predict(X_test)"
882 | ]
883 | },
884 | {
885 | "cell_type": "code",
886 | "execution_count": 464,
887 | "metadata": {},
888 | "outputs": [
889 | {
890 | "data": {
891 | "text/plain": [
892 | "0.9605970149253731"
893 | ]
894 | },
895 | "execution_count": 464,
896 | "metadata": {},
897 | "output_type": "execute_result"
898 | }
899 | ],
900 | "source": [
901 | "classifier.score(X_test, y_test)"
902 | ]
903 | },
904 | {
905 | "cell_type": "code",
906 | "execution_count": 485,
907 | "metadata": {},
908 | "outputs": [
909 | {
910 | "name": "stdout",
911 | "output_type": "stream",
912 | "text": [
913 | " precision recall f1-score support\n",
914 | "\n",
915 | " Art & Dance 0.95 0.97 0.96 313\n",
916 | " Food 0.96 0.99 0.98 272\n",
917 | " History 0.96 0.98 0.97 287\n",
918 | "Manufacturing 0.95 0.94 0.94 241\n",
919 | " Science 0.97 0.94 0.96 289\n",
920 | " Travel 0.98 0.93 0.96 273\n",
921 | "\n",
922 | " micro avg 0.96 0.96 0.96 1675\n",
923 | " macro avg 0.96 0.96 0.96 1675\n",
924 | " weighted avg 0.96 0.96 0.96 1675\n",
925 | "\n"
926 | ]
927 | }
928 | ],
929 | "source": [
930 | "print(classification_report(y_test, y_pred))"
931 | ]
932 | },
933 | {
934 | "cell_type": "code",
935 | "execution_count": 465,
936 | "metadata": {},
937 | "outputs": [],
938 | "source": [
939 | "# Making the Confusion Matrix\n",
940 | "from sklearn.metrics import confusion_matrix\n",
941 | "cm = confusion_matrix(y_test, y_pred)"
942 | ]
943 | },
944 | {
945 | "cell_type": "code",
946 | "execution_count": 466,
947 | "metadata": {},
948 | "outputs": [
949 | {
950 | "data": {
951 | "text/plain": [
952 | "array([[305, 0, 2, 3, 0, 3],\n",
953 | " [ 0, 269, 0, 1, 0, 2],\n",
954 | " [ 2, 1, 281, 1, 2, 0],\n",
955 | " [ 5, 1, 3, 226, 5, 1],\n",
956 | " [ 6, 0, 6, 4, 273, 0],\n",
957 | " [ 3, 8, 2, 3, 2, 255]])"
958 | ]
959 | },
960 | "execution_count": 466,
961 | "metadata": {},
962 | "output_type": "execute_result"
963 | }
964 | ],
965 | "source": [
966 | "cm"
967 | ]
968 | },
969 | {
970 | "cell_type": "markdown",
971 | "metadata": {},
972 | "source": [
973 | "# SVM "
974 | ]
975 | },
976 | {
977 | "cell_type": "code",
978 | "execution_count": 467,
979 | "metadata": {},
980 | "outputs": [
981 | {
982 | "data": {
983 | "text/plain": [
984 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
985 | " decision_function_shape='ovr', degree=3, gamma='auto_deprecated',\n",
986 | " kernel='linear', max_iter=-1, probability=False, random_state=0,\n",
987 | " shrinking=True, tol=0.001, verbose=False)"
988 | ]
989 | },
990 | "execution_count": 467,
991 | "metadata": {},
992 | "output_type": "execute_result"
993 | }
994 | ],
995 | "source": [
996 | "from sklearn.svm import SVC\n",
997 | "classifier1 = SVC(kernel = 'linear', random_state = 0)\n",
998 | "classifier1.fit(X_train, y_train)"
999 | ]
1000 | },
1001 | {
1002 | "cell_type": "code",
1003 | "execution_count": 468,
1004 | "metadata": {},
1005 | "outputs": [],
1006 | "source": [
1007 | "y_pred1 = classifier1.predict(X_test)"
1008 | ]
1009 | },
1010 | {
1011 | "cell_type": "code",
1012 | "execution_count": 469,
1013 | "metadata": {},
1014 | "outputs": [
1015 | {
1016 | "data": {
1017 | "text/plain": [
1018 | "0.9564179104477611"
1019 | ]
1020 | },
1021 | "execution_count": 469,
1022 | "metadata": {},
1023 | "output_type": "execute_result"
1024 | }
1025 | ],
1026 | "source": [
1027 | "classifier1.score(X_test, y_test)"
1028 | ]
1029 | },
1030 | {
1031 | "cell_type": "code",
1032 | "execution_count": 470,
1033 | "metadata": {},
1034 | "outputs": [],
1035 | "source": [
1036 | "# Making the Confusion Matrix\n",
1037 | "cm1 = confusion_matrix(y_test, y_pred1)"
1038 | ]
1039 | },
1040 | {
1041 | "cell_type": "code",
1042 | "execution_count": 471,
1043 | "metadata": {},
1044 | "outputs": [
1045 | {
1046 | "data": {
1047 | "text/plain": [
1048 | "array([[301, 0, 4, 6, 0, 2],\n",
1049 | " [ 0, 266, 0, 1, 0, 5],\n",
1050 | " [ 2, 1, 278, 3, 2, 1],\n",
1051 | " [ 0, 1, 4, 229, 4, 3],\n",
1052 | " [ 1, 0, 9, 9, 270, 0],\n",
1053 | " [ 2, 4, 1, 6, 2, 258]])"
1054 | ]
1055 | },
1056 | "execution_count": 471,
1057 | "metadata": {},
1058 | "output_type": "execute_result"
1059 | }
1060 | ],
1061 | "source": [
1062 | "cm1"
1063 | ]
1064 | },
1065 | {
1066 | "cell_type": "markdown",
1067 | "metadata": {},
1068 | "source": [
1069 | "# Naive Bayes "
1070 | ]
1071 | },
1072 | {
1073 | "cell_type": "code",
1074 | "execution_count": 472,
1075 | "metadata": {},
1076 | "outputs": [
1077 | {
1078 | "data": {
1079 | "text/plain": [
1080 | "GaussianNB(priors=None, var_smoothing=1e-09)"
1081 | ]
1082 | },
1083 | "execution_count": 472,
1084 | "metadata": {},
1085 | "output_type": "execute_result"
1086 | }
1087 | ],
1088 | "source": [
1089 | "from sklearn.naive_bayes import GaussianNB\n",
1090 | "classifier2 = GaussianNB()\n",
1091 | "classifier2.fit(X_train, y_train)"
1092 | ]
1093 | },
1094 | {
1095 | "cell_type": "code",
1096 | "execution_count": 473,
1097 | "metadata": {},
1098 | "outputs": [],
1099 | "source": [
1100 | "y_pred2 = classifier2.predict(X_test)"
1101 | ]
1102 | },
1103 | {
1104 | "cell_type": "code",
1105 | "execution_count": 474,
1106 | "metadata": {},
1107 | "outputs": [
1108 | {
1109 | "data": {
1110 | "text/plain": [
1111 | "0.8107462686567164"
1112 | ]
1113 | },
1114 | "execution_count": 474,
1115 | "metadata": {},
1116 | "output_type": "execute_result"
1117 | }
1118 | ],
1119 | "source": [
1120 | "classifier2.score(X_test, y_test)"
1121 | ]
1122 | },
1123 | {
1124 | "cell_type": "code",
1125 | "execution_count": 475,
1126 | "metadata": {},
1127 | "outputs": [],
1128 | "source": [
1129 | "# Making the Confusion Matrix\n",
1130 | "cm2 = confusion_matrix(y_test, y_pred2)"
1131 | ]
1132 | },
1133 | {
1134 | "cell_type": "code",
1135 | "execution_count": 476,
1136 | "metadata": {},
1137 | "outputs": [
1138 | {
1139 | "data": {
1140 | "text/plain": [
1141 | "array([[289, 4, 10, 2, 0, 8],\n",
1142 | " [ 0, 253, 1, 0, 0, 18],\n",
1143 | " [ 40, 4, 194, 16, 10, 23],\n",
1144 | " [ 1, 7, 4, 219, 4, 6],\n",
1145 | " [ 4, 6, 59, 15, 199, 6],\n",
1146 | " [ 3, 55, 4, 0, 7, 204]])"
1147 | ]
1148 | },
1149 | "execution_count": 476,
1150 | "metadata": {},
1151 | "output_type": "execute_result"
1152 | }
1153 | ],
1154 | "source": [
1155 | "cm2"
1156 | ]
1157 | },
1158 | {
1159 | "cell_type": "markdown",
1160 | "metadata": {},
1161 | "source": [
1162 | "# XGboost "
1163 | ]
1164 | },
1165 | {
1166 | "cell_type": "code",
1167 | "execution_count": 477,
1168 | "metadata": {},
1169 | "outputs": [
1170 | {
1171 | "data": {
1172 | "text/plain": [
1173 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
1174 | " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n",
1175 | " max_depth=3, min_child_weight=1, missing=None, n_estimators=100,\n",
1176 | " n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,\n",
1177 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
1178 | " silent=True, subsample=1)"
1179 | ]
1180 | },
1181 | "execution_count": 477,
1182 | "metadata": {},
1183 | "output_type": "execute_result"
1184 | }
1185 | ],
1186 | "source": [
1187 | "from xgboost import XGBClassifier\n",
1188 | "classifier3 = XGBClassifier()\n",
1189 | "classifier3.fit(X_train, y_train)"
1190 | ]
1191 | },
1192 | {
1193 | "cell_type": "code",
1194 | "execution_count": 293,
1195 | "metadata": {},
1196 | "outputs": [],
1197 | "source": [
1198 | "y_pred3 = classifier3.predict(X_test)"
1199 | ]
1200 | },
1201 | {
1202 | "cell_type": "code",
1203 | "execution_count": 271,
1204 | "metadata": {},
1205 | "outputs": [
1206 | {
1207 | "data": {
1208 | "text/plain": [
1209 | "0.937910447761194"
1210 | ]
1211 | },
1212 | "execution_count": 271,
1213 | "metadata": {},
1214 | "output_type": "execute_result"
1215 | }
1216 | ],
1217 | "source": [
1218 | "classifier3.score(X_test, y_test)"
1219 | ]
1220 | },
1221 | {
1222 | "cell_type": "code",
1223 | "execution_count": 458,
1224 | "metadata": {},
1225 | "outputs": [],
1226 | "source": [
1227 | "# Making the Confusion Matrix\n",
1228 | "cm3 = confusion_matrix(y_test, y_pred3)"
1229 | ]
1230 | },
1231 | {
1232 | "cell_type": "code",
1233 | "execution_count": 459,
1234 | "metadata": {},
1235 | "outputs": [
1236 | {
1237 | "data": {
1238 | "text/plain": [
1239 | "array([[287, 0, 3, 20, 0, 3],\n",
1240 | " [ 0, 264, 1, 4, 0, 3],\n",
1241 | " [ 0, 1, 275, 9, 2, 0],\n",
1242 | " [ 1, 1, 2, 235, 2, 0],\n",
1243 | " [ 0, 0, 3, 28, 258, 0],\n",
1244 | " [ 0, 7, 0, 14, 0, 252]])"
1245 | ]
1246 | },
1247 | "execution_count": 459,
1248 | "metadata": {},
1249 | "output_type": "execute_result"
1250 | }
1251 | ],
1252 | "source": [
1253 | "cm3"
1254 | ]
1255 | }
1256 | ],
1257 | "metadata": {
1258 | "kernelspec": {
1259 | "display_name": "Python 3",
1260 | "language": "python",
1261 | "name": "python3"
1262 | },
1263 | "language_info": {
1264 | "codemirror_mode": {
1265 | "name": "ipython",
1266 | "version": 2
1267 | },
1268 | "file_extension": ".py",
1269 | "mimetype": "text/x-python",
1270 | "name": "python",
1271 | "nbconvert_exporter": "python",
1272 | "pygments_lexer": "ipython2",
1273 | "version": "2.7.15rc1"
1274 | }
1275 | },
1276 | "nbformat": 4,
1277 | "nbformat_minor": 2
1278 | }
1279 |
--------------------------------------------------------------------------------