├── README.md
└── Twitter User Personality Prediction
├── .idea
├── .name
├── Twitter User Personality Prediction.iml
├── encodings.xml
├── misc.xml
├── modules.xml
└── workspace.xml
├── .ipynb_checkpoints
└── Personality Predictor and Visualizer-checkpoint.ipynb
├── Personality Prediction from Twitter Data.pdf
├── Personality Predictor and Visualizer.ipynb
├── TwitterData
├── StopWords.txt
├── UserTweets.txt
├── k_means_geo_gt_8_out
├── labeledPersonalityTweets.csv
├── survey_dump.csv
├── survey_dump_geo_gt_8_1
├── survey_dump_with_geo
├── survey_dump_with_geo_gt_8
└── survey_dump_with_tweet_count
└── mmds
├── __init__.py
├── supervised
├── __init__.py
├── classification_algos.py
├── feature_engineering.py
├── filter_stop_words.py
├── personality_predictor_and_visualizer.py
├── preprocess_tweets.py
└── tweet_analysis.py
├── unsupervised
├── __init__.py
├── k_means_estimator.py
└── k_means_plot.py
└── utils
├── __init__.py
├── plot_utils.py
└── time_utils.py
/README.md:
--------------------------------------------------------------------------------
1 | # Twitter User Personality Prediction
2 | Predict Personality of a person using Sentiment Analysis & Unigram Words as features on user's Twitter data.
3 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/.idea/.name:
--------------------------------------------------------------------------------
1 | Twitter User Personality Prediction
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/.idea/Twitter User Personality Prediction.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 | true
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 | 1449089592019
482 |
483 | 1449089592019
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 | file://$PROJECT_DIR$/FeatureEngineering.py
519 | 54
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/Personality Prediction from Twitter Data.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/Personality Prediction from Twitter Data.pdf
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/Personality Predictor and Visualizer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Malware Classification on Behavioral Data Challenge 5 "
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "### Read the input data files."
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 14,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import numpy as np\n",
26 | "import os\n",
27 | "# XTrain = np.loadtxt('training_data.csv', skiprows=1, usecols=(0,1), delimiter=',', dtype='str')\n",
28 | "# print XTrain[0]\n",
29 | "# # XTest = np.loadtxt('test_data_public_new.csv', skiprows=1, usecols=(1,), delimiter=',',dtype='str')\n",
30 | "# # print XTest[0]\n",
31 | "# YTrain = XTrain[:,-1]\n",
32 | "# XTrain = XTrain[:,0]\n",
33 | "# print XTrain.shape\n",
34 | "# YTrain = YTrain.astype(np.int)\n",
35 | "# # YTrain = YTrain.reshape(-1,1)\n",
36 | "# print type(YTrain[0]),YTrain[0],YTrain.shape\n",
37 | "# # print XTest.shape"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 15,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "80\n"
52 | ]
53 | }
54 | ],
55 | "source": [
56 | "#Read the tweets one by one and process it\n",
57 | "import csv\n",
58 | "# inpTweets = csv.reader(open('TwitterData/survey_dump.csv', 'rb'), delimiter=',')\n",
59 | "inpTweets = csv.reader(open('TwitterData/survey_dump_with_tweet_count', 'rb'), delimiter=',')\n",
60 | "i = 0\n",
61 | "for row in inpTweets:\n",
62 | " i+=1;\n",
63 | "\n",
64 | "print i\n"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "### Pre-process Tweets"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 16,
77 | "metadata": {
78 | "collapsed": true
79 | },
80 | "outputs": [],
81 | "source": [
82 | "import re\n",
83 | "\n",
84 | "class PreprocessTweets:\n",
85 | "\n",
86 | " def __init__(self):\n",
87 | " self.name = 'PreprocessTweets'\n",
88 | "\n",
89 | " #start process_tweet\n",
90 | " def processTweet(self, tweet):\n",
91 | " \n",
92 | " #Convert to lower case\n",
93 | " tweet = tweet.lower()\n",
94 | " #Convert www.* or https?://* to URL\n",
95 | " tweet = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))','URL',tweet)\n",
96 | " #Convert @username to AT_USER\n",
97 | " tweet = re.sub('@[^\\s]+','AT_USER',tweet)\n",
98 | " #Remove additional white spaces\n",
99 | " tweet = re.sub('[\\s]+', ' ', tweet)\n",
100 | " #Remove special characters\n",
101 | " #tweet = re.sub('*\\[\\]%\\(\\)', '', tweet)\n",
102 | " #Replace #word with word\n",
103 | " tweet = re.sub(r'#([^\\s]+)', r'\\1', tweet)\n",
104 | " #trim\n",
105 | " tweet = tweet.strip('\\'\"')\n",
106 | "\n",
107 | " # Remove all Non-ASCII characters\n",
108 | " tweet = re.sub(r'[^\\x00-\\x7F]+',' ', tweet)\n",
109 | "\n",
110 | " return tweet\n",
111 | "\n"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 17,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [],
121 | "source": [
122 | "\n",
123 | "# import PreprocessTweets\n",
124 | "\n",
125 | "class FilterStopWords:\n",
126 | "\n",
127 | " # stopWords = []\n",
128 | " def __init__(self):\n",
129 | " self.name = 'FilterStopWords'\n",
130 | " #initialize stopWords\n",
131 | " self.stopWords = []\n",
132 | "\n",
133 | " #start replaceTwoOrMore\n",
134 | " # def replaceTwoOrMore(s):\n",
135 | " # #look for 2 or more repetitions of character and replace with the character itself\n",
136 | " # pattern = re.compile(r\"(.)\\1{1,}\", re.DOTALL)\n",
137 | " # return pattern.sub(r\"\\1\\1\", s)\n",
138 | " #end\n",
139 | "\n",
140 | " def getStopWordList(self, stopWordListFileName):\n",
141 | " #read the stopwords file and build a list\n",
142 | " stopWords = []\n",
143 | " stopWords.append('AT_USER')\n",
144 | " stopWords.append('URL')\n",
145 | " stopWords.append('[')\n",
146 | " stopWords.append('[')\n",
147 | "\n",
148 | " fp = open(stopWordListFileName, 'r')\n",
149 | " line = fp.readline()\n",
150 | " while line:\n",
151 | " word = line.strip()\n",
152 | " stopWords.append(word)\n",
153 | " line = fp.readline()\n",
154 | " fp.close()\n",
155 | " return stopWords\n",
156 | " \n",
157 | " def getFeatureVector(self, tweet, stopWords):\n",
158 | " featureVector = []\n",
159 | " #split tweet into words\n",
160 | " words = tweet.split()\n",
161 | " for w in words:\n",
162 | " #replace two or more with two occurrences\n",
163 | " #w = replaceTwoOrMore(w)\n",
164 | " #strip punctuation\n",
165 | " w = w.strip('\\'\"?,.')\n",
166 | " #check if the word stats with an alphabet\n",
167 | " val = re.search(r\"^[a-zA-Z][a-zA-Z0-9]*$\", w)\n",
168 | " #ignore if it is a stop word\n",
169 | " if(w in self.stopWords or val is None):\n",
170 | " continue\n",
171 | " else:\n",
172 | " featureVector.append(w.lower())\n",
173 | " return featureVector\n"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "### Feature Engineering"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 18,
186 | "metadata": {
187 | "collapsed": false
188 | },
189 | "outputs": [],
190 | "source": [
191 | "# XTrain = []\n",
192 | "# YTrain = []\n",
193 | "# XTrainFeatures = []\n",
194 | "# XTrainSentiment = []\n",
195 | "# XTrainFreqTweets = []\n",
196 | "# geo_latitude = []\n",
197 | "# geo_longitude = []\n",
198 | "\n",
199 | "# # sample = \"{\"\"“@AMBITIOUS_SLIM: Fresh could have deleted my one of my double haves” huh\"\",\"\"@DJFreshery cause I can see u snappin off but doing it lowkey\"\",\"\"“@RAWmartini: The #WellCumThruThenMovement” best movement ever\"\",\"\"@HennessyBronze I do what I want tho\"\"}\"\n",
200 | "# # sample = sample.replace('\"\",\"\"',\" \")\n",
201 | "# # sample = sample.replace('\"\"',\" \")\n",
202 | "# # print sample\n",
203 | "# # wordsList = sample.split()\n",
204 | "\n",
205 | "\n",
206 | "# # newwordsList = [word.split() for word in wordsList]\n",
207 | "# # print newwordsList\n",
208 | "# # filtered_words = [word for word in newwordsList if word not in stopwords.words('english')]\n",
209 | "# # print filtered_words[0]\n",
210 | "# # filteredTweets = ' '.join(filtered_words)\n",
211 | "# # print filteredTweets\n",
212 | "\n",
213 | "\n",
214 | "# # from PreprocessTweets import PreprocessTweets\n",
215 | "# # from FilterStopWords import FilterStopWords\n",
216 | "# import nltk\n",
217 | "# # from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
218 | "# from textblob import TextBlob\n",
219 | "\n",
220 | "# # nltk.download()\n",
221 | "# from nltk.corpus import stopwords\n",
222 | "\n",
223 | "# class FeatureEngineering:\n",
224 | "\n",
225 | "# def __init__(self):\n",
226 | "# self.name = 'FeatureEngineering'\n",
227 | "# self.featureList = []\n",
228 | "# # self.sid = SentimentIntensityAnalyzer()\n",
229 | "\n",
230 | "\n",
231 | "# #start extract_features\n",
232 | "# def extract_features(self,tweet):\n",
233 | "# tweet_words = set(tweet)\n",
234 | "# features = {}\n",
235 | "# for word in self.featureList:\n",
236 | "# features['contains(%s)' % word] = (word in tweet_words)\n",
237 | "# return features\n",
238 | "\n",
239 | "# ## Create New Training set based on personality labels predicted from Survey results\n",
240 | "\n",
241 | "# def createNewTrainingSet(self):\n",
242 | "\n",
243 | "# objFilterStopWords = FilterStopWords()\n",
244 | "# objPreprocessTweets = PreprocessTweets()\n",
245 | "\n",
246 | "# stopWords = objFilterStopWords.getStopWordList('TwitterData/StopWords.txt')\n",
247 | " \n",
248 | " \n",
249 | "# #Read the tweets one by one and process it\n",
250 | "# # inpTweets = csv.reader(open('TwitterData/survey_dump.csv', 'rb'), delimiter=',') #, quotechar='|')\n",
251 | "# inpTweets = csv.reader(open('TwitterData/survey_dump_with_tweet_count', 'rb'), delimiter=',')\n",
252 | "# inpTweets.next()\n",
253 | "# tweets = []\n",
254 | "# i = 0\n",
255 | "# for row in inpTweets:\n",
256 | "# # print row\n",
257 | "# personality = row[5]\n",
258 | "# # print personality\n",
259 | "# tweet = row[1]\n",
260 | "# cleanTweet = tweet.replace('\"\",\"\"',\" \")\n",
261 | "# cleanTweet = cleanTweet.replace('\"\"',\" \")\n",
262 | "# # print tweet\n",
263 | "# processedTweet = objPreprocessTweets.processTweet(cleanTweet)\n",
264 | "# # print processedTweet\n",
265 | "\n",
266 | "# XTrainFreqTweets.append(int(row[4]))\n",
267 | "# wordsList = processedTweet.split()\n",
268 | "# # print wordsList\n",
269 | " \n",
270 | "# # Remove stop words\n",
271 | "# # filtered_words = [word for word in processedTweet if word not in stopwords.words('english')]\n",
272 | "# filtered_words = [word for word in wordsList if word not in stopwords.words('english')]\n",
273 | "# # print filtered_words\n",
274 | "# filteredTweets = ' '.join(filtered_words)\n",
275 | " \n",
276 | "# featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords)\n",
277 | " \n",
278 | "# geo_latitude.append(float(row[2]))\n",
279 | "# geo_longitude.append(float(row[3]))\n",
280 | " \n",
281 | "# # Append to feature list to collect total words\n",
282 | "# # for word in featureVector:\n",
283 | "# # self.featureList.append(word)\n",
284 | "# # featureList.append([featureVector[i] for i in xrange(len(featureVector))])\n",
285 | "\n",
286 | "# # Use NLTK Vader for Sentiment Analysis\n",
287 | "\n",
288 | "# # Citation: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text.\n",
289 | "# # Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n",
290 | "# # Extract sentiment based on the tweet.\n",
291 | "# # ss = self.sid.polarity_scores(row)\n",
292 | "# # for k in sorted(ss):\n",
293 | "# # print('{0}: {1}, '.format(k, ss[k]))\n",
294 | "# #\n",
295 | "# # totSentiment = sorted(ss)[0]\n",
296 | "\n",
297 | "# # Use TextBlog for Sentiment Analysis\n",
298 | "# # print tweet\n",
299 | "# # blob = TextBlob(tweet)\n",
300 | " \n",
301 | "# blob = TextBlob(processedTweet)\n",
302 | "# # print blob\n",
303 | "# sentiment = 0\n",
304 | "# for sentence in blob.sentences:\n",
305 | "# # print sentence\n",
306 | "# sentiment += sentence.sentiment.polarity\n",
307 | "# # print sentiment\n",
308 | "\n",
309 | "# totSentiment = sentiment/ len(blob.sentences)\n",
310 | "# # featureVector.append(totSentiment)\n",
311 | "\n",
312 | "# XTrainSentiment.append(totSentiment)\n",
313 | " \n",
314 | "# # strFeatures = [item.lower() for item in featureVector]\n",
315 | " \n",
316 | "# # XTrainFeatures.append(processedTweet)\n",
317 | "# XTrainFeatures.append(filteredTweets)\n",
318 | " \n",
319 | "# YTrain.append(personality)\n",
320 | " \n",
321 | "# tweets.append((featureVector, personality))\n",
322 | " \n",
323 | "# # i+=1\n",
324 | "# # if i==3:\n",
325 | "# # break\n",
326 | " \n",
327 | "# #end loop\n",
328 | "# # print tweets\n",
329 | "# # print self.featureList\n",
330 | "# # Remove featureList duplicates\n",
331 | "# # featureList = list(set(self.featureList))\n",
332 | "\n",
333 | "# # Extract feature vector for all tweets in one shote\n",
334 | "# training_set = nltk.classify.util.apply_features(self.extract_features, tweets)\n",
335 | "\n",
336 | "# # print self.featureList\n",
337 | "# # print training_set\n",
338 | "\n",
339 | "\n",
340 | " \n",
341 | "# return training_set\n"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 19,
347 | "metadata": {
348 | "collapsed": true
349 | },
350 | "outputs": [],
351 | "source": [
352 | "\n",
353 | "import nltk\n",
354 | "from textblob import TextBlob\n",
355 | "# nltk.download()\n",
356 | "from nltk.corpus import stopwords\n",
357 | "\n",
358 | "class FeatureEngineering:\n",
359 | "\n",
360 | " def __init__(self):\n",
361 | " self.name = 'FeatureEngineering'\n",
362 | " self.featureList = []\n",
363 | " # self.sid = SentimentIntensityAnalyzer()\n",
364 | "\n",
365 | "\n",
366 | " #start extract_features\n",
367 | " def extract_features(self,tweet):\n",
368 | " tweet_words = set(tweet)\n",
369 | " features = {}\n",
370 | " for word in self.featureList:\n",
371 | " features['contains(%s)' % word] = (word in tweet_words)\n",
372 | " return features\n",
373 | "\n",
374 | "## Create New Training set based on personality labels predicted from Survey results\n",
375 | "\n",
376 | " def createNewTrainingSet(self, fileName):\n",
377 | " XTrain = []\n",
378 | " YTrain = []\n",
379 | " XTrainFeatures = []\n",
380 | " XTrainSentiment = []\n",
381 | " XTrainFreqTweets = []\n",
382 | " geo_latitude = []\n",
383 | " geo_longitude = []\n",
384 | " \n",
385 | " objFilterStopWords = FilterStopWords()\n",
386 | " objPreprocessTweets = PreprocessTweets()\n",
387 | "\n",
388 | " stopWords = objFilterStopWords.getStopWordList('TwitterData/StopWords.txt')\n",
389 | " \n",
390 | " #Read the tweets one by one and process it\n",
391 | " inpTweets = csv.reader(open(fileName, 'rb'), delimiter=',')\n",
392 | " inpTweets.next()\n",
393 | " tweets = []\n",
394 | " i = 0\n",
395 | " for row in inpTweets:\n",
396 | "# print row\n",
397 | " personality = row[5]\n",
398 | " tweet = row[1]\n",
399 | " cleanTweet = tweet.replace('\"\",\"\"',\" \")\n",
400 | " cleanTweet = cleanTweet.replace('\"\"',\" \")\n",
401 | " processedTweet = objPreprocessTweets.processTweet(cleanTweet)\n",
402 | "\n",
403 | " XTrainFreqTweets.append(int(row[4]))\n",
404 | " wordsList = processedTweet.split()\n",
405 | " \n",
406 | " # Remove stop words\n",
407 | " filtered_words = [word for word in wordsList if word not in stopwords.words('english')]\n",
408 | " filteredTweets = ' '.join(filtered_words)\n",
409 | " \n",
410 | " featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords)\n",
411 | " \n",
412 | " geo_latitude.append(float(row[2]))\n",
413 | " geo_longitude.append(float(row[3]))\n",
414 | " \n",
415 | " blob = TextBlob(processedTweet)\n",
416 | " sentiment = 0\n",
417 | " for sentence in blob.sentences:\n",
418 | " sentiment += sentence.sentiment.polarity\n",
419 | "\n",
420 | " totSentiment = sentiment/ len(blob.sentences)\n",
421 | "\n",
422 | " XTrainSentiment.append(totSentiment)\n",
423 | "\n",
424 | " XTrainFeatures.append(filteredTweets)\n",
425 | " \n",
426 | " YTrain.append(personality)\n",
427 | " \n",
428 | "# i+=1\n",
429 | "# if i==3:\n",
430 | "# break\n",
431 | " \n",
432 | "\n",
433 | " return XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude\n"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 20,
439 | "metadata": {
440 | "collapsed": false
441 | },
442 | "outputs": [],
443 | "source": [
444 | "objFeatureEngineering = FeatureEngineering()\n",
445 | "fileName = 'TwitterData/survey_dump_with_tweet_count'\n",
446 | "XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude = objFeatureEngineering.createNewTrainingSet(fileName)"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 21,
452 | "metadata": {
453 | "collapsed": false
454 | },
455 | "outputs": [],
456 | "source": [
457 | "fileName = 'TwitterData/survey_dump_geo_gt_8_1'\n",
458 | "XEval, YEval, XEvalFeatures, XEvalSentiment, XEvalFreqTweets, eval_geo_latitude, eval_geo_longitude = objFeatureEngineering.createNewTrainingSet(fileName)"
459 | ]
460 | },
461 | {
462 | "cell_type": "markdown",
463 | "metadata": {
464 | "collapsed": true
465 | },
466 | "source": [
467 | "### Get Feature vector"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": 22,
473 | "metadata": {
474 | "collapsed": false
475 | },
476 | "outputs": [],
477 | "source": [
478 | "\n",
479 | "# # from PreprocessTweets import PreprocessTweets\n",
480 | "# # from FilterStopWords import FilterStopWords\n",
481 | "# # from FeatureEngineering import FeatureEngineering\n",
482 | "# import nltk\n",
483 | "\n",
484 | "\n",
485 | "# objFilterStopWords = FilterStopWords()\n",
486 | "# objPreprocessTweets = PreprocessTweets()\n",
487 | "# objFeatureEngineering = FeatureEngineering()\n",
488 | "\n",
489 | "# #trainingSet = objFeatureEngineering.createTrainingSet()\n",
490 | "# trainingSet = objFeatureEngineering.createNewTrainingSet()\n",
491 | "\n",
492 | "# stopWordListFileName = 'TwitterData/StopWords.txt'\n",
493 | "# stopWords = objFilterStopWords.getStopWordList(stopWordListFileName)\n",
494 | "\n",
495 | "# # Train the classifier\n",
496 | "# NBClassifier = nltk.NaiveBayesClassifier.train(trainingSet)\n",
497 | "\n",
498 | "# # Test the classifier\n",
499 | "# testTweet = 'Hurray, I am working on a project on personality prediction on twitter data using sentiment analysis!'\n",
500 | "# processedTestTweet = objPreprocessTweets.processTweet(testTweet)\n",
501 | "# featureVector = objFilterStopWords.getFeatureVector(processedTestTweet, stopWords)\n",
502 | "# # print NBClassifier.classify(objFeatureEngineering.extract_features(featureVector))\n",
503 | "\n",
504 | "\n",
505 | "# # # print informative features about the classifier\n",
506 | "# # print NBClassifier.show_most_informative_features(10)\n",
507 | "\n",
508 | "\n",
509 | "# # testTweet = 'I have successfully completed this project.'\n",
510 | "# # processedTestTweet = objPreprocessTweets.processTweet(testTweet)\n",
511 | "# # featureVector = objFilterStopWords.getFeatureVector(processedTestTweet, stopWords)\n",
512 | "# # print NBClassifier.classify(objFeatureEngineering.extract_features(featureVector))\n"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": 23,
518 | "metadata": {
519 | "collapsed": false
520 | },
521 | "outputs": [],
522 | "source": [
523 | "newYTrain = []\n",
524 | "# print YTrain\n",
525 | "for item in YTrain:\n",
526 | " temp = item.replace('[', '')\n",
527 | " temp = temp.replace('\\\"', '')\n",
528 | " newItem = temp.replace(']', '')\n",
529 | " newYTrain.append(newItem)\n",
530 | " \n",
531 | "YTrain = newYTrain\n",
532 | "# print YTrain\n",
533 | "# print XTrainFeatures[0]"
534 | ]
535 | },
536 | {
537 | "cell_type": "markdown",
538 | "metadata": {},
539 | "source": [
540 | "### Map the class labels to numbers"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": 24,
546 | "metadata": {
547 | "collapsed": false
548 | },
549 | "outputs": [],
550 | "source": [
551 | "\n",
552 | "def mapLabels(className):\n",
553 | " if className == 'Conscientiousness':\n",
554 | " return 0\n",
555 | " elif className == 'Extrovert':\n",
556 | " return 1\n",
557 | " elif className == 'Agreeable':\n",
558 | " return 2\n",
559 | " elif className == 'Empathetic':\n",
560 | " return 3\n",
561 | " elif className == 'Novelty Seeking':\n",
562 | " return 4\n",
563 | " elif className == 'Perfectionist':\n",
564 | " return 5\n",
565 | " elif className == 'Rigid':\n",
566 | " return 6\n",
567 | " elif className == 'Impulsive':\n",
568 | " return 7\n",
569 | " elif className == 'Psychopath':\n",
570 | " return 8\n",
571 | " elif className == 'Obsessive':\n",
572 | " return 9\n",
573 | "# elif className == None:\n",
574 | "# return 10\n",
575 | " else:\n",
576 | " pass\n",
577 | "\n",
578 | "YTrain = [mapLabels(x) for x in YTrain]\n",
579 | "YEval = [mapLabels(x) for x in YEval]"
580 | ]
581 | },
582 | {
583 | "cell_type": "code",
584 | "execution_count": 25,
585 | "metadata": {
586 | "collapsed": false
587 | },
588 | "outputs": [
589 | {
590 | "name": "stdout",
591 | "output_type": "stream",
592 | "text": [
593 | "[None, None, None, None]\n"
594 | ]
595 | }
596 | ],
597 | "source": [
598 | "print YEval[1:5]"
599 | ]
600 | },
601 | {
602 | "cell_type": "code",
603 | "execution_count": 26,
604 | "metadata": {
605 | "collapsed": false
606 | },
607 | "outputs": [
608 | {
609 | "name": "stdout",
610 | "output_type": "stream",
611 | "text": [
612 | "79\n",
613 | "79\n",
614 | "{\"2 please\",\" AT_USER 1dmoviepremiere today! :d :d\",\"AT_USER getting follow would blessing :) love moon back <3 j\",\"AT_USER getting follow would blessing :) love moon back <3 k\",\"AT_USER thank bby :))\",\"AT_USER getting follow would blessing :) love moon back <3 37\",\"AT_USER getting follow would blessing :) love moon back <3 40\",\"AT_USER getting follow would blessing :) love moon back <3 30\",\"AT_USER idk it's little awkward tbh\",\"AT_USER *google images* big booty hoes\",\"AT_USER afternoon good sir\",\"AT_USER think ketchup enjoy it\",\"AT_USER heart aw aw aw\",\"AT_USER ugh haven't heard yet :(\",\"AT_USER getting follow would blessing :) love moon back <3 69\",\"AT_USER can't wait till come back tour us :))\",\" please follow fuckers love much AT_USER AT_USER AT_USER AT_USER 35\",\"calum hood stop dick follow please\",\"uk x factor different compared us version like it's much better lol\",\"AT_USER he's jealous\",\"AT_USER cause anxiety\",\"AT_USER you're angel sahar lol ily <3 liking new @ btw\",\"AT_USER hey calum see follow dont ignore please ily 127\",\"AT_USER dogs cats?\",\"AT_USER youtuber....\",\"AT_USER you're one us\",\"lol icon me\",\"AT_USER hi how's recording?\",\"AT_USER hi calum! hope wonderful day you're staying healthy :) please follow me, love know 8\",\"idk still \\\"what's goodie\\ ,\"AT_USER hey fuck you\",\" AT_USER please follow me! love much <33 t\",\"AT_USER lol caused enough drama flirting shit us\",\"AT_USER actually think bendy before?\",\" AT_USER we're late guys lol suck hold would've though\",\"this follow party getting hand\",\"AT_USER it's late try get follows, wait till tomorrow afternoon man\",\"AT_USER holy crap, you're gonna peel much\",\"AT_USER join marijuana movement, it's joint effort ha that's funny follow pls ily \",\"AT_USER bet could totally dude!! give twitter name stuff hopefully they'll follow you!\",\"he's holding balls\",\"AT_USER skype part hahaha i'm crying\",\" AT_USER AT_USER AT_USER AT_USER follow motherfuckers ily i'm 8\",\"ill keep eyes wide opennn\",\" AT_USER pineapple cool refreshing drink inside. turn up\",\"never forget calum's lip piercing\",\"AT_USER hey calum see follow dont dick please ily 307\",\"being added list one best feelings ( ) ily\",\"AT_USER mean better first one claps \",\"AT_USER meow bitch\",\"AT_USER hey calum see follow dont dick 218\",\"AT_USER hey calum see follow dont dick please 379\",\"i can't even watch damn hug without wanting stab throat\",\"AT_USER what's favorite thing bus?\",\"AT_USER god bless america\",\"AT_USER man things\",\"the live tweets boys friends best ones\",\"AT_USER can't hahaa\",\"AT_USER love much don't even know. thank much everything <3 goodnight boo\",\"i wonder shirt ashton ripped...\",\"AT_USER ...they already did....\",\"i'm uncomfortable luke creeps see holla me? maybe follow me? idk choice\",\"AT_USER wow you're gonna confused lol\",\"AT_USER AT_USER gonna buy strippers?\",\" AT_USER hi bby! please follow me! love much <33 138\",\" AT_USER hi bby! please follow me! love much <33 106\"}\n",
615 | "8\n",
616 | "{\"i wanna see cousin today dont feel like going mondawmin man \",\" AT_USER yo wanted smack shit freshman today yo. !!!!!!\",\" AT_USER vma's sunday!!! \",\"ya bitches asking asking nigga do\",\" AT_USER ain't get locked summer im proud \",\"AT_USER lol could've said thanks & tell ya name then.\",\" AT_USER joy luck club boringgggg! !!!\",\"AT_USER omg, you're going too?!\",\"ima things right time ya, born ride \",\"AT_USER AT_USER lmfao go 'head & get fcked shit.\",\"i wanna go kona grill.\",\"AT_USER sorry ya lost, keep ya head up.\",\"i cant stop smiling \",\"yo really know every vine \",\"pussy, money, weed got lat...steal ya girl call hijack\",\" AT_USER don't know wear tomorrow . goin? lol\",\"damn. ima bitch \",\"AT_USER get hair done got that\",\" AT_USER smiley fucked fucked way \",\"single & nobody attention.\",\"morning store walk..lol\",\" AT_USER could really go pizza & wings right now! !!!!\",\" AT_USER AT_USER AT_USER got lied shit crazy \",\"bad azz nigga. coming home soon nigga.\",\"yall using videos sluts.\",\"i videos doe \",\"i guess means nie go sleep lol\",\"man, miss dad everyday.\",\"worldwide coast coast love get.\",\" AT_USER AT_USER lol fine fine. lol\",\"AT_USER need ya help \",\"i really looking forward going mall\",\"AT_USER dont that? lol\",\"the closer get \",\"lol done got dropped steps,tvs fell head everything else thats act act.\",\"im like meek never sleep\",\"i feel terrible.\",\" AT_USER dfl nie flashy avi lmao already man.\",\"that's lor corey running niggas try fight \",\"va life gonna good \",\" AT_USER can't wait go home \",\"AT_USER lee coming tonight lol\",\" AT_USER AT_USER lol rude eat 20 piece mcdouble hungry ass lmao thats somebody hungry\",\"AT_USER really dont lol right, whatever \",\"AT_USER what?\",\"so ajayasia like nah? lol\",\"AT_USER picture lmao\",\"AT_USER lol smart?\",\" AT_USER bitchesssssssssss get rings next school year yaaasss lmfao remember said that?!\",\"when finally find somebody >\",\" AT_USER yup white-tee \",\"AT_USER text eddie \",\"my phone dry \",\"AT_USER goodmorning \",\" AT_USER AT_USER nah hate guess thats cool \",\"i bust nut im back thinking money\",\"he really said girls trynna freak says, \\\"wanna ride face use eats handle bars?\\\" like ?!\",\" AT_USER never apologize feel. that's like saying sorry real \",\"i wasn't even sleep long felt like forever \",\"i wanna love .. \",\" AT_USER really hate people . \"}\n",
617 | "7\n"
618 | ]
619 | }
620 | ],
621 | "source": [
622 | "XTrain = np.array(XTrainFeatures)\n",
623 | "YTrain = np.array(YTrain)\n",
624 | "\n",
625 | "print len(XTrain)\n",
626 | "print len(YTrain)\n",
627 | "\n",
628 | "print XTrain[1]\n",
629 | "print YTrain[1]\n",
630 | "\n",
631 | "print XTrain[15]\n",
632 | "print YTrain[15]\n",
633 | "\n",
634 | "XEval = np.array(XEvalFeatures)\n",
635 | "YEval = np.array(YEval)"
636 | ]
637 | },
638 | {
639 | "cell_type": "markdown",
640 | "metadata": {
641 | "collapsed": true
642 | },
643 | "source": [
644 | "### Split Train and Test data"
645 | ]
646 | },
647 | {
648 | "cell_type": "code",
649 | "execution_count": 27,
650 | "metadata": {
651 | "collapsed": false
652 | },
653 | "outputs": [
654 | {
655 | "name": "stdout",
656 | "output_type": "stream",
657 | "text": [
658 | "60\n",
659 | "19\n",
660 | "79\n"
661 | ]
662 | }
663 | ],
664 | "source": [
665 | "trainSamples = XTrain[0:60]\n",
666 | "YtrainSamples = YTrain[0:60]\n",
667 | "\n",
668 | "testSamples = XTrain[60:]\n",
669 | "YtestSamples = YTrain[60:]\n",
670 | "\n",
671 | "print len(trainSamples)\n",
672 | "print len(testSamples)\n",
673 | "\n",
674 | "# print XTrain[60:63]\n",
675 | "print len(XTrain)\n",
676 | "\n",
677 | "\n",
678 | "trainSentimentSamples = np.array(XTrainSentiment[0:60])\n",
679 | "testSentimentSamples = np.array(XTrainSentiment[60:])\n",
680 | "trainFreqTweetSamples = np.array(XTrainFreqTweets[0:60])\n",
681 | "testFreqTweetSamples = np.array(XTrainFreqTweets[60:])"
682 | ]
683 | },
684 | {
685 | "cell_type": "markdown",
686 | "metadata": {},
687 | "source": [
688 | "### Bag of Words as Features"
689 | ]
690 | },
691 | {
692 | "cell_type": "code",
693 | "execution_count": 28,
694 | "metadata": {
695 | "collapsed": false
696 | },
697 | "outputs": [
698 | {
699 | "name": "stdout",
700 | "output_type": "stream",
701 | "text": [
702 | "4914\n",
703 | "(60, 4914)\n",
704 | "(19, 4914)\n"
705 | ]
706 | }
707 | ],
708 | "source": [
709 | "from sklearn.feature_extraction.text import CountVectorizer\n",
710 | "vectorizer = CountVectorizer()\n",
711 | "XTr = vectorizer.fit_transform(trainSamples)\n",
712 | "print len(vectorizer.get_feature_names())\n",
713 | "trainBagVector = XTr.toarray()\n",
714 | "print trainBagVector.shape\n",
715 | "XTe = vectorizer.transform(testSamples)\n",
716 | "testBagVector = XTe.toarray()\n",
717 | "print testBagVector.shape"
718 | ]
719 | },
720 | {
721 | "cell_type": "code",
722 | "execution_count": 29,
723 | "metadata": {
724 | "collapsed": false
725 | },
726 | "outputs": [
727 | {
728 | "name": "stdout",
729 | "output_type": "stream",
730 | "text": [
731 | "37012\n",
732 | "(3995, 37012)\n"
733 | ]
734 | }
735 | ],
736 | "source": [
737 | "XEv = vectorizer.fit_transform(XEval)\n",
738 | "print len(vectorizer.get_feature_names())\n",
739 | "evalBagVector = XEv.toarray()\n",
740 | "print evalBagVector.shape"
741 | ]
742 | },
743 | {
744 | "cell_type": "code",
745 | "execution_count": 30,
746 | "metadata": {
747 | "collapsed": false
748 | },
749 | "outputs": [
750 | {
751 | "name": "stdout",
752 | "output_type": "stream",
753 | "text": [
754 | "(3995, 4914)\n"
755 | ]
756 | }
757 | ],
758 | "source": [
759 | "evalBagVector = evalBagVector[:,0:4914]\n",
760 | "print evalBagVector.shape"
761 | ]
762 | },
763 | {
764 | "cell_type": "code",
765 | "execution_count": 31,
766 | "metadata": {
767 | "collapsed": false
768 | },
769 | "outputs": [],
770 | "source": [
771 | "# from sklearn.decomposition import PCA as sklearnPCA\n",
772 | "# sklearn_pca = sklearnPCA(n_components=4914)\n",
773 | "# evalBagVectorPCA = sklearn_pca.fit_transform(evalBagVector.T)\n",
774 | "# print evalBagVectorPCA.shape"
775 | ]
776 | },
777 | {
778 | "cell_type": "markdown",
779 | "metadata": {},
780 | "source": [
781 | "### TF-IDF"
782 | ]
783 | },
784 | {
785 | "cell_type": "code",
786 | "execution_count": 32,
787 | "metadata": {
788 | "collapsed": false
789 | },
790 | "outputs": [],
791 | "source": [
792 | "# trainBagVector = trainSamples\n",
793 | "# testBagVector = testSamples\n",
794 | "\n",
795 | "# from sklearn.feature_extraction.text import TfidfTransformer\n",
796 | "# transformer = TfidfTransformer()\n",
797 | "# # print transformer \n",
798 | "# tfidfTrain = transformer.fit_transform(trainBagVector)\n",
799 | "# tfidfTrain = tfidfTrain.toarray()\n",
800 | "# tfidfTest = transformer.fit_transform(testBagVector)\n",
801 | "# tfidfTest = tfidfTest.toarray()\n",
802 | "# print tfidfTrain.shape, tfidfTest.shape\n",
803 | "# print tfidfTrain[0]\n",
804 | "# print tfidfTest[0]"
805 | ]
806 | },
807 | {
808 | "cell_type": "code",
809 | "execution_count": 33,
810 | "metadata": {
811 | "collapsed": false
812 | },
813 | "outputs": [],
814 | "source": [
815 | "# f=open(\"trainBagVector.txt\",'w')\n",
816 | "# f.write(trainBagVector)\n",
817 | "# np.savetxt(\"trainBagVector.txt\",trainBagVector)"
818 | ]
819 | },
820 | {
821 | "cell_type": "markdown",
822 | "metadata": {},
823 | "source": [
824 | "### State Transitions"
825 | ]
826 | },
827 | {
828 | "cell_type": "code",
829 | "execution_count": 34,
830 | "metadata": {
831 | "collapsed": false
832 | },
833 | "outputs": [
834 | {
835 | "name": "stdout",
836 | "output_type": "stream",
837 | "text": [
838 | "37012 37012\n"
839 | ]
840 | }
841 | ],
842 | "source": [
843 | "stateDict = {}\n",
844 | "featureVectors = vectorizer.get_feature_names()\n",
845 | "for i in xrange(len(featureVectors)):\n",
846 | " stateDict[featureVectors[i]] = i+1\n",
847 | "print len(stateDict), len(featureVectors) #, stateDict"
848 | ]
849 | },
850 | {
851 | "cell_type": "code",
852 | "execution_count": 35,
853 | "metadata": {
854 | "collapsed": false
855 | },
856 | "outputs": [],
857 | "source": [
858 | "def createStateTransitionVector(categoricalState, stateDict, maxLength):\n",
859 | " if categoricalState:\n",
860 | " feature = []\n",
861 | " for state in categoricalState.split(' '):\n",
862 | " try:\n",
863 | " feature.append(stateDict[state.lower()])\n",
864 | " except KeyError:\n",
865 | " pass\n",
866 | "# print state\n",
867 | " if len(feature) != maxLength:\n",
868 | " for i in xrange(maxLength-len(feature)):\n",
869 | " feature.append(0)\n",
870 | " assert(len(feature)==maxLength)\n",
871 | " return feature\n",
872 | " else:\n",
873 | " return [0] * maxLength"
874 | ]
875 | },
876 | {
877 | "cell_type": "code",
878 | "execution_count": 36,
879 | "metadata": {
880 | "collapsed": true
881 | },
882 | "outputs": [],
883 | "source": [
884 | "def createStateVectors(XStates, stateDict, maxLength):\n",
885 | " XFeatures = []\n",
886 | " for state in XStates:\n",
887 | " XFeatures.append(createStateTransitionVector(state, stateDict, maxLength))\n",
888 | " return XFeatures"
889 | ]
890 | },
891 | {
892 | "cell_type": "code",
893 | "execution_count": 37,
894 | "metadata": {
895 | "collapsed": false
896 | },
897 | "outputs": [],
898 | "source": [
899 | "trainStateTransitionVector = createStateVectors(trainSamples, stateDict,9353)\n",
900 | "testStateTransitionVector = createStateVectors(testSamples, stateDict,9353)\n",
901 | "# print trainStateTransitionVector[:2], testStateTransitionVector[:2]"
902 | ]
903 | },
904 | {
905 | "cell_type": "code",
906 | "execution_count": 38,
907 | "metadata": {
908 | "collapsed": false
909 | },
910 | "outputs": [
911 | {
912 | "name": "stdout",
913 | "output_type": "stream",
914 | "text": [
915 | "9353\n",
916 | "9353\n"
917 | ]
918 | }
919 | ],
920 | "source": [
921 | "print max([len(i) for i in trainStateTransitionVector])\n",
922 | "print max([len(i) for i in testStateTransitionVector])"
923 | ]
924 | },
925 | {
926 | "cell_type": "markdown",
927 | "metadata": {},
928 | "source": [
929 | "### N Grams as features"
930 | ]
931 | },
932 | {
933 | "cell_type": "code",
934 | "execution_count": 39,
935 | "metadata": {
936 | "collapsed": false
937 | },
938 | "outputs": [
939 | {
940 | "name": "stdout",
941 | "output_type": "stream",
942 | "text": [
943 | "CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',\n",
944 | " dtype=, encoding=u'utf-8', input=u'content',\n",
945 | " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
946 | " ngram_range=(1, 3), preprocessor=None, stop_words=None,\n",
947 | " strip_accents=None, token_pattern=u'(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
948 | " tokenizer=None, vocabulary=None)\n",
949 | "44678\n",
950 | "(60, 44678)\n",
951 | "(19, 44678)\n"
952 | ]
953 | }
954 | ],
955 | "source": [
956 | "import scipy as sp\n",
957 | "noNGram = 3\n",
958 | "vectorizerNGram = CountVectorizer(ngram_range=(1, noNGram))\n",
959 | "XTrainNGram = vectorizerNGram.fit_transform(trainSamples)\n",
960 | "\n",
961 | "print vectorizerNGram\n",
962 | "\n",
963 | "\n",
964 | "print len(vectorizerNGram.get_feature_names())\n",
965 | "trainNGramVector = XTrainNGram.toarray()\n",
966 | "print trainNGramVector.shape\n",
967 | "XTestNGram = vectorizerNGram.transform(testSamples)\n",
968 | "testNGramVector = XTestNGram.toarray()\n",
969 | "print testNGramVector.shape"
970 | ]
971 | },
972 | {
973 | "cell_type": "code",
974 | "execution_count": 40,
975 | "metadata": {
976 | "collapsed": false
977 | },
978 | "outputs": [],
979 | "source": [
980 | "# from helper import *\n",
981 | "# import utilities\n",
982 | "# from utilities import build_matrices\n",
983 | "\n",
984 | "# (matrix_train, matrix_eval, ngram_list) = build_matrices(max_ngram_length=3)\n",
985 | "# len(ngram_list)\n",
986 | "# matrix_train = sp.sparse.csr_matrix(matrix_train)\n",
987 | "# matrix_eval = sp.sparse.csr_matrix(matrix_test)"
988 | ]
989 | },
990 | {
991 | "cell_type": "code",
992 | "execution_count": 41,
993 | "metadata": {
994 | "collapsed": false
995 | },
996 | "outputs": [],
997 | "source": [
998 | "# noNGram = 3\n",
999 | "# vectorizerNGram = CountVectorizer(ngram_range=(1, noNGram))\n",
1000 | "# XEvalNGram = vectorizerNGram.fit_transform(XEval)\n",
1001 | "# print vectorizerNGram\n",
1002 | "\n",
1003 | "# print len(vectorizerNGram.get_feature_names())\n",
1004 | "# evalNGramVector = XEvalNGram.toarray()\n",
1005 | "# print evalNGramVector.shape\n",
1006 | "\n",
1007 | "# matrix_eval = sp.sparse.csr_matrix(evalNGramVector)"
1008 | ]
1009 | },
1010 | {
1011 | "cell_type": "markdown",
1012 | "metadata": {},
1013 | "source": [
1014 | "### Stack or concatenate all features together"
1015 | ]
1016 | },
1017 | {
1018 | "cell_type": "code",
1019 | "execution_count": 42,
1020 | "metadata": {
1021 | "collapsed": false
1022 | },
1023 | "outputs": [
1024 | {
1025 | "name": "stdout",
1026 | "output_type": "stream",
1027 | "text": [
1028 | "(60, 4914)\n",
1029 | "(60,)\n",
1030 | "(60, 4915)\n",
1031 | "(19, 4915)\n",
1032 | "(60, 4916)\n"
1033 | ]
1034 | }
1035 | ],
1036 | "source": [
1037 | "XTrainWordFeatures = trainBagVector #trainNGramVector\n",
1038 | "print XTrainWordFeatures.shape\n",
1039 | "print trainSentimentSamples.shape\n",
1040 | "\n",
1041 | "temp = np.column_stack((XTrainWordFeatures, trainSentimentSamples))\n",
1042 | "print temp.shape\n",
1043 | "XTrainAllFeatures = np.column_stack((temp, trainFreqTweetSamples))\n",
1044 | "\n",
1045 | "\n",
1046 | "XTestWordFeatures = testBagVector #testNGramVector\n",
1047 | "temp = np.column_stack((XTestWordFeatures, testSentimentSamples))\n",
1048 | "print temp.shape\n",
1049 | "XTestAllFeatures = np.column_stack((temp, testFreqTweetSamples))\n",
1050 | "\n",
1051 | "\n",
1052 | "print XTrainAllFeatures.shape"
1053 | ]
1054 | },
1055 | {
1056 | "cell_type": "code",
1057 | "execution_count": 43,
1058 | "metadata": {
1059 | "collapsed": false
1060 | },
1061 | "outputs": [
1062 | {
1063 | "name": "stdout",
1064 | "output_type": "stream",
1065 | "text": [
1066 | "(3995, 4916)\n"
1067 | ]
1068 | }
1069 | ],
1070 | "source": [
1071 | "# XEvalWordFeatures = evalBagVector #evalNGramVector\n",
1072 | "# temp = np.column_stack((XEvalWordFeatures, XEvalSentiment))\n",
1073 | "XEvalAllFeatures = np.column_stack((np.column_stack((evalBagVector, XEvalSentiment)), XEvalFreqTweets))\n",
1074 | "\n",
1075 | "print XEvalAllFeatures.shape"
1076 | ]
1077 | },
1078 | {
1079 | "cell_type": "markdown",
1080 | "metadata": {},
1081 | "source": [
1082 | "### Write Predicted Output Labels to File"
1083 | ]
1084 | },
1085 | {
1086 | "cell_type": "code",
1087 | "execution_count": 44,
1088 | "metadata": {
1089 | "collapsed": true
1090 | },
1091 | "outputs": [],
1092 | "source": [
1093 | "def writePredictedLabelFile(YPred):\n",
1094 | " f = open(\"Predictions.csv\",\"w\")\n",
1095 | " f.write(\"Id,Label\" + \"\\n\")\n",
1096 | " for i in xrange(len(YPred)):\n",
1097 | " f.write(str(i) + \",\" + str(int(YPred[i]))+ \"\\n\")\n",
1098 | " f.close()"
1099 | ]
1100 | },
1101 | {
1102 | "cell_type": "markdown",
1103 | "metadata": {
1104 | "collapsed": true
1105 | },
1106 | "source": [
1107 | "### Classifiers"
1108 | ]
1109 | },
1110 | {
1111 | "cell_type": "code",
1112 | "execution_count": 45,
1113 | "metadata": {
1114 | "collapsed": true
1115 | },
1116 | "outputs": [],
1117 | "source": [
1118 | "# Random Forest Classifier\n",
1119 | "from sklearn.ensemble import RandomForestClassifier\n",
1120 | "# def classifyRandomForestClassifier(XTrain, XTest, YTrain, YTest,trees=100,crit='gini'):\n",
1121 | "def classifyRandomForestClassifier(XTrain, XTest, YTrain, YTest, params):\n",
1122 | " trees = params['trees']\n",
1123 | " crit = params['criterion']\n",
1124 | " seed = params['random_state']\n",
1125 | " clf = RandomForestClassifier(n_estimators=trees,criterion=crit,random_state=seed)\n",
1126 | " clf.fit(XTrain, YTrain)\n",
1127 | " YPred = clf.predict(XTest)\n",
1128 | " diff = YPred - YTest\n",
1129 | " score = diff[diff == 0].size\n",
1130 | " return (100.0 * score)/(YPred.size)"
1131 | ]
1132 | },
1133 | {
1134 | "cell_type": "code",
1135 | "execution_count": 46,
1136 | "metadata": {
1137 | "collapsed": true
1138 | },
1139 | "outputs": [],
1140 | "source": [
1141 | "#Multi Class SVM\n",
1142 | "from sklearn import svm\n",
1143 | "def classifyMultiClassSVMClassifier(XTrain, XTest, YTrain, YTest, params):\n",
1144 | " ker = params['kernel']\n",
1145 | " YPred = svm.SVC(kernel=ker).fit(XTrain, YTrain).predict(XTest)\n",
1146 | " diff = YPred - YTest\n",
1147 | " score = diff[diff == 0].size\n",
1148 | " return (100.0 * score)/(YPred.size)"
1149 | ]
1150 | },
1151 | {
1152 | "cell_type": "code",
1153 | "execution_count": 47,
1154 | "metadata": {
1155 | "collapsed": true
1156 | },
1157 | "outputs": [],
1158 | "source": [
1159 | "#K Nearest Neighbours Classifier\n",
1160 | "from sklearn.neighbors import KNeighborsClassifier\n",
1161 | "def classifyKNNClassifier(XTrain, XTest, YTrain, YTest, params):\n",
1162 | "# print XTrain.shape, XTest.shape\n",
1163 | " neighbours = params['neighbours']\n",
1164 | " neigh = KNeighborsClassifier(n_neighbors=neighbours)\n",
1165 | " YPred = neigh.fit(XTrain, YTrain).predict(XTest)\n",
1166 | " diff = YPred - YTest\n",
1167 | " score = diff[diff == 0].size\n",
1168 | " return (100.0 * score)/(YPred.size)"
1169 | ]
1170 | },
1171 | {
1172 | "cell_type": "code",
1173 | "execution_count": 48,
1174 | "metadata": {
1175 | "collapsed": false
1176 | },
1177 | "outputs": [],
1178 | "source": [
1179 | "# Logistic Regression\n",
1180 | "from sklearn import linear_model\n",
1181 | "def classifyLogisticRegression(XTrain, XTest, YTrain, YTest, params):\n",
1182 | " LogReg = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)\n",
1183 | " LogReg.fit(XTrain, YTrain)\n",
1184 | " # Finds the optimal model parameters using a least squares method.\n",
1185 | " # To get the parameter values:\n",
1186 | " # LogReg.get_params()\n",
1187 | " # To predict a new input XTest,\n",
1188 | " YPred = LogReg.predict(XTest)\n",
1189 | " diff = YPred - YTest\n",
1190 | " score = diff[diff == 0].size\n",
1191 | " return (100.0 * score)/(YPred.size)"
1192 | ]
1193 | },
1194 | {
1195 | "cell_type": "code",
1196 | "execution_count": 49,
1197 | "metadata": {
1198 | "collapsed": true
1199 | },
1200 | "outputs": [],
1201 | "source": [
1202 | "# Adaboost Classfier\n",
1203 | "from sklearn.ensemble import AdaBoostClassifier\n",
1204 | "from sklearn.tree import DecisionTreeClassifier\n",
1205 | "def classifyAdaboostClassifier(XTrain, XTest, YTrain, YTest, params):\n",
1206 | " depth = params['max_depth']\n",
1207 | " algo = params['algorithm']\n",
1208 | " estimators = params['n_estimators']\n",
1209 | " \n",
1210 | " # Create and fit an AdaBoosted decision tree\n",
1211 | " bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth = depth),\n",
1212 | " algorithm = algo,\n",
1213 | " n_estimators=estimators)\n",
1214 | "\n",
1215 | " bdt.fit(XTrain, YTrain)\n",
1216 | " YPred = bdt.predict(XTest)\n",
1217 | "\n",
1218 | " diff = YPred - YTest\n",
1219 | " score = diff[diff == 0].size\n",
1220 | " return (100.0 * score)/(YPred.size)"
1221 | ]
1222 | },
1223 | {
1224 | "cell_type": "code",
1225 | "execution_count": 50,
1226 | "metadata": {
1227 | "collapsed": true
1228 | },
1229 | "outputs": [],
1230 | "source": [
1231 | "# Neural Networks\n",
1232 | "try:\n",
1233 | " from sknn.mlp import Classifier, Layer\n",
1234 | "except ImportError:\n",
1235 | " print 'Please install scikit-neuralnetwork(pip install scikit-neuralnetwork)'\n",
1236 | "\n",
1237 | "def classifyNeuralNetworkClassifier(XTrain, XTest, YTrain, YTest, params):\n",
1238 | " activation = params['activation']\n",
1239 | " actLastLayer = params['actLastLayer']\n",
1240 | " rule = params['rule']\n",
1241 | " noOfUnits = params['units']\n",
1242 | " rate = params['rate']\n",
1243 | " noOfIter = params['iter']\n",
1244 | " nn = Classifier(layers=[Layer(activation, units=noOfUnits),Layer(actLastLayer)], learning_rule=rule,\n",
1245 | " learning_rate=0.02,\n",
1246 | " n_iter=10)\n",
1247 | " nn.fit(XTrain, YTrain)\n",
1248 | " YPred = nn.predict(XTest)\n",
1249 | " diff = YPred - YTest.reshape(YPred.shape)\n",
1250 | " score = diff[diff == 0].size\n",
1251 | " score = (100.0 * score)/(YPred.size)\n",
1252 | " return score"
1253 | ]
1254 | },
1255 | {
1256 | "cell_type": "markdown",
1257 | "metadata": {},
1258 | "source": [
1259 | "### Stratified K Fold Cross Validation"
1260 | ]
1261 | },
1262 | {
1263 | "cell_type": "code",
1264 | "execution_count": 51,
1265 | "metadata": {
1266 | "collapsed": true
1267 | },
1268 | "outputs": [],
1269 | "source": [
1270 | "from sklearn.cross_validation import StratifiedKFold\n",
1271 | "def stratifiedKFoldVal(XTrain, YTrain, classify, params):\n",
1272 | " n_folds = 5\n",
1273 | " score = 0.0\n",
1274 | " skf = StratifiedKFold(YTrain, n_folds)\n",
1275 | " try:\n",
1276 | " multi = params['multi']\n",
1277 | " except KeyError:\n",
1278 | " multi = False\n",
1279 | " for train_index, test_index in skf:\n",
1280 | " y_train, y_test = YTrain[train_index], YTrain[test_index]\n",
1281 | " if not multi:\n",
1282 | " X_train, X_test = XTrain[train_index], XTrain[test_index]\n",
1283 | " score += classify(X_train, X_test, y_train, y_test, params)\n",
1284 | " else:\n",
1285 | " X_train, X_test = [XTrain[i] for i in train_index], [XTrain[i] for i in test_index]\n",
1286 | " score += classify(np.array(X_train), np.array(X_test), y_train, y_test, params)\n",
1287 | " \n",
1288 | " return score/n_folds"
1289 | ]
1290 | },
1291 | {
1292 | "cell_type": "markdown",
1293 | "metadata": {},
1294 | "source": [
1295 | "### Normalisation of Feature Vectors"
1296 | ]
1297 | },
1298 | {
1299 | "cell_type": "code",
1300 | "execution_count": 52,
1301 | "metadata": {
1302 | "collapsed": false
1303 | },
1304 | "outputs": [],
1305 | "source": [
1306 | "from sklearn import preprocessing\n",
1307 | "def NormalizeVector(XTestFeatures,XTrainFeatures):\n",
1308 | " XTestFeaturesNorm = preprocessing.normalize(XTestFeatures, norm='l2')\n",
1309 | " XTrainFeaturesNorm = preprocessing.normalize(XTrainFeatures, norm='l2')\n",
1310 | " print XTrainFeaturesNorm.shape,XTestFeaturesNorm.shape\n",
1311 | "# print XTrainFeaturesNorm[0],XTestFeaturesNorm[0]\n",
1312 | " return XTrainFeaturesNorm, XTestFeaturesNorm"
1313 | ]
1314 | },
1315 | {
1316 | "cell_type": "markdown",
1317 | "metadata": {},
1318 | "source": [
1319 | "### Assign Train features for cross validation based on the feature encoding"
1320 | ]
1321 | },
1322 | {
1323 | "cell_type": "code",
1324 | "execution_count": 53,
1325 | "metadata": {
1326 | "collapsed": false
1327 | },
1328 | "outputs": [
1329 | {
1330 | "name": "stdout",
1331 | "output_type": "stream",
1332 | "text": [
1333 | " \n",
1334 | "(60, 4916)\n",
1335 | "(60,)\n"
1336 | ]
1337 | }
1338 | ],
1339 | "source": [
1340 | "train = XTrainAllFeatures\n",
1341 | "# train = tfidfTrain\n",
1342 | "# train = trainStateTransitionVector\n",
1343 | "print type(trainBagVector), type(trainStateTransitionVector)\n",
1344 | "# train = []\n",
1345 | "# for i in xrange(len(trainBagVector)):\n",
1346 | "# train.append(trainBagVector[i]+trainStateTransitionVector[i])\n",
1347 | "# print len(train)\n",
1348 | "# train = np.hstack([tfidfTrain, np.array(trainStateTransitionVector)])\n",
1349 | "# train = np.hstack([trainBagVector, np.array(trainStateTransitionVector)])\n",
1350 | "\n",
1351 | "print train.shape\n",
1352 | "YTrain = YtrainSamples\n",
1353 | "print YTrain.shape\n",
1354 | "YTest = YtestSamples"
1355 | ]
1356 | },
1357 | {
1358 | "cell_type": "markdown",
1359 | "metadata": {},
1360 | "source": [
1361 | "### Selection of Nearest Neighbours for KNN"
1362 | ]
1363 | },
1364 | {
1365 | "cell_type": "code",
1366 | "execution_count": 54,
1367 | "metadata": {
1368 | "collapsed": false
1369 | },
1370 | "outputs": [],
1371 | "source": [
1372 | "# selectNeighbourScores = []\n",
1373 | "\n",
1374 | "# params = {'neighbours':2}\n",
1375 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
1376 | "# print score\n",
1377 | "# selectNeighbourScores.append(score)\n",
1378 | "\n",
1379 | "# params = {'neighbours':3}\n",
1380 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
1381 | "# print score\n",
1382 | "# selectNeighbourScores.append(score)\n",
1383 | "\n",
1384 | "# params = {'neighbours':4}\n",
1385 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
1386 | "# print score\n",
1387 | "# selectNeighbourScores.append(score)\n",
1388 | "\n",
1389 | "# params = {'neighbours':5}\n",
1390 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
1391 | "# print score\n",
1392 | "# selectNeighbourScores.append(score)\n",
1393 | "\n",
1394 | "# params = {'neighbours':10}\n",
1395 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
1396 | "# print score\n",
1397 | "# selectNeighbourScores.append(score)\n",
1398 | "\n",
1399 | "# params = {'neighbours':25}\n",
1400 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
1401 | "# print score\n",
1402 | "# selectNeighbourScores.append(score)\n",
1403 | "\n",
1404 | "# params = {'neighbours':40}\n",
1405 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
1406 | "# print score\n",
1407 | "# selectNeighbourScores.append(score)\n",
1408 | "\n",
1409 | "# print selectNeighbourScores"
1410 | ]
1411 | },
1412 | {
1413 | "cell_type": "code",
1414 | "execution_count": 55,
1415 | "metadata": {
1416 | "collapsed": false
1417 | },
1418 | "outputs": [],
1419 | "source": [
1420 | "# #Plotting the results\n",
1421 | "# import matplotlib.pyplot as plt\n",
1422 | "# %matplotlib inline\n",
1423 | "# plt.plot(selectNeighbourScores, label = \"Neighbors in k-Nearest Neighbor (kNN) Classifier\")\n",
1424 | "# plt.title(\"Neighbors in k-Nearest Neighbor (kNN) Classifier\")\n",
1425 | "\n",
1426 | "# labels = [2,3,4,5,6,8,10]\n",
1427 | "# plt.xticks(np.arange(len(labels)), labels, rotation='horizontal')\n",
1428 | "# # plt.title(\"Optimal choice of Neighbors in k-Nearest Neighbor (kNN) Classifier\")\n",
1429 | "# plt.ylabel('Categorization Accuracy')\n",
1430 | "# plt.xlabel('No. of Neighbours')\n",
1431 | "# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
1432 | "# plt.show()"
1433 | ]
1434 | },
1435 | {
1436 | "cell_type": "markdown",
1437 | "metadata": {},
1438 | "source": [
1439 | "#### Hence, we choose k = 25 for our nearest neighbor classifier."
1440 | ]
1441 | },
1442 | {
1443 | "cell_type": "code",
1444 | "execution_count": 56,
1445 | "metadata": {
1446 | "collapsed": false
1447 | },
1448 | "outputs": [
1449 | {
1450 | "name": "stdout",
1451 | "output_type": "stream",
1452 | "text": [
1453 | "19 60\n"
1454 | ]
1455 | }
1456 | ],
1457 | "source": [
1458 | "print len(testStateTransitionVector), len(trainStateTransitionVector)"
1459 | ]
1460 | },
1461 | {
1462 | "cell_type": "code",
1463 | "execution_count": 57,
1464 | "metadata": {
1465 | "collapsed": false
1466 | },
1467 | "outputs": [],
1468 | "source": [
1469 | "# train = np.hstack([XTrainAllFeatures, XTestAllFeatures])\n",
1470 | "train = XTrainAllFeatures\n",
1471 | "test = XEvalAllFeatures\n",
1472 | "params = {'neighbours':25}\n",
1473 | "neighbours = params['neighbours']\n",
1474 | "neigh = KNeighborsClassifier(n_neighbors=neighbours)\n",
1475 | "YPred = neigh.fit(train, YTrain).predict(test)"
1476 | ]
1477 | },
1478 | {
1479 | "cell_type": "code",
1480 | "execution_count": 64,
1481 | "metadata": {
1482 | "collapsed": false
1483 | },
1484 | "outputs": [
1485 | {
1486 | "name": "stdout",
1487 | "output_type": "stream",
1488 | "text": [
1489 | "[9 9 9 ..., 9 9 9]\n"
1490 | ]
1491 | }
1492 | ],
1493 | "source": [
1494 | "print YPred[2:3020]"
1495 | ]
1496 | },
1497 | {
1498 | "cell_type": "markdown",
1499 | "metadata": {},
1500 | "source": [
1501 | "### Selection of Parameters for Random Forest"
1502 | ]
1503 | },
1504 | {
1505 | "cell_type": "code",
1506 | "execution_count": 59,
1507 | "metadata": {
1508 | "collapsed": false
1509 | },
1510 | "outputs": [],
1511 | "source": [
1512 | "# selectRandomForestScores = []\n",
1513 | "\n",
1514 | "# params = {'trees':500, 'criterion':'entropy','random_state':1000}\n",
1515 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n",
1516 | "# print score\n",
1517 | "# selectRandomForestScores.append(score)\n",
1518 | "\n",
1519 | "# params = {'trees':1000, 'criterion':'entropy','random_state':1000}\n",
1520 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n",
1521 | "# print score\n",
1522 | "# selectRandomForestScores.append(score)\n",
1523 | "\n",
1524 | "# params = {'trees':500, 'criterion':'gini','random_state':1000}\n",
1525 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n",
1526 | "# print score\n",
1527 | "# selectRandomForestScores.append(score)\n",
1528 | "\n",
1529 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n",
1530 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n",
1531 | "# print score\n",
1532 | "# selectRandomForestScores.append(score)\n",
1533 | "\n",
1534 | "# print selectRandomForestScores"
1535 | ]
1536 | },
1537 | {
1538 | "cell_type": "code",
1539 | "execution_count": 60,
1540 | "metadata": {
1541 | "collapsed": false
1542 | },
1543 | "outputs": [],
1544 | "source": [
1545 | "# #Plotting the results\n",
1546 | "# import matplotlib.pyplot as plt\n",
1547 | "# %matplotlib inline\n",
1548 | "# plt.plot(selectRandomForestScores, label = \"Random Forest Classifier\")\n",
1549 | "# plt.title(\"Random Forest Classifier\")\n",
1550 | "\n",
1551 | "# labels = ['500 Trees + entropy', '1000 Trees + entropy', '500 Trees + gini', '1000 Trees + gini']\n",
1552 | "\n",
1553 | "# # You can specify a rotation for the tick labels in degrees or with keywords.\n",
1554 | "# plt.xticks(np.arange(len(labels)), labels, rotation='vertical')\n",
1555 | "\n",
1556 | "# plt.ylabel('Scores')\n",
1557 | "# plt.xlabel('Parameters')\n",
1558 | "# # Place a legend to the right of this smaller figure.\n",
1559 | "# # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
1560 | "# plt.show()"
1561 | ]
1562 | },
1563 | {
1564 | "cell_type": "markdown",
1565 | "metadata": {},
1566 | "source": [
1567 | "#### Hence, we choose 1000 Trees + Gini as a criterion for our Random Forest classifier."
1568 | ]
1569 | },
1570 | {
1571 | "cell_type": "code",
1572 | "execution_count": 61,
1573 | "metadata": {
1574 | "collapsed": false
1575 | },
1576 | "outputs": [],
1577 | "source": [
1578 | "params = {'trees':150, 'criterion':'entropy','random_state':None}\n",
1579 | "trees = params['trees']\n",
1580 | "crit = params['criterion']\n",
1581 | "seed = params['random_state']\n",
1582 | "clf = RandomForestClassifier(n_estimators=trees,criterion=crit,random_state=seed)\n",
1583 | "clf.fit(train, YTrain)\n",
1584 | "YPred = clf.predict(test)"
1585 | ]
1586 | },
1587 | {
1588 | "cell_type": "markdown",
1589 | "metadata": {},
1590 | "source": [
1591 | "### Selection of Kernel for Multi Class SVM"
1592 | ]
1593 | },
1594 | {
1595 | "cell_type": "code",
1596 | "execution_count": null,
1597 | "metadata": {
1598 | "collapsed": false
1599 | },
1600 | "outputs": [],
1601 | "source": [
1602 | "# selectKernelScores = []\n",
1603 | "\n",
1604 | "# params = {'kernel':'poly'}\n",
1605 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n",
1606 | "# print score\n",
1607 | "# selectKernelScores.append(score)\n",
1608 | "\n",
1609 | "# params = {'kernel':'linear'}\n",
1610 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n",
1611 | "# print score\n",
1612 | "# selectKernelScores.append(score)\n",
1613 | "\n",
1614 | "# params = {'kernel':'rbf'}\n",
1615 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n",
1616 | "# print score\n",
1617 | "# selectKernelScores.append(score)"
1618 | ]
1619 | },
1620 | {
1621 | "cell_type": "code",
1622 | "execution_count": null,
1623 | "metadata": {
1624 | "collapsed": false
1625 | },
1626 | "outputs": [],
1627 | "source": [
1628 | "# #Plotting the results\n",
1629 | "# import matplotlib.pyplot as plt\n",
1630 | "# %matplotlib inline\n",
1631 | "# plt.plot(selectKernelScores, label = \"Multiclass SVM Classifier\")\n",
1632 | "\n",
1633 | "# labels = ['poly','linear','rbf']\n",
1634 | "# plt.title(\"Multiclass SVM Classifier\")\n",
1635 | "# # You can specify a rotation for the tick labels in degrees or with keywords.\n",
1636 | "# plt.xticks(np.arange(len(labels)), labels, rotation='horizontal')\n",
1637 | "\n",
1638 | "# plt.ylabel('Scores')\n",
1639 | "# plt.xlabel('Kernel used')\n",
1640 | "# # Place a legend to the right of this smaller figure.\n",
1641 | "# # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
1642 | "# plt.show()"
1643 | ]
1644 | },
1645 | {
1646 | "cell_type": "markdown",
1647 | "metadata": {
1648 | "collapsed": true
1649 | },
1650 | "source": [
1651 | "#### Hence, we choose rbf for our SVM classifier."
1652 | ]
1653 | },
1654 | {
1655 | "cell_type": "code",
1656 | "execution_count": null,
1657 | "metadata": {
1658 | "collapsed": true
1659 | },
1660 | "outputs": [],
1661 | "source": [
1662 | "params = {'kernel':'rbf'}\n",
1663 | "ker = params['kernel']\n",
1664 | "YPred = svm.SVC(kernel=ker).fit(train, YTrain).predict(test)"
1665 | ]
1666 | },
1667 | {
1668 | "cell_type": "markdown",
1669 | "metadata": {},
1670 | "source": [
1671 | "### Logistic Regression"
1672 | ]
1673 | },
1674 | {
1675 | "cell_type": "code",
1676 | "execution_count": null,
1677 | "metadata": {
1678 | "collapsed": false
1679 | },
1680 | "outputs": [],
1681 | "source": [
1682 | "# # params = {'multi':False}\n",
1683 | "# # train = tfidfTrain\n",
1684 | "# # score = stratifiedKFoldVal(train, YTrain, classifyLogisticRegression, params)\n",
1685 | "# # print score\n",
1686 | "# train = XTrainAllFeatures\n",
1687 | "# score = stratifiedKFoldVal(train, YTrain, classifyLogisticRegression, params)\n",
1688 | "# print score"
1689 | ]
1690 | },
1691 | {
1692 | "cell_type": "code",
1693 | "execution_count": null,
1694 | "metadata": {
1695 | "collapsed": false
1696 | },
1697 | "outputs": [],
1698 | "source": [
1699 | "# params = {'multi':True}\n",
1700 | "# train = trainStateTransitionVector\n",
1701 | "# score = stratifiedKFoldVal(train, YTrain, classifyLogisticRegression, params)\n",
1702 | "# print score"
1703 | ]
1704 | },
1705 | {
1706 | "cell_type": "code",
1707 | "execution_count": null,
1708 | "metadata": {
1709 | "collapsed": true
1710 | },
1711 | "outputs": [],
1712 | "source": [
1713 | "# LogReg = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)\n",
1714 | "# LogReg.fit(trainBagVector, YTrain)\n",
1715 | "# YPred = LogReg.predict(testBagVector)\n",
1716 | "# # writePredictedLabelFile(YPred)"
1717 | ]
1718 | },
1719 | {
1720 | "cell_type": "markdown",
1721 | "metadata": {},
1722 | "source": [
1723 | "### Define the parameters for Adaboost and use it on different training dataset"
1724 | ]
1725 | },
1726 | {
1727 | "cell_type": "code",
1728 | "execution_count": null,
1729 | "metadata": {
1730 | "collapsed": false
1731 | },
1732 | "outputs": [],
1733 | "source": [
1734 | "# train = XTrainAllFeatures\n",
1735 | "# params = {'max_depth':1, 'algorithm':'SAMME', 'n_estimators':200}\n",
1736 | "# score = stratifiedKFoldVal(train, YTrain, classifyAdaboostClassifier, params)\n",
1737 | "# print score"
1738 | ]
1739 | },
1740 | {
1741 | "cell_type": "code",
1742 | "execution_count": null,
1743 | "metadata": {
1744 | "collapsed": false
1745 | },
1746 | "outputs": [],
1747 | "source": [
1748 | "# train = XTrainAllFeatures\n",
1749 | "# params = {'max_depth':10, 'algorithm':'SAMME', 'n_estimators':500}\n",
1750 | "# score = stratifiedKFoldVal(train, YTrain, classifyAdaboostClassifier, params)\n",
1751 | "# print score"
1752 | ]
1753 | },
1754 | {
1755 | "cell_type": "code",
1756 | "execution_count": null,
1757 | "metadata": {
1758 | "collapsed": false
1759 | },
1760 | "outputs": [],
1761 | "source": [
1762 | "# # Submission\n",
1763 | "# params = {'max_depth':10, 'algorithm':'SAMME', 'n_estimators':500}\n",
1764 | "# train = tfidfTrain\n",
1765 | "# test = tfidfTest\n",
1766 | "# depth = params['max_depth']\n",
1767 | "# algo = params['algorithm']\n",
1768 | "# estimators = params['n_estimators']\n",
1769 | "\n",
1770 | "# # Create and fit an AdaBoosted decision tree\n",
1771 | "# bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth = depth),\n",
1772 | "# algorithm = algo,\n",
1773 | "# n_estimators=estimators)\n",
1774 | "\n",
1775 | "# bdt.fit(train, YTrain)\n",
1776 | "# YPred = bdt.predict(test)\n",
1777 | "# # writePredictedLabelFile(YPred)"
1778 | ]
1779 | },
1780 | {
1781 | "cell_type": "markdown",
1782 | "metadata": {},
1783 | "source": [
1784 | "### Selection of Parameters for Neural Networks"
1785 | ]
1786 | },
1787 | {
1788 | "cell_type": "code",
1789 | "execution_count": null,
1790 | "metadata": {
1791 | "collapsed": false
1792 | },
1793 | "outputs": [],
1794 | "source": [
1795 | "# train = XTrainAllFeatures\n",
1796 | "# # params = {'activation':'Rectifier', 'units':100, 'rate':0.02, 'iter':10}\n",
1797 | "# params = {'activation':'Tanh', 'actLastLayer':'Softmax', 'rule':'momentum', 'units':100, 'rate':0.002, 'iter':10}\n",
1798 | "# score = stratifiedKFoldVal(train, YTrain, classifyNeuralNetworkClassifier, params)\n",
1799 | "# print score"
1800 | ]
1801 | },
1802 | {
1803 | "cell_type": "code",
1804 | "execution_count": null,
1805 | "metadata": {
1806 | "collapsed": false
1807 | },
1808 | "outputs": [],
1809 | "source": [
1810 | "# train = XTrainAllFeatures\n",
1811 | "# # params = {'activation':'Rectifier', 'units':100, 'rate':0.02, 'iter':10}\n",
1812 | "# params = {'activation':'Tanh', 'actLastLayer':'Softmax', 'rule':'sgd', 'units':100, 'rate':0.002, 'iter':10}\n",
1813 | "# score = stratifiedKFoldVal(train, YTrain, classifyNeuralNetworkClassifier, params)\n",
1814 | "# print score"
1815 | ]
1816 | },
1817 | {
1818 | "cell_type": "code",
1819 | "execution_count": null,
1820 | "metadata": {
1821 | "collapsed": false
1822 | },
1823 | "outputs": [],
1824 | "source": [
1825 | "# train = XTrainAllFeatures\n",
1826 | "# params = {'activation':'Sigmoid', 'actLastLayer':'Softmax', 'rule':'rmsprop', 'units':100, 'rate':0.002, 'iter':10}\n",
1827 | "# score = stratifiedKFoldVal(train, YTrain, classifyNeuralNetworkClassifier, params)\n",
1828 | "# print score"
1829 | ]
1830 | },
1831 | {
1832 | "cell_type": "code",
1833 | "execution_count": null,
1834 | "metadata": {
1835 | "collapsed": true
1836 | },
1837 | "outputs": [],
1838 | "source": [
1839 | "# # Submission\n",
1840 | "# tr = trainBagVector\n",
1841 | "# te = testBagVector\n",
1842 | "# params = {'activation':'Tanh', 'actLastLayer':'Softmax', 'rule':'adagrad', 'units':100, 'rate':0.002, 'iter':10}\n",
1843 | "# activation = params['activation']\n",
1844 | "# actLastLayer = params['actLastLayer']\n",
1845 | "# rule = params['rule']\n",
1846 | "# noOfUnits = params['units']\n",
1847 | "# rate = params['rate']\n",
1848 | "# noOfIter = params['iter']\n",
1849 | "# nn = Classifier(layers=[Layer(activation, units=noOfUnits),Layer(actLastLayer)], learning_rule=rule,\n",
1850 | "# learning_rate=0.02,\n",
1851 | "# n_iter=10)\n",
1852 | "# nn.fit(tr, YTrain)\n",
1853 | "# YPred = nn.predict(te)\n",
1854 | "# # writePredictedLabelFile(YPred)"
1855 | ]
1856 | },
1857 | {
1858 | "cell_type": "markdown",
1859 | "metadata": {
1860 | "collapsed": true
1861 | },
1862 | "source": [
1863 | "### Get features in format for Models of NLTK Classify "
1864 | ]
1865 | },
1866 | {
1867 | "cell_type": "code",
1868 | "execution_count": null,
1869 | "metadata": {
1870 | "collapsed": true
1871 | },
1872 | "outputs": [],
1873 | "source": [
1874 | "def featNLTKClassify(samples, phase):\n",
1875 | " featureVectors = vectorizer.get_feature_names()\n",
1876 | " nltkClassifySamples = []\n",
1877 | "\n",
1878 | " for i in xrange(len(samples)):\n",
1879 | " t = samples[i]\n",
1880 | " lstFuncCalls = t.split()\n",
1881 | " wordOccDict = {}\n",
1882 | " for j in xrange(len(featureVectors)):\n",
1883 | " wordOccDict[featureVectors[j]] = lstFuncCalls.count(featureVectors[j])\n",
1884 | " if phase == 'train':\n",
1885 | " nltkClassifySamples.append((wordOccDict, YTrain[i]))\n",
1886 | " else:\n",
1887 | " nltkClassifySamples.append(wordOccDict)\n",
1888 | "\n",
1889 | " return nltkClassifySamples"
1890 | ]
1891 | },
1892 | {
1893 | "cell_type": "code",
1894 | "execution_count": null,
1895 | "metadata": {
1896 | "collapsed": true
1897 | },
1898 | "outputs": [],
1899 | "source": [
1900 | "# nltkClassifyTrain = featNLTKClassify(trainSamples, 'train')\n",
1901 | "# nltkClassifyTest = featNLTKClassify(testSamples, 'test')"
1902 | ]
1903 | },
1904 | {
1905 | "cell_type": "markdown",
1906 | "metadata": {},
1907 | "source": [
1908 | "### Nave Baiyes Classifier"
1909 | ]
1910 | },
1911 | {
1912 | "cell_type": "code",
1913 | "execution_count": null,
1914 | "metadata": {
1915 | "collapsed": true
1916 | },
1917 | "outputs": [],
1918 | "source": [
1919 | "# tr = nltkClassifyTrain\n",
1920 | "# te = nltkClassifyTest\n",
1921 | "# classifier = nltk.classify.NaiveBayesClassifier.train(tr)\n",
1922 | "# sorted(classifier.labels())"
1923 | ]
1924 | },
1925 | {
1926 | "cell_type": "code",
1927 | "execution_count": null,
1928 | "metadata": {
1929 | "collapsed": true
1930 | },
1931 | "outputs": [],
1932 | "source": [
1933 | "# classifier.classify_many(te)\n",
1934 | "\n",
1935 | "# classifier.show_most_informative_features()\n",
1936 | "# # print nltk.classify.accuracy(classifier, te)*100"
1937 | ]
1938 | },
1939 | {
1940 | "cell_type": "markdown",
1941 | "metadata": {},
1942 | "source": [
1943 | "### Maximum Entropy Classifier"
1944 | ]
1945 | },
1946 | {
1947 | "cell_type": "code",
1948 | "execution_count": null,
1949 | "metadata": {
1950 | "collapsed": true
1951 | },
1952 | "outputs": [],
1953 | "source": [
1954 | "# from nltk.classify import maxent\n",
1955 | "# tr = nltkClassifyTrain\n",
1956 | "# te = nltkClassifyTest\n",
1957 | "# classifierME = maxent.MaxentClassifier.train(tr, bernoulli=False, encoding=encoding, trace=0)\n",
1958 | "# classifierME.classify_many(te)"
1959 | ]
1960 | },
1961 | {
1962 | "cell_type": "markdown",
1963 | "metadata": {},
1964 | "source": [
1965 | "### Decision Tree Classifier"
1966 | ]
1967 | },
1968 | {
1969 | "cell_type": "code",
1970 | "execution_count": null,
1971 | "metadata": {
1972 | "collapsed": true
1973 | },
1974 | "outputs": [],
1975 | "source": [
1976 | "# tr = nltkClassifyTrain\n",
1977 | "# te = nltkClassifyTest\n",
1978 | "\n",
1979 | "# classifier = nltk.classify.DecisionTreeClassifier.train(tr, entropy_cutoff=0,support_cutoff=0)\n",
1980 | "# sorted(classifier.labels())\n",
1981 | "# print(classifier)\n",
1982 | "# classifier.classify_many(te)"
1983 | ]
1984 | },
1985 | {
1986 | "cell_type": "markdown",
1987 | "metadata": {},
1988 | "source": [
1989 | "### Graphs depicting Categorization Accuracy scores on KFold Stratified Validation on Train data for:"
1990 | ]
1991 | },
1992 | {
1993 | "cell_type": "code",
1994 | "execution_count": null,
1995 | "metadata": {
1996 | "collapsed": false
1997 | },
1998 | "outputs": [],
1999 | "source": [
2000 | "# selectRandomForestScores = []\n",
2001 | "# selectKernelScores = []\n",
2002 | "# selectNeighbourScores = []\n",
2003 | "\n",
2004 | "# train = trainBagVector\n",
2005 | "\n",
2006 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n",
2007 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n",
2008 | "# print score\n",
2009 | "# selectRandomForestScores.append(score)\n",
2010 | "\n",
2011 | "# params = {'neighbours':25}\n",
2012 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
2013 | "# print score\n",
2014 | "# selectNeighbourScores.append(score)\n",
2015 | "\n",
2016 | "# params = {'kernel':'linear'}\n",
2017 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n",
2018 | "# print score\n",
2019 | "# selectKernelScores.append(score)\n",
2020 | "\n",
2021 | "\n",
2022 | "\n",
2023 | "# train = tfidfTrain\n",
2024 | "\n",
2025 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n",
2026 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n",
2027 | "# print score\n",
2028 | "# selectRandomForestScores.append(score)\n",
2029 | "\n",
2030 | "# params = {'neighbours':25}\n",
2031 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
2032 | "# print score\n",
2033 | "# selectNeighbourScores.append(score)\n",
2034 | "\n",
2035 | "# params = {'kernel':'rbf'}\n",
2036 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n",
2037 | "# print score\n",
2038 | "# selectKernelScores.append(score)\n",
2039 | "\n",
2040 | "\n",
2041 | "# train = np.array(trainStateTransitionVector)\n",
2042 | "\n",
2043 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n",
2044 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n",
2045 | "# print score\n",
2046 | "# selectRandomForestScores.append(score)\n",
2047 | "\n",
2048 | "# params = {'neighbours':25}\n",
2049 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
2050 | "# print score\n",
2051 | "# selectNeighbourScores.append(score)\n",
2052 | "\n",
2053 | "\n",
2054 | "# params = {'kernel':'rbf'}\n",
2055 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n",
2056 | "# print score\n",
2057 | "# selectKernelScores.append(score)\n",
2058 | "\n",
2059 | "\n",
2060 | "# train = np.hstack([trainBagVector, np.array(trainStateTransitionVector)])\n",
2061 | "\n",
2062 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n",
2063 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n",
2064 | "# print score\n",
2065 | "# selectRandomForestScores.append(score)\n",
2066 | "\n",
2067 | "# params = {'neighbours':25}\n",
2068 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n",
2069 | "# print score\n",
2070 | "# selectNeighbourScores.append(score)\n",
2071 | "\n",
2072 | "# params = {'kernel':'rbf'}\n",
2073 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n",
2074 | "# print score\n",
2075 | "# selectKernelScores.append(score)\n",
2076 | "\n",
2077 | "\n",
2078 | "\n",
2079 | "# print selectRandomForestScores\n",
2080 | "# print selectKernelScores\n",
2081 | "# print selectNeighbourScores"
2082 | ]
2083 | },
2084 | {
2085 | "cell_type": "code",
2086 | "execution_count": null,
2087 | "metadata": {
2088 | "collapsed": false
2089 | },
2090 | "outputs": [],
2091 | "source": [
2092 | "# #Plotting the results\n",
2093 | "# import matplotlib.pyplot as plt\n",
2094 | "# %matplotlib inline\n",
2095 | "# plt.plot(selectRandomForestScores, label = \"Random Forest Classifier\")\n",
2096 | "# plt.plot(selectKernelScores, label = \"Multiclass Linear SVM Classifier\")\n",
2097 | "# plt.plot(selectNeighbourScores, label = \"KNN Classifier\")\n",
2098 | "\n",
2099 | "# labels = ['Bag of Words', 'TF-IDF', 'State Transitions', 'Stacked Features 1 & 3']\n",
2100 | "\n",
2101 | "# # You can specify a rotation for the tick labels in degrees or with keywords.\n",
2102 | "# plt.xticks(np.arange(len(labels)), labels, rotation='vertical')\n",
2103 | "\n",
2104 | "# plt.ylabel('Scores')\n",
2105 | "# plt.xlabel('Feature Encoding used')\n",
2106 | "# # Place a legend to the right of this smaller figure.\n",
2107 | "# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
2108 | "# plt.show()"
2109 | ]
2110 | },
2111 | {
2112 | "cell_type": "markdown",
2113 | "metadata": {},
2114 | "source": [
2115 | "## Final evaluation results"
2116 | ]
2117 | },
2118 | {
2119 | "cell_type": "markdown",
2120 | "metadata": {},
2121 | "source": [
2122 | "### Bar graph depicting Categorization Accuracy Scores on the different Models."
2123 | ]
2124 | },
2125 | {
2126 | "cell_type": "code",
2127 | "execution_count": 65,
2128 | "metadata": {
2129 | "collapsed": false
2130 | },
2131 | "outputs": [],
2132 | "source": [
2133 | "# import numpy as np\n",
2134 | "# import matplotlib.pyplot as plt\n",
2135 | "# %matplotlib inline\n",
2136 | "# N = 8\n",
2137 | "# publicScore = (80.453, 75.637, 79.887, 80.737, 81.586, 80.170, 80.170, 79.887)\n",
2138 | "# privateScore = (84.136, 80.170, 83.569, 83.003, 83.286, 83.003, 83.003, 83.569)\n",
2139 | "# modelNames = ('RF(50T, Entropy)+Bag of Words', 'RF(150T, Entropy)+TF-IDF', 'RF(50T, Entropy) + State Transition', \n",
2140 | "# 'KNN(5) +Bag of Words', 'KNN(5) + TF-IDF', 'KNN(5) + State Transition',\n",
2141 | "# 'Stack: KNN(5) + ST + BoW', 'Stack: RF(50T, Entropy) + ST + BoW')\n",
2142 | "\n",
2143 | "# ind = np.arange(N) # the x locations for the groups\n",
2144 | "# width = 0.35 # the width of the bars\n",
2145 | "\n",
2146 | "# fig, ax = plt.subplots()\n",
2147 | "# rects1 = ax.bar(ind, publicScore, width, color='m')\n",
2148 | "\n",
2149 | "# rects2 = ax.bar(ind + width, privateScore, width, color='c')\n",
2150 | "\n",
2151 | "# # add some text for labels, title and axes ticks\n",
2152 | "# ax.set_ylabel('Scores')\n",
2153 | "# ax.set_title('Evaluations of submissions using Categorization Accuracy.')\n",
2154 | "# ax.set_xticks(ind + width)\n",
2155 | "# ax.set_xticklabels(modelNames, rotation='vertical')\n",
2156 | "# ax.set_ylim(75,85)\n",
2157 | "\n",
2158 | "# # def autolabel(rects):\n",
2159 | "# # # attach some text labels\n",
2160 | "# # for rect in rects:\n",
2161 | "# # height = rect.get_height()\n",
2162 | "# # ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,\n",
2163 | "# # '%d' % int(height),\n",
2164 | "# # ha='center', va='bottom')\n",
2165 | "\n",
2166 | "# # autolabel(rects1)\n",
2167 | "# # autolabel(rects2)\n",
2168 | "\n",
2169 | "# # Place a legend to the right of this smaller figure.\n",
2170 | "# ax.legend((rects1[0], rects2[0]), ('Public Scores', 'Private Scores'), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
2171 | "\n",
2172 | "# plt.show()"
2173 | ]
2174 | },
2175 | {
2176 | "cell_type": "markdown",
2177 | "metadata": {},
2178 | "source": [
2179 | "### Hence we conclude that the best model is kNN using TF-IDF as features !"
2180 | ]
2181 | },
2182 | {
2183 | "cell_type": "markdown",
2184 | "metadata": {},
2185 | "source": [
2186 | "## Geo Visualization"
2187 | ]
2188 | },
2189 | {
2190 | "cell_type": "code",
2191 | "execution_count": 83,
2192 | "metadata": {
2193 | "collapsed": true
2194 | },
2195 | "outputs": [],
2196 | "source": [
2197 | "def reverseMapLabels(classNo):\n",
2198 | " if className == 0:\n",
2199 | " return 'Conscientiousness'\n",
2200 | " elif className == 1:\n",
2201 | " return 'Extrovert'\n",
2202 | " elif className == 2:\n",
2203 | " return 'Agreeable'\n",
2204 | " elif className == 3:\n",
2205 | " return 'Empathetic'\n",
2206 | " elif className == 4:\n",
2207 | " return 'Novelty Seeking'\n",
2208 | " elif className == 5:\n",
2209 | " return 'Perfectionist'\n",
2210 | " elif className == 6:\n",
2211 | " return 'Rigid'\n",
2212 | " elif className == 7:\n",
2213 | " return 'Impulsive'\n",
2214 | " elif className == 8:\n",
2215 | " return 'Psychopath'\n",
2216 | " elif className == 9:\n",
2217 | " return 'Obsessive'\n",
2218 | " else:\n",
2219 | " return None\n"
2220 | ]
2221 | },
2222 | {
2223 | "cell_type": "code",
2224 | "execution_count": 84,
2225 | "metadata": {
2226 | "collapsed": false,
2227 | "scrolled": true
2228 | },
2229 | "outputs": [],
2230 | "source": [
2231 | "import string\n",
2232 | "import matplotlib.cm as cm\n",
2233 | "\n",
2234 | "from mpl_toolkits.basemap import Basemap\n",
2235 | "import matplotlib.pyplot as plt\n",
2236 | "\n",
2237 | "import pandas as pd\n",
2238 | "\n",
2239 | "def GeoPlot(geo_longitude, geo_latitude, labels):\n",
2240 | "\n",
2241 | " fig = plt.figure(figsize=(20,10))\n",
2242 | " \n",
2243 | " raw_data = {'latitude': geo_latitude,'longitude': geo_longitude}\n",
2244 | "\n",
2245 | " df = pd.DataFrame(raw_data, columns = ['latitude', 'longitude'])\n",
2246 | " \n",
2247 | " totSampleLen = len(labels)\n",
2248 | "# print totSampleLen\n",
2249 | " colors = ['blue', 'beige', 'red', 'green', 'magenta', 'yellow', 'cyan', 'aquamarine', 'azure', 'darkkhaki']\n",
2250 | " \n",
2251 | " m = Basemap(projection='gall',lon_0=0,lat_0=0,resolution='i')\n",
2252 | "# x1,y1=map(geo_longitude, geo_latitude)\n",
2253 | " x1,y1 = map(df['longitude'].values, df['latitude'].values)\n",
2254 | "\n",
2255 | "\n",
2256 | " m.drawmapboundary(fill_color='black') # fill to edge\n",
2257 | " m.drawcountries()\n",
2258 | " m.fillcontinents(color='white',lake_color='black')\n",
2259 | " \n",
2260 | "# m.scatter(x1, y1, marker='D',color='m', s=2)\n",
2261 | " for i in xrange(totSampleLen):\n",
2262 | " for k in xrange(10):\n",
2263 | " if labels[i] == k:\n",
2264 | "# print x1[i], y1[i]\n",
2265 | "# print colors[k]\n",
2266 | "# m.scatter(x1[i], y1[i], marker='D',color=colors[k], s=2)\n",
2267 | " m.plot(x1[i], y1[i], 'ro', color=colors[k]) #'ro', markersize=6)\n",
2268 | "\n",
2269 | " \n",
2270 | " for k in xrange(10):\n",
2271 | " m.scatter(0,0, marker='D',color=colors[k], s=2, label=reverseMapLabels(k))\n",
2272 | " \n",
2273 | " plt.title(\"Geo-tagging Personality Types for Twitter Users\")\n",
2274 | " # Place a legend to the right of this smaller figure.\n",
2275 | " plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
2276 | " plt.show()\n"
2277 | ]
2278 | },
2279 | {
2280 | "cell_type": "markdown",
2281 | "metadata": {},
2282 | "source": [
2283 | "### Visualize Personality Types based on location of user tweets."
2284 | ]
2285 | },
2286 | {
2287 | "cell_type": "code",
2288 | "execution_count": 85,
2289 | "metadata": {
2290 | "collapsed": false
2291 | },
2292 | "outputs": [
2293 | {
2294 | "name": "stdout",
2295 | "output_type": "stream",
2296 | "text": [
2297 | "\n",
2298 | "(60,)\n"
2299 | ]
2300 | },
2301 | {
2302 | "ename": "TypeError",
2303 | "evalue": "'numpy.ndarray' object is not callable",
2304 | "output_type": "error",
2305 | "traceback": [
2306 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
2307 | "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
2308 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlat\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[0mlat\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mGeoPlot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgeo_longitude\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgeo_latitude\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mYTrain\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m60\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
2309 | "\u001b[1;32m\u001b[0m in \u001b[0;36mGeoPlot\u001b[1;34m(geo_longitude, geo_latitude, labels)\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[0mm\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBasemap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprojection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'gall'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlon_0\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlat_0\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mresolution\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'i'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[1;31m# x1,y1=map(geo_longitude, geo_latitude)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 23\u001b[1;33m \u001b[0mx1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0my1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'longitude'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'latitude'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 24\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
2310 | "\u001b[1;31mTypeError\u001b[0m: 'numpy.ndarray' object is not callable"
2311 | ]
2312 | }
2313 | ],
2314 | "source": [
2315 | "lon = np.random.random_integers(-180,180,60)\n",
2316 | "lat = np.random.random_integers(-90,90,60)\n",
2317 | "geo_latitude = lat\n",
2318 | "geo_longitude = lon\n",
2319 | "print type(lat)\n",
2320 | "print lat.shape\n",
2321 | "GeoPlot(geo_longitude, geo_latitude, YTrain[0:60])"
2322 | ]
2323 | },
2324 | {
2325 | "cell_type": "code",
2326 | "execution_count": null,
2327 | "metadata": {
2328 | "collapsed": false
2329 | },
2330 | "outputs": [],
2331 | "source": [
2332 | "GeoPlot(eval_geo_longitude[0:1000], eval_geo_latitude[0:1000], YPred[0:1000])"
2333 | ]
2334 | },
2335 | {
2336 | "cell_type": "markdown",
2337 | "metadata": {},
2338 | "source": [
2339 | "### Geo-tagging Sentiments of Twitter Users"
2340 | ]
2341 | },
2342 | {
2343 | "cell_type": "code",
2344 | "execution_count": 86,
2345 | "metadata": {
2346 | "collapsed": true
2347 | },
2348 | "outputs": [],
2349 | "source": [
2350 | "def reverseMapSentiments(classNo):\n",
2351 | " if classNo == 0:\n",
2352 | " return 'Negative'\n",
2353 | " elif classNo == 1:\n",
2354 | " return 'Neutral'\n",
2355 | " elif classNo == 2:\n",
2356 | " return 'Positive'\n",
2357 | " else:\n",
2358 | " return None"
2359 | ]
2360 | },
2361 | {
2362 | "cell_type": "code",
2363 | "execution_count": 87,
2364 | "metadata": {
2365 | "collapsed": false
2366 | },
2367 | "outputs": [],
2368 | "source": [
2369 | "def GeoSentimentPlot(geo_longitude, geo_latitude, sentiments):\n",
2370 | "\n",
2371 | " fig = plt.figure(figsize=(20,10))\n",
2372 | " \n",
2373 | " raw_data = {'latitude': geo_latitude,\n",
2374 | " 'longitude': geo_longitude}\n",
2375 | "\n",
2376 | " df = pd.DataFrame(raw_data, columns = ['latitude', 'longitude'])\n",
2377 | "\n",
2378 | " \n",
2379 | " totSampleLen = len(sentiments)\n",
2380 | " colors = ['red', 'blue', 'green']\n",
2381 | " \n",
2382 | " negLimit = 0\n",
2383 | " posLimit = 0\n",
2384 | " \n",
2385 | " m = Basemap(projection='gall',lon_0=0,lat_0=0,resolution='i')\n",
2386 | " \n",
2387 | " x1,y1 = map(df['longitude'].values, df['latitude'].values)\n",
2388 | "\n",
2389 | " m.drawmapboundary(fill_color='black')\n",
2390 | " m.drawcountries()\n",
2391 | " m.fillcontinents(color='white',lake_color='black')\n",
2392 | " \n",
2393 | " for i in xrange(totSampleLen):\n",
2394 | "# print sentiments[i]\n",
2395 | " if sentiments[i] < negLimit:\n",
2396 | " m.plot(x1[i], y1[i], 'ro', color=colors[0])\n",
2397 | " elif sentiments[i] >= negLimit and sentiments[i] <= posLimit:\n",
2398 | " m.plot(x1[i], y1[i], 'ro', color=colors[1])\n",
2399 | " elif sentiments[i] > posLimit:\n",
2400 | " m.plot(x1[i], y1[i], 'ro', color=colors[2])\n",
2401 | " \n",
2402 | " \n",
2403 | " for k in xrange(3):\n",
2404 | " m.scatter(0,0, marker='D',color=colors[k], s=2, label=reverseMapSentiments(k))\n",
2405 | " \n",
2406 | " plt.title(\"Geo-tagging Sentiments of Twitter Users\")\n",
2407 | " # Place a legend to the right of this smaller figure.\n",
2408 | " plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
2409 | " plt.show()\n"
2410 | ]
2411 | },
2412 | {
2413 | "cell_type": "markdown",
2414 | "metadata": {},
2415 | "source": [
2416 | "### Visualize Sentiment of user tweets based on location."
2417 | ]
2418 | },
2419 | {
2420 | "cell_type": "code",
2421 | "execution_count": 88,
2422 | "metadata": {
2423 | "collapsed": false
2424 | },
2425 | "outputs": [
2426 | {
2427 | "ename": "TypeError",
2428 | "evalue": "'numpy.ndarray' object is not callable",
2429 | "output_type": "error",
2430 | "traceback": [
2431 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
2432 | "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
2433 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mGeoSentimentPlot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgeo_longitude\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgeo_latitude\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mXTrainSentiment\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m60\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
2434 | "\u001b[1;32m\u001b[0m in \u001b[0;36mGeoSentimentPlot\u001b[1;34m(geo_longitude, geo_latitude, sentiments)\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[0mm\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBasemap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprojection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'gall'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlon_0\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlat_0\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mresolution\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'i'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 19\u001b[1;33m \u001b[0mx1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0my1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'longitude'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'latitude'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 20\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[0mm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrawmapboundary\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfill_color\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'black'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
2435 | "\u001b[1;31mTypeError\u001b[0m: 'numpy.ndarray' object is not callable"
2436 | ]
2437 | }
2438 | ],
2439 | "source": [
2440 | "GeoSentimentPlot(geo_longitude, geo_latitude, XTrainSentiment[0:60])"
2441 | ]
2442 | },
2443 | {
2444 | "cell_type": "code",
2445 | "execution_count": null,
2446 | "metadata": {
2447 | "collapsed": false
2448 | },
2449 | "outputs": [],
2450 | "source": [
2451 | "print len(eval_geo_longitude)\n",
2452 | "eval_geo_longitude = np.array(eval_geo_longitude)\n",
2453 | "eval_geo_latitude = np.array(eval_geo_latitude)\n",
2454 | "print len(eval_geo_longitude)\n",
2455 | "print eval_geo_longitude.shape\n",
2456 | "print type(eval_geo_longitude)"
2457 | ]
2458 | },
2459 | {
2460 | "cell_type": "code",
2461 | "execution_count": null,
2462 | "metadata": {
2463 | "collapsed": false
2464 | },
2465 | "outputs": [],
2466 | "source": [
2467 | "GeoSentimentPlot(eval_geo_longitude[0:1000], eval_geo_latitude[0:1000], XEvalSentiment[0:1000])"
2468 | ]
2469 | },
2470 | {
2471 | "cell_type": "code",
2472 | "execution_count": null,
2473 | "metadata": {
2474 | "collapsed": true
2475 | },
2476 | "outputs": [],
2477 | "source": []
2478 | }
2479 | ],
2480 | "metadata": {
2481 | "kernelspec": {
2482 | "display_name": "Python 2",
2483 | "language": "python",
2484 | "name": "python2"
2485 | },
2486 | "language_info": {
2487 | "codemirror_mode": {
2488 | "name": "ipython",
2489 | "version": 2
2490 | },
2491 | "file_extension": ".py",
2492 | "mimetype": "text/x-python",
2493 | "name": "python",
2494 | "nbconvert_exporter": "python",
2495 | "pygments_lexer": "ipython2",
2496 | "version": "2.7.11"
2497 | }
2498 | },
2499 | "nbformat": 4,
2500 | "nbformat_minor": 0
2501 | }
2502 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/TwitterData/StopWords.txt:
--------------------------------------------------------------------------------
1 | video
2 | [video]
3 | URL
4 | url
5 | pic
6 | [
7 | ]
8 | (
9 | )
10 | "
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/TwitterData/UserTweets.txt:
--------------------------------------------------------------------------------
1 | For #GivingTuesday, these soccer charities are very worthy, among many others: @soccerwoborders @sfw_tweets @FCHARLEM
2 | Proud to work w/ organizations that use #football to teach important messages about healthy behaviour #WorldAIDSDay
3 | Obsessed with @AnticoVinaioaFi! So glad we got there before the line. Freakin' killer #porchetta! #streetfood #Italy
4 | What an awesome #selfie @slattykat we love it! Let's see what fun #selfies you can get in … http://ift.tt/1IpwqNC
5 | Quench your thirst for beauty with the most beautiful river in the world [video] http://holykaw.alltop.com/quench-your-thirst-for-beauty-with-the-most-beautiful-river-in-the-world-video?gk1 …
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/TwitterData/labeledPersonalityTweets.csv:
--------------------------------------------------------------------------------
1 | |Sentinel|,|For #GivingTuesday, these soccer charities are very worthy, among many others: @soccerwoborders @sfw_tweets @FCHARLEM|
2 | |Diplomat|,|Proud to work w/ organizations that use #football to teach important messages about healthy behaviour #WorldAIDSDay|
3 | |Explorer|,|Obsessed with @AnticoVinaioaFi! So glad we got there before the line. Freakin' killer #porchetta! #streetfood #Italy|
4 | |Analyst|,|What an awesome #selfie @slattykat we love it! Let's see what fun #selfies you can get in URL |
5 | |Explorer|,|Quench your thirst for beauty with the most beautiful river in the world [video] URL |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/mmds/__init__.py
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/supervised/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/mmds/supervised/__init__.py
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/supervised/classification_algos.py:
--------------------------------------------------------------------------------
1 | from sklearn import linear_model
2 | from sklearn import svm
3 | from sklearn.cross_validation import StratifiedKFold
4 | from sklearn.ensemble import AdaBoostClassifier
5 | from sklearn.ensemble import RandomForestClassifier
6 | from sklearn.neighbors import KNeighborsClassifier
7 | from sklearn.tree import DecisionTreeClassifier
8 | from sknn.mlp import Classifier, Layer
9 |
10 | import numpy as np
11 |
12 |
13 | # Random Forest Classifier
14 | # def classifyRandomForestClassifier(XTrain, XTest, YTrain, YTest,trees=100,crit='gini'):
15 | def classifyRandomForestClassifier(XTrain, XTest, YTrain, YTest, params):
16 | trees = params['trees']
17 | crit = params['criterion']
18 | seed = params['random_state']
19 | clf = RandomForestClassifier(n_estimators=trees, criterion=crit, random_state=seed)
20 | clf.fit(XTrain, YTrain)
21 | YPred = clf.predict(XTest)
22 | diff = YPred - YTest
23 | score = diff[diff == 0].size
24 | return (100.0 * score) / (YPred.size)
25 |
26 |
27 | # In[46]:
28 |
29 | # Multi Class SVM
30 | def classifyMultiClassSVMClassifier(XTrain, XTest, YTrain, YTest, params):
31 | ker = params['kernel']
32 | YPred = svm.SVC(kernel=ker).fit(XTrain, YTrain).predict(XTest)
33 | diff = YPred - YTest
34 | score = diff[diff == 0].size
35 | return (100.0 * score) / (YPred.size)
36 |
37 |
38 | # In[47]:
39 |
40 | # K Nearest Neighbours Classifier
41 | def classifyKNNClassifier(XTrain, XTest, YTrain, YTest, params):
42 | # print XTrain.shape, XTest.shape
43 | neighbours = params['neighbours']
44 | neigh = KNeighborsClassifier(n_neighbors=neighbours)
45 | YPred = neigh.fit(XTrain, YTrain).predict(XTest)
46 | diff = YPred - YTest
47 | score = diff[diff == 0].size
48 | return (100.0 * score) / (YPred.size)
49 |
50 |
51 | # In[48]:
52 |
53 | # Logistic Regression
54 | def classifyLogisticRegression(XTrain, XTest, YTrain, YTest, params):
55 | LogReg = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)
56 | LogReg.fit(XTrain, YTrain)
57 | # Finds the optimal model parameters using a least squares method.
58 | # To get the parameter values:
59 | # LogReg.get_params()
60 | # To predict a new input XTest,
61 | YPred = LogReg.predict(XTest)
62 | diff = YPred - YTest
63 | score = diff[diff == 0].size
64 | return (100.0 * score) / (YPred.size)
65 |
66 |
67 | # In[49]:
68 |
69 | # Adaboost Classfier
70 | def classifyAdaboostClassifier(XTrain, XTest, YTrain, YTest, params):
71 | depth = params['max_depth']
72 | algo = params['algorithm']
73 | estimators = params['n_estimators']
74 |
75 | # Create and fit an AdaBoosted decision tree
76 | bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth),
77 | algorithm=algo,
78 | n_estimators=estimators)
79 |
80 | bdt.fit(XTrain, YTrain)
81 | YPred = bdt.predict(XTest)
82 |
83 | diff = YPred - YTest
84 | score = diff[diff == 0].size
85 | return (100.0 * score) / (YPred.size)
86 |
87 |
88 | def classifyNeuralNetworkClassifier(XTrain, XTest, YTrain, YTest, params):
89 | activation = params['activation']
90 | actLastLayer = params['actLastLayer']
91 | rule = params['rule']
92 | noOfUnits = params['units']
93 | rate = params['rate']
94 | noOfIter = params['iter']
95 | nn = Classifier(layers=[Layer(activation, units=noOfUnits), Layer(actLastLayer)], learning_rule=rule,
96 | learning_rate=0.02,
97 | n_iter=10)
98 | nn.fit(XTrain, YTrain)
99 | YPred = nn.predict(XTest)
100 | diff = YPred - YTest.reshape(YPred.shape)
101 | score = diff[diff == 0].size
102 | score = (100.0 * score) / (YPred.size)
103 | return score
104 |
105 |
106 | def featNLTKClassify(samples, phase, feature_names, YTrain):
107 | nltkClassifySamples = []
108 |
109 | for i in xrange(len(samples)):
110 | t = samples[i]
111 | lstFuncCalls = t.split()
112 | wordOccDict = {}
113 | for j in xrange(len(feature_names)):
114 | wordOccDict[feature_names[j]] = lstFuncCalls.count(feature_names[j])
115 | if phase == 'train':
116 | nltkClassifySamples.append((wordOccDict, YTrain[i]))
117 | else:
118 | nltkClassifySamples.append(wordOccDict)
119 |
120 | return nltkClassifySamples
121 |
122 |
123 | def stratifiedKFoldVal(XTrain, YTrain, classify, params):
124 | n_folds = 5
125 | score = 0.0
126 | skf = StratifiedKFold(YTrain, n_folds)
127 | try:
128 | multi = params['multi']
129 | except KeyError:
130 | multi = False
131 | for train_index, test_index in skf:
132 | y_train, y_test = YTrain[train_index], YTrain[test_index]
133 | if not multi:
134 | X_train, X_test = XTrain[train_index], XTrain[test_index]
135 | score += classify(X_train, X_test, y_train, y_test, params)
136 | else:
137 | X_train, X_test = [XTrain[i] for i in train_index], [XTrain[i] for i in test_index]
138 | score += classify(np.array(X_train), np.array(X_test), y_train, y_test, params)
139 |
140 | return score / n_folds
141 |
142 | def createStateTransitionVector(categoricalState, stateDict, maxLength):
143 | if categoricalState:
144 | feature = []
145 | for state in categoricalState.split(' '):
146 | try:
147 | feature.append(stateDict[state.lower()])
148 | except KeyError:
149 | pass
150 | # print state
151 | if len(feature) != maxLength:
152 | for i in xrange(maxLength - len(feature)):
153 | feature.append(0)
154 | assert(len(feature) == maxLength)
155 | return feature
156 | else:
157 | return [0] * maxLength
158 |
159 |
160 | def createStateVectors(XStates, stateDict, maxLength):
161 | XFeatures = []
162 | for state in XStates:
163 | XFeatures.append(createStateTransitionVector(state, stateDict, maxLength))
164 | return XFeatures
165 |
166 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/supervised/feature_engineering.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from nltk.corpus import stopwords
3 | from textblob.blob import TextBlob
4 |
5 | from mmds.supervised.filter_stop_words import FilterStopWords
6 | from mmds.supervised.preprocess_tweets import PreprocessTweets
7 |
8 |
9 | class FeatureEngineering:
10 |
11 | def __init__(self):
12 | self.name = 'FeatureEngineering'
13 | self.featureList = []
14 | # self.sid = SentimentIntensityAnalyzer()
15 |
16 |
17 | # start extract_features
18 | def extract_features(self, tweet):
19 | tweet_words = set(tweet)
20 | features = {}
21 | for word in self.featureList:
22 | features['contains(%s)' % word] = (word in tweet_words)
23 | return features
24 |
25 | # # Create New Training set based on personality labels predicted from Survey results
26 |
27 | def createNewTrainingSet(self, training_data_file):
28 | XTrain = []
29 | YTrain = []
30 | XTrainFeatures = []
31 | XTrainSentiment = []
32 | XTrainFreqTweets = []
33 | geo_latitude = []
34 | geo_longitude = []
35 |
36 | objFilterStopWords = FilterStopWords()
37 | objPreprocessTweets = PreprocessTweets()
38 |
39 | stopWords = objFilterStopWords.getStopWordList('../../TwitterData/StopWords.txt')
40 |
41 | # Read the tweets one by one and process it
42 | inpTweets = csv.reader(open(training_data_file, 'rb'), delimiter=',')
43 | inpTweets.next()
44 | tweets = []
45 | i = 0
46 | for row in inpTweets:
47 | # print row
48 | personality = row[5]
49 | tweet = row[1]
50 | cleanTweet = tweet.replace('"",""', " ")
51 | cleanTweet = cleanTweet.replace('""', " ")
52 | processedTweet = objPreprocessTweets.processTweet(cleanTweet)
53 |
54 | XTrainFreqTweets.append(int(row[4]))
55 | wordsList = processedTweet.split()
56 |
57 | # Remove stop words
58 | filtered_words = [word for word in wordsList if word not in stopwords.words('english')]
59 | filteredTweets = ' '.join(filtered_words)
60 |
61 | featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords)
62 |
63 | geo_latitude.append(float(row[2]))
64 | geo_longitude.append(float(row[3]))
65 |
66 | blob = TextBlob(processedTweet)
67 | sentiment = 0
68 | for sentence in blob.sentences:
69 | sentiment += sentence.sentiment.polarity
70 |
71 | totSentiment = sentiment / len(blob.sentences)
72 |
73 | XTrainSentiment.append(totSentiment)
74 |
75 | XTrainFeatures.append(filteredTweets)
76 |
77 | YTrain.append(personality.replace('[', '').replace('\"', '').replace(']', ''))
78 |
79 | # i+=1
80 | # if i==3:
81 | # break
82 |
83 |
84 | return XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude
85 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/supervised/filter_stop_words.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | class FilterStopWords:
4 |
5 | # stopWords = []
6 | def __init__(self):
7 | self.name = 'FilterStopWords'
8 | #initialize stopWords
9 | self.stopWords = []
10 |
11 |
12 | def getStopWordList(self, stopWordListFileName):
13 | #read the stopwords file and build a list
14 | stopWords = []
15 | stopWords.append('AT_USER')
16 | stopWords.append('URL')
17 | stopWords.append('[')
18 | stopWords.append('[')
19 |
20 | fp = open(stopWordListFileName, 'r')
21 | line = fp.readline()
22 | while line:
23 | word = line.strip()
24 | stopWords.append(word)
25 | line = fp.readline()
26 | fp.close()
27 | return stopWords
28 |
29 | def getFeatureVector(self, tweet, stopWords):
30 | featureVector = []
31 | #split tweet into words
32 | words = tweet.split()
33 | for w in words:
34 | #replace two or more with two occurrences
35 | #w = replaceTwoOrMore(w)
36 | #strip punctuation
37 | w = w.strip('\'"?,.')
38 | #check if the word stats with an alphabet
39 | val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
40 | #ignore if it is a stop word
41 | if(w in self.stopWords or val is None):
42 | continue
43 | else:
44 | featureVector.append(w.lower())
45 | return featureVector
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/supervised/personality_predictor_and_visualizer.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from mpl_toolkits.basemap import Basemap
3 | from sklearn.ensemble import RandomForestClassifier
4 | from sklearn.feature_extraction.text import CountVectorizer
5 | from sklearn.neighbors import KNeighborsClassifier
6 | from mmds.supervised.feature_engineering import FeatureEngineering
7 | from sklearn import svm
8 |
9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | import pandas as pd
12 |
13 |
14 | logging.basicConfig(filename="../../supervised.log", level=logging.DEBUG, format="%(asctime)-15s %(threadName)s %(message)s")
15 |
16 | PERSONALITY_LABELS = ['Conscientiousness', 'Extrovert', 'Agreeable', 'Empathetic', 'Novelty Seeking', 'Perfectionist', 'Rigid',
17 | 'Impulsive', 'Psychopath', 'Obsessive']
18 |
19 | SENTIMENT_LABELS = ['Negative', 'Neutral', 'Positive']
20 |
21 | def mapLabels(class_name):
22 | if class_name in PERSONALITY_LABELS:
23 | return PERSONALITY_LABELS.index(class_name)
24 | else:
25 | pass
26 |
27 | def writePredictedLabelFile(YPred):
28 | f = open("../../TwitterData/Predictions.csv", "w")
29 | f.write("Id,Label" + "\n")
30 | for i in xrange(len(YPred)):
31 | f.write(str(i) + "," + str(int(YPred[i])) + "\n")
32 | f.close()
33 |
34 | def reverseMapLabels(index):
35 | if index < len(PERSONALITY_LABELS):
36 | return PERSONALITY_LABELS[index]
37 | else:
38 | return None
39 |
40 | def GeoPlot(geo_longitude, geo_latitude, labels):
41 |
42 | fig = plt.figure(figsize=(20, 10))
43 |
44 | raw_data = {'latitude': geo_latitude, 'longitude': geo_longitude}
45 |
46 | df = pd.DataFrame(raw_data, columns=['latitude', 'longitude'])
47 |
48 | totSampleLen = len(labels)
49 | # print totSampleLen
50 | colors = ['blue', 'beige', 'red', 'green', 'magenta', 'yellow', 'cyan', 'aquamarine', 'azure', 'darkkhaki']
51 |
52 | m = Basemap(projection='gall', lon_0=0, lat_0=0, resolution='i')
53 | # x1,y1=map(geo_longitude, geo_latitude)
54 | x1, y1 = m(df['longitude'].values, df['latitude'].values)
55 |
56 |
57 | m.drawmapboundary(fill_color='black') # fill to edge
58 | m.drawcountries()
59 | m.fillcontinents(color='white', lake_color='black')
60 |
61 | # m.scatter(x1, y1, marker='D',color='m', s=2)
62 | for i in xrange(totSampleLen):
63 | for k in xrange(10):
64 | if labels[i] == k:
65 | # print x1[i], y1[i]
66 | # print colors[k]
67 | # m.scatter(x1[i], y1[i], marker='D',color=colors[k], s=2)
68 | m.plot(x1[i], y1[i], 'ro', color=colors[k]) # 'ro', markersize=6)
69 |
70 |
71 | for k in xrange(10):
72 | m.scatter(0, 0, marker='D', color=colors[k], s=2, label=reverseMapLabels(k))
73 |
74 | plt.title("Geo-tagging Personality Types for Twitter Users")
75 | # Place a legend to the right of this smaller figure.
76 | plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
77 | plt.show()
78 |
79 | def reverseMapSentiments(index):
80 | if index < len(SENTIMENT_LABELS):
81 | return SENTIMENT_LABELS[index]
82 | else:
83 | return None
84 |
85 | def GeoSentimentPlot(geo_longitude, geo_latitude, sentiments):
86 |
87 | fig = plt.figure(figsize=(20, 10))
88 |
89 | raw_data = {'latitude': geo_latitude,
90 | 'longitude': geo_longitude}
91 |
92 | df = pd.DataFrame(raw_data, columns=['latitude', 'longitude'])
93 |
94 |
95 | totSampleLen = len(sentiments)
96 | colors = ['red', 'blue', 'green']
97 |
98 | negLimit = 0
99 | posLimit = 0
100 |
101 | m = Basemap(projection='gall', lon_0=0, lat_0=0, resolution='i')
102 |
103 | x1, y1 = m(df['longitude'].values, df['latitude'].values)
104 |
105 | m.drawmapboundary(fill_color='black')
106 | m.drawcountries()
107 | m.fillcontinents(color='white', lake_color='black')
108 |
109 | for i in xrange(totSampleLen):
110 | # print sentiments[i]
111 | if sentiments[i] < negLimit:
112 | m.plot(x1[i], y1[i], 'ro', color=colors[0])
113 | elif sentiments[i] >= negLimit and sentiments[i] <= posLimit:
114 | m.plot(x1[i], y1[i], 'ro', color=colors[1])
115 | elif sentiments[i] > posLimit:
116 | m.plot(x1[i], y1[i], 'ro', color=colors[2])
117 |
118 |
119 | for k in xrange(3):
120 | m.scatter(0, 0, marker='D', color=colors[k], s=2, label=reverseMapSentiments(k))
121 |
122 | plt.title("Geo-tagging Sentiments of Twitter Users")
123 | # Place a legend to the right of this smaller figure.
124 | plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
125 | plt.show()
126 |
127 |
128 | if __name__ == "__main__":
129 | """
130 | Main script starts here.
131 | """
132 | logging.info("Inside main...")
133 | training_data_file = '../../TwitterData/survey_dump_with_tweet_count'
134 | evauluation_data_file = '../../TwitterData/survey_dump_geo_gt_8_1'
135 |
136 | objFeatureEngineering = FeatureEngineering()
137 | XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, \
138 | geo_longitude = objFeatureEngineering.createNewTrainingSet(training_data_file)
139 |
140 | XEval, YEval, XEvalFeatures, XEvalSentiment, XEvalFreqTweets, eval_geo_latitude, \
141 | eval_geo_longitude = objFeatureEngineering.createNewTrainingSet(evauluation_data_file)
142 |
143 | YTrain = map(mapLabels, YTrain)
144 | YEval = map(mapLabels, YEval)
145 |
146 | XTrain = np.array(XTrainFeatures)
147 | YTrain = np.array(YTrain)
148 |
149 | logging.info("Number of training vectors XTrain:{}, target variables YTrain:{}".format(len(XTrain), len(YTrain)))
150 |
151 | XEval = np.array(XEvalFeatures)
152 | YEval = np.array(YEval)
153 |
154 | logging.info("Number of evaluation vectors XEval:{}, target variables YEval:{}".format(len(XEval), len(YEval)))
155 |
156 | # ### Split Train and Test data
157 |
158 | TRAINING_DATA_SET_SIZE = 60
159 | XTrainSamples = XTrain[0:TRAINING_DATA_SET_SIZE]
160 | YTrainSamples = YTrain[0:TRAINING_DATA_SET_SIZE]
161 |
162 | XTestSamples = XTrain[TRAINING_DATA_SET_SIZE:]
163 | YTestSamples = YTrain[TRAINING_DATA_SET_SIZE:]
164 |
165 | logging.info("No. of training samples XTrainSamples:{}, test samples XTestSamples:{}".format(len(XTrainSamples), len(XTestSamples)))
166 |
167 | trainSentimentSamples = np.array(XTrainSentiment[0:TRAINING_DATA_SET_SIZE])
168 | testSentimentSamples = np.array(XTrainSentiment[TRAINING_DATA_SET_SIZE:])
169 | trainFreqTweetSamples = np.array(XTrainFreqTweets[0:TRAINING_DATA_SET_SIZE])
170 | testFreqTweetSamples = np.array(XTrainFreqTweets[TRAINING_DATA_SET_SIZE:])
171 |
172 | vectorizer = CountVectorizer()
173 | vectorizer.fit_transform(np.array(XTrainFeatures + XEvalFeatures))
174 |
175 | logging.info("Total features in training and evalution data:{}".format(len(vectorizer.get_feature_names())))
176 |
177 | XTr = vectorizer.transform(XTrainSamples)
178 | trainBagVector = XTr.toarray()
179 | XTe = vectorizer.transform(XTestSamples)
180 | testBagVector = XTe.toarray()
181 |
182 | XEv = vectorizer.transform(XEval)
183 | evalBagVector = XEv.toarray()
184 |
185 | logging.info("Dimension of training bag:{}, test bag:{}, eval bag".format(trainBagVector.shape,
186 | testBagVector.shape, evalBagVector.shape))
187 |
188 | # join word features + sentiment + tweet frequency for training samples ...
189 | XTrainAllFeatures = np.column_stack((np.column_stack((trainBagVector, trainSentimentSamples)), trainFreqTweetSamples))
190 |
191 | # join word features + sentiment + tweet frequency for testing samples ...
192 | XTestAllFeatures = np.column_stack((np.column_stack((testBagVector, testSentimentSamples)), testFreqTweetSamples))
193 |
194 | # join word features + sentiment + tweet frequency for evalution samples ...
195 | XEvalAllFeatures = np.column_stack((np.column_stack((evalBagVector, XEvalSentiment)), XEvalFreqTweets))
196 |
197 | logging.info("Dim of all training samples:{}, test samples:{}, eval samples, ytrain :{}".format(XTrainAllFeatures.shape,
198 | XTestAllFeatures.shape, XEvalAllFeatures.shape, YTrainSamples.shape))
199 |
200 | """K Nearest Neighbourhood"""
201 | params = {'neighbours':25}
202 | neigh = KNeighborsClassifier(n_neighbors=params['neighbours'])
203 | YPred = neigh.fit(XTrainAllFeatures, YTrainSamples).predict(XEvalAllFeatures)
204 |
205 | """Random Forest"""
206 | params = {'trees':150, 'criterion':'entropy', 'random_state':None}
207 | clf = RandomForestClassifier(n_estimators=params['trees'], criterion=params['criterion'], random_state=params['random_state'])
208 | clf.fit(XTrainAllFeatures, YTrainSamples)
209 | YPred = clf.predict(XEvalAllFeatures)
210 |
211 | """SVM"""
212 | params = {'kernel':'rbf'}
213 | YPred = svm.SVC(kernel=params['kernel']).fit(XTrainAllFeatures, YTrainSamples).predict(XEvalAllFeatures)
214 |
215 | GeoPlot(eval_geo_longitude, eval_geo_latitude, YPred)
216 |
217 | GeoSentimentPlot(np.array(eval_geo_longitude), np.array(eval_geo_latitude), XEvalSentiment)
218 |
219 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/supervised/preprocess_tweets.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | class PreprocessTweets:
5 |
6 | def __init__(self):
7 | self.name = 'PreprocessTweets'
8 |
9 | #start process_tweet
10 | def processTweet(self, tweet):
11 |
12 | #Convert to lower case
13 | tweet = tweet.lower()
14 | #Convert www.* or https?://* to URL
15 | tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
16 | #Convert @username to AT_USER
17 | tweet = re.sub('@[^\s]+','AT_USER',tweet)
18 | #Remove additional white spaces
19 | tweet = re.sub('[\s]+', ' ', tweet)
20 | #Remove special characters
21 | #tweet = re.sub('*\[\]%\(\)', '', tweet)
22 | #Replace #word with word
23 | tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
24 | #trim
25 | tweet = tweet.strip('\'"')
26 |
27 | # Remove all Non-ASCII characters
28 | tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
29 |
30 | return tweet
31 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/supervised/tweet_analysis.py:
--------------------------------------------------------------------------------
1 |
2 | # import preprocess_tweets
3 | # import filter_stop_words
4 | from mmds.supervised.filter_stop_words import FilterStopWords
5 | from mmds.supervised.preprocess_tweets import PreprocessTweets
6 |
7 | #Read the tweets one by one and process it
8 | fp = open('../../TwitterData/UserTweets.txt', 'r')
9 | line = fp.readline()
10 |
11 | objFilterStopWords = FilterStopWords()
12 | objPreprocessTweets = PreprocessTweets()
13 |
14 | st = open('../../TwitterData/StopWords.txt', 'r')
15 | stopWords = objFilterStopWords.getStopWordList('../../TwitterData/StopWords.txt')
16 |
17 | while line:
18 | processedTweet = objPreprocessTweets.processTweet(line)
19 | featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords)
20 | print featureVector
21 | line = fp.readline()
22 | #end loop
23 | fp.close()
24 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/unsupervised/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/mmds/unsupervised/__init__.py
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/unsupervised/k_means_estimator.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import csv
3 | import logging
4 | from collections import Counter
5 | from scipy.sparse import csr_matrix
6 | from scipy.sparse.coo import coo_matrix
7 | from sklearn.cluster.k_means_ import KMeans
8 | from sklearn.feature_extraction.dict_vectorizer import DictVectorizer
9 | from textblob.blob import TextBlob
10 | from textblob.en.np_extractors import ConllExtractor
11 | from textblob.en.taggers import NLTKTagger
12 | from mmds.utils.time_utils import time_it
13 |
14 |
15 |
16 | class KMeansEstimator:
17 | """
18 | This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
19 | method for finding the closest cluster center of unseen data.
20 | """
21 |
22 | ADJECTIVE = 'JJ'
23 |
24 | """
25 | Feature keys used in clustering...
26 | """
27 | POLARITY_FEATURE_KEY = 'polarity'
28 | SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
29 | TWEET_COUNT_FEATURE_KEY = 'tweetCount'
30 | """
31 | Features not considered for clustering...
32 | """
33 | USER_ID_FEATURE_KEY = 'userId'
34 | LONGITUDE_FEATURE_KEY = 'longitude'
35 | LATITUDE_FEATURE_KEY = 'latitude'
36 |
37 |
38 | """
39 | Predicted label feature name.
40 | """
41 | LABEL_FEATURE_KEY = 'label'
42 |
43 | RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY]
44 |
45 | def __init__(self, tweet_file_path, no_of_clusters):
46 | """
47 | The constructor reads csv file and builds the data matrix.
48 | """
49 | self.np_extractor = ConllExtractor()
50 | self.pos_tagger = NLTKTagger()
51 | self.tweet_file_path = tweet_file_path
52 | self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
53 | self.vectorizer = DictVectorizer(sparse=True)
54 | self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
55 |
56 | @time_it
57 | def __get_data_matrix_from_file(self, tweet_file_path):
58 | """
59 | Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
60 | of all feature vectors.
61 | """
62 | file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
63 | next(file_reader)
64 | data_matrix = []
65 | for row in file_reader:
66 | logging.info("Extracting features for user_id:%s", row[0])
67 | feature_vector = {}
68 | feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
69 | feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
70 | feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
71 | feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
72 | feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8')))
73 | data_matrix.append(feature_vector)
74 | logging.info("Successfully extracted features for user_id:%s", row[0])
75 | return data_matrix
76 |
77 | @time_it
78 | def __get_features_from_tweet_text(self, tweet_text):
79 | """This function returns the following features from the tweet text:
80 | - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
81 | - Subjectivity and polarity as determined by TextBlob.
82 | :returns: (key,value) map of all features found.
83 | """
84 | text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger);
85 | adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE)))
86 | polarity = text_blob.sentiment[0]
87 | subjectivity = text_blob.sentiment[1]
88 | return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items())
89 |
90 | @time_it
91 | def __get_clustering_data_matrix(self, data_matrix):
92 | """
93 | This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
94 | the data matrix and returns a copy of the data matrix.
95 | """
96 | data_matrix_copy = copy.deepcopy(data_matrix)
97 | for feature_vector in data_matrix_copy:
98 | feature_vector.pop(self.USER_ID_FEATURE_KEY)
99 | feature_vector.pop(self.LATITUDE_FEATURE_KEY)
100 | feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
101 | return data_matrix_copy
102 |
103 |
104 | @time_it
105 | def perform_clustering(self, features_to_include=None):
106 | """
107 | This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
108 | "tweet_file_path".
109 | It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
110 | if "features_to_include" is None.
111 | """
112 | clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix)
113 | transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix)
114 |
115 | self.k_means_estimator.fit(transformed_data_matrix, y=None)
116 | return self.__get_predicted_labels(self.data_matrix, features_to_include)
117 |
118 | @time_it
119 | def __get_predicted_labels(self, data_matrix, features_to_include):
120 | """
121 | Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
122 | data matrix is modified in place.
123 | It returns a new copy of data_matrix with "features_to_include" features.
124 | """
125 | feature_names = self.vectorizer.get_feature_names()
126 | for feature_vector in data_matrix:
127 | row = [0] * len(feature_names)
128 | column = range(len(feature_names))
129 | data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names)
130 | feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
131 | predicted_label = self.k_means_estimator.predict(feature_csr_matrix)
132 | feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]
133 |
134 | expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
135 | if features_to_include:
136 | return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include)
137 | else:
138 | return expanded_data_matrix
139 |
140 | @time_it
141 | def __get_filtered_data_matrix(self, data_matrix, features_to_include):
142 | """
143 | Removes all features except features_to_include
144 | """
145 | filtered_data_matrix = []
146 | for feature_vector in data_matrix:
147 | filtered_feature_vector = {}
148 | for feature_name in features_to_include:
149 | filtered_feature_vector[feature_name] = feature_vector[feature_name]
150 | filtered_data_matrix.append(filtered_feature_vector)
151 | return filtered_data_matrix
152 |
153 | @time_it
154 | def __get_expanded_data_matrix(self, data_matrix):
155 | """
156 | Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new
157 | modified copy is returned.
158 | """
159 | feature_names = self.vectorizer.get_feature_names()
160 | expanded_data_matrix = copy.deepcopy(data_matrix)
161 | for feature_vector in expanded_data_matrix:
162 | for feature_name in feature_names:
163 | if feature_name not in feature_vector:
164 | feature_vector[feature_name] = 0
165 | return expanded_data_matrix
166 |
167 | @time_it
168 | def predict_labels_for_data(self, file_path, features_to_include=None):
169 | """
170 | This function reads the tweets of different users from the file at file_path and assigns the closest
171 | cluster center to each user.
172 | It returns list of tuples of (user_id,predicted_label,latitude, longitude).
173 | """
174 | data_matrix = self.__get_data_matrix_from_file(file_path)
175 | return self.__get_predicted_labels(data_matrix, features_to_include)
176 |
177 |
178 | def write_dict_list_to_csv(dict_list, file_name):
179 | """
180 | Saves the list of dictionaries to file at "file_name". Each dictionary should have same set of keys.
181 | """
182 | file_writer = csv.DictWriter(open(file_name, "w"), dict_list[0].keys())
183 | file_writer.writeheader()
184 | file_writer.writerows(dict_list)
185 |
186 | if __name__ == "__main__":
187 | input_file = "../../TwitterData/survey_dump_with_geo_gt_8"
188 | output_file = "../../TwitterData/k_means_geo_gt_8_out"
189 | no_of_clusters = 10
190 | clusterd_data = KMeansEstimator(input_file, no_of_clusters).perform_clustering(KMeansEstimator.RELEVENT_FEATURE_LIST)
191 | logging.info("Input file:%s, output file:%s, no of clusters:%d", input_file, output_file, no_of_clusters)
192 | write_dict_list_to_csv(clusterd_data, output_file)
193 | logging.info("Written predicted labels for %d users in file:%s", len(clusterd_data), output_file)
194 |
195 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/unsupervised/k_means_plot.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | from mmds.utils.plot_utils import GeoMap, COLORS
4 |
5 |
6 | if __name__ == "__main__":
7 | data_file = "../../TwitterData/k_means_geo_gt_8_out"
8 | file_reader = csv.reader(open(data_file, "r"))
9 | next(file_reader)
10 | GeoMap().plot_points(file_reader, lambda row: COLORS[int(row[3])], lambda row:[row[0], row[2]])
11 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/mmds/utils/__init__.py
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/utils/plot_utils.py:
--------------------------------------------------------------------------------
1 | from matplotlib import pyplot
2 | from mpl_toolkits.basemap import Basemap
3 |
4 |
5 | COLORS = ['green', 'red', 'blue', 'yellow', 'purple', 'olive', 'khaki', 'indigo', 'aquamarine', 'orange']
6 | class GeoMap:
7 |
8 | def plot_points(self, data_points, color_provider, coord_mapper):
9 | """
10 | Plots the list of data point("data_points") on geo map.
11 | "color_provider" is the mapper function to map a data row to the corresponding color of the data point.
12 | "coord_mapper" is the mapper function to map a data row to the [latitude, langitude] of the data point.
13 | """
14 | base_map = Basemap(projection='robin', lat_0=0, lon_0=0, resolution='l', area_thresh=1000.0)
15 | base_map.drawcoastlines()
16 | base_map.drawcountries()
17 | base_map.fillcontinents()
18 | for row in data_points:
19 | latitude, longitude = coord_mapper(row)
20 | x, y = base_map(longitude, latitude)
21 | base_map.plot(x, y, marker='o', color=color_provider(row), markersize=4)
22 | pyplot.show()
23 |
24 |
--------------------------------------------------------------------------------
/Twitter User Personality Prediction/mmds/utils/time_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 |
4 | logging.basicConfig(filename="timing.log", level=logging.DEBUG, format="%(asctime)-15s %(threadName)s %(message)s")
5 |
6 | def time_it(func):
7 | """
8 | A decorator for timing the execution time of functions.
9 | """
10 | def decorator(*args, **kwargs):
11 | start_time = time.time()
12 | result = func(*args, **kwargs)
13 | end_time = time.time()
14 | logging.info("Execution time : {}() = {}sec".format(func.__name__, end_time - start_time))
15 | return result
16 | return decorator
17 |
--------------------------------------------------------------------------------