├── README.md └── Twitter User Personality Prediction ├── .idea ├── .name ├── Twitter User Personality Prediction.iml ├── encodings.xml ├── misc.xml ├── modules.xml └── workspace.xml ├── .ipynb_checkpoints └── Personality Predictor and Visualizer-checkpoint.ipynb ├── Personality Prediction from Twitter Data.pdf ├── Personality Predictor and Visualizer.ipynb ├── TwitterData ├── StopWords.txt ├── UserTweets.txt ├── k_means_geo_gt_8_out ├── labeledPersonalityTweets.csv ├── survey_dump.csv ├── survey_dump_geo_gt_8_1 ├── survey_dump_with_geo ├── survey_dump_with_geo_gt_8 └── survey_dump_with_tweet_count └── mmds ├── __init__.py ├── supervised ├── __init__.py ├── classification_algos.py ├── feature_engineering.py ├── filter_stop_words.py ├── personality_predictor_and_visualizer.py ├── preprocess_tweets.py └── tweet_analysis.py ├── unsupervised ├── __init__.py ├── k_means_estimator.py └── k_means_plot.py └── utils ├── __init__.py ├── plot_utils.py └── time_utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Twitter User Personality Prediction 2 | Predict Personality of a person using Sentiment Analysis & Unigram Words as features on user's Twitter data. 3 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/.idea/.name: -------------------------------------------------------------------------------- 1 | Twitter User Personality Prediction -------------------------------------------------------------------------------- /Twitter User Personality Prediction/.idea/Twitter User Personality Prediction.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 116 | 117 | 118 | 130 | 131 | 132 | 133 | 134 | true 135 | 136 | 137 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 174 | 175 | 176 | 177 | 180 | 181 | 184 | 185 | 186 | 187 | 190 | 191 | 194 | 195 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 235 | 236 | 253 | 254 | 271 | 272 | 291 | 292 | 293 | 294 | 295 | 308 | 309 | 322 | 323 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 374 | 375 | 394 | 395 | 416 | 417 | 439 | 440 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 1449089592019 482 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 514 | 515 | 516 | 517 | 518 | file://$PROJECT_DIR$/FeatureEngineering.py 519 | 54 520 | 521 | 522 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/Personality Prediction from Twitter Data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/Personality Prediction from Twitter Data.pdf -------------------------------------------------------------------------------- /Twitter User Personality Prediction/Personality Predictor and Visualizer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Malware Classification on Behavioral Data Challenge 5 " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Read the input data files." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 14, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "import os\n", 27 | "# XTrain = np.loadtxt('training_data.csv', skiprows=1, usecols=(0,1), delimiter=',', dtype='str')\n", 28 | "# print XTrain[0]\n", 29 | "# # XTest = np.loadtxt('test_data_public_new.csv', skiprows=1, usecols=(1,), delimiter=',',dtype='str')\n", 30 | "# # print XTest[0]\n", 31 | "# YTrain = XTrain[:,-1]\n", 32 | "# XTrain = XTrain[:,0]\n", 33 | "# print XTrain.shape\n", 34 | "# YTrain = YTrain.astype(np.int)\n", 35 | "# # YTrain = YTrain.reshape(-1,1)\n", 36 | "# print type(YTrain[0]),YTrain[0],YTrain.shape\n", 37 | "# # print XTest.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 15, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "80\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "#Read the tweets one by one and process it\n", 57 | "import csv\n", 58 | "# inpTweets = csv.reader(open('TwitterData/survey_dump.csv', 'rb'), delimiter=',')\n", 59 | "inpTweets = csv.reader(open('TwitterData/survey_dump_with_tweet_count', 'rb'), delimiter=',')\n", 60 | "i = 0\n", 61 | "for row in inpTweets:\n", 62 | " i+=1;\n", 63 | "\n", 64 | "print i\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "### Pre-process Tweets" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 16, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "import re\n", 83 | "\n", 84 | "class PreprocessTweets:\n", 85 | "\n", 86 | " def __init__(self):\n", 87 | " self.name = 'PreprocessTweets'\n", 88 | "\n", 89 | " #start process_tweet\n", 90 | " def processTweet(self, tweet):\n", 91 | " \n", 92 | " #Convert to lower case\n", 93 | " tweet = tweet.lower()\n", 94 | " #Convert www.* or https?://* to URL\n", 95 | " tweet = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))','URL',tweet)\n", 96 | " #Convert @username to AT_USER\n", 97 | " tweet = re.sub('@[^\\s]+','AT_USER',tweet)\n", 98 | " #Remove additional white spaces\n", 99 | " tweet = re.sub('[\\s]+', ' ', tweet)\n", 100 | " #Remove special characters\n", 101 | " #tweet = re.sub('*\\[\\]%\$\$', '', tweet)\n", 102 | " #Replace #word with word\n", 103 | " tweet = re.sub(r'#([^\\s]+)', r'\\1', tweet)\n", 104 | " #trim\n", 105 | " tweet = tweet.strip('\\'\"')\n", 106 | "\n", 107 | " # Remove all Non-ASCII characters\n", 108 | " tweet = re.sub(r'[^\\x00-\\x7F]+',' ', tweet)\n", 109 | "\n", 110 | " return tweet\n", 111 | "\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 17, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "\n", 123 | "# import PreprocessTweets\n", 124 | "\n", 125 | "class FilterStopWords:\n", 126 | "\n", 127 | " # stopWords = []\n", 128 | " def __init__(self):\n", 129 | " self.name = 'FilterStopWords'\n", 130 | " #initialize stopWords\n", 131 | " self.stopWords = []\n", 132 | "\n", 133 | " #start replaceTwoOrMore\n", 134 | " # def replaceTwoOrMore(s):\n", 135 | " # #look for 2 or more repetitions of character and replace with the character itself\n", 136 | " # pattern = re.compile(r\"(.)\\1{1,}\", re.DOTALL)\n", 137 | " # return pattern.sub(r\"\\1\\1\", s)\n", 138 | " #end\n", 139 | "\n", 140 | " def getStopWordList(self, stopWordListFileName):\n", 141 | " #read the stopwords file and build a list\n", 142 | " stopWords = []\n", 143 | " stopWords.append('AT_USER')\n", 144 | " stopWords.append('URL')\n", 145 | " stopWords.append('[')\n", 146 | " stopWords.append('[')\n", 147 | "\n", 148 | " fp = open(stopWordListFileName, 'r')\n", 149 | " line = fp.readline()\n", 150 | " while line:\n", 151 | " word = line.strip()\n", 152 | " stopWords.append(word)\n", 153 | " line = fp.readline()\n", 154 | " fp.close()\n", 155 | " return stopWords\n", 156 | " \n", 157 | " def getFeatureVector(self, tweet, stopWords):\n", 158 | " featureVector = []\n", 159 | " #split tweet into words\n", 160 | " words = tweet.split()\n", 161 | " for w in words:\n", 162 | " #replace two or more with two occurrences\n", 163 | " #w = replaceTwoOrMore(w)\n", 164 | " #strip punctuation\n", 165 | " w = w.strip('\\'\"?,.')\n", 166 | " #check if the word stats with an alphabet\n", 167 | " val = re.search(r\"^[a-zA-Z][a-zA-Z0-9]*$\", w)\n", 168 | " #ignore if it is a stop word\n", 169 | " if(w in self.stopWords or val is None):\n", 170 | " continue\n", 171 | " else:\n", 172 | " featureVector.append(w.lower())\n", 173 | " return featureVector\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "### Feature Engineering" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 18, 186 | "metadata": { 187 | "collapsed": false 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "# XTrain = []\n", 192 | "# YTrain = []\n", 193 | "# XTrainFeatures = []\n", 194 | "# XTrainSentiment = []\n", 195 | "# XTrainFreqTweets = []\n", 196 | "# geo_latitude = []\n", 197 | "# geo_longitude = []\n", 198 | "\n", 199 | "# # sample = \"{\"\"“@AMBITIOUS_SLIM: Fresh could have deleted my one of my double haves” huh\"\",\"\"@DJFreshery cause I can see u snappin off but doing it lowkey\"\",\"\"“@RAWmartini: The #WellCumThruThenMovement” best movement ever\"\",\"\"@HennessyBronze I do what I want tho\"\"}\"\n", 200 | "# # sample = sample.replace('\"\",\"\"',\" \")\n", 201 | "# # sample = sample.replace('\"\"',\" \")\n", 202 | "# # print sample\n", 203 | "# # wordsList = sample.split()\n", 204 | "\n", 205 | "\n", 206 | "# # newwordsList = [word.split() for word in wordsList]\n", 207 | "# # print newwordsList\n", 208 | "# # filtered_words = [word for word in newwordsList if word not in stopwords.words('english')]\n", 209 | "# # print filtered_words[0]\n", 210 | "# # filteredTweets = ' '.join(filtered_words)\n", 211 | "# # print filteredTweets\n", 212 | "\n", 213 | "\n", 214 | "# # from PreprocessTweets import PreprocessTweets\n", 215 | "# # from FilterStopWords import FilterStopWords\n", 216 | "# import nltk\n", 217 | "# # from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", 218 | "# from textblob import TextBlob\n", 219 | "\n", 220 | "# # nltk.download()\n", 221 | "# from nltk.corpus import stopwords\n", 222 | "\n", 223 | "# class FeatureEngineering:\n", 224 | "\n", 225 | "# def __init__(self):\n", 226 | "# self.name = 'FeatureEngineering'\n", 227 | "# self.featureList = []\n", 228 | "# # self.sid = SentimentIntensityAnalyzer()\n", 229 | "\n", 230 | "\n", 231 | "# #start extract_features\n", 232 | "# def extract_features(self,tweet):\n", 233 | "# tweet_words = set(tweet)\n", 234 | "# features = {}\n", 235 | "# for word in self.featureList:\n", 236 | "# features['contains(%s)' % word] = (word in tweet_words)\n", 237 | "# return features\n", 238 | "\n", 239 | "# ## Create New Training set based on personality labels predicted from Survey results\n", 240 | "\n", 241 | "# def createNewTrainingSet(self):\n", 242 | "\n", 243 | "# objFilterStopWords = FilterStopWords()\n", 244 | "# objPreprocessTweets = PreprocessTweets()\n", 245 | "\n", 246 | "# stopWords = objFilterStopWords.getStopWordList('TwitterData/StopWords.txt')\n", 247 | " \n", 248 | " \n", 249 | "# #Read the tweets one by one and process it\n", 250 | "# # inpTweets = csv.reader(open('TwitterData/survey_dump.csv', 'rb'), delimiter=',') #, quotechar='|')\n", 251 | "# inpTweets = csv.reader(open('TwitterData/survey_dump_with_tweet_count', 'rb'), delimiter=',')\n", 252 | "# inpTweets.next()\n", 253 | "# tweets = []\n", 254 | "# i = 0\n", 255 | "# for row in inpTweets:\n", 256 | "# # print row\n", 257 | "# personality = row[5]\n", 258 | "# # print personality\n", 259 | "# tweet = row[1]\n", 260 | "# cleanTweet = tweet.replace('\"\",\"\"',\" \")\n", 261 | "# cleanTweet = cleanTweet.replace('\"\"',\" \")\n", 262 | "# # print tweet\n", 263 | "# processedTweet = objPreprocessTweets.processTweet(cleanTweet)\n", 264 | "# # print processedTweet\n", 265 | "\n", 266 | "# XTrainFreqTweets.append(int(row[4]))\n", 267 | "# wordsList = processedTweet.split()\n", 268 | "# # print wordsList\n", 269 | " \n", 270 | "# # Remove stop words\n", 271 | "# # filtered_words = [word for word in processedTweet if word not in stopwords.words('english')]\n", 272 | "# filtered_words = [word for word in wordsList if word not in stopwords.words('english')]\n", 273 | "# # print filtered_words\n", 274 | "# filteredTweets = ' '.join(filtered_words)\n", 275 | " \n", 276 | "# featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords)\n", 277 | " \n", 278 | "# geo_latitude.append(float(row[2]))\n", 279 | "# geo_longitude.append(float(row[3]))\n", 280 | " \n", 281 | "# # Append to feature list to collect total words\n", 282 | "# # for word in featureVector:\n", 283 | "# # self.featureList.append(word)\n", 284 | "# # featureList.append([featureVector[i] for i in xrange(len(featureVector))])\n", 285 | "\n", 286 | "# # Use NLTK Vader for Sentiment Analysis\n", 287 | "\n", 288 | "# # Citation: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text.\n", 289 | "# # Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n", 290 | "# # Extract sentiment based on the tweet.\n", 291 | "# # ss = self.sid.polarity_scores(row)\n", 292 | "# # for k in sorted(ss):\n", 293 | "# # print('{0}: {1}, '.format(k, ss[k]))\n", 294 | "# #\n", 295 | "# # totSentiment = sorted(ss)[0]\n", 296 | "\n", 297 | "# # Use TextBlog for Sentiment Analysis\n", 298 | "# # print tweet\n", 299 | "# # blob = TextBlob(tweet)\n", 300 | " \n", 301 | "# blob = TextBlob(processedTweet)\n", 302 | "# # print blob\n", 303 | "# sentiment = 0\n", 304 | "# for sentence in blob.sentences:\n", 305 | "# # print sentence\n", 306 | "# sentiment += sentence.sentiment.polarity\n", 307 | "# # print sentiment\n", 308 | "\n", 309 | "# totSentiment = sentiment/ len(blob.sentences)\n", 310 | "# # featureVector.append(totSentiment)\n", 311 | "\n", 312 | "# XTrainSentiment.append(totSentiment)\n", 313 | " \n", 314 | "# # strFeatures = [item.lower() for item in featureVector]\n", 315 | " \n", 316 | "# # XTrainFeatures.append(processedTweet)\n", 317 | "# XTrainFeatures.append(filteredTweets)\n", 318 | " \n", 319 | "# YTrain.append(personality)\n", 320 | " \n", 321 | "# tweets.append((featureVector, personality))\n", 322 | " \n", 323 | "# # i+=1\n", 324 | "# # if i==3:\n", 325 | "# # break\n", 326 | " \n", 327 | "# #end loop\n", 328 | "# # print tweets\n", 329 | "# # print self.featureList\n", 330 | "# # Remove featureList duplicates\n", 331 | "# # featureList = list(set(self.featureList))\n", 332 | "\n", 333 | "# # Extract feature vector for all tweets in one shote\n", 334 | "# training_set = nltk.classify.util.apply_features(self.extract_features, tweets)\n", 335 | "\n", 336 | "# # print self.featureList\n", 337 | "# # print training_set\n", 338 | "\n", 339 | "\n", 340 | " \n", 341 | "# return training_set\n" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 19, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "\n", 353 | "import nltk\n", 354 | "from textblob import TextBlob\n", 355 | "# nltk.download()\n", 356 | "from nltk.corpus import stopwords\n", 357 | "\n", 358 | "class FeatureEngineering:\n", 359 | "\n", 360 | " def __init__(self):\n", 361 | " self.name = 'FeatureEngineering'\n", 362 | " self.featureList = []\n", 363 | " # self.sid = SentimentIntensityAnalyzer()\n", 364 | "\n", 365 | "\n", 366 | " #start extract_features\n", 367 | " def extract_features(self,tweet):\n", 368 | " tweet_words = set(tweet)\n", 369 | " features = {}\n", 370 | " for word in self.featureList:\n", 371 | " features['contains(%s)' % word] = (word in tweet_words)\n", 372 | " return features\n", 373 | "\n", 374 | "## Create New Training set based on personality labels predicted from Survey results\n", 375 | "\n", 376 | " def createNewTrainingSet(self, fileName):\n", 377 | " XTrain = []\n", 378 | " YTrain = []\n", 379 | " XTrainFeatures = []\n", 380 | " XTrainSentiment = []\n", 381 | " XTrainFreqTweets = []\n", 382 | " geo_latitude = []\n", 383 | " geo_longitude = []\n", 384 | " \n", 385 | " objFilterStopWords = FilterStopWords()\n", 386 | " objPreprocessTweets = PreprocessTweets()\n", 387 | "\n", 388 | " stopWords = objFilterStopWords.getStopWordList('TwitterData/StopWords.txt')\n", 389 | " \n", 390 | " #Read the tweets one by one and process it\n", 391 | " inpTweets = csv.reader(open(fileName, 'rb'), delimiter=',')\n", 392 | " inpTweets.next()\n", 393 | " tweets = []\n", 394 | " i = 0\n", 395 | " for row in inpTweets:\n", 396 | "# print row\n", 397 | " personality = row[5]\n", 398 | " tweet = row[1]\n", 399 | " cleanTweet = tweet.replace('\"\",\"\"',\" \")\n", 400 | " cleanTweet = cleanTweet.replace('\"\"',\" \")\n", 401 | " processedTweet = objPreprocessTweets.processTweet(cleanTweet)\n", 402 | "\n", 403 | " XTrainFreqTweets.append(int(row[4]))\n", 404 | " wordsList = processedTweet.split()\n", 405 | " \n", 406 | " # Remove stop words\n", 407 | " filtered_words = [word for word in wordsList if word not in stopwords.words('english')]\n", 408 | " filteredTweets = ' '.join(filtered_words)\n", 409 | " \n", 410 | " featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords)\n", 411 | " \n", 412 | " geo_latitude.append(float(row[2]))\n", 413 | " geo_longitude.append(float(row[3]))\n", 414 | " \n", 415 | " blob = TextBlob(processedTweet)\n", 416 | " sentiment = 0\n", 417 | " for sentence in blob.sentences:\n", 418 | " sentiment += sentence.sentiment.polarity\n", 419 | "\n", 420 | " totSentiment = sentiment/ len(blob.sentences)\n", 421 | "\n", 422 | " XTrainSentiment.append(totSentiment)\n", 423 | "\n", 424 | " XTrainFeatures.append(filteredTweets)\n", 425 | " \n", 426 | " YTrain.append(personality)\n", 427 | " \n", 428 | "# i+=1\n", 429 | "# if i==3:\n", 430 | "# break\n", 431 | " \n", 432 | "\n", 433 | " return XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude\n" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 20, 439 | "metadata": { 440 | "collapsed": false 441 | }, 442 | "outputs": [], 443 | "source": [ 444 | "objFeatureEngineering = FeatureEngineering()\n", 445 | "fileName = 'TwitterData/survey_dump_with_tweet_count'\n", 446 | "XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude = objFeatureEngineering.createNewTrainingSet(fileName)" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 21, 452 | "metadata": { 453 | "collapsed": false 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "fileName = 'TwitterData/survey_dump_geo_gt_8_1'\n", 458 | "XEval, YEval, XEvalFeatures, XEvalSentiment, XEvalFreqTweets, eval_geo_latitude, eval_geo_longitude = objFeatureEngineering.createNewTrainingSet(fileName)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": { 464 | "collapsed": true 465 | }, 466 | "source": [ 467 | "### Get Feature vector" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 22, 473 | "metadata": { 474 | "collapsed": false 475 | }, 476 | "outputs": [], 477 | "source": [ 478 | "\n", 479 | "# # from PreprocessTweets import PreprocessTweets\n", 480 | "# # from FilterStopWords import FilterStopWords\n", 481 | "# # from FeatureEngineering import FeatureEngineering\n", 482 | "# import nltk\n", 483 | "\n", 484 | "\n", 485 | "# objFilterStopWords = FilterStopWords()\n", 486 | "# objPreprocessTweets = PreprocessTweets()\n", 487 | "# objFeatureEngineering = FeatureEngineering()\n", 488 | "\n", 489 | "# #trainingSet = objFeatureEngineering.createTrainingSet()\n", 490 | "# trainingSet = objFeatureEngineering.createNewTrainingSet()\n", 491 | "\n", 492 | "# stopWordListFileName = 'TwitterData/StopWords.txt'\n", 493 | "# stopWords = objFilterStopWords.getStopWordList(stopWordListFileName)\n", 494 | "\n", 495 | "# # Train the classifier\n", 496 | "# NBClassifier = nltk.NaiveBayesClassifier.train(trainingSet)\n", 497 | "\n", 498 | "# # Test the classifier\n", 499 | "# testTweet = 'Hurray, I am working on a project on personality prediction on twitter data using sentiment analysis!'\n", 500 | "# processedTestTweet = objPreprocessTweets.processTweet(testTweet)\n", 501 | "# featureVector = objFilterStopWords.getFeatureVector(processedTestTweet, stopWords)\n", 502 | "# # print NBClassifier.classify(objFeatureEngineering.extract_features(featureVector))\n", 503 | "\n", 504 | "\n", 505 | "# # # print informative features about the classifier\n", 506 | "# # print NBClassifier.show_most_informative_features(10)\n", 507 | "\n", 508 | "\n", 509 | "# # testTweet = 'I have successfully completed this project.'\n", 510 | "# # processedTestTweet = objPreprocessTweets.processTweet(testTweet)\n", 511 | "# # featureVector = objFilterStopWords.getFeatureVector(processedTestTweet, stopWords)\n", 512 | "# # print NBClassifier.classify(objFeatureEngineering.extract_features(featureVector))\n" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 23, 518 | "metadata": { 519 | "collapsed": false 520 | }, 521 | "outputs": [], 522 | "source": [ 523 | "newYTrain = []\n", 524 | "# print YTrain\n", 525 | "for item in YTrain:\n", 526 | " temp = item.replace('[', '')\n", 527 | " temp = temp.replace('\\\"', '')\n", 528 | " newItem = temp.replace(']', '')\n", 529 | " newYTrain.append(newItem)\n", 530 | " \n", 531 | "YTrain = newYTrain\n", 532 | "# print YTrain\n", 533 | "# print XTrainFeatures[0]" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "### Map the class labels to numbers" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 24, 546 | "metadata": { 547 | "collapsed": false 548 | }, 549 | "outputs": [], 550 | "source": [ 551 | "\n", 552 | "def mapLabels(className):\n", 553 | " if className == 'Conscientiousness':\n", 554 | " return 0\n", 555 | " elif className == 'Extrovert':\n", 556 | " return 1\n", 557 | " elif className == 'Agreeable':\n", 558 | " return 2\n", 559 | " elif className == 'Empathetic':\n", 560 | " return 3\n", 561 | " elif className == 'Novelty Seeking':\n", 562 | " return 4\n", 563 | " elif className == 'Perfectionist':\n", 564 | " return 5\n", 565 | " elif className == 'Rigid':\n", 566 | " return 6\n", 567 | " elif className == 'Impulsive':\n", 568 | " return 7\n", 569 | " elif className == 'Psychopath':\n", 570 | " return 8\n", 571 | " elif className == 'Obsessive':\n", 572 | " return 9\n", 573 | "# elif className == None:\n", 574 | "# return 10\n", 575 | " else:\n", 576 | " pass\n", 577 | "\n", 578 | "YTrain = [mapLabels(x) for x in YTrain]\n", 579 | "YEval = [mapLabels(x) for x in YEval]" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 25, 585 | "metadata": { 586 | "collapsed": false 587 | }, 588 | "outputs": [ 589 | { 590 | "name": "stdout", 591 | "output_type": "stream", 592 | "text": [ 593 | "[None, None, None, None]\n" 594 | ] 595 | } 596 | ], 597 | "source": [ 598 | "print YEval[1:5]" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 26, 604 | "metadata": { 605 | "collapsed": false 606 | }, 607 | "outputs": [ 608 | { 609 | "name": "stdout", 610 | "output_type": "stream", 611 | "text": [ 612 | "79\n", 613 | "79\n", 614 | "{\"2 please\",\" AT_USER 1dmoviepremiere today! :d :d\",\"AT_USER getting follow would blessing :) love moon back <3 j\",\"AT_USER getting follow would blessing :) love moon back <3 k\",\"AT_USER thank bby :))\",\"AT_USER getting follow would blessing :) love moon back <3 37\",\"AT_USER getting follow would blessing :) love moon back <3 40\",\"AT_USER getting follow would blessing :) love moon back <3 30\",\"AT_USER idk it's little awkward tbh\",\"AT_USER *google images* big booty hoes\",\"AT_USER afternoon good sir\",\"AT_USER think ketchup enjoy it\",\"AT_USER heart aw aw aw\",\"AT_USER ugh haven't heard yet :(\",\"AT_USER getting follow would blessing :) love moon back <3 69\",\"AT_USER can't wait till come back tour us :))\",\" please follow fuckers love much AT_USER AT_USER AT_USER AT_USER 35\",\"calum hood stop dick follow please\",\"uk x factor different compared us version like it's much better lol\",\"AT_USER he's jealous\",\"AT_USER cause anxiety\",\"AT_USER you're angel sahar lol ily <3 liking new @ btw\",\"AT_USER hey calum see follow dont ignore please ily 127\",\"AT_USER dogs cats?\",\"AT_USER youtuber....\",\"AT_USER you're one us\",\"lol icon me\",\"AT_USER hi how's recording?\",\"AT_USER hi calum! hope wonderful day you're staying healthy :) please follow me, love know 8\",\"idk still \\\"what's goodie\\ ,\"AT_USER hey fuck you\",\" AT_USER please follow me! love much <33 t\",\"AT_USER lol caused enough drama flirting shit us\",\"AT_USER actually think bendy before?\",\" AT_USER we're late guys lol suck hold would've though\",\"this follow party getting hand\",\"AT_USER it's late try get follows, wait till tomorrow afternoon man\",\"AT_USER holy crap, you're gonna peel much\",\"AT_USER join marijuana movement, it's joint effort ha that's funny follow pls ily \",\"AT_USER bet could totally dude!! give twitter name stuff hopefully they'll follow you!\",\"he's holding balls\",\"AT_USER skype part hahaha i'm crying\",\" AT_USER AT_USER AT_USER AT_USER follow motherfuckers ily i'm 8\",\"ill keep eyes wide opennn\",\" AT_USER pineapple cool refreshing drink inside. turn up\",\"never forget calum's lip piercing\",\"AT_USER hey calum see follow dont dick please ily 307\",\"being added list one best feelings ( ) ily\",\"AT_USER mean better first one claps \",\"AT_USER meow bitch\",\"AT_USER hey calum see follow dont dick 218\",\"AT_USER hey calum see follow dont dick please 379\",\"i can't even watch damn hug without wanting stab throat\",\"AT_USER what's favorite thing bus?\",\"AT_USER god bless america\",\"AT_USER man things\",\"the live tweets boys friends best ones\",\"AT_USER can't hahaa\",\"AT_USER love much don't even know. thank much everything <3 goodnight boo\",\"i wonder shirt ashton ripped...\",\"AT_USER ...they already did....\",\"i'm uncomfortable luke creeps see holla me? maybe follow me? idk choice\",\"AT_USER wow you're gonna confused lol\",\"AT_USER AT_USER gonna buy strippers?\",\" AT_USER hi bby! please follow me! love much <33 138\",\" AT_USER hi bby! please follow me! love much <33 106\"}\n", 615 | "8\n", 616 | "{\"i wanna see cousin today dont feel like going mondawmin man \",\" AT_USER yo wanted smack shit freshman today yo. !!!!!!\",\" AT_USER vma's sunday!!! \",\"ya bitches asking asking nigga do\",\" AT_USER ain't get locked summer im proud \",\"AT_USER lol could've said thanks & tell ya name then.\",\" AT_USER joy luck club boringgggg! !!!\",\"AT_USER omg, you're going too?!\",\"ima things right time ya, born ride \",\"AT_USER AT_USER lmfao go 'head & get fcked shit.\",\"i wanna go kona grill.\",\"AT_USER sorry ya lost, keep ya head up.\",\"i cant stop smiling \",\"yo really know every vine \",\"pussy, money, weed got lat...steal ya girl call hijack\",\" AT_USER don't know wear tomorrow . goin? lol\",\"damn. ima bitch \",\"AT_USER get hair done got that\",\" AT_USER smiley fucked fucked way \",\"single & nobody attention.\",\"morning store walk..lol\",\" AT_USER could really go pizza & wings right now! !!!!\",\" AT_USER AT_USER AT_USER got lied shit crazy \",\"bad azz nigga. coming home soon nigga.\",\"yall using videos sluts.\",\"i videos doe \",\"i guess means nie go sleep lol\",\"man, miss dad everyday.\",\"worldwide coast coast love get.\",\" AT_USER AT_USER lol fine fine. lol\",\"AT_USER need ya help \",\"i really looking forward going mall\",\"AT_USER dont that? lol\",\"the closer get \",\"lol done got dropped steps,tvs fell head everything else thats act act.\",\"im like meek never sleep\",\"i feel terrible.\",\" AT_USER dfl nie flashy avi lmao already man.\",\"that's lor corey running niggas try fight \",\"va life gonna good \",\" AT_USER can't wait go home \",\"AT_USER lee coming tonight lol\",\" AT_USER AT_USER lol rude eat 20 piece mcdouble hungry ass lmao thats somebody hungry\",\"AT_USER really dont lol right, whatever \",\"AT_USER what?\",\"so ajayasia like nah? lol\",\"AT_USER picture lmao\",\"AT_USER lol smart?\",\" AT_USER bitchesssssssssss get rings next school year yaaasss lmfao remember said that?!\",\"when finally find somebody >\",\" AT_USER yup white-tee \",\"AT_USER text eddie \",\"my phone dry \",\"AT_USER goodmorning \",\" AT_USER AT_USER nah hate guess thats cool \",\"i bust nut im back thinking money\",\"he really said girls trynna freak says, \\\"wanna ride face use eats handle bars?\\\" like ?!\",\" AT_USER never apologize feel. that's like saying sorry real \",\"i wasn't even sleep long felt like forever \",\"i wanna love .. \",\" AT_USER really hate people . \"}\n", 617 | "7\n" 618 | ] 619 | } 620 | ], 621 | "source": [ 622 | "XTrain = np.array(XTrainFeatures)\n", 623 | "YTrain = np.array(YTrain)\n", 624 | "\n", 625 | "print len(XTrain)\n", 626 | "print len(YTrain)\n", 627 | "\n", 628 | "print XTrain[1]\n", 629 | "print YTrain[1]\n", 630 | "\n", 631 | "print XTrain[15]\n", 632 | "print YTrain[15]\n", 633 | "\n", 634 | "XEval = np.array(XEvalFeatures)\n", 635 | "YEval = np.array(YEval)" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": { 641 | "collapsed": true 642 | }, 643 | "source": [ 644 | "### Split Train and Test data" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 27, 650 | "metadata": { 651 | "collapsed": false 652 | }, 653 | "outputs": [ 654 | { 655 | "name": "stdout", 656 | "output_type": "stream", 657 | "text": [ 658 | "60\n", 659 | "19\n", 660 | "79\n" 661 | ] 662 | } 663 | ], 664 | "source": [ 665 | "trainSamples = XTrain[0:60]\n", 666 | "YtrainSamples = YTrain[0:60]\n", 667 | "\n", 668 | "testSamples = XTrain[60:]\n", 669 | "YtestSamples = YTrain[60:]\n", 670 | "\n", 671 | "print len(trainSamples)\n", 672 | "print len(testSamples)\n", 673 | "\n", 674 | "# print XTrain[60:63]\n", 675 | "print len(XTrain)\n", 676 | "\n", 677 | "\n", 678 | "trainSentimentSamples = np.array(XTrainSentiment[0:60])\n", 679 | "testSentimentSamples = np.array(XTrainSentiment[60:])\n", 680 | "trainFreqTweetSamples = np.array(XTrainFreqTweets[0:60])\n", 681 | "testFreqTweetSamples = np.array(XTrainFreqTweets[60:])" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "### Bag of Words as Features" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 28, 694 | "metadata": { 695 | "collapsed": false 696 | }, 697 | "outputs": [ 698 | { 699 | "name": "stdout", 700 | "output_type": "stream", 701 | "text": [ 702 | "4914\n", 703 | "(60, 4914)\n", 704 | "(19, 4914)\n" 705 | ] 706 | } 707 | ], 708 | "source": [ 709 | "from sklearn.feature_extraction.text import CountVectorizer\n", 710 | "vectorizer = CountVectorizer()\n", 711 | "XTr = vectorizer.fit_transform(trainSamples)\n", 712 | "print len(vectorizer.get_feature_names())\n", 713 | "trainBagVector = XTr.toarray()\n", 714 | "print trainBagVector.shape\n", 715 | "XTe = vectorizer.transform(testSamples)\n", 716 | "testBagVector = XTe.toarray()\n", 717 | "print testBagVector.shape" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 29, 723 | "metadata": { 724 | "collapsed": false 725 | }, 726 | "outputs": [ 727 | { 728 | "name": "stdout", 729 | "output_type": "stream", 730 | "text": [ 731 | "37012\n", 732 | "(3995, 37012)\n" 733 | ] 734 | } 735 | ], 736 | "source": [ 737 | "XEv = vectorizer.fit_transform(XEval)\n", 738 | "print len(vectorizer.get_feature_names())\n", 739 | "evalBagVector = XEv.toarray()\n", 740 | "print evalBagVector.shape" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 30, 746 | "metadata": { 747 | "collapsed": false 748 | }, 749 | "outputs": [ 750 | { 751 | "name": "stdout", 752 | "output_type": "stream", 753 | "text": [ 754 | "(3995, 4914)\n" 755 | ] 756 | } 757 | ], 758 | "source": [ 759 | "evalBagVector = evalBagVector[:,0:4914]\n", 760 | "print evalBagVector.shape" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": 31, 766 | "metadata": { 767 | "collapsed": false 768 | }, 769 | "outputs": [], 770 | "source": [ 771 | "# from sklearn.decomposition import PCA as sklearnPCA\n", 772 | "# sklearn_pca = sklearnPCA(n_components=4914)\n", 773 | "# evalBagVectorPCA = sklearn_pca.fit_transform(evalBagVector.T)\n", 774 | "# print evalBagVectorPCA.shape" 775 | ] 776 | }, 777 | { 778 | "cell_type": "markdown", 779 | "metadata": {}, 780 | "source": [ 781 | "### TF-IDF" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": 32, 787 | "metadata": { 788 | "collapsed": false 789 | }, 790 | "outputs": [], 791 | "source": [ 792 | "# trainBagVector = trainSamples\n", 793 | "# testBagVector = testSamples\n", 794 | "\n", 795 | "# from sklearn.feature_extraction.text import TfidfTransformer\n", 796 | "# transformer = TfidfTransformer()\n", 797 | "# # print transformer \n", 798 | "# tfidfTrain = transformer.fit_transform(trainBagVector)\n", 799 | "# tfidfTrain = tfidfTrain.toarray()\n", 800 | "# tfidfTest = transformer.fit_transform(testBagVector)\n", 801 | "# tfidfTest = tfidfTest.toarray()\n", 802 | "# print tfidfTrain.shape, tfidfTest.shape\n", 803 | "# print tfidfTrain[0]\n", 804 | "# print tfidfTest[0]" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": 33, 810 | "metadata": { 811 | "collapsed": false 812 | }, 813 | "outputs": [], 814 | "source": [ 815 | "# f=open(\"trainBagVector.txt\",'w')\n", 816 | "# f.write(trainBagVector)\n", 817 | "# np.savetxt(\"trainBagVector.txt\",trainBagVector)" 818 | ] 819 | }, 820 | { 821 | "cell_type": "markdown", 822 | "metadata": {}, 823 | "source": [ 824 | "### State Transitions" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": 34, 830 | "metadata": { 831 | "collapsed": false 832 | }, 833 | "outputs": [ 834 | { 835 | "name": "stdout", 836 | "output_type": "stream", 837 | "text": [ 838 | "37012 37012\n" 839 | ] 840 | } 841 | ], 842 | "source": [ 843 | "stateDict = {}\n", 844 | "featureVectors = vectorizer.get_feature_names()\n", 845 | "for i in xrange(len(featureVectors)):\n", 846 | " stateDict[featureVectors[i]] = i+1\n", 847 | "print len(stateDict), len(featureVectors) #, stateDict" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": 35, 853 | "metadata": { 854 | "collapsed": false 855 | }, 856 | "outputs": [], 857 | "source": [ 858 | "def createStateTransitionVector(categoricalState, stateDict, maxLength):\n", 859 | " if categoricalState:\n", 860 | " feature = []\n", 861 | " for state in categoricalState.split(' '):\n", 862 | " try:\n", 863 | " feature.append(stateDict[state.lower()])\n", 864 | " except KeyError:\n", 865 | " pass\n", 866 | "# print state\n", 867 | " if len(feature) != maxLength:\n", 868 | " for i in xrange(maxLength-len(feature)):\n", 869 | " feature.append(0)\n", 870 | " assert(len(feature)==maxLength)\n", 871 | " return feature\n", 872 | " else:\n", 873 | " return [0] * maxLength" 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": 36, 879 | "metadata": { 880 | "collapsed": true 881 | }, 882 | "outputs": [], 883 | "source": [ 884 | "def createStateVectors(XStates, stateDict, maxLength):\n", 885 | " XFeatures = []\n", 886 | " for state in XStates:\n", 887 | " XFeatures.append(createStateTransitionVector(state, stateDict, maxLength))\n", 888 | " return XFeatures" 889 | ] 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": 37, 894 | "metadata": { 895 | "collapsed": false 896 | }, 897 | "outputs": [], 898 | "source": [ 899 | "trainStateTransitionVector = createStateVectors(trainSamples, stateDict,9353)\n", 900 | "testStateTransitionVector = createStateVectors(testSamples, stateDict,9353)\n", 901 | "# print trainStateTransitionVector[:2], testStateTransitionVector[:2]" 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": 38, 907 | "metadata": { 908 | "collapsed": false 909 | }, 910 | "outputs": [ 911 | { 912 | "name": "stdout", 913 | "output_type": "stream", 914 | "text": [ 915 | "9353\n", 916 | "9353\n" 917 | ] 918 | } 919 | ], 920 | "source": [ 921 | "print max([len(i) for i in trainStateTransitionVector])\n", 922 | "print max([len(i) for i in testStateTransitionVector])" 923 | ] 924 | }, 925 | { 926 | "cell_type": "markdown", 927 | "metadata": {}, 928 | "source": [ 929 | "### N Grams as features" 930 | ] 931 | }, 932 | { 933 | "cell_type": "code", 934 | "execution_count": 39, 935 | "metadata": { 936 | "collapsed": false 937 | }, 938 | "outputs": [ 939 | { 940 | "name": "stdout", 941 | "output_type": "stream", 942 | "text": [ 943 | "CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',\n", 944 | " dtype=, encoding=u'utf-8', input=u'content',\n", 945 | " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", 946 | " ngram_range=(1, 3), preprocessor=None, stop_words=None,\n", 947 | " strip_accents=None, token_pattern=u'(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 948 | " tokenizer=None, vocabulary=None)\n", 949 | "44678\n", 950 | "(60, 44678)\n", 951 | "(19, 44678)\n" 952 | ] 953 | } 954 | ], 955 | "source": [ 956 | "import scipy as sp\n", 957 | "noNGram = 3\n", 958 | "vectorizerNGram = CountVectorizer(ngram_range=(1, noNGram))\n", 959 | "XTrainNGram = vectorizerNGram.fit_transform(trainSamples)\n", 960 | "\n", 961 | "print vectorizerNGram\n", 962 | "\n", 963 | "\n", 964 | "print len(vectorizerNGram.get_feature_names())\n", 965 | "trainNGramVector = XTrainNGram.toarray()\n", 966 | "print trainNGramVector.shape\n", 967 | "XTestNGram = vectorizerNGram.transform(testSamples)\n", 968 | "testNGramVector = XTestNGram.toarray()\n", 969 | "print testNGramVector.shape" 970 | ] 971 | }, 972 | { 973 | "cell_type": "code", 974 | "execution_count": 40, 975 | "metadata": { 976 | "collapsed": false 977 | }, 978 | "outputs": [], 979 | "source": [ 980 | "# from helper import *\n", 981 | "# import utilities\n", 982 | "# from utilities import build_matrices\n", 983 | "\n", 984 | "# (matrix_train, matrix_eval, ngram_list) = build_matrices(max_ngram_length=3)\n", 985 | "# len(ngram_list)\n", 986 | "# matrix_train = sp.sparse.csr_matrix(matrix_train)\n", 987 | "# matrix_eval = sp.sparse.csr_matrix(matrix_test)" 988 | ] 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": 41, 993 | "metadata": { 994 | "collapsed": false 995 | }, 996 | "outputs": [], 997 | "source": [ 998 | "# noNGram = 3\n", 999 | "# vectorizerNGram = CountVectorizer(ngram_range=(1, noNGram))\n", 1000 | "# XEvalNGram = vectorizerNGram.fit_transform(XEval)\n", 1001 | "# print vectorizerNGram\n", 1002 | "\n", 1003 | "# print len(vectorizerNGram.get_feature_names())\n", 1004 | "# evalNGramVector = XEvalNGram.toarray()\n", 1005 | "# print evalNGramVector.shape\n", 1006 | "\n", 1007 | "# matrix_eval = sp.sparse.csr_matrix(evalNGramVector)" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "markdown", 1012 | "metadata": {}, 1013 | "source": [ 1014 | "### Stack or concatenate all features together" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "code", 1019 | "execution_count": 42, 1020 | "metadata": { 1021 | "collapsed": false 1022 | }, 1023 | "outputs": [ 1024 | { 1025 | "name": "stdout", 1026 | "output_type": "stream", 1027 | "text": [ 1028 | "(60, 4914)\n", 1029 | "(60,)\n", 1030 | "(60, 4915)\n", 1031 | "(19, 4915)\n", 1032 | "(60, 4916)\n" 1033 | ] 1034 | } 1035 | ], 1036 | "source": [ 1037 | "XTrainWordFeatures = trainBagVector #trainNGramVector\n", 1038 | "print XTrainWordFeatures.shape\n", 1039 | "print trainSentimentSamples.shape\n", 1040 | "\n", 1041 | "temp = np.column_stack((XTrainWordFeatures, trainSentimentSamples))\n", 1042 | "print temp.shape\n", 1043 | "XTrainAllFeatures = np.column_stack((temp, trainFreqTweetSamples))\n", 1044 | "\n", 1045 | "\n", 1046 | "XTestWordFeatures = testBagVector #testNGramVector\n", 1047 | "temp = np.column_stack((XTestWordFeatures, testSentimentSamples))\n", 1048 | "print temp.shape\n", 1049 | "XTestAllFeatures = np.column_stack((temp, testFreqTweetSamples))\n", 1050 | "\n", 1051 | "\n", 1052 | "print XTrainAllFeatures.shape" 1053 | ] 1054 | }, 1055 | { 1056 | "cell_type": "code", 1057 | "execution_count": 43, 1058 | "metadata": { 1059 | "collapsed": false 1060 | }, 1061 | "outputs": [ 1062 | { 1063 | "name": "stdout", 1064 | "output_type": "stream", 1065 | "text": [ 1066 | "(3995, 4916)\n" 1067 | ] 1068 | } 1069 | ], 1070 | "source": [ 1071 | "# XEvalWordFeatures = evalBagVector #evalNGramVector\n", 1072 | "# temp = np.column_stack((XEvalWordFeatures, XEvalSentiment))\n", 1073 | "XEvalAllFeatures = np.column_stack((np.column_stack((evalBagVector, XEvalSentiment)), XEvalFreqTweets))\n", 1074 | "\n", 1075 | "print XEvalAllFeatures.shape" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "markdown", 1080 | "metadata": {}, 1081 | "source": [ 1082 | "### Write Predicted Output Labels to File" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "code", 1087 | "execution_count": 44, 1088 | "metadata": { 1089 | "collapsed": true 1090 | }, 1091 | "outputs": [], 1092 | "source": [ 1093 | "def writePredictedLabelFile(YPred):\n", 1094 | " f = open(\"Predictions.csv\",\"w\")\n", 1095 | " f.write(\"Id,Label\" + \"\\n\")\n", 1096 | " for i in xrange(len(YPred)):\n", 1097 | " f.write(str(i) + \",\" + str(int(YPred[i]))+ \"\\n\")\n", 1098 | " f.close()" 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "markdown", 1103 | "metadata": { 1104 | "collapsed": true 1105 | }, 1106 | "source": [ 1107 | "### Classifiers" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "code", 1112 | "execution_count": 45, 1113 | "metadata": { 1114 | "collapsed": true 1115 | }, 1116 | "outputs": [], 1117 | "source": [ 1118 | "# Random Forest Classifier\n", 1119 | "from sklearn.ensemble import RandomForestClassifier\n", 1120 | "# def classifyRandomForestClassifier(XTrain, XTest, YTrain, YTest,trees=100,crit='gini'):\n", 1121 | "def classifyRandomForestClassifier(XTrain, XTest, YTrain, YTest, params):\n", 1122 | " trees = params['trees']\n", 1123 | " crit = params['criterion']\n", 1124 | " seed = params['random_state']\n", 1125 | " clf = RandomForestClassifier(n_estimators=trees,criterion=crit,random_state=seed)\n", 1126 | " clf.fit(XTrain, YTrain)\n", 1127 | " YPred = clf.predict(XTest)\n", 1128 | " diff = YPred - YTest\n", 1129 | " score = diff[diff == 0].size\n", 1130 | " return (100.0 * score)/(YPred.size)" 1131 | ] 1132 | }, 1133 | { 1134 | "cell_type": "code", 1135 | "execution_count": 46, 1136 | "metadata": { 1137 | "collapsed": true 1138 | }, 1139 | "outputs": [], 1140 | "source": [ 1141 | "#Multi Class SVM\n", 1142 | "from sklearn import svm\n", 1143 | "def classifyMultiClassSVMClassifier(XTrain, XTest, YTrain, YTest, params):\n", 1144 | " ker = params['kernel']\n", 1145 | " YPred = svm.SVC(kernel=ker).fit(XTrain, YTrain).predict(XTest)\n", 1146 | " diff = YPred - YTest\n", 1147 | " score = diff[diff == 0].size\n", 1148 | " return (100.0 * score)/(YPred.size)" 1149 | ] 1150 | }, 1151 | { 1152 | "cell_type": "code", 1153 | "execution_count": 47, 1154 | "metadata": { 1155 | "collapsed": true 1156 | }, 1157 | "outputs": [], 1158 | "source": [ 1159 | "#K Nearest Neighbours Classifier\n", 1160 | "from sklearn.neighbors import KNeighborsClassifier\n", 1161 | "def classifyKNNClassifier(XTrain, XTest, YTrain, YTest, params):\n", 1162 | "# print XTrain.shape, XTest.shape\n", 1163 | " neighbours = params['neighbours']\n", 1164 | " neigh = KNeighborsClassifier(n_neighbors=neighbours)\n", 1165 | " YPred = neigh.fit(XTrain, YTrain).predict(XTest)\n", 1166 | " diff = YPred - YTest\n", 1167 | " score = diff[diff == 0].size\n", 1168 | " return (100.0 * score)/(YPred.size)" 1169 | ] 1170 | }, 1171 | { 1172 | "cell_type": "code", 1173 | "execution_count": 48, 1174 | "metadata": { 1175 | "collapsed": false 1176 | }, 1177 | "outputs": [], 1178 | "source": [ 1179 | "# Logistic Regression\n", 1180 | "from sklearn import linear_model\n", 1181 | "def classifyLogisticRegression(XTrain, XTest, YTrain, YTest, params):\n", 1182 | " LogReg = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)\n", 1183 | " LogReg.fit(XTrain, YTrain)\n", 1184 | " # Finds the optimal model parameters using a least squares method.\n", 1185 | " # To get the parameter values:\n", 1186 | " # LogReg.get_params()\n", 1187 | " # To predict a new input XTest,\n", 1188 | " YPred = LogReg.predict(XTest)\n", 1189 | " diff = YPred - YTest\n", 1190 | " score = diff[diff == 0].size\n", 1191 | " return (100.0 * score)/(YPred.size)" 1192 | ] 1193 | }, 1194 | { 1195 | "cell_type": "code", 1196 | "execution_count": 49, 1197 | "metadata": { 1198 | "collapsed": true 1199 | }, 1200 | "outputs": [], 1201 | "source": [ 1202 | "# Adaboost Classfier\n", 1203 | "from sklearn.ensemble import AdaBoostClassifier\n", 1204 | "from sklearn.tree import DecisionTreeClassifier\n", 1205 | "def classifyAdaboostClassifier(XTrain, XTest, YTrain, YTest, params):\n", 1206 | " depth = params['max_depth']\n", 1207 | " algo = params['algorithm']\n", 1208 | " estimators = params['n_estimators']\n", 1209 | " \n", 1210 | " # Create and fit an AdaBoosted decision tree\n", 1211 | " bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth = depth),\n", 1212 | " algorithm = algo,\n", 1213 | " n_estimators=estimators)\n", 1214 | "\n", 1215 | " bdt.fit(XTrain, YTrain)\n", 1216 | " YPred = bdt.predict(XTest)\n", 1217 | "\n", 1218 | " diff = YPred - YTest\n", 1219 | " score = diff[diff == 0].size\n", 1220 | " return (100.0 * score)/(YPred.size)" 1221 | ] 1222 | }, 1223 | { 1224 | "cell_type": "code", 1225 | "execution_count": 50, 1226 | "metadata": { 1227 | "collapsed": true 1228 | }, 1229 | "outputs": [], 1230 | "source": [ 1231 | "# Neural Networks\n", 1232 | "try:\n", 1233 | " from sknn.mlp import Classifier, Layer\n", 1234 | "except ImportError:\n", 1235 | " print 'Please install scikit-neuralnetwork(pip install scikit-neuralnetwork)'\n", 1236 | "\n", 1237 | "def classifyNeuralNetworkClassifier(XTrain, XTest, YTrain, YTest, params):\n", 1238 | " activation = params['activation']\n", 1239 | " actLastLayer = params['actLastLayer']\n", 1240 | " rule = params['rule']\n", 1241 | " noOfUnits = params['units']\n", 1242 | " rate = params['rate']\n", 1243 | " noOfIter = params['iter']\n", 1244 | " nn = Classifier(layers=[Layer(activation, units=noOfUnits),Layer(actLastLayer)], learning_rule=rule,\n", 1245 | " learning_rate=0.02,\n", 1246 | " n_iter=10)\n", 1247 | " nn.fit(XTrain, YTrain)\n", 1248 | " YPred = nn.predict(XTest)\n", 1249 | " diff = YPred - YTest.reshape(YPred.shape)\n", 1250 | " score = diff[diff == 0].size\n", 1251 | " score = (100.0 * score)/(YPred.size)\n", 1252 | " return score" 1253 | ] 1254 | }, 1255 | { 1256 | "cell_type": "markdown", 1257 | "metadata": {}, 1258 | "source": [ 1259 | "### Stratified K Fold Cross Validation" 1260 | ] 1261 | }, 1262 | { 1263 | "cell_type": "code", 1264 | "execution_count": 51, 1265 | "metadata": { 1266 | "collapsed": true 1267 | }, 1268 | "outputs": [], 1269 | "source": [ 1270 | "from sklearn.cross_validation import StratifiedKFold\n", 1271 | "def stratifiedKFoldVal(XTrain, YTrain, classify, params):\n", 1272 | " n_folds = 5\n", 1273 | " score = 0.0\n", 1274 | " skf = StratifiedKFold(YTrain, n_folds)\n", 1275 | " try:\n", 1276 | " multi = params['multi']\n", 1277 | " except KeyError:\n", 1278 | " multi = False\n", 1279 | " for train_index, test_index in skf:\n", 1280 | " y_train, y_test = YTrain[train_index], YTrain[test_index]\n", 1281 | " if not multi:\n", 1282 | " X_train, X_test = XTrain[train_index], XTrain[test_index]\n", 1283 | " score += classify(X_train, X_test, y_train, y_test, params)\n", 1284 | " else:\n", 1285 | " X_train, X_test = [XTrain[i] for i in train_index], [XTrain[i] for i in test_index]\n", 1286 | " score += classify(np.array(X_train), np.array(X_test), y_train, y_test, params)\n", 1287 | " \n", 1288 | " return score/n_folds" 1289 | ] 1290 | }, 1291 | { 1292 | "cell_type": "markdown", 1293 | "metadata": {}, 1294 | "source": [ 1295 | "### Normalisation of Feature Vectors" 1296 | ] 1297 | }, 1298 | { 1299 | "cell_type": "code", 1300 | "execution_count": 52, 1301 | "metadata": { 1302 | "collapsed": false 1303 | }, 1304 | "outputs": [], 1305 | "source": [ 1306 | "from sklearn import preprocessing\n", 1307 | "def NormalizeVector(XTestFeatures,XTrainFeatures):\n", 1308 | " XTestFeaturesNorm = preprocessing.normalize(XTestFeatures, norm='l2')\n", 1309 | " XTrainFeaturesNorm = preprocessing.normalize(XTrainFeatures, norm='l2')\n", 1310 | " print XTrainFeaturesNorm.shape,XTestFeaturesNorm.shape\n", 1311 | "# print XTrainFeaturesNorm[0],XTestFeaturesNorm[0]\n", 1312 | " return XTrainFeaturesNorm, XTestFeaturesNorm" 1313 | ] 1314 | }, 1315 | { 1316 | "cell_type": "markdown", 1317 | "metadata": {}, 1318 | "source": [ 1319 | "### Assign Train features for cross validation based on the feature encoding" 1320 | ] 1321 | }, 1322 | { 1323 | "cell_type": "code", 1324 | "execution_count": 53, 1325 | "metadata": { 1326 | "collapsed": false 1327 | }, 1328 | "outputs": [ 1329 | { 1330 | "name": "stdout", 1331 | "output_type": "stream", 1332 | "text": [ 1333 | " \n", 1334 | "(60, 4916)\n", 1335 | "(60,)\n" 1336 | ] 1337 | } 1338 | ], 1339 | "source": [ 1340 | "train = XTrainAllFeatures\n", 1341 | "# train = tfidfTrain\n", 1342 | "# train = trainStateTransitionVector\n", 1343 | "print type(trainBagVector), type(trainStateTransitionVector)\n", 1344 | "# train = []\n", 1345 | "# for i in xrange(len(trainBagVector)):\n", 1346 | "# train.append(trainBagVector[i]+trainStateTransitionVector[i])\n", 1347 | "# print len(train)\n", 1348 | "# train = np.hstack([tfidfTrain, np.array(trainStateTransitionVector)])\n", 1349 | "# train = np.hstack([trainBagVector, np.array(trainStateTransitionVector)])\n", 1350 | "\n", 1351 | "print train.shape\n", 1352 | "YTrain = YtrainSamples\n", 1353 | "print YTrain.shape\n", 1354 | "YTest = YtestSamples" 1355 | ] 1356 | }, 1357 | { 1358 | "cell_type": "markdown", 1359 | "metadata": {}, 1360 | "source": [ 1361 | "### Selection of Nearest Neighbours for KNN" 1362 | ] 1363 | }, 1364 | { 1365 | "cell_type": "code", 1366 | "execution_count": 54, 1367 | "metadata": { 1368 | "collapsed": false 1369 | }, 1370 | "outputs": [], 1371 | "source": [ 1372 | "# selectNeighbourScores = []\n", 1373 | "\n", 1374 | "# params = {'neighbours':2}\n", 1375 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 1376 | "# print score\n", 1377 | "# selectNeighbourScores.append(score)\n", 1378 | "\n", 1379 | "# params = {'neighbours':3}\n", 1380 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 1381 | "# print score\n", 1382 | "# selectNeighbourScores.append(score)\n", 1383 | "\n", 1384 | "# params = {'neighbours':4}\n", 1385 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 1386 | "# print score\n", 1387 | "# selectNeighbourScores.append(score)\n", 1388 | "\n", 1389 | "# params = {'neighbours':5}\n", 1390 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 1391 | "# print score\n", 1392 | "# selectNeighbourScores.append(score)\n", 1393 | "\n", 1394 | "# params = {'neighbours':10}\n", 1395 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 1396 | "# print score\n", 1397 | "# selectNeighbourScores.append(score)\n", 1398 | "\n", 1399 | "# params = {'neighbours':25}\n", 1400 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 1401 | "# print score\n", 1402 | "# selectNeighbourScores.append(score)\n", 1403 | "\n", 1404 | "# params = {'neighbours':40}\n", 1405 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 1406 | "# print score\n", 1407 | "# selectNeighbourScores.append(score)\n", 1408 | "\n", 1409 | "# print selectNeighbourScores" 1410 | ] 1411 | }, 1412 | { 1413 | "cell_type": "code", 1414 | "execution_count": 55, 1415 | "metadata": { 1416 | "collapsed": false 1417 | }, 1418 | "outputs": [], 1419 | "source": [ 1420 | "# #Plotting the results\n", 1421 | "# import matplotlib.pyplot as plt\n", 1422 | "# %matplotlib inline\n", 1423 | "# plt.plot(selectNeighbourScores, label = \"Neighbors in k-Nearest Neighbor (kNN) Classifier\")\n", 1424 | "# plt.title(\"Neighbors in k-Nearest Neighbor (kNN) Classifier\")\n", 1425 | "\n", 1426 | "# labels = [2,3,4,5,6,8,10]\n", 1427 | "# plt.xticks(np.arange(len(labels)), labels, rotation='horizontal')\n", 1428 | "# # plt.title(\"Optimal choice of Neighbors in k-Nearest Neighbor (kNN) Classifier\")\n", 1429 | "# plt.ylabel('Categorization Accuracy')\n", 1430 | "# plt.xlabel('No. of Neighbours')\n", 1431 | "# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n", 1432 | "# plt.show()" 1433 | ] 1434 | }, 1435 | { 1436 | "cell_type": "markdown", 1437 | "metadata": {}, 1438 | "source": [ 1439 | "#### Hence, we choose k = 25 for our nearest neighbor classifier." 1440 | ] 1441 | }, 1442 | { 1443 | "cell_type": "code", 1444 | "execution_count": 56, 1445 | "metadata": { 1446 | "collapsed": false 1447 | }, 1448 | "outputs": [ 1449 | { 1450 | "name": "stdout", 1451 | "output_type": "stream", 1452 | "text": [ 1453 | "19 60\n" 1454 | ] 1455 | } 1456 | ], 1457 | "source": [ 1458 | "print len(testStateTransitionVector), len(trainStateTransitionVector)" 1459 | ] 1460 | }, 1461 | { 1462 | "cell_type": "code", 1463 | "execution_count": 57, 1464 | "metadata": { 1465 | "collapsed": false 1466 | }, 1467 | "outputs": [], 1468 | "source": [ 1469 | "# train = np.hstack([XTrainAllFeatures, XTestAllFeatures])\n", 1470 | "train = XTrainAllFeatures\n", 1471 | "test = XEvalAllFeatures\n", 1472 | "params = {'neighbours':25}\n", 1473 | "neighbours = params['neighbours']\n", 1474 | "neigh = KNeighborsClassifier(n_neighbors=neighbours)\n", 1475 | "YPred = neigh.fit(train, YTrain).predict(test)" 1476 | ] 1477 | }, 1478 | { 1479 | "cell_type": "code", 1480 | "execution_count": 64, 1481 | "metadata": { 1482 | "collapsed": false 1483 | }, 1484 | "outputs": [ 1485 | { 1486 | "name": "stdout", 1487 | "output_type": "stream", 1488 | "text": [ 1489 | "[9 9 9 ..., 9 9 9]\n" 1490 | ] 1491 | } 1492 | ], 1493 | "source": [ 1494 | "print YPred[2:3020]" 1495 | ] 1496 | }, 1497 | { 1498 | "cell_type": "markdown", 1499 | "metadata": {}, 1500 | "source": [ 1501 | "### Selection of Parameters for Random Forest" 1502 | ] 1503 | }, 1504 | { 1505 | "cell_type": "code", 1506 | "execution_count": 59, 1507 | "metadata": { 1508 | "collapsed": false 1509 | }, 1510 | "outputs": [], 1511 | "source": [ 1512 | "# selectRandomForestScores = []\n", 1513 | "\n", 1514 | "# params = {'trees':500, 'criterion':'entropy','random_state':1000}\n", 1515 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n", 1516 | "# print score\n", 1517 | "# selectRandomForestScores.append(score)\n", 1518 | "\n", 1519 | "# params = {'trees':1000, 'criterion':'entropy','random_state':1000}\n", 1520 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n", 1521 | "# print score\n", 1522 | "# selectRandomForestScores.append(score)\n", 1523 | "\n", 1524 | "# params = {'trees':500, 'criterion':'gini','random_state':1000}\n", 1525 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n", 1526 | "# print score\n", 1527 | "# selectRandomForestScores.append(score)\n", 1528 | "\n", 1529 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n", 1530 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n", 1531 | "# print score\n", 1532 | "# selectRandomForestScores.append(score)\n", 1533 | "\n", 1534 | "# print selectRandomForestScores" 1535 | ] 1536 | }, 1537 | { 1538 | "cell_type": "code", 1539 | "execution_count": 60, 1540 | "metadata": { 1541 | "collapsed": false 1542 | }, 1543 | "outputs": [], 1544 | "source": [ 1545 | "# #Plotting the results\n", 1546 | "# import matplotlib.pyplot as plt\n", 1547 | "# %matplotlib inline\n", 1548 | "# plt.plot(selectRandomForestScores, label = \"Random Forest Classifier\")\n", 1549 | "# plt.title(\"Random Forest Classifier\")\n", 1550 | "\n", 1551 | "# labels = ['500 Trees + entropy', '1000 Trees + entropy', '500 Trees + gini', '1000 Trees + gini']\n", 1552 | "\n", 1553 | "# # You can specify a rotation for the tick labels in degrees or with keywords.\n", 1554 | "# plt.xticks(np.arange(len(labels)), labels, rotation='vertical')\n", 1555 | "\n", 1556 | "# plt.ylabel('Scores')\n", 1557 | "# plt.xlabel('Parameters')\n", 1558 | "# # Place a legend to the right of this smaller figure.\n", 1559 | "# # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n", 1560 | "# plt.show()" 1561 | ] 1562 | }, 1563 | { 1564 | "cell_type": "markdown", 1565 | "metadata": {}, 1566 | "source": [ 1567 | "#### Hence, we choose 1000 Trees + Gini as a criterion for our Random Forest classifier." 1568 | ] 1569 | }, 1570 | { 1571 | "cell_type": "code", 1572 | "execution_count": 61, 1573 | "metadata": { 1574 | "collapsed": false 1575 | }, 1576 | "outputs": [], 1577 | "source": [ 1578 | "params = {'trees':150, 'criterion':'entropy','random_state':None}\n", 1579 | "trees = params['trees']\n", 1580 | "crit = params['criterion']\n", 1581 | "seed = params['random_state']\n", 1582 | "clf = RandomForestClassifier(n_estimators=trees,criterion=crit,random_state=seed)\n", 1583 | "clf.fit(train, YTrain)\n", 1584 | "YPred = clf.predict(test)" 1585 | ] 1586 | }, 1587 | { 1588 | "cell_type": "markdown", 1589 | "metadata": {}, 1590 | "source": [ 1591 | "### Selection of Kernel for Multi Class SVM" 1592 | ] 1593 | }, 1594 | { 1595 | "cell_type": "code", 1596 | "execution_count": null, 1597 | "metadata": { 1598 | "collapsed": false 1599 | }, 1600 | "outputs": [], 1601 | "source": [ 1602 | "# selectKernelScores = []\n", 1603 | "\n", 1604 | "# params = {'kernel':'poly'}\n", 1605 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n", 1606 | "# print score\n", 1607 | "# selectKernelScores.append(score)\n", 1608 | "\n", 1609 | "# params = {'kernel':'linear'}\n", 1610 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n", 1611 | "# print score\n", 1612 | "# selectKernelScores.append(score)\n", 1613 | "\n", 1614 | "# params = {'kernel':'rbf'}\n", 1615 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n", 1616 | "# print score\n", 1617 | "# selectKernelScores.append(score)" 1618 | ] 1619 | }, 1620 | { 1621 | "cell_type": "code", 1622 | "execution_count": null, 1623 | "metadata": { 1624 | "collapsed": false 1625 | }, 1626 | "outputs": [], 1627 | "source": [ 1628 | "# #Plotting the results\n", 1629 | "# import matplotlib.pyplot as plt\n", 1630 | "# %matplotlib inline\n", 1631 | "# plt.plot(selectKernelScores, label = \"Multiclass SVM Classifier\")\n", 1632 | "\n", 1633 | "# labels = ['poly','linear','rbf']\n", 1634 | "# plt.title(\"Multiclass SVM Classifier\")\n", 1635 | "# # You can specify a rotation for the tick labels in degrees or with keywords.\n", 1636 | "# plt.xticks(np.arange(len(labels)), labels, rotation='horizontal')\n", 1637 | "\n", 1638 | "# plt.ylabel('Scores')\n", 1639 | "# plt.xlabel('Kernel used')\n", 1640 | "# # Place a legend to the right of this smaller figure.\n", 1641 | "# # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n", 1642 | "# plt.show()" 1643 | ] 1644 | }, 1645 | { 1646 | "cell_type": "markdown", 1647 | "metadata": { 1648 | "collapsed": true 1649 | }, 1650 | "source": [ 1651 | "#### Hence, we choose rbf for our SVM classifier." 1652 | ] 1653 | }, 1654 | { 1655 | "cell_type": "code", 1656 | "execution_count": null, 1657 | "metadata": { 1658 | "collapsed": true 1659 | }, 1660 | "outputs": [], 1661 | "source": [ 1662 | "params = {'kernel':'rbf'}\n", 1663 | "ker = params['kernel']\n", 1664 | "YPred = svm.SVC(kernel=ker).fit(train, YTrain).predict(test)" 1665 | ] 1666 | }, 1667 | { 1668 | "cell_type": "markdown", 1669 | "metadata": {}, 1670 | "source": [ 1671 | "### Logistic Regression" 1672 | ] 1673 | }, 1674 | { 1675 | "cell_type": "code", 1676 | "execution_count": null, 1677 | "metadata": { 1678 | "collapsed": false 1679 | }, 1680 | "outputs": [], 1681 | "source": [ 1682 | "# # params = {'multi':False}\n", 1683 | "# # train = tfidfTrain\n", 1684 | "# # score = stratifiedKFoldVal(train, YTrain, classifyLogisticRegression, params)\n", 1685 | "# # print score\n", 1686 | "# train = XTrainAllFeatures\n", 1687 | "# score = stratifiedKFoldVal(train, YTrain, classifyLogisticRegression, params)\n", 1688 | "# print score" 1689 | ] 1690 | }, 1691 | { 1692 | "cell_type": "code", 1693 | "execution_count": null, 1694 | "metadata": { 1695 | "collapsed": false 1696 | }, 1697 | "outputs": [], 1698 | "source": [ 1699 | "# params = {'multi':True}\n", 1700 | "# train = trainStateTransitionVector\n", 1701 | "# score = stratifiedKFoldVal(train, YTrain, classifyLogisticRegression, params)\n", 1702 | "# print score" 1703 | ] 1704 | }, 1705 | { 1706 | "cell_type": "code", 1707 | "execution_count": null, 1708 | "metadata": { 1709 | "collapsed": true 1710 | }, 1711 | "outputs": [], 1712 | "source": [ 1713 | "# LogReg = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)\n", 1714 | "# LogReg.fit(trainBagVector, YTrain)\n", 1715 | "# YPred = LogReg.predict(testBagVector)\n", 1716 | "# # writePredictedLabelFile(YPred)" 1717 | ] 1718 | }, 1719 | { 1720 | "cell_type": "markdown", 1721 | "metadata": {}, 1722 | "source": [ 1723 | "### Define the parameters for Adaboost and use it on different training dataset" 1724 | ] 1725 | }, 1726 | { 1727 | "cell_type": "code", 1728 | "execution_count": null, 1729 | "metadata": { 1730 | "collapsed": false 1731 | }, 1732 | "outputs": [], 1733 | "source": [ 1734 | "# train = XTrainAllFeatures\n", 1735 | "# params = {'max_depth':1, 'algorithm':'SAMME', 'n_estimators':200}\n", 1736 | "# score = stratifiedKFoldVal(train, YTrain, classifyAdaboostClassifier, params)\n", 1737 | "# print score" 1738 | ] 1739 | }, 1740 | { 1741 | "cell_type": "code", 1742 | "execution_count": null, 1743 | "metadata": { 1744 | "collapsed": false 1745 | }, 1746 | "outputs": [], 1747 | "source": [ 1748 | "# train = XTrainAllFeatures\n", 1749 | "# params = {'max_depth':10, 'algorithm':'SAMME', 'n_estimators':500}\n", 1750 | "# score = stratifiedKFoldVal(train, YTrain, classifyAdaboostClassifier, params)\n", 1751 | "# print score" 1752 | ] 1753 | }, 1754 | { 1755 | "cell_type": "code", 1756 | "execution_count": null, 1757 | "metadata": { 1758 | "collapsed": false 1759 | }, 1760 | "outputs": [], 1761 | "source": [ 1762 | "# # Submission\n", 1763 | "# params = {'max_depth':10, 'algorithm':'SAMME', 'n_estimators':500}\n", 1764 | "# train = tfidfTrain\n", 1765 | "# test = tfidfTest\n", 1766 | "# depth = params['max_depth']\n", 1767 | "# algo = params['algorithm']\n", 1768 | "# estimators = params['n_estimators']\n", 1769 | "\n", 1770 | "# # Create and fit an AdaBoosted decision tree\n", 1771 | "# bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth = depth),\n", 1772 | "# algorithm = algo,\n", 1773 | "# n_estimators=estimators)\n", 1774 | "\n", 1775 | "# bdt.fit(train, YTrain)\n", 1776 | "# YPred = bdt.predict(test)\n", 1777 | "# # writePredictedLabelFile(YPred)" 1778 | ] 1779 | }, 1780 | { 1781 | "cell_type": "markdown", 1782 | "metadata": {}, 1783 | "source": [ 1784 | "### Selection of Parameters for Neural Networks" 1785 | ] 1786 | }, 1787 | { 1788 | "cell_type": "code", 1789 | "execution_count": null, 1790 | "metadata": { 1791 | "collapsed": false 1792 | }, 1793 | "outputs": [], 1794 | "source": [ 1795 | "# train = XTrainAllFeatures\n", 1796 | "# # params = {'activation':'Rectifier', 'units':100, 'rate':0.02, 'iter':10}\n", 1797 | "# params = {'activation':'Tanh', 'actLastLayer':'Softmax', 'rule':'momentum', 'units':100, 'rate':0.002, 'iter':10}\n", 1798 | "# score = stratifiedKFoldVal(train, YTrain, classifyNeuralNetworkClassifier, params)\n", 1799 | "# print score" 1800 | ] 1801 | }, 1802 | { 1803 | "cell_type": "code", 1804 | "execution_count": null, 1805 | "metadata": { 1806 | "collapsed": false 1807 | }, 1808 | "outputs": [], 1809 | "source": [ 1810 | "# train = XTrainAllFeatures\n", 1811 | "# # params = {'activation':'Rectifier', 'units':100, 'rate':0.02, 'iter':10}\n", 1812 | "# params = {'activation':'Tanh', 'actLastLayer':'Softmax', 'rule':'sgd', 'units':100, 'rate':0.002, 'iter':10}\n", 1813 | "# score = stratifiedKFoldVal(train, YTrain, classifyNeuralNetworkClassifier, params)\n", 1814 | "# print score" 1815 | ] 1816 | }, 1817 | { 1818 | "cell_type": "code", 1819 | "execution_count": null, 1820 | "metadata": { 1821 | "collapsed": false 1822 | }, 1823 | "outputs": [], 1824 | "source": [ 1825 | "# train = XTrainAllFeatures\n", 1826 | "# params = {'activation':'Sigmoid', 'actLastLayer':'Softmax', 'rule':'rmsprop', 'units':100, 'rate':0.002, 'iter':10}\n", 1827 | "# score = stratifiedKFoldVal(train, YTrain, classifyNeuralNetworkClassifier, params)\n", 1828 | "# print score" 1829 | ] 1830 | }, 1831 | { 1832 | "cell_type": "code", 1833 | "execution_count": null, 1834 | "metadata": { 1835 | "collapsed": true 1836 | }, 1837 | "outputs": [], 1838 | "source": [ 1839 | "# # Submission\n", 1840 | "# tr = trainBagVector\n", 1841 | "# te = testBagVector\n", 1842 | "# params = {'activation':'Tanh', 'actLastLayer':'Softmax', 'rule':'adagrad', 'units':100, 'rate':0.002, 'iter':10}\n", 1843 | "# activation = params['activation']\n", 1844 | "# actLastLayer = params['actLastLayer']\n", 1845 | "# rule = params['rule']\n", 1846 | "# noOfUnits = params['units']\n", 1847 | "# rate = params['rate']\n", 1848 | "# noOfIter = params['iter']\n", 1849 | "# nn = Classifier(layers=[Layer(activation, units=noOfUnits),Layer(actLastLayer)], learning_rule=rule,\n", 1850 | "# learning_rate=0.02,\n", 1851 | "# n_iter=10)\n", 1852 | "# nn.fit(tr, YTrain)\n", 1853 | "# YPred = nn.predict(te)\n", 1854 | "# # writePredictedLabelFile(YPred)" 1855 | ] 1856 | }, 1857 | { 1858 | "cell_type": "markdown", 1859 | "metadata": { 1860 | "collapsed": true 1861 | }, 1862 | "source": [ 1863 | "### Get features in format for Models of NLTK Classify " 1864 | ] 1865 | }, 1866 | { 1867 | "cell_type": "code", 1868 | "execution_count": null, 1869 | "metadata": { 1870 | "collapsed": true 1871 | }, 1872 | "outputs": [], 1873 | "source": [ 1874 | "def featNLTKClassify(samples, phase):\n", 1875 | " featureVectors = vectorizer.get_feature_names()\n", 1876 | " nltkClassifySamples = []\n", 1877 | "\n", 1878 | " for i in xrange(len(samples)):\n", 1879 | " t = samples[i]\n", 1880 | " lstFuncCalls = t.split()\n", 1881 | " wordOccDict = {}\n", 1882 | " for j in xrange(len(featureVectors)):\n", 1883 | " wordOccDict[featureVectors[j]] = lstFuncCalls.count(featureVectors[j])\n", 1884 | " if phase == 'train':\n", 1885 | " nltkClassifySamples.append((wordOccDict, YTrain[i]))\n", 1886 | " else:\n", 1887 | " nltkClassifySamples.append(wordOccDict)\n", 1888 | "\n", 1889 | " return nltkClassifySamples" 1890 | ] 1891 | }, 1892 | { 1893 | "cell_type": "code", 1894 | "execution_count": null, 1895 | "metadata": { 1896 | "collapsed": true 1897 | }, 1898 | "outputs": [], 1899 | "source": [ 1900 | "# nltkClassifyTrain = featNLTKClassify(trainSamples, 'train')\n", 1901 | "# nltkClassifyTest = featNLTKClassify(testSamples, 'test')" 1902 | ] 1903 | }, 1904 | { 1905 | "cell_type": "markdown", 1906 | "metadata": {}, 1907 | "source": [ 1908 | "### Nave Baiyes Classifier" 1909 | ] 1910 | }, 1911 | { 1912 | "cell_type": "code", 1913 | "execution_count": null, 1914 | "metadata": { 1915 | "collapsed": true 1916 | }, 1917 | "outputs": [], 1918 | "source": [ 1919 | "# tr = nltkClassifyTrain\n", 1920 | "# te = nltkClassifyTest\n", 1921 | "# classifier = nltk.classify.NaiveBayesClassifier.train(tr)\n", 1922 | "# sorted(classifier.labels())" 1923 | ] 1924 | }, 1925 | { 1926 | "cell_type": "code", 1927 | "execution_count": null, 1928 | "metadata": { 1929 | "collapsed": true 1930 | }, 1931 | "outputs": [], 1932 | "source": [ 1933 | "# classifier.classify_many(te)\n", 1934 | "\n", 1935 | "# classifier.show_most_informative_features()\n", 1936 | "# # print nltk.classify.accuracy(classifier, te)*100" 1937 | ] 1938 | }, 1939 | { 1940 | "cell_type": "markdown", 1941 | "metadata": {}, 1942 | "source": [ 1943 | "### Maximum Entropy Classifier" 1944 | ] 1945 | }, 1946 | { 1947 | "cell_type": "code", 1948 | "execution_count": null, 1949 | "metadata": { 1950 | "collapsed": true 1951 | }, 1952 | "outputs": [], 1953 | "source": [ 1954 | "# from nltk.classify import maxent\n", 1955 | "# tr = nltkClassifyTrain\n", 1956 | "# te = nltkClassifyTest\n", 1957 | "# classifierME = maxent.MaxentClassifier.train(tr, bernoulli=False, encoding=encoding, trace=0)\n", 1958 | "# classifierME.classify_many(te)" 1959 | ] 1960 | }, 1961 | { 1962 | "cell_type": "markdown", 1963 | "metadata": {}, 1964 | "source": [ 1965 | "### Decision Tree Classifier" 1966 | ] 1967 | }, 1968 | { 1969 | "cell_type": "code", 1970 | "execution_count": null, 1971 | "metadata": { 1972 | "collapsed": true 1973 | }, 1974 | "outputs": [], 1975 | "source": [ 1976 | "# tr = nltkClassifyTrain\n", 1977 | "# te = nltkClassifyTest\n", 1978 | "\n", 1979 | "# classifier = nltk.classify.DecisionTreeClassifier.train(tr, entropy_cutoff=0,support_cutoff=0)\n", 1980 | "# sorted(classifier.labels())\n", 1981 | "# print(classifier)\n", 1982 | "# classifier.classify_many(te)" 1983 | ] 1984 | }, 1985 | { 1986 | "cell_type": "markdown", 1987 | "metadata": {}, 1988 | "source": [ 1989 | "### Graphs depicting Categorization Accuracy scores on KFold Stratified Validation on Train data for:" 1990 | ] 1991 | }, 1992 | { 1993 | "cell_type": "code", 1994 | "execution_count": null, 1995 | "metadata": { 1996 | "collapsed": false 1997 | }, 1998 | "outputs": [], 1999 | "source": [ 2000 | "# selectRandomForestScores = []\n", 2001 | "# selectKernelScores = []\n", 2002 | "# selectNeighbourScores = []\n", 2003 | "\n", 2004 | "# train = trainBagVector\n", 2005 | "\n", 2006 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n", 2007 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n", 2008 | "# print score\n", 2009 | "# selectRandomForestScores.append(score)\n", 2010 | "\n", 2011 | "# params = {'neighbours':25}\n", 2012 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 2013 | "# print score\n", 2014 | "# selectNeighbourScores.append(score)\n", 2015 | "\n", 2016 | "# params = {'kernel':'linear'}\n", 2017 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n", 2018 | "# print score\n", 2019 | "# selectKernelScores.append(score)\n", 2020 | "\n", 2021 | "\n", 2022 | "\n", 2023 | "# train = tfidfTrain\n", 2024 | "\n", 2025 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n", 2026 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n", 2027 | "# print score\n", 2028 | "# selectRandomForestScores.append(score)\n", 2029 | "\n", 2030 | "# params = {'neighbours':25}\n", 2031 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 2032 | "# print score\n", 2033 | "# selectNeighbourScores.append(score)\n", 2034 | "\n", 2035 | "# params = {'kernel':'rbf'}\n", 2036 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n", 2037 | "# print score\n", 2038 | "# selectKernelScores.append(score)\n", 2039 | "\n", 2040 | "\n", 2041 | "# train = np.array(trainStateTransitionVector)\n", 2042 | "\n", 2043 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n", 2044 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n", 2045 | "# print score\n", 2046 | "# selectRandomForestScores.append(score)\n", 2047 | "\n", 2048 | "# params = {'neighbours':25}\n", 2049 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 2050 | "# print score\n", 2051 | "# selectNeighbourScores.append(score)\n", 2052 | "\n", 2053 | "\n", 2054 | "# params = {'kernel':'rbf'}\n", 2055 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n", 2056 | "# print score\n", 2057 | "# selectKernelScores.append(score)\n", 2058 | "\n", 2059 | "\n", 2060 | "# train = np.hstack([trainBagVector, np.array(trainStateTransitionVector)])\n", 2061 | "\n", 2062 | "# params = {'trees':1000, 'criterion':'gini','random_state':1000}\n", 2063 | "# score = stratifiedKFoldVal(train, YTrain, classifyRandomForestClassifier, params)\n", 2064 | "# print score\n", 2065 | "# selectRandomForestScores.append(score)\n", 2066 | "\n", 2067 | "# params = {'neighbours':25}\n", 2068 | "# score = stratifiedKFoldVal(train, YTrain, classifyKNNClassifier, params)\n", 2069 | "# print score\n", 2070 | "# selectNeighbourScores.append(score)\n", 2071 | "\n", 2072 | "# params = {'kernel':'rbf'}\n", 2073 | "# score = stratifiedKFoldVal(train, YTrain, classifyMultiClassSVMClassifier, params)\n", 2074 | "# print score\n", 2075 | "# selectKernelScores.append(score)\n", 2076 | "\n", 2077 | "\n", 2078 | "\n", 2079 | "# print selectRandomForestScores\n", 2080 | "# print selectKernelScores\n", 2081 | "# print selectNeighbourScores" 2082 | ] 2083 | }, 2084 | { 2085 | "cell_type": "code", 2086 | "execution_count": null, 2087 | "metadata": { 2088 | "collapsed": false 2089 | }, 2090 | "outputs": [], 2091 | "source": [ 2092 | "# #Plotting the results\n", 2093 | "# import matplotlib.pyplot as plt\n", 2094 | "# %matplotlib inline\n", 2095 | "# plt.plot(selectRandomForestScores, label = \"Random Forest Classifier\")\n", 2096 | "# plt.plot(selectKernelScores, label = \"Multiclass Linear SVM Classifier\")\n", 2097 | "# plt.plot(selectNeighbourScores, label = \"KNN Classifier\")\n", 2098 | "\n", 2099 | "# labels = ['Bag of Words', 'TF-IDF', 'State Transitions', 'Stacked Features 1 & 3']\n", 2100 | "\n", 2101 | "# # You can specify a rotation for the tick labels in degrees or with keywords.\n", 2102 | "# plt.xticks(np.arange(len(labels)), labels, rotation='vertical')\n", 2103 | "\n", 2104 | "# plt.ylabel('Scores')\n", 2105 | "# plt.xlabel('Feature Encoding used')\n", 2106 | "# # Place a legend to the right of this smaller figure.\n", 2107 | "# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n", 2108 | "# plt.show()" 2109 | ] 2110 | }, 2111 | { 2112 | "cell_type": "markdown", 2113 | "metadata": {}, 2114 | "source": [ 2115 | "## Final evaluation results" 2116 | ] 2117 | }, 2118 | { 2119 | "cell_type": "markdown", 2120 | "metadata": {}, 2121 | "source": [ 2122 | "### Bar graph depicting Categorization Accuracy Scores on the different Models." 2123 | ] 2124 | }, 2125 | { 2126 | "cell_type": "code", 2127 | "execution_count": 65, 2128 | "metadata": { 2129 | "collapsed": false 2130 | }, 2131 | "outputs": [], 2132 | "source": [ 2133 | "# import numpy as np\n", 2134 | "# import matplotlib.pyplot as plt\n", 2135 | "# %matplotlib inline\n", 2136 | "# N = 8\n", 2137 | "# publicScore = (80.453, 75.637, 79.887, 80.737, 81.586, 80.170, 80.170, 79.887)\n", 2138 | "# privateScore = (84.136, 80.170, 83.569, 83.003, 83.286, 83.003, 83.003, 83.569)\n", 2139 | "# modelNames = ('RF(50T, Entropy)+Bag of Words', 'RF(150T, Entropy)+TF-IDF', 'RF(50T, Entropy) + State Transition', \n", 2140 | "# 'KNN(5) +Bag of Words', 'KNN(5) + TF-IDF', 'KNN(5) + State Transition',\n", 2141 | "# 'Stack: KNN(5) + ST + BoW', 'Stack: RF(50T, Entropy) + ST + BoW')\n", 2142 | "\n", 2143 | "# ind = np.arange(N) # the x locations for the groups\n", 2144 | "# width = 0.35 # the width of the bars\n", 2145 | "\n", 2146 | "# fig, ax = plt.subplots()\n", 2147 | "# rects1 = ax.bar(ind, publicScore, width, color='m')\n", 2148 | "\n", 2149 | "# rects2 = ax.bar(ind + width, privateScore, width, color='c')\n", 2150 | "\n", 2151 | "# # add some text for labels, title and axes ticks\n", 2152 | "# ax.set_ylabel('Scores')\n", 2153 | "# ax.set_title('Evaluations of submissions using Categorization Accuracy.')\n", 2154 | "# ax.set_xticks(ind + width)\n", 2155 | "# ax.set_xticklabels(modelNames, rotation='vertical')\n", 2156 | "# ax.set_ylim(75,85)\n", 2157 | "\n", 2158 | "# # def autolabel(rects):\n", 2159 | "# # # attach some text labels\n", 2160 | "# # for rect in rects:\n", 2161 | "# # height = rect.get_height()\n", 2162 | "# # ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,\n", 2163 | "# # '%d' % int(height),\n", 2164 | "# # ha='center', va='bottom')\n", 2165 | "\n", 2166 | "# # autolabel(rects1)\n", 2167 | "# # autolabel(rects2)\n", 2168 | "\n", 2169 | "# # Place a legend to the right of this smaller figure.\n", 2170 | "# ax.legend((rects1[0], rects2[0]), ('Public Scores', 'Private Scores'), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n", 2171 | "\n", 2172 | "# plt.show()" 2173 | ] 2174 | }, 2175 | { 2176 | "cell_type": "markdown", 2177 | "metadata": {}, 2178 | "source": [ 2179 | "### Hence we conclude that the best model is kNN using TF-IDF as features !" 2180 | ] 2181 | }, 2182 | { 2183 | "cell_type": "markdown", 2184 | "metadata": {}, 2185 | "source": [ 2186 | "## Geo Visualization" 2187 | ] 2188 | }, 2189 | { 2190 | "cell_type": "code", 2191 | "execution_count": 83, 2192 | "metadata": { 2193 | "collapsed": true 2194 | }, 2195 | "outputs": [], 2196 | "source": [ 2197 | "def reverseMapLabels(classNo):\n", 2198 | " if className == 0:\n", 2199 | " return 'Conscientiousness'\n", 2200 | " elif className == 1:\n", 2201 | " return 'Extrovert'\n", 2202 | " elif className == 2:\n", 2203 | " return 'Agreeable'\n", 2204 | " elif className == 3:\n", 2205 | " return 'Empathetic'\n", 2206 | " elif className == 4:\n", 2207 | " return 'Novelty Seeking'\n", 2208 | " elif className == 5:\n", 2209 | " return 'Perfectionist'\n", 2210 | " elif className == 6:\n", 2211 | " return 'Rigid'\n", 2212 | " elif className == 7:\n", 2213 | " return 'Impulsive'\n", 2214 | " elif className == 8:\n", 2215 | " return 'Psychopath'\n", 2216 | " elif className == 9:\n", 2217 | " return 'Obsessive'\n", 2218 | " else:\n", 2219 | " return None\n" 2220 | ] 2221 | }, 2222 | { 2223 | "cell_type": "code", 2224 | "execution_count": 84, 2225 | "metadata": { 2226 | "collapsed": false, 2227 | "scrolled": true 2228 | }, 2229 | "outputs": [], 2230 | "source": [ 2231 | "import string\n", 2232 | "import matplotlib.cm as cm\n", 2233 | "\n", 2234 | "from mpl_toolkits.basemap import Basemap\n", 2235 | "import matplotlib.pyplot as plt\n", 2236 | "\n", 2237 | "import pandas as pd\n", 2238 | "\n", 2239 | "def GeoPlot(geo_longitude, geo_latitude, labels):\n", 2240 | "\n", 2241 | " fig = plt.figure(figsize=(20,10))\n", 2242 | " \n", 2243 | " raw_data = {'latitude': geo_latitude,'longitude': geo_longitude}\n", 2244 | "\n", 2245 | " df = pd.DataFrame(raw_data, columns = ['latitude', 'longitude'])\n", 2246 | " \n", 2247 | " totSampleLen = len(labels)\n", 2248 | "# print totSampleLen\n", 2249 | " colors = ['blue', 'beige', 'red', 'green', 'magenta', 'yellow', 'cyan', 'aquamarine', 'azure', 'darkkhaki']\n", 2250 | " \n", 2251 | " m = Basemap(projection='gall',lon_0=0,lat_0=0,resolution='i')\n", 2252 | "# x1,y1=map(geo_longitude, geo_latitude)\n", 2253 | " x1,y1 = map(df['longitude'].values, df['latitude'].values)\n", 2254 | "\n", 2255 | "\n", 2256 | " m.drawmapboundary(fill_color='black') # fill to edge\n", 2257 | " m.drawcountries()\n", 2258 | " m.fillcontinents(color='white',lake_color='black')\n", 2259 | " \n", 2260 | "# m.scatter(x1, y1, marker='D',color='m', s=2)\n", 2261 | " for i in xrange(totSampleLen):\n", 2262 | " for k in xrange(10):\n", 2263 | " if labels[i] == k:\n", 2264 | "# print x1[i], y1[i]\n", 2265 | "# print colors[k]\n", 2266 | "# m.scatter(x1[i], y1[i], marker='D',color=colors[k], s=2)\n", 2267 | " m.plot(x1[i], y1[i], 'ro', color=colors[k]) #'ro', markersize=6)\n", 2268 | "\n", 2269 | " \n", 2270 | " for k in xrange(10):\n", 2271 | " m.scatter(0,0, marker='D',color=colors[k], s=2, label=reverseMapLabels(k))\n", 2272 | " \n", 2273 | " plt.title(\"Geo-tagging Personality Types for Twitter Users\")\n", 2274 | " # Place a legend to the right of this smaller figure.\n", 2275 | " plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n", 2276 | " plt.show()\n" 2277 | ] 2278 | }, 2279 | { 2280 | "cell_type": "markdown", 2281 | "metadata": {}, 2282 | "source": [ 2283 | "### Visualize Personality Types based on location of user tweets." 2284 | ] 2285 | }, 2286 | { 2287 | "cell_type": "code", 2288 | "execution_count": 85, 2289 | "metadata": { 2290 | "collapsed": false 2291 | }, 2292 | "outputs": [ 2293 | { 2294 | "name": "stdout", 2295 | "output_type": "stream", 2296 | "text": [ 2297 | "\n", 2298 | "(60,)\n" 2299 | ] 2300 | }, 2301 | { 2302 | "ename": "TypeError", 2303 | "evalue": "'numpy.ndarray' object is not callable", 2304 | "output_type": "error", 2305 | "traceback": [ 2306 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 2307 | "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", 2308 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlat\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[0mlat\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mGeoPlot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgeo_longitude\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgeo_latitude\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mYTrain\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m60\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 2309 | "\u001b[1;32m\u001b[0m in \u001b[0;36mGeoPlot\u001b[1;34m(geo_longitude, geo_latitude, labels)\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[0mm\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBasemap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprojection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'gall'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlon_0\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlat_0\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mresolution\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'i'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[1;31m# x1,y1=map(geo_longitude, geo_latitude)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 23\u001b[1;33m \u001b[0mx1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0my1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'longitude'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'latitude'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 24\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 2310 | "\u001b[1;31mTypeError\u001b[0m: 'numpy.ndarray' object is not callable" 2311 | ] 2312 | } 2313 | ], 2314 | "source": [ 2315 | "lon = np.random.random_integers(-180,180,60)\n", 2316 | "lat = np.random.random_integers(-90,90,60)\n", 2317 | "geo_latitude = lat\n", 2318 | "geo_longitude = lon\n", 2319 | "print type(lat)\n", 2320 | "print lat.shape\n", 2321 | "GeoPlot(geo_longitude, geo_latitude, YTrain[0:60])" 2322 | ] 2323 | }, 2324 | { 2325 | "cell_type": "code", 2326 | "execution_count": null, 2327 | "metadata": { 2328 | "collapsed": false 2329 | }, 2330 | "outputs": [], 2331 | "source": [ 2332 | "GeoPlot(eval_geo_longitude[0:1000], eval_geo_latitude[0:1000], YPred[0:1000])" 2333 | ] 2334 | }, 2335 | { 2336 | "cell_type": "markdown", 2337 | "metadata": {}, 2338 | "source": [ 2339 | "### Geo-tagging Sentiments of Twitter Users" 2340 | ] 2341 | }, 2342 | { 2343 | "cell_type": "code", 2344 | "execution_count": 86, 2345 | "metadata": { 2346 | "collapsed": true 2347 | }, 2348 | "outputs": [], 2349 | "source": [ 2350 | "def reverseMapSentiments(classNo):\n", 2351 | " if classNo == 0:\n", 2352 | " return 'Negative'\n", 2353 | " elif classNo == 1:\n", 2354 | " return 'Neutral'\n", 2355 | " elif classNo == 2:\n", 2356 | " return 'Positive'\n", 2357 | " else:\n", 2358 | " return None" 2359 | ] 2360 | }, 2361 | { 2362 | "cell_type": "code", 2363 | "execution_count": 87, 2364 | "metadata": { 2365 | "collapsed": false 2366 | }, 2367 | "outputs": [], 2368 | "source": [ 2369 | "def GeoSentimentPlot(geo_longitude, geo_latitude, sentiments):\n", 2370 | "\n", 2371 | " fig = plt.figure(figsize=(20,10))\n", 2372 | " \n", 2373 | " raw_data = {'latitude': geo_latitude,\n", 2374 | " 'longitude': geo_longitude}\n", 2375 | "\n", 2376 | " df = pd.DataFrame(raw_data, columns = ['latitude', 'longitude'])\n", 2377 | "\n", 2378 | " \n", 2379 | " totSampleLen = len(sentiments)\n", 2380 | " colors = ['red', 'blue', 'green']\n", 2381 | " \n", 2382 | " negLimit = 0\n", 2383 | " posLimit = 0\n", 2384 | " \n", 2385 | " m = Basemap(projection='gall',lon_0=0,lat_0=0,resolution='i')\n", 2386 | " \n", 2387 | " x1,y1 = map(df['longitude'].values, df['latitude'].values)\n", 2388 | "\n", 2389 | " m.drawmapboundary(fill_color='black')\n", 2390 | " m.drawcountries()\n", 2391 | " m.fillcontinents(color='white',lake_color='black')\n", 2392 | " \n", 2393 | " for i in xrange(totSampleLen):\n", 2394 | "# print sentiments[i]\n", 2395 | " if sentiments[i] < negLimit:\n", 2396 | " m.plot(x1[i], y1[i], 'ro', color=colors[0])\n", 2397 | " elif sentiments[i] >= negLimit and sentiments[i] <= posLimit:\n", 2398 | " m.plot(x1[i], y1[i], 'ro', color=colors[1])\n", 2399 | " elif sentiments[i] > posLimit:\n", 2400 | " m.plot(x1[i], y1[i], 'ro', color=colors[2])\n", 2401 | " \n", 2402 | " \n", 2403 | " for k in xrange(3):\n", 2404 | " m.scatter(0,0, marker='D',color=colors[k], s=2, label=reverseMapSentiments(k))\n", 2405 | " \n", 2406 | " plt.title(\"Geo-tagging Sentiments of Twitter Users\")\n", 2407 | " # Place a legend to the right of this smaller figure.\n", 2408 | " plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n", 2409 | " plt.show()\n" 2410 | ] 2411 | }, 2412 | { 2413 | "cell_type": "markdown", 2414 | "metadata": {}, 2415 | "source": [ 2416 | "### Visualize Sentiment of user tweets based on location." 2417 | ] 2418 | }, 2419 | { 2420 | "cell_type": "code", 2421 | "execution_count": 88, 2422 | "metadata": { 2423 | "collapsed": false 2424 | }, 2425 | "outputs": [ 2426 | { 2427 | "ename": "TypeError", 2428 | "evalue": "'numpy.ndarray' object is not callable", 2429 | "output_type": "error", 2430 | "traceback": [ 2431 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 2432 | "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", 2433 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mGeoSentimentPlot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgeo_longitude\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgeo_latitude\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mXTrainSentiment\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m60\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 2434 | "\u001b[1;32m\u001b[0m in \u001b[0;36mGeoSentimentPlot\u001b[1;34m(geo_longitude, geo_latitude, sentiments)\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[0mm\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBasemap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprojection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'gall'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlon_0\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlat_0\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mresolution\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'i'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 19\u001b[1;33m \u001b[0mx1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0my1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'longitude'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'latitude'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 20\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[0mm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrawmapboundary\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfill_color\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'black'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 2435 | "\u001b[1;31mTypeError\u001b[0m: 'numpy.ndarray' object is not callable" 2436 | ] 2437 | } 2438 | ], 2439 | "source": [ 2440 | "GeoSentimentPlot(geo_longitude, geo_latitude, XTrainSentiment[0:60])" 2441 | ] 2442 | }, 2443 | { 2444 | "cell_type": "code", 2445 | "execution_count": null, 2446 | "metadata": { 2447 | "collapsed": false 2448 | }, 2449 | "outputs": [], 2450 | "source": [ 2451 | "print len(eval_geo_longitude)\n", 2452 | "eval_geo_longitude = np.array(eval_geo_longitude)\n", 2453 | "eval_geo_latitude = np.array(eval_geo_latitude)\n", 2454 | "print len(eval_geo_longitude)\n", 2455 | "print eval_geo_longitude.shape\n", 2456 | "print type(eval_geo_longitude)" 2457 | ] 2458 | }, 2459 | { 2460 | "cell_type": "code", 2461 | "execution_count": null, 2462 | "metadata": { 2463 | "collapsed": false 2464 | }, 2465 | "outputs": [], 2466 | "source": [ 2467 | "GeoSentimentPlot(eval_geo_longitude[0:1000], eval_geo_latitude[0:1000], XEvalSentiment[0:1000])" 2468 | ] 2469 | }, 2470 | { 2471 | "cell_type": "code", 2472 | "execution_count": null, 2473 | "metadata": { 2474 | "collapsed": true 2475 | }, 2476 | "outputs": [], 2477 | "source": [] 2478 | } 2479 | ], 2480 | "metadata": { 2481 | "kernelspec": { 2482 | "display_name": "Python 2", 2483 | "language": "python", 2484 | "name": "python2" 2485 | }, 2486 | "language_info": { 2487 | "codemirror_mode": { 2488 | "name": "ipython", 2489 | "version": 2 2490 | }, 2491 | "file_extension": ".py", 2492 | "mimetype": "text/x-python", 2493 | "name": "python", 2494 | "nbconvert_exporter": "python", 2495 | "pygments_lexer": "ipython2", 2496 | "version": "2.7.11" 2497 | } 2498 | }, 2499 | "nbformat": 4, 2500 | "nbformat_minor": 0 2501 | } 2502 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/TwitterData/StopWords.txt: -------------------------------------------------------------------------------- 1 | video 2 | [video] 3 | URL 4 | url 5 | pic 6 | [ 7 | ] 8 | ( 9 | ) 10 | " -------------------------------------------------------------------------------- /Twitter User Personality Prediction/TwitterData/UserTweets.txt: -------------------------------------------------------------------------------- 1 | For #GivingTuesday, these soccer charities are very worthy, among many others: @soccerwoborders @sfw_tweets @FCHARLEM 2 | Proud to work w/ organizations that use #football to teach important messages about healthy behaviour #WorldAIDSDay 3 | Obsessed with @AnticoVinaioaFi! So glad we got there before the line. Freakin' killer #porchetta! #streetfood #Italy 4 | What an awesome #selfie @slattykat we love it! Let's see what fun #selfies you can get in … http://ift.tt/1IpwqNC 5 | Quench your thirst for beauty with the most beautiful river in the world [video] http://holykaw.alltop.com/quench-your-thirst-for-beauty-with-the-most-beautiful-river-in-the-world-video?gk1 … -------------------------------------------------------------------------------- /Twitter User Personality Prediction/TwitterData/labeledPersonalityTweets.csv: -------------------------------------------------------------------------------- 1 | |Sentinel|,|For #GivingTuesday, these soccer charities are very worthy, among many others: @soccerwoborders @sfw_tweets @FCHARLEM| 2 | |Diplomat|,|Proud to work w/ organizations that use #football to teach important messages about healthy behaviour #WorldAIDSDay| 3 | |Explorer|,|Obsessed with @AnticoVinaioaFi! So glad we got there before the line. Freakin' killer #porchetta! #streetfood #Italy| 4 | |Analyst|,|What an awesome #selfie @slattykat we love it! Let's see what fun #selfies you can get in URL | 5 | |Explorer|,|Quench your thirst for beauty with the most beautiful river in the world [video] URL | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/mmds/__init__.py -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/supervised/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/mmds/supervised/__init__.py -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/supervised/classification_algos.py: -------------------------------------------------------------------------------- 1 | from sklearn import linear_model 2 | from sklearn import svm 3 | from sklearn.cross_validation import StratifiedKFold 4 | from sklearn.ensemble import AdaBoostClassifier 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.neighbors import KNeighborsClassifier 7 | from sklearn.tree import DecisionTreeClassifier 8 | from sknn.mlp import Classifier, Layer 9 | 10 | import numpy as np 11 | 12 | 13 | # Random Forest Classifier 14 | # def classifyRandomForestClassifier(XTrain, XTest, YTrain, YTest,trees=100,crit='gini'): 15 | def classifyRandomForestClassifier(XTrain, XTest, YTrain, YTest, params): 16 | trees = params['trees'] 17 | crit = params['criterion'] 18 | seed = params['random_state'] 19 | clf = RandomForestClassifier(n_estimators=trees, criterion=crit, random_state=seed) 20 | clf.fit(XTrain, YTrain) 21 | YPred = clf.predict(XTest) 22 | diff = YPred - YTest 23 | score = diff[diff == 0].size 24 | return (100.0 * score) / (YPred.size) 25 | 26 | 27 | # In[46]: 28 | 29 | # Multi Class SVM 30 | def classifyMultiClassSVMClassifier(XTrain, XTest, YTrain, YTest, params): 31 | ker = params['kernel'] 32 | YPred = svm.SVC(kernel=ker).fit(XTrain, YTrain).predict(XTest) 33 | diff = YPred - YTest 34 | score = diff[diff == 0].size 35 | return (100.0 * score) / (YPred.size) 36 | 37 | 38 | # In[47]: 39 | 40 | # K Nearest Neighbours Classifier 41 | def classifyKNNClassifier(XTrain, XTest, YTrain, YTest, params): 42 | # print XTrain.shape, XTest.shape 43 | neighbours = params['neighbours'] 44 | neigh = KNeighborsClassifier(n_neighbors=neighbours) 45 | YPred = neigh.fit(XTrain, YTrain).predict(XTest) 46 | diff = YPred - YTest 47 | score = diff[diff == 0].size 48 | return (100.0 * score) / (YPred.size) 49 | 50 | 51 | # In[48]: 52 | 53 | # Logistic Regression 54 | def classifyLogisticRegression(XTrain, XTest, YTrain, YTest, params): 55 | LogReg = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None) 56 | LogReg.fit(XTrain, YTrain) 57 | # Finds the optimal model parameters using a least squares method. 58 | # To get the parameter values: 59 | # LogReg.get_params() 60 | # To predict a new input XTest, 61 | YPred = LogReg.predict(XTest) 62 | diff = YPred - YTest 63 | score = diff[diff == 0].size 64 | return (100.0 * score) / (YPred.size) 65 | 66 | 67 | # In[49]: 68 | 69 | # Adaboost Classfier 70 | def classifyAdaboostClassifier(XTrain, XTest, YTrain, YTest, params): 71 | depth = params['max_depth'] 72 | algo = params['algorithm'] 73 | estimators = params['n_estimators'] 74 | 75 | # Create and fit an AdaBoosted decision tree 76 | bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth), 77 | algorithm=algo, 78 | n_estimators=estimators) 79 | 80 | bdt.fit(XTrain, YTrain) 81 | YPred = bdt.predict(XTest) 82 | 83 | diff = YPred - YTest 84 | score = diff[diff == 0].size 85 | return (100.0 * score) / (YPred.size) 86 | 87 | 88 | def classifyNeuralNetworkClassifier(XTrain, XTest, YTrain, YTest, params): 89 | activation = params['activation'] 90 | actLastLayer = params['actLastLayer'] 91 | rule = params['rule'] 92 | noOfUnits = params['units'] 93 | rate = params['rate'] 94 | noOfIter = params['iter'] 95 | nn = Classifier(layers=[Layer(activation, units=noOfUnits), Layer(actLastLayer)], learning_rule=rule, 96 | learning_rate=0.02, 97 | n_iter=10) 98 | nn.fit(XTrain, YTrain) 99 | YPred = nn.predict(XTest) 100 | diff = YPred - YTest.reshape(YPred.shape) 101 | score = diff[diff == 0].size 102 | score = (100.0 * score) / (YPred.size) 103 | return score 104 | 105 | 106 | def featNLTKClassify(samples, phase, feature_names, YTrain): 107 | nltkClassifySamples = [] 108 | 109 | for i in xrange(len(samples)): 110 | t = samples[i] 111 | lstFuncCalls = t.split() 112 | wordOccDict = {} 113 | for j in xrange(len(feature_names)): 114 | wordOccDict[feature_names[j]] = lstFuncCalls.count(feature_names[j]) 115 | if phase == 'train': 116 | nltkClassifySamples.append((wordOccDict, YTrain[i])) 117 | else: 118 | nltkClassifySamples.append(wordOccDict) 119 | 120 | return nltkClassifySamples 121 | 122 | 123 | def stratifiedKFoldVal(XTrain, YTrain, classify, params): 124 | n_folds = 5 125 | score = 0.0 126 | skf = StratifiedKFold(YTrain, n_folds) 127 | try: 128 | multi = params['multi'] 129 | except KeyError: 130 | multi = False 131 | for train_index, test_index in skf: 132 | y_train, y_test = YTrain[train_index], YTrain[test_index] 133 | if not multi: 134 | X_train, X_test = XTrain[train_index], XTrain[test_index] 135 | score += classify(X_train, X_test, y_train, y_test, params) 136 | else: 137 | X_train, X_test = [XTrain[i] for i in train_index], [XTrain[i] for i in test_index] 138 | score += classify(np.array(X_train), np.array(X_test), y_train, y_test, params) 139 | 140 | return score / n_folds 141 | 142 | def createStateTransitionVector(categoricalState, stateDict, maxLength): 143 | if categoricalState: 144 | feature = [] 145 | for state in categoricalState.split(' '): 146 | try: 147 | feature.append(stateDict[state.lower()]) 148 | except KeyError: 149 | pass 150 | # print state 151 | if len(feature) != maxLength: 152 | for i in xrange(maxLength - len(feature)): 153 | feature.append(0) 154 | assert(len(feature) == maxLength) 155 | return feature 156 | else: 157 | return [0] * maxLength 158 | 159 | 160 | def createStateVectors(XStates, stateDict, maxLength): 161 | XFeatures = [] 162 | for state in XStates: 163 | XFeatures.append(createStateTransitionVector(state, stateDict, maxLength)) 164 | return XFeatures 165 | 166 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/supervised/feature_engineering.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from nltk.corpus import stopwords 3 | from textblob.blob import TextBlob 4 | 5 | from mmds.supervised.filter_stop_words import FilterStopWords 6 | from mmds.supervised.preprocess_tweets import PreprocessTweets 7 | 8 | 9 | class FeatureEngineering: 10 | 11 | def __init__(self): 12 | self.name = 'FeatureEngineering' 13 | self.featureList = [] 14 | # self.sid = SentimentIntensityAnalyzer() 15 | 16 | 17 | # start extract_features 18 | def extract_features(self, tweet): 19 | tweet_words = set(tweet) 20 | features = {} 21 | for word in self.featureList: 22 | features['contains(%s)' % word] = (word in tweet_words) 23 | return features 24 | 25 | # # Create New Training set based on personality labels predicted from Survey results 26 | 27 | def createNewTrainingSet(self, training_data_file): 28 | XTrain = [] 29 | YTrain = [] 30 | XTrainFeatures = [] 31 | XTrainSentiment = [] 32 | XTrainFreqTweets = [] 33 | geo_latitude = [] 34 | geo_longitude = [] 35 | 36 | objFilterStopWords = FilterStopWords() 37 | objPreprocessTweets = PreprocessTweets() 38 | 39 | stopWords = objFilterStopWords.getStopWordList('../../TwitterData/StopWords.txt') 40 | 41 | # Read the tweets one by one and process it 42 | inpTweets = csv.reader(open(training_data_file, 'rb'), delimiter=',') 43 | inpTweets.next() 44 | tweets = [] 45 | i = 0 46 | for row in inpTweets: 47 | # print row 48 | personality = row[5] 49 | tweet = row[1] 50 | cleanTweet = tweet.replace('"",""', " ") 51 | cleanTweet = cleanTweet.replace('""', " ") 52 | processedTweet = objPreprocessTweets.processTweet(cleanTweet) 53 | 54 | XTrainFreqTweets.append(int(row[4])) 55 | wordsList = processedTweet.split() 56 | 57 | # Remove stop words 58 | filtered_words = [word for word in wordsList if word not in stopwords.words('english')] 59 | filteredTweets = ' '.join(filtered_words) 60 | 61 | featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords) 62 | 63 | geo_latitude.append(float(row[2])) 64 | geo_longitude.append(float(row[3])) 65 | 66 | blob = TextBlob(processedTweet) 67 | sentiment = 0 68 | for sentence in blob.sentences: 69 | sentiment += sentence.sentiment.polarity 70 | 71 | totSentiment = sentiment / len(blob.sentences) 72 | 73 | XTrainSentiment.append(totSentiment) 74 | 75 | XTrainFeatures.append(filteredTweets) 76 | 77 | YTrain.append(personality.replace('[', '').replace('\"', '').replace(']', '')) 78 | 79 | # i+=1 80 | # if i==3: 81 | # break 82 | 83 | 84 | return XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude 85 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/supervised/filter_stop_words.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class FilterStopWords: 4 | 5 | # stopWords = [] 6 | def __init__(self): 7 | self.name = 'FilterStopWords' 8 | #initialize stopWords 9 | self.stopWords = [] 10 | 11 | 12 | def getStopWordList(self, stopWordListFileName): 13 | #read the stopwords file and build a list 14 | stopWords = [] 15 | stopWords.append('AT_USER') 16 | stopWords.append('URL') 17 | stopWords.append('[') 18 | stopWords.append('[') 19 | 20 | fp = open(stopWordListFileName, 'r') 21 | line = fp.readline() 22 | while line: 23 | word = line.strip() 24 | stopWords.append(word) 25 | line = fp.readline() 26 | fp.close() 27 | return stopWords 28 | 29 | def getFeatureVector(self, tweet, stopWords): 30 | featureVector = [] 31 | #split tweet into words 32 | words = tweet.split() 33 | for w in words: 34 | #replace two or more with two occurrences 35 | #w = replaceTwoOrMore(w) 36 | #strip punctuation 37 | w = w.strip('\'"?,.') 38 | #check if the word stats with an alphabet 39 | val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w) 40 | #ignore if it is a stop word 41 | if(w in self.stopWords or val is None): 42 | continue 43 | else: 44 | featureVector.append(w.lower()) 45 | return featureVector -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/supervised/personality_predictor_and_visualizer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from mpl_toolkits.basemap import Basemap 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | from sklearn.neighbors import KNeighborsClassifier 6 | from mmds.supervised.feature_engineering import FeatureEngineering 7 | from sklearn import svm 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import pandas as pd 12 | 13 | 14 | logging.basicConfig(filename="../../supervised.log", level=logging.DEBUG, format="%(asctime)-15s %(threadName)s %(message)s") 15 | 16 | PERSONALITY_LABELS = ['Conscientiousness', 'Extrovert', 'Agreeable', 'Empathetic', 'Novelty Seeking', 'Perfectionist', 'Rigid', 17 | 'Impulsive', 'Psychopath', 'Obsessive'] 18 | 19 | SENTIMENT_LABELS = ['Negative', 'Neutral', 'Positive'] 20 | 21 | def mapLabels(class_name): 22 | if class_name in PERSONALITY_LABELS: 23 | return PERSONALITY_LABELS.index(class_name) 24 | else: 25 | pass 26 | 27 | def writePredictedLabelFile(YPred): 28 | f = open("../../TwitterData/Predictions.csv", "w") 29 | f.write("Id,Label" + "\n") 30 | for i in xrange(len(YPred)): 31 | f.write(str(i) + "," + str(int(YPred[i])) + "\n") 32 | f.close() 33 | 34 | def reverseMapLabels(index): 35 | if index < len(PERSONALITY_LABELS): 36 | return PERSONALITY_LABELS[index] 37 | else: 38 | return None 39 | 40 | def GeoPlot(geo_longitude, geo_latitude, labels): 41 | 42 | fig = plt.figure(figsize=(20, 10)) 43 | 44 | raw_data = {'latitude': geo_latitude, 'longitude': geo_longitude} 45 | 46 | df = pd.DataFrame(raw_data, columns=['latitude', 'longitude']) 47 | 48 | totSampleLen = len(labels) 49 | # print totSampleLen 50 | colors = ['blue', 'beige', 'red', 'green', 'magenta', 'yellow', 'cyan', 'aquamarine', 'azure', 'darkkhaki'] 51 | 52 | m = Basemap(projection='gall', lon_0=0, lat_0=0, resolution='i') 53 | # x1,y1=map(geo_longitude, geo_latitude) 54 | x1, y1 = m(df['longitude'].values, df['latitude'].values) 55 | 56 | 57 | m.drawmapboundary(fill_color='black') # fill to edge 58 | m.drawcountries() 59 | m.fillcontinents(color='white', lake_color='black') 60 | 61 | # m.scatter(x1, y1, marker='D',color='m', s=2) 62 | for i in xrange(totSampleLen): 63 | for k in xrange(10): 64 | if labels[i] == k: 65 | # print x1[i], y1[i] 66 | # print colors[k] 67 | # m.scatter(x1[i], y1[i], marker='D',color=colors[k], s=2) 68 | m.plot(x1[i], y1[i], 'ro', color=colors[k]) # 'ro', markersize=6) 69 | 70 | 71 | for k in xrange(10): 72 | m.scatter(0, 0, marker='D', color=colors[k], s=2, label=reverseMapLabels(k)) 73 | 74 | plt.title("Geo-tagging Personality Types for Twitter Users") 75 | # Place a legend to the right of this smaller figure. 76 | plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) 77 | plt.show() 78 | 79 | def reverseMapSentiments(index): 80 | if index < len(SENTIMENT_LABELS): 81 | return SENTIMENT_LABELS[index] 82 | else: 83 | return None 84 | 85 | def GeoSentimentPlot(geo_longitude, geo_latitude, sentiments): 86 | 87 | fig = plt.figure(figsize=(20, 10)) 88 | 89 | raw_data = {'latitude': geo_latitude, 90 | 'longitude': geo_longitude} 91 | 92 | df = pd.DataFrame(raw_data, columns=['latitude', 'longitude']) 93 | 94 | 95 | totSampleLen = len(sentiments) 96 | colors = ['red', 'blue', 'green'] 97 | 98 | negLimit = 0 99 | posLimit = 0 100 | 101 | m = Basemap(projection='gall', lon_0=0, lat_0=0, resolution='i') 102 | 103 | x1, y1 = m(df['longitude'].values, df['latitude'].values) 104 | 105 | m.drawmapboundary(fill_color='black') 106 | m.drawcountries() 107 | m.fillcontinents(color='white', lake_color='black') 108 | 109 | for i in xrange(totSampleLen): 110 | # print sentiments[i] 111 | if sentiments[i] < negLimit: 112 | m.plot(x1[i], y1[i], 'ro', color=colors[0]) 113 | elif sentiments[i] >= negLimit and sentiments[i] <= posLimit: 114 | m.plot(x1[i], y1[i], 'ro', color=colors[1]) 115 | elif sentiments[i] > posLimit: 116 | m.plot(x1[i], y1[i], 'ro', color=colors[2]) 117 | 118 | 119 | for k in xrange(3): 120 | m.scatter(0, 0, marker='D', color=colors[k], s=2, label=reverseMapSentiments(k)) 121 | 122 | plt.title("Geo-tagging Sentiments of Twitter Users") 123 | # Place a legend to the right of this smaller figure. 124 | plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) 125 | plt.show() 126 | 127 | 128 | if __name__ == "__main__": 129 | """ 130 | Main script starts here. 131 | """ 132 | logging.info("Inside main...") 133 | training_data_file = '../../TwitterData/survey_dump_with_tweet_count' 134 | evauluation_data_file = '../../TwitterData/survey_dump_geo_gt_8_1' 135 | 136 | objFeatureEngineering = FeatureEngineering() 137 | XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, \ 138 | geo_longitude = objFeatureEngineering.createNewTrainingSet(training_data_file) 139 | 140 | XEval, YEval, XEvalFeatures, XEvalSentiment, XEvalFreqTweets, eval_geo_latitude, \ 141 | eval_geo_longitude = objFeatureEngineering.createNewTrainingSet(evauluation_data_file) 142 | 143 | YTrain = map(mapLabels, YTrain) 144 | YEval = map(mapLabels, YEval) 145 | 146 | XTrain = np.array(XTrainFeatures) 147 | YTrain = np.array(YTrain) 148 | 149 | logging.info("Number of training vectors XTrain:{}, target variables YTrain:{}".format(len(XTrain), len(YTrain))) 150 | 151 | XEval = np.array(XEvalFeatures) 152 | YEval = np.array(YEval) 153 | 154 | logging.info("Number of evaluation vectors XEval:{}, target variables YEval:{}".format(len(XEval), len(YEval))) 155 | 156 | # ### Split Train and Test data 157 | 158 | TRAINING_DATA_SET_SIZE = 60 159 | XTrainSamples = XTrain[0:TRAINING_DATA_SET_SIZE] 160 | YTrainSamples = YTrain[0:TRAINING_DATA_SET_SIZE] 161 | 162 | XTestSamples = XTrain[TRAINING_DATA_SET_SIZE:] 163 | YTestSamples = YTrain[TRAINING_DATA_SET_SIZE:] 164 | 165 | logging.info("No. of training samples XTrainSamples:{}, test samples XTestSamples:{}".format(len(XTrainSamples), len(XTestSamples))) 166 | 167 | trainSentimentSamples = np.array(XTrainSentiment[0:TRAINING_DATA_SET_SIZE]) 168 | testSentimentSamples = np.array(XTrainSentiment[TRAINING_DATA_SET_SIZE:]) 169 | trainFreqTweetSamples = np.array(XTrainFreqTweets[0:TRAINING_DATA_SET_SIZE]) 170 | testFreqTweetSamples = np.array(XTrainFreqTweets[TRAINING_DATA_SET_SIZE:]) 171 | 172 | vectorizer = CountVectorizer() 173 | vectorizer.fit_transform(np.array(XTrainFeatures + XEvalFeatures)) 174 | 175 | logging.info("Total features in training and evalution data:{}".format(len(vectorizer.get_feature_names()))) 176 | 177 | XTr = vectorizer.transform(XTrainSamples) 178 | trainBagVector = XTr.toarray() 179 | XTe = vectorizer.transform(XTestSamples) 180 | testBagVector = XTe.toarray() 181 | 182 | XEv = vectorizer.transform(XEval) 183 | evalBagVector = XEv.toarray() 184 | 185 | logging.info("Dimension of training bag:{}, test bag:{}, eval bag".format(trainBagVector.shape, 186 | testBagVector.shape, evalBagVector.shape)) 187 | 188 | # join word features + sentiment + tweet frequency for training samples ... 189 | XTrainAllFeatures = np.column_stack((np.column_stack((trainBagVector, trainSentimentSamples)), trainFreqTweetSamples)) 190 | 191 | # join word features + sentiment + tweet frequency for testing samples ... 192 | XTestAllFeatures = np.column_stack((np.column_stack((testBagVector, testSentimentSamples)), testFreqTweetSamples)) 193 | 194 | # join word features + sentiment + tweet frequency for evalution samples ... 195 | XEvalAllFeatures = np.column_stack((np.column_stack((evalBagVector, XEvalSentiment)), XEvalFreqTweets)) 196 | 197 | logging.info("Dim of all training samples:{}, test samples:{}, eval samples, ytrain :{}".format(XTrainAllFeatures.shape, 198 | XTestAllFeatures.shape, XEvalAllFeatures.shape, YTrainSamples.shape)) 199 | 200 | """K Nearest Neighbourhood""" 201 | params = {'neighbours':25} 202 | neigh = KNeighborsClassifier(n_neighbors=params['neighbours']) 203 | YPred = neigh.fit(XTrainAllFeatures, YTrainSamples).predict(XEvalAllFeatures) 204 | 205 | """Random Forest""" 206 | params = {'trees':150, 'criterion':'entropy', 'random_state':None} 207 | clf = RandomForestClassifier(n_estimators=params['trees'], criterion=params['criterion'], random_state=params['random_state']) 208 | clf.fit(XTrainAllFeatures, YTrainSamples) 209 | YPred = clf.predict(XEvalAllFeatures) 210 | 211 | """SVM""" 212 | params = {'kernel':'rbf'} 213 | YPred = svm.SVC(kernel=params['kernel']).fit(XTrainAllFeatures, YTrainSamples).predict(XEvalAllFeatures) 214 | 215 | GeoPlot(eval_geo_longitude, eval_geo_latitude, YPred) 216 | 217 | GeoSentimentPlot(np.array(eval_geo_longitude), np.array(eval_geo_latitude), XEvalSentiment) 218 | 219 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/supervised/preprocess_tweets.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class PreprocessTweets: 5 | 6 | def __init__(self): 7 | self.name = 'PreprocessTweets' 8 | 9 | #start process_tweet 10 | def processTweet(self, tweet): 11 | 12 | #Convert to lower case 13 | tweet = tweet.lower() 14 | #Convert www.* or https?://* to URL 15 | tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet) 16 | #Convert @username to AT_USER 17 | tweet = re.sub('@[^\s]+','AT_USER',tweet) 18 | #Remove additional white spaces 19 | tweet = re.sub('[\s]+', ' ', tweet) 20 | #Remove special characters 21 | #tweet = re.sub('*\[\]%', '', tweet) 22 | #Replace #word with word 23 | tweet = re.sub(r'#([^\s]+)', r'\1', tweet) 24 | #trim 25 | tweet = tweet.strip('\'"') 26 | 27 | # Remove all Non-ASCII characters 28 | tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet) 29 | 30 | return tweet 31 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/supervised/tweet_analysis.py: -------------------------------------------------------------------------------- 1 | 2 | # import preprocess_tweets 3 | # import filter_stop_words 4 | from mmds.supervised.filter_stop_words import FilterStopWords 5 | from mmds.supervised.preprocess_tweets import PreprocessTweets 6 | 7 | #Read the tweets one by one and process it 8 | fp = open('../../TwitterData/UserTweets.txt', 'r') 9 | line = fp.readline() 10 | 11 | objFilterStopWords = FilterStopWords() 12 | objPreprocessTweets = PreprocessTweets() 13 | 14 | st = open('../../TwitterData/StopWords.txt', 'r') 15 | stopWords = objFilterStopWords.getStopWordList('../../TwitterData/StopWords.txt') 16 | 17 | while line: 18 | processedTweet = objPreprocessTweets.processTweet(line) 19 | featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords) 20 | print featureVector 21 | line = fp.readline() 22 | #end loop 23 | fp.close() 24 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/unsupervised/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/mmds/unsupervised/__init__.py -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/unsupervised/k_means_estimator.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import csv 3 | import logging 4 | from collections import Counter 5 | from scipy.sparse import csr_matrix 6 | from scipy.sparse.coo import coo_matrix 7 | from sklearn.cluster.k_means_ import KMeans 8 | from sklearn.feature_extraction.dict_vectorizer import DictVectorizer 9 | from textblob.blob import TextBlob 10 | from textblob.en.np_extractors import ConllExtractor 11 | from textblob.en.taggers import NLTKTagger 12 | from mmds.utils.time_utils import time_it 13 | 14 | 15 | 16 | class KMeansEstimator: 17 | """ 18 | This class reads the tweets of users from a file and builds cluster centers on that data. It also provides 19 | method for finding the closest cluster center of unseen data. 20 | """ 21 | 22 | ADJECTIVE = 'JJ' 23 | 24 | """ 25 | Feature keys used in clustering... 26 | """ 27 | POLARITY_FEATURE_KEY = 'polarity' 28 | SUBJECTIVITY_FEATURE_KEY = 'subjectivity' 29 | TWEET_COUNT_FEATURE_KEY = 'tweetCount' 30 | """ 31 | Features not considered for clustering... 32 | """ 33 | USER_ID_FEATURE_KEY = 'userId' 34 | LONGITUDE_FEATURE_KEY = 'longitude' 35 | LATITUDE_FEATURE_KEY = 'latitude' 36 | 37 | 38 | """ 39 | Predicted label feature name. 40 | """ 41 | LABEL_FEATURE_KEY = 'label' 42 | 43 | RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY] 44 | 45 | def __init__(self, tweet_file_path, no_of_clusters): 46 | """ 47 | The constructor reads csv file and builds the data matrix. 48 | """ 49 | self.np_extractor = ConllExtractor() 50 | self.pos_tagger = NLTKTagger() 51 | self.tweet_file_path = tweet_file_path 52 | self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) 53 | self.vectorizer = DictVectorizer(sparse=True) 54 | self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) 55 | 56 | @time_it 57 | def __get_data_matrix_from_file(self, tweet_file_path): 58 | """ 59 | Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list 60 | of all feature vectors. 61 | """ 62 | file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') 63 | next(file_reader) 64 | data_matrix = [] 65 | for row in file_reader: 66 | logging.info("Extracting features for user_id:%s", row[0]) 67 | feature_vector = {} 68 | feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) 69 | feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) 70 | feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) 71 | feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) 72 | feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8'))) 73 | data_matrix.append(feature_vector) 74 | logging.info("Successfully extracted features for user_id:%s", row[0]) 75 | return data_matrix 76 | 77 | @time_it 78 | def __get_features_from_tweet_text(self, tweet_text): 79 | """This function returns the following features from the tweet text: 80 | - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. 81 | - Subjectivity and polarity as determined by TextBlob. 82 | :returns: (key,value) map of all features found. 83 | """ 84 | text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger); 85 | adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) 86 | polarity = text_blob.sentiment[0] 87 | subjectivity = text_blob.sentiment[1] 88 | return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items()) 89 | 90 | @time_it 91 | def __get_clustering_data_matrix(self, data_matrix): 92 | """ 93 | This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from 94 | the data matrix and returns a copy of the data matrix. 95 | """ 96 | data_matrix_copy = copy.deepcopy(data_matrix) 97 | for feature_vector in data_matrix_copy: 98 | feature_vector.pop(self.USER_ID_FEATURE_KEY) 99 | feature_vector.pop(self.LATITUDE_FEATURE_KEY) 100 | feature_vector.pop(self.LONGITUDE_FEATURE_KEY) 101 | return data_matrix_copy 102 | 103 | 104 | @time_it 105 | def perform_clustering(self, features_to_include=None): 106 | """ 107 | This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at 108 | "tweet_file_path". 109 | It returns list of feature vector, where each feature vector contains only "features_to_include" or all features 110 | if "features_to_include" is None. 111 | """ 112 | clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix) 113 | transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix) 114 | 115 | self.k_means_estimator.fit(transformed_data_matrix, y=None) 116 | return self.__get_predicted_labels(self.data_matrix, features_to_include) 117 | 118 | @time_it 119 | def __get_predicted_labels(self, data_matrix, features_to_include): 120 | """ 121 | Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The 122 | data matrix is modified in place. 123 | It returns a new copy of data_matrix with "features_to_include" features. 124 | """ 125 | feature_names = self.vectorizer.get_feature_names() 126 | for feature_vector in data_matrix: 127 | row = [0] * len(feature_names) 128 | column = range(len(feature_names)) 129 | data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) 130 | feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) 131 | predicted_label = self.k_means_estimator.predict(feature_csr_matrix) 132 | feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] 133 | 134 | expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) 135 | if features_to_include: 136 | return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) 137 | else: 138 | return expanded_data_matrix 139 | 140 | @time_it 141 | def __get_filtered_data_matrix(self, data_matrix, features_to_include): 142 | """ 143 | Removes all features except features_to_include 144 | """ 145 | filtered_data_matrix = [] 146 | for feature_vector in data_matrix: 147 | filtered_feature_vector = {} 148 | for feature_name in features_to_include: 149 | filtered_feature_vector[feature_name] = feature_vector[feature_name] 150 | filtered_data_matrix.append(filtered_feature_vector) 151 | return filtered_data_matrix 152 | 153 | @time_it 154 | def __get_expanded_data_matrix(self, data_matrix): 155 | """ 156 | Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 157 | modified copy is returned. 158 | """ 159 | feature_names = self.vectorizer.get_feature_names() 160 | expanded_data_matrix = copy.deepcopy(data_matrix) 161 | for feature_vector in expanded_data_matrix: 162 | for feature_name in feature_names: 163 | if feature_name not in feature_vector: 164 | feature_vector[feature_name] = 0 165 | return expanded_data_matrix 166 | 167 | @time_it 168 | def predict_labels_for_data(self, file_path, features_to_include=None): 169 | """ 170 | This function reads the tweets of different users from the file at file_path and assigns the closest 171 | cluster center to each user. 172 | It returns list of tuples of (user_id,predicted_label,latitude, longitude). 173 | """ 174 | data_matrix = self.__get_data_matrix_from_file(file_path) 175 | return self.__get_predicted_labels(data_matrix, features_to_include) 176 | 177 | 178 | def write_dict_list_to_csv(dict_list, file_name): 179 | """ 180 | Saves the list of dictionaries to file at "file_name". Each dictionary should have same set of keys. 181 | """ 182 | file_writer = csv.DictWriter(open(file_name, "w"), dict_list[0].keys()) 183 | file_writer.writeheader() 184 | file_writer.writerows(dict_list) 185 | 186 | if __name__ == "__main__": 187 | input_file = "../../TwitterData/survey_dump_with_geo_gt_8" 188 | output_file = "../../TwitterData/k_means_geo_gt_8_out" 189 | no_of_clusters = 10 190 | clusterd_data = KMeansEstimator(input_file, no_of_clusters).perform_clustering(KMeansEstimator.RELEVENT_FEATURE_LIST) 191 | logging.info("Input file:%s, output file:%s, no of clusters:%d", input_file, output_file, no_of_clusters) 192 | write_dict_list_to_csv(clusterd_data, output_file) 193 | logging.info("Written predicted labels for %d users in file:%s", len(clusterd_data), output_file) 194 | 195 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/unsupervised/k_means_plot.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | from mmds.utils.plot_utils import GeoMap, COLORS 4 | 5 | 6 | if __name__ == "__main__": 7 | data_file = "../../TwitterData/k_means_geo_gt_8_out" 8 | file_reader = csv.reader(open(data_file, "r")) 9 | next(file_reader) 10 | GeoMap().plot_points(file_reader, lambda row: COLORS[int(row[3])], lambda row:[row[0], row[2]]) 11 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalbhalla/Twitter-User-Personality-Prediction/4e0f8641aaea01b550151b150bf4f54437b72179/Twitter User Personality Prediction/mmds/utils/__init__.py -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/utils/plot_utils.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot 2 | from mpl_toolkits.basemap import Basemap 3 | 4 | 5 | COLORS = ['green', 'red', 'blue', 'yellow', 'purple', 'olive', 'khaki', 'indigo', 'aquamarine', 'orange'] 6 | class GeoMap: 7 | 8 | def plot_points(self, data_points, color_provider, coord_mapper): 9 | """ 10 | Plots the list of data point("data_points") on geo map. 11 | "color_provider" is the mapper function to map a data row to the corresponding color of the data point. 12 | "coord_mapper" is the mapper function to map a data row to the [latitude, langitude] of the data point. 13 | """ 14 | base_map = Basemap(projection='robin', lat_0=0, lon_0=0, resolution='l', area_thresh=1000.0) 15 | base_map.drawcoastlines() 16 | base_map.drawcountries() 17 | base_map.fillcontinents() 18 | for row in data_points: 19 | latitude, longitude = coord_mapper(row) 20 | x, y = base_map(longitude, latitude) 21 | base_map.plot(x, y, marker='o', color=color_provider(row), markersize=4) 22 | pyplot.show() 23 | 24 | -------------------------------------------------------------------------------- /Twitter User Personality Prediction/mmds/utils/time_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | logging.basicConfig(filename="timing.log", level=logging.DEBUG, format="%(asctime)-15s %(threadName)s %(message)s") 5 | 6 | def time_it(func): 7 | """ 8 | A decorator for timing the execution time of functions. 9 | """ 10 | def decorator(*args, **kwargs): 11 | start_time = time.time() 12 | result = func(*args, **kwargs) 13 | end_time = time.time() 14 | logging.info("Execution time : {}() = {}sec".format(func.__name__, end_time - start_time)) 15 | return result 16 | return decorator 17 | --------------------------------------------------------------------------------