├── src ├── __init__.py ├── load_data.py ├── tweetstorm.py ├── tweetokenizer.py ├── vader_sentiment.py ├── part_of_speech.py ├── text_emotion.py ├── time_of_day.py ├── ridge_grid_scan.py ├── feature_pipeline.py └── style.py ├── images ├── 1.png ├── 2.png ├── 3.png ├── 4.png ├── ridge.png ├── trump.png ├── neutral.png ├── trump2.png ├── not_trump.png ├── ridge_sns.png ├── flynn_tweet.png ├── oval_office.jpg ├── trump_blank.png ├── trump_robot.jpg ├── trump_ticker.gif ├── trump_clapping.jpg ├── trump_thumbs_up.jpg ├── flynn_tweet_poll.png ├── trump_clapping_lg.jpg └── trump_tweet_birds.gif ├── requirements.txt ├── .gitignore ├── penn_part_of_speech_tags.txt ├── twitterbot.py ├── twitterbot_rf.py ├── twitterbot_knn.py ├── twitterbot_mini_ensemble.py ├── README.md └── TweetAuthorshipPredictor.py /src/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/1.png -------------------------------------------------------------------------------- /images/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/2.png -------------------------------------------------------------------------------- /images/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/3.png -------------------------------------------------------------------------------- /images/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/4.png -------------------------------------------------------------------------------- /images/ridge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/ridge.png -------------------------------------------------------------------------------- /images/trump.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump.png -------------------------------------------------------------------------------- /images/neutral.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/neutral.png -------------------------------------------------------------------------------- /images/trump2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump2.png -------------------------------------------------------------------------------- /images/not_trump.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/not_trump.png -------------------------------------------------------------------------------- /images/ridge_sns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/ridge_sns.png -------------------------------------------------------------------------------- /images/flynn_tweet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/flynn_tweet.png -------------------------------------------------------------------------------- /images/oval_office.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/oval_office.jpg -------------------------------------------------------------------------------- /images/trump_blank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_blank.png -------------------------------------------------------------------------------- /images/trump_robot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_robot.jpg -------------------------------------------------------------------------------- /images/trump_ticker.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_ticker.gif -------------------------------------------------------------------------------- /images/trump_clapping.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_clapping.jpg -------------------------------------------------------------------------------- /images/trump_thumbs_up.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_thumbs_up.jpg -------------------------------------------------------------------------------- /images/flynn_tweet_poll.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/flynn_tweet_poll.png -------------------------------------------------------------------------------- /images/trump_clapping_lg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_clapping_lg.jpg -------------------------------------------------------------------------------- /images/trump_tweet_birds.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_tweet_birds.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==2.1.0 2 | tweepy==3.5.0 3 | nltk==3.2.1 4 | requests==2.18.4 5 | pandas==0.20.3 6 | numpy==1.13.3 7 | Pillow==5.0.0 8 | beautifulsoup4==4.6.0 9 | secrets==1.0.2 10 | scikit_learn==0.19.1 11 | vaderSentiment==2.5 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Raw data 2 | data/ 3 | 4 | # .env 5 | .env/ 6 | 7 | # Jupyter Notebooks 8 | *.ipynb_checkpoints 9 | *.ipynb 10 | 11 | # .pyc files 12 | *.pyc 13 | 14 | # twitterscraper query 15 | twitterscraper_query.txt 16 | 17 | # Stanford Named Entity Recognition (NER) library 18 | stanford-ner/ 19 | 20 | # NRC Sentiment-Emotion Lexicons 21 | NRC-Sentiment-Emotion-Lexicons/ 22 | 23 | # pickle files 24 | *.pkl 25 | 26 | # .npy and .npz files 27 | *.npy 28 | *.npz 29 | 30 | # presentation folder 31 | presentation/ 32 | 33 | # ensemble pickles 34 | ensemble/ 35 | -------------------------------------------------------------------------------- /src/load_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def load_json_list(list): 5 | ''' 6 | Takes a list of json files, loads them, and concatenates them 7 | INPUT: a list of json files 8 | OUTPUT: a single concatenated DataFrame 9 | ''' 10 | 11 | files = [] 12 | for file in list: 13 | df = pd.read_json(file) 14 | files.append(df) 15 | return pd.concat(files) 16 | 17 | 18 | def apply_date_mask(df, date_column, start_date, end_date): 19 | ''' 20 | applies mask to a df to include only dates within the given date range 21 | INPUT: a DataFrame, the name of the datetime column, start and end dates 22 | OUTPUT: a DataFrame with a datetime index, sorted by datetime 23 | ''' 24 | 25 | mask = (df[date_column] > start_date) & (df[date_column] <= end_date) 26 | return df.loc[mask] 27 | -------------------------------------------------------------------------------- /src/tweetstorm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def tweetstorm(df, tweet, source, timestamp, time_threshold): 5 | ''' 6 | Takes a DataFrame with a specified column containing tweets, a specified 7 | column identifying the source of the tweet, a specified column indicating 8 | the timestamp of the tweet, and a threshold in seconds defining the 9 | maximimum time which can pass between tweets to define a tweetstorm 10 | INPUT: DataFrame, string, string, string, int 11 | OUTPUT: the original DataFrame with one new column 12 | ''' 13 | 14 | temp = pd.DataFrame() 15 | df = df.copy() 16 | temp['time_diff'] = df.groupby(source)[timestamp].diff().dt.total_seconds() 17 | temp['time_diff_prev'] = temp['time_diff'].shift(-1) 18 | df['tweetstorm'] = temp.eval('time_diff < @time_threshold | \ 19 | time_diff_prev < @time_threshold') 20 | return df 21 | -------------------------------------------------------------------------------- /src/tweetokenizer.py: -------------------------------------------------------------------------------- 1 | from tweetokenize import Tokenizer 2 | 3 | 4 | def tweet_tokens(tweet): 5 | ''' 6 | Takes a tweet and replaces mentions, hashtags, urls, times, and numbers 7 | with a generic label 8 | INPUT: string 9 | OUTPUT: string 10 | ''' 11 | 12 | gettokens = Tokenizer(usernames='USER', urls='URL', 13 | hashtags='HASHTAG', times='TIME', 14 | numbers='NUMBER', allcapskeep=True, 15 | lowercase=False) 16 | tokens = gettokens.tokenize(tweet) 17 | tweet = ' '.join(tokens) 18 | 19 | return tweet 20 | 21 | 22 | def tweet_tokenize(df, column): 23 | ''' 24 | Takes a Data Frame and a specified column of tweets and creates a new 25 | column with the tweetokenized tweet 26 | INPUT: DateFrame, string 27 | OUTPUT: the original DataFrame with one new column 28 | ''' 29 | 30 | new_df = df.copy() 31 | new_df['tweetokenize'] = new_df['text'].apply(tweet_tokens) 32 | return new_df 33 | -------------------------------------------------------------------------------- /penn_part_of_speech_tags.txt: -------------------------------------------------------------------------------- 1 | 1. CC Coordinating conjunction 2 | 2. CD Cardinal number 3 | 3. DT Determiner 4 | 4. EX Existential there 5 | 5. FW Foreign word 6 | 6. IN Preposition or subordinating conjunction 7 | 7. JJ Adjective 8 | 8. JJR Adjective, comparative 9 | 9. JJS Adjective, superlative 10 | 10. LS List item marker 11 | 11. MD Modal 12 | 12. NN Noun, singular or mass 13 | 13. NNS Noun, plural 14 | 14. NNP Proper noun, singular 15 | 15. NNPS Proper noun, plural 16 | 16. PDT Predeterminer 17 | 17. POS Possessive ending 18 | 18. PRP Personal pronoun 19 | 19. PRP$ Possessive pronoun 20 | 20. RB Adverb 21 | 21. RBR Adverb, comparative 22 | 22. RBS Adverb, superlative 23 | 23. RP Particle 24 | 24. SYM Symbol 25 | 25. TO to 26 | 26. UH Interjection 27 | 27. VB Verb, base form 28 | 28. VBD Verb, past tense 29 | 29. VBG Verb, gerund or present participle 30 | 30. VBN Verb, past participle 31 | 31. VBP Verb, non-3rd person singular present 32 | 32. VBZ Verb, 3rd person singular present 33 | 33. WDT Wh-determiner 34 | 34. WP Wh-pronoun 35 | 35. WP$ Possessive wh-pronoun 36 | 36. WRB Wh-adverb 37 | -------------------------------------------------------------------------------- /src/vader_sentiment.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 3 | from sklearn.preprocessing import normalize 4 | from sklearn import preprocessing 5 | 6 | 7 | def get_vader_scores(text): 8 | ''' 9 | Takes a string of text and outputs four values for Vader's negative, 10 | neutral, positive, and compound (normalized) sentiment scores 11 | INPUT: a string 12 | OUTPUT: a dictionary of four sentiment scores 13 | ''' 14 | 15 | analyser = SentimentIntensityAnalyzer() 16 | return analyser.polarity_scores(text) 17 | 18 | 19 | def apply_vader(df, column): 20 | ''' 21 | Takes a DataFrame with a specified column of text and adds four new columns 22 | to the DataFrame, corresponding to the Vader sentiment scores 23 | INPUT: DataFrame, string 24 | OUTPUT: the original DataFrame with four additional columns 25 | ''' 26 | 27 | sentiment = pd.DataFrame(df[column].apply(get_vader_scores)) 28 | unpacked = pd.DataFrame([d for idx, d in sentiment[column].iteritems()], 29 | index=sentiment.index) 30 | unpacked['compound'] += 1 31 | columns = {'neu': 'v_neutral', 'pos': 'v_positive', 'neg': 'v_negative'} 32 | unpacked.rename(columns=columns, inplace=True) 33 | return pd.concat([df, unpacked], axis=1) 34 | -------------------------------------------------------------------------------- /src/part_of_speech.py: -------------------------------------------------------------------------------- 1 | from nltk import word_tokenize, pos_tag 2 | from nltk.tag import StanfordNERTagger 3 | 4 | 5 | def pos_tagging(text): 6 | ''' 7 | Takes a string of words and returns a string with parts-of-speech of words 8 | INPUT: string 9 | OUTPUT: string 10 | ''' 11 | pos = pos_tag(word_tokenize(text)) 12 | string = "" 13 | for item in pos: 14 | string += item[1] + " " 15 | return string 16 | 17 | 18 | def ner_tagging(text): 19 | ''' 20 | Takes a tweetokenized string of words and uses the Stanford NER Tagger to 21 | replace names, places, and organizations with a standard token 22 | INPUT: string 23 | OUTPUT: string 24 | ''' 25 | st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.' 26 | 'distsim.crf.ser.gz', 'stanford-ner/stanford-ner.' 27 | 'jar', encoding='utf-8') 28 | ner = st.tag(word_tokenize(text)) 29 | string = "" 30 | for item in ner: 31 | if item[1] == 'O': 32 | if item[0] == '<' or item[0] == '@': 33 | string += item[0] 34 | elif item[0] == '>': 35 | string = string[:-1] + item[0] + ' ' 36 | else: 37 | string += item[0] + ' ' 38 | else: 39 | string += item[1] + ' ' 40 | tweet = '' 41 | for word in string.split(): 42 | if word.isupper(): 43 | tweet += word + ' ' 44 | else: 45 | tweet += word.lower() + ' ' 46 | return tweet 47 | -------------------------------------------------------------------------------- /src/text_emotion.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from nltk import word_tokenize 3 | from nltk.stem.snowball import SnowballStemmer 4 | 5 | 6 | def text_emotion(df, column): 7 | ''' 8 | Takes a DataFrame and a specified column of text and adds 10 columns to the 9 | DataFrame for each of the 10 emotions in the NRC Emotion Lexicon, with each 10 | column containing the value of the text in that emotions 11 | INPUT: DataFrame, string 12 | OUTPUT: the original DataFrame with ten new columns 13 | ''' 14 | 15 | new_df = df.copy() 16 | 17 | filepath = ('NRC-Sentiment-Emotion-Lexicons/' 18 | 'NRC-Emotion-Lexicon-v0.92/' 19 | 'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt') 20 | emolex_df = pd.read_csv(filepath, 21 | names=["word", "emotion", "association"], 22 | sep='\t') 23 | emolex_words = emolex_df.pivot(index='word', 24 | columns='emotion', 25 | values='association').reset_index() 26 | emotions = emolex_words.columns.drop('word') 27 | emo_df = pd.DataFrame(0, index=df.index, columns=emotions) 28 | 29 | stemmer = SnowballStemmer("english") 30 | 31 | for i, row in new_df.iterrows(): 32 | document = word_tokenize(new_df.loc[i][column]) 33 | for word in document: 34 | word = stemmer.stem(word.lower()) 35 | emo_score = emolex_words[emolex_words.word == word] 36 | if not emo_score.empty: 37 | for emotion in list(emotions): 38 | emo_df.at[i, emotion] += emo_score[emotion] 39 | 40 | new_df = pd.concat([new_df, emo_df], axis=1) 41 | 42 | return new_df 43 | -------------------------------------------------------------------------------- /src/time_of_day.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def time_of_day(df, timestamp): 6 | ''' 7 | Takes a DataFrame and a specified column containing a timestamp and creates 8 | a new column indicating the hour of the day 9 | INPUT: DataFrame, string 10 | OUTPUT: the original DataFrame with one new column 11 | ''' 12 | 13 | new_df = df.copy() 14 | new_df['hour'] = new_df[timestamp].dt.hour 15 | return new_df 16 | 17 | 18 | def period_of_day(df, timestamp): 19 | ''' 20 | Takes a DataFrame and a specified column containing a timestamp and creates 21 | a new column indicating the period of the day in 6-hour increments 22 | INPUT: DataFrame, string 23 | OUTPUT: the original DataFrame with one new column 24 | ''' 25 | 26 | new_df = df.copy() 27 | new_df['hour_20_02'] = np.where(((new_df['created_at'].dt.hour >= 20) | 28 | (new_df['created_at'].dt.hour < 2)), 29 | True, False) 30 | new_df['hour_14_20'] = np.where(((new_df['created_at'].dt.hour >= 14) & 31 | (new_df['created_at'].dt.hour < 20)), 32 | True, False) 33 | new_df['hour_08_14'] = np.where(((new_df['created_at'].dt.hour >= 8) & 34 | (new_df['created_at'].dt.hour < 14)), 35 | True, False) 36 | new_df['hour_02_08'] = np.where(((new_df['created_at'].dt.hour >= 2) & 37 | (new_df['created_at'].dt.hour < 8)), 38 | True, False) 39 | return new_df 40 | 41 | 42 | def day_of_week(df, timestamp): 43 | ''' 44 | Takes a DataFrame and a specified column containing a timestamp and creates 45 | a new column indicating the day of the week 46 | INPUT: DataFrame, string 47 | OUTPUT: the original DataFrame with one new column 48 | ''' 49 | new_df = df.copy() 50 | new_df['day_of_week'] = new_df[timestamp].dt.weekday 51 | 52 | return new_df 53 | 54 | 55 | def weekend(df, day_of_week): 56 | ''' 57 | Takes a DataFrame and a specified column containing a day of the week and 58 | creates a new column indicating if the day occurs on a weekend 59 | INPUT: DataFrame, string 60 | OUTPUT: the original DataFrame with one new column 61 | ''' 62 | new_df = df.copy() 63 | new_df['weekend'] = new_df[day_of_week].apply(lambda x: 1 if x in [5, 6] else 0) 64 | 65 | return new_df 66 | -------------------------------------------------------------------------------- /src/ridge_grid_scan.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import RidgeClassifier 2 | 3 | 4 | def ridge_grid_scan(X_train, y_train, n=100): 5 | ''' 6 | Recursively performs ridge regression to sort all features in order of 7 | importance and return the top n features 8 | INPUT: X DataFrame, y DataFrame, int 9 | OUTPUT: list of feature importances 10 | ''' 11 | 12 | scan = GridScan(X_train, y_train, n) 13 | return scan.feature_importances 14 | 15 | 16 | class GridScan(object): 17 | ''' 18 | Grid scan object to track alpha levels of ridge regression and features 19 | which are driven out of the model at each alpha levels 20 | ''' 21 | 22 | def __init__(self, X_train, y_train, n): 23 | self.X_train = X_train 24 | self.y_train = y_train 25 | self.n = n 26 | alpha_min = 1e-8 27 | alpha_max = 1e24 28 | self.alpha_levels = {} 29 | self.feature_importances = [] 30 | 31 | self.ridge(alpha_min) 32 | self.ridge(alpha_max) 33 | while len(self.alpha_levels[alpha_max]) < len(self.X_train.columns): 34 | print('alpha too low; increasing value') 35 | alpha_max *= 2 36 | self.ridge(alpha_max) 37 | 38 | self.scan(alpha_min, alpha_max) 39 | 40 | self.feature_importances.sort(key=lambda feature_alpha: 41 | -feature_alpha[1]) 42 | 43 | def scan(self, lower, upper): 44 | ''' 45 | Takes a lower and upper bound for alpha levels and recursively runs 46 | ridge regression until only one feature is eliminated from the model 47 | INPUT: int, int 48 | OUTPUT: 49 | ''' 50 | 51 | mid = (lower + upper) / 2 52 | 53 | if len(self.alpha_levels[upper]) <= (len(self.X_train.columns) - 54 | self.n): 55 | return 56 | 57 | diff = self.alpha_levels[upper] - self.alpha_levels[lower] 58 | if not diff: 59 | return 60 | 61 | if len(diff) == 1: 62 | for feature in diff: 63 | self.feature_importances.append((feature, mid)) 64 | print('========') 65 | print(len(self.feature_importances), feature) 66 | print('{:0.1f}% complete'.format(( 67 | len(self.feature_importances) / self.n) * 100)) 68 | print('========') 69 | return 70 | 71 | self.ridge(mid) 72 | 73 | self.scan(lower, mid) 74 | self.scan(mid, upper) 75 | 76 | def ridge(self, alpha): 77 | ''' 78 | Takes an alpha level and runs ridge regression 79 | INPUT: float 80 | OUTPUT: 81 | ''' 82 | 83 | print(alpha) 84 | model = RidgeClassifier(alpha=alpha) 85 | 86 | model.fit(self.X_train, self.y_train) 87 | 88 | feat_coef = list(zip(self.X_train.columns, model.coef_[0])) 89 | 90 | features = set() 91 | for element in feat_coef: 92 | if abs(element[1]) < 1e-24: 93 | features.add(element[0]) 94 | 95 | self.alpha_levels[alpha] = features 96 | -------------------------------------------------------------------------------- /twitterbot.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tweepy 3 | import pandas as pd 4 | import pickle 5 | from time import sleep 6 | from TweetAuthorshipPredictor import TweetAuthorshipPredictor 7 | 8 | 9 | credentials = json.load(open('.env/twitter_credentials.json')) 10 | 11 | consumer_key = credentials['consumer_key'] 12 | consumer_secret = credentials['consumer_secret'] 13 | access_token = credentials['access_token'] 14 | access_token_secret = credentials['access_token_secret'] 15 | 16 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 17 | auth.set_access_token(access_token, access_token_secret) 18 | api = tweepy.API(auth) 19 | 20 | realDonaldTrump = '25073877' 21 | # realDonaldTrump = '14649582' # test 22 | 23 | with open('twitterbot_pickles/trump.pkl', 'rb') as trump: 24 | print('Loading model...') 25 | model = pickle.load(trump) 26 | 27 | 28 | class TrumpStreamListener(tweepy.StreamListener): 29 | 30 | def on_status(self, status): 31 | if status.author.id_str == realDonaldTrump: 32 | tweet = pd.DataFrame(columns=['created_at', 33 | 'favorite_count', 34 | 'id_str', 35 | 'in_reply_to_user_id_str', 36 | 'is_retweet', 37 | 'retweet_count', 38 | 'source', 39 | 'text']) 40 | tweet.loc[0] = [status.created_at, 41 | status.favorite_count, 42 | status.id_str, 43 | status.in_reply_to_user_id_str, 44 | status.retweeted, 45 | status.retweet_count, 46 | status.source, 47 | status.text] 48 | prediction = predict_author(tweet) 49 | post_tweet(status, prediction) 50 | 51 | def on_error(self, status_code): 52 | if status_code == 420: 53 | # returning False in on_data disconnects the stream 54 | print('Hit rate limit, pausing 60 seconds') 55 | sleep(60) 56 | return True 57 | 58 | 59 | def post_tweet(status, prediction): 60 | '''Takes a tweet, formats the response, and posts to Twitter 61 | INPUT: string 62 | OUTPUT: 63 | ''' 64 | url = ('https://twitter.com/' + status.user.screen_name + 65 | '/status/' + status.id_str) 66 | text = str(status.text) 67 | if len(text) >= 114: 68 | text = text[:114] + '…' 69 | 70 | proba = .99 if prediction[1] > .99 else prediction[1] 71 | 72 | if prediction[0] == 0: 73 | tweet = ('I am {0:.0%} confident an aide wrote this:\n' 74 | '"{1}"\n' 75 | '@realDonaldTrump {2}'. 76 | format((1 - proba), text, url)) 77 | else: 78 | tweet = ('I am {0:.0%} confident Trump wrote this:\n' 79 | '"{1}"\n' 80 | '@realDonaldTrump {2}'. 81 | format(proba, text, url)) 82 | print(tweet) 83 | print() 84 | api.update_status(tweet) 85 | 86 | 87 | def predict_author(tweet): 88 | return model.predict(tweet) 89 | 90 | 91 | def first_tweet(api): 92 | api.update_with_media('images/trump_ticker.gif', 93 | status="Stay tuned!...") 94 | 95 | 96 | def start_stream(): 97 | while True: 98 | try: 99 | trumpstream = tweepy.Stream(auth, trumpstreamlistener) 100 | trumpstream.filter(follow=[realDonaldTrump]) 101 | except: 102 | continue 103 | 104 | 105 | trumpstreamlistener = TrumpStreamListener() 106 | print('Ready!') 107 | start_stream() 108 | -------------------------------------------------------------------------------- /src/feature_pipeline.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from src.vader_sentiment import apply_vader 4 | from src.text_emotion import text_emotion 5 | from src.style import apply_avg_lengths, tweet_length, punctuation_columns, \ 6 | quoted_retweet, apply_all_caps, mention_hashtag_url, \ 7 | mention_start, random_capitalization 8 | from src.tweetstorm import tweetstorm 9 | from src.time_of_day import time_of_day, period_of_day, day_of_week, weekend 10 | from src.part_of_speech import pos_tagging, ner_tagging 11 | from src.tweetokenizer import tweet_tokenize, tweet_tokens 12 | 13 | 14 | def feature_pipeline(df, verbose=False): 15 | # ========================================================================= 16 | # Feature engineering 17 | # ========================================================================= 18 | if verbose: 19 | print() 20 | print('Feature engineering') 21 | 22 | # Dummify is_reply column 23 | if verbose: 24 | print(' dummifying is_reply column') 25 | df['in_reply_to_user_id_str'].fillna(0, inplace=True) 26 | df['is_reply'] = np.where(df['in_reply_to_user_id_str'], 1, 0) 27 | 28 | # Create columns for vader sentiment 29 | if verbose: 30 | print(' calculating vader sentiment') 31 | df = apply_vader(df, 'text') 32 | 33 | # Create columns for NRC Emotion Lexicon 34 | if verbose: 35 | print(' calculating NRC Emotion Lexicon score') 36 | df = text_emotion(df, 'text') 37 | 38 | # Create columns for average tweet, sentence, and word length of tweet 39 | if verbose: 40 | print(' calculating average sentence and word length') 41 | df = apply_avg_lengths(df, 'text') 42 | 43 | # Create columns for counts of punctuation 44 | if verbose: 45 | print(' calculating punctuation counts') 46 | punctuation_dict = {'commas': ',', 'semicolons': ';', 'exclamations': '!', 47 | 'periods': '.', 'questions': '?', 'quotes': '"', 48 | 'ellipses': '...'} 49 | 50 | df = punctuation_columns(df, 'text', punctuation_dict) 51 | 52 | # Create columns for counts of @mentions, #hashtags, and urls 53 | if verbose: 54 | print(' calculating mentions, hashtags, and url counts') 55 | df = mention_hashtag_url(df, 'text') 56 | 57 | # Create column identifying if the tweet is surrounding by quote marks 58 | if verbose: 59 | print(' calculating quoted retweet') 60 | df = quoted_retweet(df, 'text') 61 | 62 | # Create column indicating the count of fully capitalized words in a tweet 63 | if verbose: 64 | print(' calculating fully capitalized word counts') 65 | df = apply_all_caps(df, 'text') 66 | 67 | # Create column identifying if the tweet is part of a tweetstorm 68 | # if verbose: 69 | # print(' calculating tweetstorm') 70 | # df = tweetstorm(df, 'text', 'source', 'created_at', 600) 71 | 72 | # Create column identifying the hour of the day that the tweet was posted 73 | if verbose: 74 | print(' calculating time of day') 75 | df = time_of_day(df, 'created_at') 76 | 77 | # Create column identifying the day of the week that the tweet was posted 78 | if verbose: 79 | print(' calculating day of week') 80 | df = day_of_week(df, 'created_at') 81 | 82 | # Create column identifying if the day of the week occurred on a weekend 83 | if verbose: 84 | print(' calculating weekend') 85 | df = weekend(df, 'day_of_week') 86 | 87 | # Create column identifying the period of the day, in 6-hour increments 88 | if verbose: 89 | print(' calculating period of day') 90 | df = period_of_day(df, 'created_at') 91 | 92 | # Create column finding the number of randomly capitalized words 93 | if verbose: 94 | print(' calculating randomly capitalized words') 95 | df = random_capitalization(df, 'text') 96 | 97 | # Create column of tweetokenize tweets 98 | if verbose: 99 | print(' calculating tweetokenize tweets') 100 | df = tweet_tokenize(df, 'text') 101 | 102 | # Create column identifying if the tweet begins with an @mentions 103 | if verbose: 104 | print(' calculating @mention beginnings') 105 | df['start_mention'] = df['tweetokenize'].apply(mention_start) 106 | 107 | # Part of speech tagging 108 | if verbose: 109 | print(' calculating part of speech') 110 | df['pos'] = df['tweetokenize'].apply(pos_tagging) 111 | 112 | # Create ner column for Name Entity Recognition 113 | if verbose: 114 | print() 115 | print('Performing NER') 116 | df['ner'] = df['tweetokenize'].apply(ner_tagging) 117 | 118 | return df.drop(['source'], axis=1) 119 | -------------------------------------------------------------------------------- /src/style.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | from src import tweetokenizer as t 4 | 5 | 6 | def sentence_word_length(text): 7 | ''' 8 | Finds the average length of sentences and words in a given text 9 | INPUT: string 10 | OUTPUT: float(average sentence length), float(average word length) 11 | ''' 12 | 13 | sentence_lengths = [] 14 | word_lengths = [] 15 | sentences = [s.strip() for s in re.split('[\.\?!]', text) if s] 16 | for sentence in sentences: 17 | words = sentence.split() 18 | word_lengths = word_lengths + [len(word) for word in words] 19 | sentence_length = len(words) 20 | sentence_lengths.append(sentence_length) 21 | return (sum(sentence_lengths) / float(len(sentence_lengths)), 22 | sum(word_lengths) / float(len(word_lengths))) 23 | 24 | 25 | def apply_avg_lengths(df, column): 26 | ''' 27 | Takes a DataFrame with a specified column of text and adds two new columns 28 | to the DataFrame, corresponding to the average sentence and word lengths 29 | INPUT: DataFrame, string 30 | OUTPUT: the original DataFrame with two additional columns 31 | ''' 32 | 33 | avg_lengths = pd.DataFrame(df[column].apply(sentence_word_length)) 34 | unpacked = pd.DataFrame([d for idx, d in avg_lengths[column].iteritems()], 35 | index=avg_lengths.index) 36 | unpacked.columns = ['avg_sentence_length', 'avg_word_length'] 37 | return pd.concat([df, unpacked], axis=1) 38 | 39 | 40 | def tweet_length(df, column): 41 | ''' 42 | Takes a DataFrame and the name of a column of text and creates a new 43 | column containing the count of characters of the text 44 | INPUT: DataFrame, string 45 | OUTPUT: the original DataFrame, with one new column 46 | ''' 47 | 48 | new_df = df.copy() 49 | new_df['tweet_length'] = new_df[column].str.len() 50 | return new_df 51 | 52 | 53 | def count_character(text, character): 54 | ''' 55 | Takes a text string and a character and outputs the number of occurances 56 | of that character in the text 57 | INPUT: text string, character string 58 | OUTPUT: int 59 | ''' 60 | 61 | return text.count(character) 62 | 63 | 64 | def punctuation_columns(df, column, punctuation_dict): 65 | ''' 66 | Takes a DataFrame, a column of text, and a dictionary with keys = character 67 | names and values = character, for example {'comma':','}. Creates new 68 | columns containing the number of occurances specified punctuation 69 | INPUT: DataFrame, string of column name, dictionary 70 | OUTPUT: original DataFrame with new columns 71 | ''' 72 | 73 | new_df = df.copy() 74 | for idx in range(len(punctuation_dict)): 75 | col = pd.DataFrame(df[column].apply(count_character, 76 | character=list(punctuation_dict.values())[idx])) 77 | col.columns = [list(punctuation_dict.keys())[idx]] 78 | new_df = pd.concat([new_df, col], axis=1) 79 | 80 | return new_df 81 | 82 | 83 | def mention_hashtag_url(df, column): 84 | ''' 85 | Takes a DataFrame and a specified column of tweetokenized tweets and 86 | creates new columns containing the count of @mentions, #hashtags, and URLs 87 | in the tweet 88 | INPUT: DataFrame, string 89 | OUTPUT: the original DataFrame with four new columns 90 | ''' 91 | 92 | new_df = t.tweet_tokenize(df, 'text') 93 | new_df['mentions'] = new_df['tweetokenize'].apply( 94 | lambda x: x.count('')) 95 | new_df['hashtags'] = new_df['tweetokenize'].apply( 96 | lambda x: x.count('')) 97 | new_df['urls'] = new_df['tweetokenize'].apply( 98 | lambda x: x.count('')) 99 | return new_df 100 | 101 | 102 | def identify_quoted_retweet(text): 103 | ''' 104 | Takes a string of text and returns 1 if the text begins with '"@' and a 0 105 | if not 106 | INPUT: string 107 | OUTPUT: int 108 | ''' 109 | 110 | return (0 if re.match('^"@', text) is None else 1) 111 | 112 | 113 | def quoted_retweet(df, column): 114 | ''' 115 | Takes a DataFrame and a column of text and creates a new colun with 1 if 116 | the text is fully surrounded by quote marks and a 0 if not 117 | INPUT: DataFrame, String of column name 118 | OUPUT: original DataFrame with one new column 119 | ''' 120 | 121 | quote = pd.DataFrame(df[column].apply(identify_quoted_retweet), 122 | index=df.index) 123 | quote.columns = ['is_quoted_retweet'] 124 | return pd.concat([df, quote], axis=1) 125 | 126 | 127 | def all_caps(text): 128 | ''' 129 | Takes a string of text and counts the number of ALL UPPERCASE words 130 | INPUT: string 131 | OUTPUT: int 132 | ''' 133 | 134 | return (len(re.findall('\s([A-Z][A-Z]+)', text))) 135 | 136 | 137 | def apply_all_caps(df, column): 138 | ''' 139 | Takes a DataFrame and a specified column of text and creates a new column 140 | with the count of fully capitalized words in the text 141 | INPUT: DataFrame, string 142 | OUTPUT: the original DataFrame with one new column 143 | ''' 144 | 145 | new_df = df.copy() 146 | new_df['all_caps'] = new_df[column].apply(all_caps) 147 | return new_df 148 | 149 | 150 | def random_capitalization(df, column): 151 | ''' 152 | Takes a DataFrame and a specified column of text and creates a new column 153 | with the count of randomly capitalized words in the text 154 | INPUT: DataFrame, string 155 | OUTPUT: the original DataFrame with one new column 156 | ''' 157 | 158 | new_df = df.copy() 159 | exp = r"(?" and 168 | 0 if not. 169 | INPUT: string 170 | OUTPUT: int 171 | ''' 172 | return 1 if text[:6] == '' else 0 173 | -------------------------------------------------------------------------------- /twitterbot_rf.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tweepy 3 | import pandas as pd 4 | import pickle 5 | from TweetAuthorshipPredictor import TweetAuthorshipPredictor 6 | from src.feature_pipeline import feature_pipeline 7 | 8 | 9 | credentials = json.load(open('.env/twitter_credentials.json')) 10 | 11 | consumer_key = credentials['consumer_key'] 12 | consumer_secret = credentials['consumer_secret'] 13 | access_token = credentials['access_token'] 14 | access_token_secret = credentials['access_token_secret'] 15 | 16 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 17 | auth.set_access_token(access_token, access_token_secret) 18 | api = tweepy.API(auth) 19 | 20 | realDonaldTrump = '25073877' 21 | # realDonaldTrump = '14649582' 22 | 23 | with open('twitterbot_pickles/rf.pkl', 'rb') as trump: 24 | model = pickle.load(trump) 25 | 26 | std = ['compound', 'anger', 'anticipation', 'disgust', 'fear', 27 | 'joy', 'negative', 'positive', 'sadness', 'surprise', 28 | 'trust', 'tweet_length', 'avg_sentence_length', 29 | 'avg_word_length', 'commas', 'semicolons', 'exclamations', 30 | 'periods', 'questions', 'quotes', 'ellipses', 'mentions', 31 | 'hashtags', 'urls', 'all_caps', 'hour', 'random_caps'] 32 | 33 | feat = ['created_at', 'is_retweet', 'text', 'is_reply', 34 | 'compound', 'v_negative', 'v_neutral', 'v_positive', 35 | 'anger', 'anticipation', 'disgust', 'fear', 'joy', 36 | 'negative', 'positive', 'sadness', 'surprise', 'trust', 37 | 'tweet_length', 'avg_sentence_length', 'avg_word_length', 38 | 'commas', 'semicolons', 'exclamations', 'periods', 39 | 'questions', 'quotes', 'ellipses', 'mentions', 'hashtags', 40 | 'urls', 'is_quoted_retweet', 'all_caps', 'tweetstorm', 41 | 'hour', 'hour_20_02', 'hour_14_20', 'hour_08_14', 42 | 'hour_02_08', 'day_of_week', 'weekend', 'random_caps', 43 | 'start_mention', 'ner', 'pos'] 44 | 45 | 46 | def load_pickle(filename): 47 | # Open pickle filename 48 | print('Pickle load', filename) 49 | with open(filename, 'rb') as f: 50 | return pickle.load(f) 51 | 52 | 53 | tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl') 54 | tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl') 55 | tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl') 56 | text_cols = tfidf_text.get_feature_names() 57 | ner_cols = tfidf_ner.get_feature_names() 58 | pos_cols = tfidf_pos.get_feature_names() 59 | scaler = load_pickle('twitterbot_pickles/scaler.pkl') 60 | top_feats = load_pickle('twitterbot_pickles/top_feats.pkl') 61 | 62 | 63 | class TrumpStreamListener(tweepy.StreamListener): 64 | 65 | def on_status(self, status): 66 | if status.author.id_str == realDonaldTrump: 67 | tweet = df = pd.DataFrame(columns=['created_at', 68 | 'favorite_count', 69 | 'id_str', 70 | 'in_reply_to_user_id_str', 71 | 'is_retweet', 72 | 'retweet_count', 73 | 'source', 74 | 'text']) 75 | 76 | tweet.loc[0] = [status.created_at, 77 | status.favorite_count, 78 | status.id_str, 79 | status.in_reply_to_user_id_str, 80 | status.retweeted, 81 | status.retweet_count, 82 | status.source, 83 | status.text] 84 | prediction = predict_author(tweet) 85 | post_tweet(status, prediction) 86 | 87 | 88 | def post_tweet(status, prediction): 89 | '''Takes a tweet, formats the response, and posts to Twitter 90 | INPUT: string 91 | OUTPUT: 92 | ''' 93 | url = str('https://twitter.com/realDonaldTrump/status/' + status.id_str) 94 | url = ('https://twitter.com/' + status.user.screen_name + 95 | '/status/' + status.id_str) 96 | text = str(status.text) 97 | 98 | if prediction[0] == 0: 99 | proba = .99 if prediction[1][0][0] > .99 else prediction[1][0][0] 100 | tweet = ('I am {0:.0%} confident an aide wrote this:\n"{1}..."' 101 | '\n@realDonaldTrump\n' 102 | '{2}'. 103 | format(proba, text[:150], url)) 104 | else: 105 | proba = .99 if prediction[1][0][1] > .99 else prediction[1][0][1] 106 | tweet = ('I am {0:.0%} confident Trump wrote this:\n"{1}..."' 107 | '\n@realDonaldTrump\n' 108 | '{2}'. 109 | format(proba, text[:150], url)) 110 | print(tweet) 111 | print() 112 | api.update_status(tweet) 113 | 114 | 115 | def predict_author(tweet): 116 | X, X_std = prepare_data_for_predict(tweet) 117 | X = X[top_feats[:200]] 118 | return model.predict(X), model.predict_proba(X) 119 | 120 | 121 | def first_tweet(api): 122 | api.update_with_media('images/trump_ticker.gif', 123 | status="Stay tuned!...") 124 | 125 | 126 | def prepare_data_for_predict(X): 127 | ''' Processes the X data with all features and standardizes. 128 | ''' 129 | # Create new feature columns 130 | X = feature_pipeline(X) 131 | X = tfidf_transform(X[feat]) 132 | X_std = standardize(X) 133 | return X, X_std 134 | 135 | 136 | def tfidf_transform(X): 137 | '''Performs a tf-idf transform on the given column of data 138 | ''' 139 | X.reset_index(drop=True, inplace=True) 140 | _tfidf_text = tfidf_text.transform(X['text']) 141 | _tfidf_text = pd.DataFrame(_tfidf_text.todense(), 142 | columns=[text_cols]) 143 | 144 | _tfidf_ner = tfidf_ner.transform(X['ner']) 145 | _tfidf_ner = pd.DataFrame(_tfidf_ner.todense(), 146 | columns=[ner_cols]) 147 | 148 | _tfidf_pos = tfidf_pos.transform(X['pos']) 149 | _tfidf_pos = pd.DataFrame(_tfidf_pos.todense(), 150 | columns=[pos_cols]) 151 | 152 | X = tfidf_remove_dups(X, _tfidf_text, _tfidf_pos, _tfidf_ner) 153 | 154 | return X 155 | 156 | 157 | def tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner): 158 | '''Removes columns in tfidf_pos and tfidf_ner that are duplicates from 159 | tfidf_text, and concatentates the DataFrames 160 | ''' 161 | # Drop ner columns also present in tfidf_text 162 | columns_to_keep = [x for x in tfidf_ner 163 | if x not in tfidf_text] 164 | tfidf_ner = tfidf_ner[columns_to_keep] 165 | 166 | # Drop pos columns also present in ner 167 | columns_to_keep = [x for x in tfidf_pos 168 | if x not in tfidf_ner] 169 | tfidf_pos = tfidf_pos[columns_to_keep] 170 | 171 | X = pd.concat([X, tfidf_text, tfidf_pos, tfidf_ner], axis=1) 172 | return X 173 | 174 | 175 | def standardize(X): 176 | print('Performing Standardization') 177 | X_std = X.copy() 178 | cols = X[std].columns 179 | X_std[std] = pd.DataFrame(scaler.transform( 180 | X[std]), 181 | index=X.index, 182 | columns=cols) 183 | return X_std 184 | 185 | 186 | def start_stream(): 187 | while True: 188 | try: 189 | trumpstream = tweepy.Stream(auth, trumpstreamlistener) 190 | trumpstream.filter(follow=[realDonaldTrump]) 191 | except: 192 | continue 193 | 194 | 195 | trumpstreamlistener = TrumpStreamListener() 196 | start_stream() 197 | -------------------------------------------------------------------------------- /twitterbot_knn.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tweepy 3 | import pandas as pd 4 | import pickle 5 | from TweetAuthorshipPredictor import TweetAuthorshipPredictor 6 | from src.feature_pipeline import feature_pipeline 7 | 8 | 9 | credentials = json.load(open('.env/twitter_credentials.json')) 10 | 11 | consumer_key = credentials['consumer_key'] 12 | consumer_secret = credentials['consumer_secret'] 13 | access_token = credentials['access_token'] 14 | access_token_secret = credentials['access_token_secret'] 15 | 16 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 17 | auth.set_access_token(access_token, access_token_secret) 18 | api = tweepy.API(auth) 19 | 20 | realDonaldTrump = '25073877' 21 | # realDonaldTrump = '14649582' 22 | 23 | with open('pickle/ensemble_knn.pkl', 'rb') as trump: 24 | model = pickle.load(trump) 25 | 26 | std = ['compound', 'anger', 'anticipation', 'disgust', 'fear', 27 | 'joy', 'negative', 'positive', 'sadness', 'surprise', 28 | 'trust', 'tweet_length', 'avg_sentence_length', 29 | 'avg_word_length', 'commas', 'semicolons', 'exclamations', 30 | 'periods', 'questions', 'quotes', 'ellipses', 'mentions', 31 | 'hashtags', 'urls', 'all_caps', 'hour', 'random_caps'] 32 | 33 | feat = ['created_at', 'is_retweet', 'text', 'is_reply', 34 | 'compound', 'v_negative', 'v_neutral', 'v_positive', 35 | 'anger', 'anticipation', 'disgust', 'fear', 'joy', 36 | 'negative', 'positive', 'sadness', 'surprise', 'trust', 37 | 'tweet_length', 'avg_sentence_length', 'avg_word_length', 38 | 'commas', 'semicolons', 'exclamations', 'periods', 39 | 'questions', 'quotes', 'ellipses', 'mentions', 'hashtags', 40 | 'urls', 'is_quoted_retweet', 'all_caps', 'tweetstorm', 41 | 'hour', 'hour_20_02', 'hour_14_20', 'hour_08_14', 42 | 'hour_02_08', 'day_of_week', 'weekend', 'random_caps', 43 | 'start_mention', 'ner', 'pos'] 44 | 45 | 46 | def load_pickle(filename): 47 | # Open pickle filename 48 | print('Pickle load', filename) 49 | with open(filename, 'rb') as f: 50 | return pickle.load(f) 51 | 52 | 53 | knn_pca = load_pickle('twitterbot_pickles/knn_pca.pkl') 54 | tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl') 55 | tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl') 56 | tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl') 57 | text_cols = tfidf_text.get_feature_names() 58 | ner_cols = tfidf_ner.get_feature_names() 59 | pos_cols = tfidf_pos.get_feature_names() 60 | scaler = load_pickle('twitterbot_pickles/scaler.pkl') 61 | top_feats = load_pickle('twitterbot_pickles/top_feats.pkl') 62 | 63 | 64 | class TrumpStreamListener(tweepy.StreamListener): 65 | 66 | def on_status(self, status): 67 | if status.author.id_str == realDonaldTrump: 68 | tweet = df = pd.DataFrame(columns=['created_at', 69 | 'favorite_count', 70 | 'id_str', 71 | 'in_reply_to_user_id_str', 72 | 'is_retweet', 73 | 'retweet_count', 74 | 'source', 75 | 'text']) 76 | 77 | tweet.loc[0] = [status.created_at, 78 | status.favorite_count, 79 | status.id_str, 80 | status.in_reply_to_user_id_str, 81 | status.retweeted, 82 | status.retweet_count, 83 | status.source, 84 | status.text] 85 | prediction = predict_author(tweet) 86 | post_tweet(status, prediction) 87 | 88 | 89 | def post_tweet(status, prediction): 90 | '''Takes a tweet, formats the response, and posts to Twitter 91 | INPUT: string 92 | OUTPUT: 93 | ''' 94 | url = str('https://twitter.com/realDonaldTrump/status/' + status.id_str) 95 | url = ('https://twitter.com/' + status.user.screen_name + 96 | '/status/' + status.id_str) 97 | text = str(status.text) 98 | 99 | if prediction[0] == 0: 100 | proba = .99 if prediction[1][0][0] > .99 else prediction[1][0][0] 101 | tweet = ('I am {0:.0%} confident an aide wrote this:\n"{1}..."' 102 | '\n@realDonaldTrump\n' 103 | '{2}'. 104 | format(proba, text[:150], url)) 105 | else: 106 | proba = .99 if prediction[1][0][1] > .99 else prediction[1][0][1] 107 | tweet = ('I am {0:.0%} confident Trump wrote this:\n"{1}..."' 108 | '\n@realDonaldTrump\n' 109 | '{2}'. 110 | format(proba, text[:150], url)) 111 | print(tweet) 112 | print() 113 | api.update_status(tweet) 114 | 115 | 116 | def predict_author(tweet): 117 | X, X_std = prepare_data_for_predict(tweet) 118 | X_knn = knn_pca.transform(X_std[top_feats[:13]]) 119 | return model.predict(X_knn), model.predict_proba(X_knn) 120 | 121 | 122 | def first_tweet(api): 123 | api.update_with_media('images/trump_ticker.gif', 124 | status="Stay tuned!...") 125 | 126 | 127 | def prepare_data_for_predict(X): 128 | ''' Processes the X data with all features and standardizes. 129 | ''' 130 | # Create new feature columns 131 | X = feature_pipeline(X) 132 | X = tfidf_transform(X[feat]) 133 | X_std = standardize(X) 134 | return X, X_std 135 | 136 | 137 | def tfidf_transform(X): 138 | '''Performs a tf-idf transform on the given column of data 139 | ''' 140 | X.reset_index(drop=True, inplace=True) 141 | _tfidf_text = tfidf_text.transform(X['text']) 142 | _tfidf_text = pd.DataFrame(_tfidf_text.todense(), 143 | columns=[text_cols]) 144 | 145 | _tfidf_ner = tfidf_ner.transform(X['ner']) 146 | _tfidf_ner = pd.DataFrame(_tfidf_ner.todense(), 147 | columns=[ner_cols]) 148 | 149 | _tfidf_pos = tfidf_pos.transform(X['pos']) 150 | _tfidf_pos = pd.DataFrame(_tfidf_pos.todense(), 151 | columns=[pos_cols]) 152 | 153 | X = tfidf_remove_dups(X, _tfidf_text, _tfidf_pos, _tfidf_ner) 154 | 155 | return X 156 | 157 | 158 | def tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner): 159 | '''Removes columns in tfidf_pos and tfidf_ner that are duplicates from 160 | tfidf_text, and concatentates the DataFrames 161 | ''' 162 | # Drop ner columns also present in tfidf_text 163 | columns_to_keep = [x for x in tfidf_ner 164 | if x not in tfidf_text] 165 | tfidf_ner = tfidf_ner[columns_to_keep] 166 | 167 | # Drop pos columns also present in ner 168 | columns_to_keep = [x for x in tfidf_pos 169 | if x not in tfidf_ner] 170 | tfidf_pos = tfidf_pos[columns_to_keep] 171 | 172 | X = pd.concat([X, tfidf_text, tfidf_pos, tfidf_ner], axis=1) 173 | return X 174 | 175 | 176 | def standardize(X): 177 | print('Performing Standardization') 178 | X_std = X.copy() 179 | cols = X[std].columns 180 | X_std[std] = pd.DataFrame(scaler.transform( 181 | X[std]), 182 | index=X.index, 183 | columns=cols) 184 | return X_std 185 | 186 | 187 | def start_stream(): 188 | while True: 189 | try: 190 | trumpstream = tweepy.Stream(auth, trumpstreamlistener) 191 | trumpstream.filter(follow=[realDonaldTrump]) 192 | except: 193 | continue 194 | 195 | 196 | trumpstreamlistener = TrumpStreamListener() 197 | start_stream() 198 | -------------------------------------------------------------------------------- /twitterbot_mini_ensemble.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tweepy 3 | import pandas as pd 4 | import numpy as np 5 | import pickle 6 | from src.feature_pipeline import feature_pipeline 7 | 8 | 9 | credentials = json.load(open('.env/twitter_credentials.json')) 10 | 11 | consumer_key = credentials['consumer_key'] 12 | consumer_secret = credentials['consumer_secret'] 13 | access_token = credentials['access_token'] 14 | access_token_secret = credentials['access_token_secret'] 15 | 16 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 17 | auth.set_access_token(access_token, access_token_secret) 18 | api = tweepy.API(auth) 19 | 20 | realDonaldTrump = '25073877' 21 | # realDonaldTrump = '14649582' 22 | 23 | std = ['compound', 'anger', 'anticipation', 'disgust', 'fear', 24 | 'joy', 'negative', 'positive', 'sadness', 'surprise', 25 | 'trust', 'tweet_length', 'avg_sentence_length', 26 | 'avg_word_length', 'commas', 'semicolons', 'exclamations', 27 | 'periods', 'questions', 'quotes', 'ellipses', 'mentions', 28 | 'hashtags', 'urls', 'all_caps', 'hour', 'random_caps'] 29 | 30 | feat = ['created_at', 'is_retweet', 'text', 'is_reply', 31 | 'compound', 'v_negative', 'v_neutral', 'v_positive', 32 | 'anger', 'anticipation', 'disgust', 'fear', 'joy', 33 | 'negative', 'positive', 'sadness', 'surprise', 'trust', 34 | 'tweet_length', 'avg_sentence_length', 'avg_word_length', 35 | 'commas', 'semicolons', 'exclamations', 'periods', 36 | 'questions', 'quotes', 'ellipses', 'mentions', 'hashtags', 37 | 'urls', 'is_quoted_retweet', 'all_caps', 'tweetstorm', 38 | 'hour', 'hour_20_02', 'hour_14_20', 'hour_08_14', 39 | 'hour_02_08', 'day_of_week', 'weekend', 'random_caps', 40 | 'start_mention', 'ner', 'pos'] 41 | 42 | 43 | def load_pickle(filename): 44 | # Open pickle filename 45 | print('Pickle load', filename) 46 | with open(filename, 'rb') as f: 47 | return pickle.load(f) 48 | 49 | 50 | rf = load_pickle('twitterbot_pickles/rf.pkl') 51 | gb = load_pickle('twitterbot_pickles/gb.pkl') 52 | knn = load_pickle('twitterbot_pickles/knn.pkl') 53 | knn_pca = load_pickle('twitterbot_pickles/knn_pca.pkl') 54 | tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl') 55 | tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl') 56 | tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl') 57 | text_cols = tfidf_text.get_feature_names() 58 | ner_cols = tfidf_ner.get_feature_names() 59 | pos_cols = tfidf_pos.get_feature_names() 60 | scaler = load_pickle('twitterbot_pickles/scaler.pkl') 61 | top_feats = load_pickle('twitterbot_pickles/top_feats.pkl') 62 | 63 | 64 | class TrumpStreamListener(tweepy.StreamListener): 65 | 66 | def on_status(self, status): 67 | if status.author.id_str == realDonaldTrump: 68 | tweet = df = pd.DataFrame(columns=['created_at', 69 | 'favorite_count', 70 | 'id_str', 71 | 'in_reply_to_user_id_str', 72 | 'is_retweet', 73 | 'retweet_count', 74 | 'source', 75 | 'text']) 76 | 77 | tweet.loc[0] = [status.created_at, 78 | status.favorite_count, 79 | status.id_str, 80 | status.in_reply_to_user_id_str, 81 | status.retweeted, 82 | status.retweet_count, 83 | status.source, 84 | status.text] 85 | prediction = predict_author(tweet) 86 | post_tweet(status, prediction) 87 | 88 | 89 | def post_tweet(status, prediction): 90 | '''Takes a tweet, formats the response, and posts to Twitter 91 | INPUT: string 92 | OUTPUT: 93 | ''' 94 | url = str('https://twitter.com/realDonaldTrump/status/' + status.id_str) 95 | url = ('https://twitter.com/' + status.user.screen_name + 96 | '/status/' + status.id_str) 97 | text = str(status.text) 98 | 99 | if prediction[0] == 0: 100 | proba = .99 if prediction[1][0][0] > .99 else prediction[1][0][0] 101 | tweet = ('I am {0:.0%} confident an aide wrote this:\n"{1}..."' 102 | '\n@realDonaldTrump\n' 103 | '{2}'. 104 | format(proba, text[:150], url)) 105 | else: 106 | proba = .99 if prediction[1][0][1] > .99 else prediction[1][0][1] 107 | tweet = ('I am {0:.0%} confident Trump wrote this:\n"{1}..."' 108 | '\n@realDonaldTrump\n' 109 | '{2}'. 110 | format(proba, text[:150], url)) 111 | print(tweet) 112 | print() 113 | api.update_status(tweet) 114 | 115 | 116 | def predict_author(tweet): 117 | X, X_std = prepare_data_for_predict(tweet) 118 | X_rf = X[top_feats[:200]] 119 | X_gb = X_std[top_feats[:300]] 120 | X_knn = knn_pca.transform(X_std[top_feats[:13]]) 121 | 122 | rf_results = rf.predict(X_rf), rf.predict_proba(X_rf) 123 | gb_results = gb.predict(X_gb), gb.predict_proba(X_gb) 124 | knn_results = knn.predict(X_knn), knn.predict_proba(X_knn) 125 | 126 | print(rf_results) 127 | print(gb_results) 128 | print(knn_results) 129 | 130 | total = sum([rf_results[0], gb_results[0], knn_results[0]]) 131 | majority = 1 if total > 1 else 0 132 | 133 | zero = -(rf_results[1][0][0] * (rf_results[0] - 1) + 134 | gb_results[1][0][0] * (gb_results[0] - 1) + 135 | knn_results[1][0][0] * (knn_results[0] - 1)) 136 | one = (rf_results[1][0][1] * rf_results[0] + 137 | gb_results[1][0][1] * gb_results[0] + 138 | knn_results[1][0][1] * knn_results[0]) 139 | 140 | proba0 = zero / (3 - total) if total != 3 else 0 141 | proba1 = one / total if total != 0 else 0 142 | 143 | return (np.array([majority]), np.array([[float(proba0), float(proba1)]])) 144 | 145 | 146 | def first_tweet(api): 147 | api.update_with_media('images/trump_ticker.gif', 148 | status="Stay tuned!...") 149 | 150 | 151 | def prepare_data_for_predict(X): 152 | ''' Processes the X data with all features and standardizes. 153 | ''' 154 | # Create new feature columns 155 | X = feature_pipeline(X) 156 | X = tfidf_transform(X[feat]) 157 | X_std = standardize(X) 158 | return X, X_std 159 | 160 | 161 | def tfidf_transform(X): 162 | '''Performs a tf-idf transform on the given column of data 163 | ''' 164 | X.reset_index(drop=True, inplace=True) 165 | _tfidf_text = tfidf_text.transform(X['text']) 166 | _tfidf_text = pd.DataFrame(_tfidf_text.todense(), 167 | columns=[text_cols]) 168 | 169 | _tfidf_ner = tfidf_ner.transform(X['ner']) 170 | _tfidf_ner = pd.DataFrame(_tfidf_ner.todense(), 171 | columns=[ner_cols]) 172 | 173 | _tfidf_pos = tfidf_pos.transform(X['pos']) 174 | _tfidf_pos = pd.DataFrame(_tfidf_pos.todense(), 175 | columns=[pos_cols]) 176 | 177 | X = tfidf_remove_dups(X, _tfidf_text, _tfidf_pos, _tfidf_ner) 178 | 179 | return X 180 | 181 | 182 | def tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner): 183 | '''Removes columns in tfidf_pos and tfidf_ner that are duplicates from 184 | tfidf_text, and concatentates the DataFrames 185 | ''' 186 | # Drop ner columns also present in tfidf_text 187 | columns_to_keep = [x for x in tfidf_ner 188 | if x not in tfidf_text] 189 | tfidf_ner = tfidf_ner[columns_to_keep] 190 | 191 | # Drop pos columns also present in ner 192 | columns_to_keep = [x for x in tfidf_pos 193 | if x not in tfidf_ner] 194 | tfidf_pos = tfidf_pos[columns_to_keep] 195 | 196 | X = pd.concat([X, tfidf_text, tfidf_pos, tfidf_ner], axis=1) 197 | return X 198 | 199 | 200 | def standardize(X): 201 | X_std = X.copy() 202 | cols = X[std].columns 203 | X_std[std] = pd.DataFrame(scaler.transform( 204 | X[std]), 205 | index=X.index, 206 | columns=cols) 207 | return X_std 208 | 209 | 210 | def start_stream(): 211 | while True: 212 | try: 213 | trumpstream = tweepy.Stream(auth, trumpstreamlistener) 214 | trumpstream.filter(follow=[realDonaldTrump]) 215 | except: 216 | continue 217 | 218 | 219 | trumpstreamlistener = TrumpStreamListener() 220 | start_stream() 221 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Trump-Tweet Author Identification 2 | 3 | This project is an attempt to build a model which can identify if Trump is truly the author of any given tweet on his account, or if it was written and posted by an aide. It is widely reported in the news that many of his tweets are actually written by staff. With particularly controversial tweets, and especially given that he sometimes makes presidential decrees through his Twitter account, it can be enlightening to have greater certainty in the authorship of his tweets. 4 | 5 | I have written some Medium posts about this project going into greater detail about my procedure and results. Those posts can be accessed here: 6 | - [Who’s Tweeting from the Oval Office?](https://towardsdatascience.com/whos-tweeting-from-the-oval-office-96ea5b60c03) 7 | - [Who’s Tweeting from the Oval Office? - Building a Twitter bot](https://towardsdatascience.com/whos-tweeting-from-the-oval-office-building-a-twitter-bot-9c602edf91dd) 8 | 9 | ![Did Trump Tweet It?](images/trump_ticker.gif) 10 | 11 | Did Trump tweet it? Or just an aide in Trump clothing? 12 | 13 | ## Background 14 | On December 1st, 2017, Michael Flynn pleaded guilty to lying to the FBI. The next day, Trump’s personal Twitter account tweeted: 15 | 16 | ![I had to fire General Flynn because he lied to the Vice President and the FBI. He has pled guilty to those lies. It is a shame because his actions during the transition were lawful. There was nothing to hide!](images/flynn_tweet.png) 17 | 18 | The controversy arises because on February 14th of that year, the day after Flynn resigned, Trump had asked James Comey, then the director of the FBI, to back off any investigations of Flynn. If Trump knew at the time of his request to Comey that Flynn had indeed lied to the FBI, then Trump's tweet could be seen as evidence that Trump attempted to obstruct justice. After several legal experts argued this point, Trump defended himself by claiming that his lawyer John Dowd wrote and posted the tweet. But did he really? 19 | 20 | *** 21 | 22 | Forensic text analysis was an early field in machine learning and has been used in cases as varied as identifying the Unabomber to discovering J.K. Rowling as the true identity of the author Robert Galbraith to determining the specific authors of each of the Federalist Papers. This project is an effort to use machine learning and these same techniques to identify tweets on [@realDonaldTrump](https://twitter.com/realdonaldtrump) as written by Trump himself or by his staff when using his account. This task, however, is particularly challenging due to the short nature of a tweet--there just isn't much signal to pick up in such a short text. 23 | 24 | Prior to March 26, 2017, Trump was tweeting using a Samsung Galaxy device while his staff were tweeting using an iPhone. From this information provided in the metadata of each tweet, we know whether it was Trump himself or his staff tweeting (see [Further Reading](#further-reading) below for some articles discussing this assumption). After March however, Trump switched to using an iPhone as well, so identification of the tweeter cannot come from the metadata alone and must be deduced from the content of the tweet. 25 | 26 | ### Potential Tweeters 27 | 28 | These individuals have been reported in the news as possible tweeters on Trump's Twitter account. The Start Date is the date their association with the Trump Campaign or Administration was announced, and the end date is when their positions were terminated. 29 | 30 | |Name|Start Date|End Date|Twitter Handle| 31 | |----|----------|--------|--------------| 32 | |Donald Trump|2009-05-04|present|@realDonaldTrump| 33 | |Sean Spicer|2016-12-22|2017-07-21|@seanspicer| 34 | |Reince Priebus|2016-11-13|2017-07-27|@Reince| 35 | |Steve Bannon|2016-08-17|2017-08-18|@SteveKBannon| 36 | |Kellyanne Conway|2016-07-01|present|@KellyannePolls| 37 | |Anthony Scaramucci|2017-07-21|2017-07-31|@Scaramucci| 38 | |Dan Scavino|2015-06-01|present|@DanScavino| 39 | |John Dowd|2017-07-16|present|N/A| 40 | 41 | 42 | ## Data 43 | 44 | I used Brendan Brown's [Trump Tweet Data Archive](https://github.com/bpb27/trump_tweet_data_archive) to collect all tweets from the beginning of Trump's account in mid-2009 up until the end of 2017. This set consists of nearly 33,000 tweets. Even though I know from whose device a tweet originated, there is still some ambiguity around the authorship because Trump is known to dictate tweets to assistants, so a tweet may have Trump's characteristics but be posted from a non-Trump device, and also (especially during the campaign) to write tweets collaboratively with aides, making true authorship unclear. 45 | 46 | ## Feature engineering 47 | 48 | ### Style 49 | I looked at the style of each tweet by counting various punctuation marks (the number of exclamation marks, for example), the number of @mentions and #hashtags, and average tweet/sentence/word length. 50 | 51 | ### Trump quirks 52 | I also created features for what I have recognized as Trump's rather unique Twitter behavior. These features include the "quoted retweet" (where Trump copies and pastes a another user's tweet onto his own timeline and surrounds it in quotation marks), words written in ALL CAPS or followed by several exclamation points!!!, and also middle-of-the-night tweeting. 53 | 54 | ### Sentiment 55 | I used C.J. Hutto's [VADER](https://github.com/cjhutto/vaderSentiment) package to extract the sentiment of each tweet. VADER, which stands for Valence Aware Dictionary and sEntiment Reasoning, is a lexicon and rule-based tool that is specifically tuned to social media. Given a string of text, it outputs a number between 0 and 1 for negativity, positivity, and neutrality 56 | for the text, as well as a compound score from -1 to 1 which is an aggregate measure. 57 | 58 | ### Emotion 59 | The National Research Council of Canada created a [lexicon](http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) of over 14,000 words, each rated as belonging to any of 10 emotion classes. For each tweet, I counted the number of words for each emotion class and assigned the tweet that count score for each emotion. 60 | 61 | ### Word choice 62 | I performed TF-IDF on the text of each tweet in order to pick up vocabulary unique to Trump or his staff. 63 | 64 | ### Grammatical structure 65 | I knew the phrasing of Trump's tweets would stand out from that of his staff, so in order to capture this I performed part-of-speech replacement on each tweet, reducing it to a string of its parts of speech. For example, the phrase "Hello. This is a tweet which has been parsed for parts of speech" would be replaced with "UH . DT BZ DT NN WDT VBZ VBN VBN IN NNS IN NN ", using the [Penn part of speech tags](https://cs.nyu.edu/grishman/jet/guide/PennPOS.html). 66 | 67 | 68 | ## Models 69 | 70 | I created models for Naive Bayes, SVM, Logistic Regression with Ridge Regularization, KNN, and the ensemble methods of Random Forest, Gradient Boosting, and AdaBoost. All models achieved accuracy, precision, and recall rates in the low 90%s, except for Naive Bayes which was in the mid 80%s. For my final model, I found that an ensemble of these individual models worked best. 71 | 72 | Additionally, I used the Ridge Regularization to iteratively drive each of the roughly 900 feature coefficients to zero with ever increasing alpha values. This allowed me to rank each feature in order of its importance to the logistic regression model. At an alpha-level of 3e22, the first feature dropped out when its regression coefficient was driven to zero. Slowly, more features dropped out until an alpha-level of about 10e25, when the feature dropout rapidly increased. Above an alpha-level of 10e26, the dropout rate slowed down, and these features still left are the most influential features in the model. 73 | 74 | ![Ridge Regularization](images/ridge.png) 75 | 76 | ## Results 77 | 78 | One of the most interesting results from my analysis is the characteristics which identify a tweet as coming from Trump or from someone else. From my Ridge analysis, the top Trump features are: 79 | 80 | * Quoted retweet 81 | * @mentions 82 | * Between 10pm and 10am 83 | * Exclamations!!! 84 | * ALL CAPS 85 | * Tweet length: 114 characters 86 | * @realDonaldTrump 87 | 88 | The top features of non-Trump tweets are: 89 | 90 | * True retweets 91 | * The word “via” 92 | * Between 10am and 4pm 93 | * Semicolons 94 | * Periods 95 | * Tweet length: 103 characters 96 | * @BarackObama 97 | 98 | Trump's tweets are in general more emotive than his aides' tweets, exhibiting high scores for the emotions surprise, anger, negativity, disgust, joy, sadness, and fear. Non-Trump tweets, in contrast, are relatively unemotional, and feature many URLs, hashtags, and organization names. 99 | 100 | As for the models, Random Forest performed the best on its own, with AdaBoost a close second. Naive Bayes performed most poorly of the models tested. 101 | 102 | | |Gradient Boosting|Random Forest|AdaBoost|Logistic Regression|KNN|SVM|Naive Bayes| 103 | |---:|:--------------:|:-----------:|:------:|:-------------:|:---:|:---:|:---------:| 104 | |Accuracy|95%|94%|92%|90%|90%|90%|84%| 105 | |Precision|95%|94%|92%|90%|91%|90%|83%| 106 | |Recall|95%|95%|90%|88%|89%|90%|82%| 107 | 108 | For my final model, I created an ensemble of all seven models, using the majority class as my predictor. 109 | 110 | ## The Flynn Tweet 111 | 112 | And as for that Flynn Tweet? My analysis indicates it was most likely not written by Trump. However, my models are split evenly on this one. Some predict Trump, others not Trump. The Logistic Regression outputs a probability estimate of 97% and Naive Bayes of 94% that it did indeed come from Trump. Correspondingly, the [/@RPMMAS](https://twitter.com/RPMMAS) twitter account performed an informal poll of its users and received almost 2000 responses, with 96% indicating they believed the tweet to have come from Trump: 113 | 114 | ![WH claims his lawyer wrote this tweet: "I had to fire General Flynn because he lied to the Vice President and the FBI. He has pled guilty to those lies. It is a shame because his actions during the transition were lawful. There was nothing to hide!" Do you believe that's true?](images/flynn_tweet_poll.png) 115 | 116 | A word of caution though: not all of my models individually agreed on this one. Specifically, AdaBoost, KNN, and SVM indicated that it is a non-Trump tweet. Random Forest, Naive Bayes, and Logistic Regression all output Trump as the author. In my opinion, after reviewing thousands of Trump tweets throughout this project and evaluating all features which describe his tweets, I find the topic, sentiment, and emotion very much to be Trumpian, while the phrasing, grammar, and style all indicate another author. I believe the tweet was written collaboratively, with Trump providing the topical features of the tweet and an unknown author actually composing it. 117 | 118 | 119 | ## Sources 120 | 121 | *Many thanks to the following packages and lexicons!* 122 | 123 | Trump's tweet data is from Brendan Brown's [Trump Tweet Data Archive](https://github.com/bpb27/trump_tweet_data_archive) 124 | 125 | Trump aide data was scraped from Twitter using Ahmet Taspinar's [twitterscraper](https://github.com/taspinar/twitterscraper) with the query "twitterscraper 'from:twitter_handle since:2009-01-01 until:2017-12-31' -o scraped_tweets.json" 126 | 127 | VADER sentiment analysis was performed using [C.J. Hutto's VADER package](https://github.com/cjhutto/vaderSentiment) 128 | 129 | The National Research Council of Canada kindly gave me access to the [NRC Word-Emotion Association Lexicon](http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm). Contact: Saif Mohammad (saif.mohammad@nrc-cnrc.gc.ca) 130 | 131 | Lastly, I used Jared Suttles' [Tweetokenize](https://github.com/jaredks/tweetokenize) to aid in my part-of-speech analysis. An updated version of the package which works with Python 3 can be found in my fork [here](https://github.com/raffg/tweetokenize/tree/Python-3). 132 | 133 | 134 | ## Further Reading 135 | 136 | * [Text analysis of Trump's tweets confirms he writes only the (angrier) Android half](http://varianceexplained.org/r/trump-tweets/) 137 | * [How to tell when someone else tweets from @realDonaldTrump](https://www.wired.com/story/tell-when-someone-else-tweets-from-realdonaldtrump/) 138 | * [All the president’s tweets: Fox News enjoys considerable influence over the world’s most important Twitter account](https://www.economist.com/blogs/graphicdetail/2018/01/daily-chart-9) 139 | * [Is Trump's Campaign Locking Him Out of Twitter?](https://www.theatlantic.com/politics/archive/2016/08/donald-trump-twitter-iphone-android/495239/) 140 | * [Timestamp analysis confirms Trump is the author of Android tweets](http://didtrumptweetit.com/timestamp-analysis-trump-android-phone/) 141 | -------------------------------------------------------------------------------- /TweetAuthorshipPredictor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | from src.ridge_grid_scan import ridge_grid_scan 5 | from src.feature_pipeline import feature_pipeline 6 | from src.load_data import load_json_list, apply_date_mask 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.decomposition import PCA 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.ensemble import AdaBoostClassifier 13 | from sklearn.ensemble import GradientBoostingClassifier 14 | from sklearn.neighbors import KNeighborsClassifier 15 | from sklearn.naive_bayes import MultinomialNB 16 | from sklearn.naive_bayes import GaussianNB 17 | from sklearn.svm import SVC 18 | from sklearn.linear_model import SGDClassifier 19 | from sklearn.linear_model import LogisticRegression 20 | from sklearn.metrics import accuracy_score, precision_score, recall_score, \ 21 | f1_score 22 | 23 | 24 | def main(): 25 | with open('labeled_data_through_mar_11.pkl', 'rb') as f: 26 | df = pickle.load(f) 27 | 28 | print('Loading data...') 29 | df = apply_date_mask(df, 'created_at', '2009-01-01', '2018-12-31') 30 | 31 | y = pd.DataFrame(np.where(df['label'] == 1, 1, 0)) 32 | X = df.drop(['label'], axis=1) 33 | save_pickle(y, 'ensemble/y_train.pkl') 34 | 35 | trump = TweetAuthorshipPredictor() 36 | trump.fit(X, y) 37 | 38 | save_pickle(trump, 'ensemble/trump.pkl') 39 | 40 | 41 | class TweetAuthorshipPredictor(object): 42 | ''' This class represents the ensemble of models for tweet authorship 43 | prediction 44 | 45 | Parameters 46 | ---------- 47 | featurized: boolean, option (default=False) 48 | Boolean indicating if the X data has already been featurized. 49 | Use True if sending featurized data to the class. 50 | 51 | Methods 52 | ------- 53 | fit : fit the model to X and y data 54 | predict : predict the authoriship of an unlabled tweet_length 55 | get_top_features : returns a list of the features ordered by influence 56 | 57 | Attributes 58 | ---------- 59 | top_feats : Array of the features sorted by influence 60 | 61 | Returns 62 | ------- 63 | self: 64 | The initialized GradientDescent object. 65 | ''' 66 | 67 | def __init__(self): 68 | ''' Initialize the ensemble object 69 | ''' 70 | # Save the individual ensemble models 71 | self.rf = None 72 | self.ab = None 73 | self.gb = None 74 | self.knn = None 75 | self.nb = None 76 | self.gnb = None 77 | self.svc = None 78 | self.svm = None 79 | self.lr = None 80 | self.ridge = None 81 | 82 | # Save the data processing objects 83 | self.top_feats = None 84 | self.scaler = None 85 | self.knn_pca = None 86 | self.gnb_pca = None 87 | self.tfidf_text = None 88 | self.tfidf_ner = None 89 | self.tfidf_pos = None 90 | 91 | # Columns to standardize 92 | self.std = ['compound', 'anger', 'anticipation', 'disgust', 'fear', 93 | 'joy', 'negative', 'positive', 'sadness', 'surprise', 94 | 'trust', 'avg_sentence_length', 'avg_word_length', 95 | 'commas', 'semicolons', 'exclamations', 'periods', 96 | 'questions', 'quotes', 'ellipses', 'mentions', 97 | 'hashtags', 'urls', 'all_caps', 'random_caps'] 98 | 99 | # Columns to train on prior to tf-idf 100 | self.feat = ['created_at', 'is_retweet', 'text', 'is_reply', 101 | 'compound', 'v_negative', 'v_neutral', 'v_positive', 102 | 'anger', 'anticipation', 'disgust', 'fear', 'joy', 103 | 'negative', 'positive', 'sadness', 'surprise', 'trust', 104 | 'avg_sentence_length', 'avg_word_length', 'commas', 105 | 'semicolons', 'exclamations', 'periods', 'questions', 106 | 'quotes', 'ellipses', 'mentions', 'hashtags', 'urls', 107 | 'is_quoted_retweet', 'all_caps', 'hour_20_02', 108 | 'hour_14_20', 'hour_08_14', 'hour_02_08', 'weekend', 109 | 'random_caps', 'start_mention', 'ner', 'pos'] 110 | 111 | # tf-idf column names 112 | self.text_cols = None 113 | self.pos_cols = None 114 | self.ner_cols = None 115 | 116 | # Set the number of features for each model 117 | self.rf_feats = 200 118 | self.ab_feats = 300 119 | self.gb_feats = 300 120 | self.knn_feats = 13 121 | self.nb_feats = 5 122 | self.gnb_feats = 13 123 | self.svc_feats = 50 124 | self.svm_feats = 300 125 | self.lr_feats = 200 126 | 127 | def fit(self, X_train, y_train): 128 | ''' Train the ensemble with X and y data 129 | 130 | Parameters 131 | ---------- 132 | X: Pandas DataFrame, shape (n_samples, n_features) 133 | The training data. 134 | y: Pandas DataFrame, shape (n_samples, ). 135 | The training response for the optimization. 136 | 137 | Returns 138 | ------- 139 | self: 140 | The fit Ensemble object. 141 | ''' 142 | # Featurize the X data 143 | X_train, X_std_train = self._prepare_data_for_fit(X_train) 144 | 145 | save_pickle(X_train, 'ensemble/X_train.pkl') 146 | save_pickle(X_std_train, 'ensemble/X_std_train.pkl') 147 | # X_train = load_pickle('twitterbot_pickles/X_train.pkl') 148 | # X_std_train = load_pickle('twitterbot_pickles/X_std_train.pkl') 149 | 150 | drop = ['created_at', 'text', 'pos', 'ner'] 151 | 152 | # self.tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl') 153 | # self.tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl') 154 | # self.tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl') 155 | # self.scaler = load_pickle('twitterbot_pickles/scaler.pkl') 156 | self.text_cols = self.tfidf_text.get_feature_names() 157 | self.ner_cols = self.tfidf_ner.get_feature_names() 158 | self.pos_cols = self.tfidf_pos.get_feature_names() 159 | 160 | # Remove non-numeric features 161 | X_train = X_train.drop(drop, axis=1) 162 | X_std_train = X_std_train.drop(drop, axis=1) 163 | 164 | # Load the feature sets 165 | feature_list = ridge_grid_scan(X_train, 166 | np.array(y_train).ravel(), 167 | n=len(X_train.columns)) 168 | self.top_feats = [(x[0]) for x in list(feature_list)] 169 | save_pickle(self.top_feats, 'ensemble/top_feats.pkl') 170 | # self.top_feats = load_pickle('twitterbot_pickles/top_feats.pkl') 171 | 172 | # Train the PCA objects 173 | self._gnb_pca_calc(X_std_train[self.top_feats[:13]]) 174 | self._knn_pca_calc(X_std_train[self.top_feats[:13]]) 175 | save_pickle(self.gnb_pca, 'ensemble/gnb_pca.pkl') 176 | save_pickle(self.knn_pca, 'ensemble/knn_pca.pkl') 177 | # self.gnb_pca = load_pickle('twitterbot_pickles/gnb_pca.pkl') 178 | # self.knn_pca = load_pickle('twitterbot_pickles/knn_pca.pkl') 179 | 180 | # Train the individual models 181 | data, probabilities = self._first_stage_train(X_train, X_std_train, 182 | np.array(y_train). 183 | ravel()) 184 | X_train_ridge = pd.DataFrame(probabilities) 185 | 186 | save_pickle(X_train_ridge, 'ensemble/X_train_ridge.pkl') 187 | # X_train_ridge = load_pickle('twitterbot_pickles/X_train_ridge.pkl') 188 | 189 | self.ridge = self._ridge(X_train_ridge, np.array(y_train).ravel()) 190 | 191 | return self 192 | 193 | def predict(self, X): 194 | '''Return a label for prediction of the authoriship of the tweet X 195 | 196 | Parameters 197 | ---------- 198 | X: 2d Pandas DataFrame 199 | The feature matrix 200 | 201 | Returns 202 | ------- 203 | y: (1 or 0, probability) 204 | Predicted label, probabilities 205 | ''' 206 | X, X_std = self._prepare_data_for_predict(X) 207 | data, probabilities = self._first_stage_predict(X, X_std) 208 | X_ridge = pd.DataFrame(probabilities) 209 | 210 | prediction = self.ridge.predict(X_ridge) 211 | proba_list = [] 212 | for key, value in probabilities.items(): 213 | if data[key] == prediction: 214 | proba_list.append(probabilities[key]) 215 | proba = np.mean(proba_list) 216 | 217 | return prediction, proba 218 | 219 | def get_top_features(self): 220 | '''Returns a list of the features ordered by influence 221 | ''' 222 | return self.top_feats 223 | 224 | def _standard_scaler(self, X): 225 | # Standardize features 226 | print('Calculating standardization') 227 | self.scaler = StandardScaler() 228 | cols = X.columns 229 | self.scaler.fit(X) 230 | save_pickle(self.scaler, 'ensemble/scaler.pkl') 231 | # self.scaler = load_pickle('twitterbot_pickles/scaler.pkl') 232 | 233 | def _standardize(self, X): 234 | X_std = X.copy() 235 | cols = X[self.std].columns 236 | X_std[self.std] = pd.DataFrame(self.scaler.transform( 237 | X[self.std]), 238 | index=X.index, 239 | columns=cols) 240 | return X_std 241 | 242 | def _prepare_data_for_fit(self, X): 243 | ''' Processes the X data with all features, saves tf-idf vectorizers, 244 | and standardizes. 245 | ''' 246 | # Create new feature columns 247 | # X = feature_pipeline(X, verbose=True) 248 | # save_pickle(X, 'ensemble/X.pkl') 249 | X = load_pickle('X.pkl') 250 | X = apply_date_mask(X, 'created_at', '2009-01-01', '2018-12-31') 251 | 252 | X = self._tfidf_fit_transform(X[self.feat]) 253 | self._standard_scaler(X[self.std]) 254 | X_std = self._standardize(X) 255 | 256 | return X, X_std 257 | 258 | def _prepare_data_for_predict(self, X): 259 | ''' Processes the X data with all features and standardizes. 260 | ''' 261 | # Create new feature columns 262 | X = feature_pipeline(X) 263 | X = self._tfidf_transform(X[self.feat]) 264 | X_std = self._standardize(X) 265 | 266 | return X, X_std 267 | 268 | def _first_stage_train(self, X_train, X_std_train, y_train): 269 | '''Train models in first stage of 9 models 270 | ''' 271 | rf_feat = self.top_feats[:self.rf_feats] 272 | ab_feat = self.top_feats[:self.ab_feats] 273 | gb_feat = self.top_feats[:self.gb_feats] 274 | knn_feat = self.top_feats[:self.knn_feats] 275 | nb_feat = self.top_feats[:self.nb_feats] 276 | gnb_feat = self.top_feats[:self.gnb_feats] 277 | svc_feat = self.top_feats[:self.svc_feats] 278 | svm_feat = self.top_feats[:self.svm_feats] 279 | lr_feat = self.top_feats[:self.lr_feats] 280 | 281 | rf_results = self._random_forest(X_train[rf_feat], y_train) 282 | ab_results = self._adaboost(X_std_train[ab_feat], y_train) 283 | gb_results = self._gradient_boosting(X_std_train[gb_feat], y_train) 284 | knn_results = self._knn(X_std_train[knn_feat], y_train) 285 | nb_results = self._naive_bayes(X_train[nb_feat], y_train) 286 | gnb_results = self._gaussian_naive_bayes(X_std_train[gnb_feat], 287 | y_train) 288 | svc_results = self._svc(X_std_train[svc_feat], y_train) 289 | svm_results = self._svm(X_std_train[svm_feat], y_train) 290 | lr_results = self._logistic_regression(X_std_train[lr_feat], y_train) 291 | 292 | data = {'rf': rf_results[0], 'ab': ab_results[0], 293 | 'gb': gb_results[0], 'knn': knn_results[0], 294 | 'nb': nb_results[0], 'gnb': gnb_results[0], 295 | 'svc': svc_results[0], 'svm': svm_results[0], 296 | 'lr': lr_results[0]} 297 | 298 | probabilities = {'rf': rf_results[1], 'ab': ab_results[1], 299 | 'gb': gb_results[1], 'knn': knn_results[1], 300 | 'nb': nb_results[1], 'gnb': gnb_results[1], 301 | 'svc': svc_results[1], 'svm': svm_results[1], 302 | 'lr': lr_results[1]} 303 | 304 | for key, value in probabilities.items(): 305 | probabilities[key] = [item[1] for item in probabilities[key]] 306 | 307 | return data, probabilities 308 | 309 | def _first_stage_predict(self, X, X_std): 310 | '''Calculate predictions for first stage of 9 models 311 | ''' 312 | rf_feat = self.top_feats[:self.rf_feats] 313 | ab_feat = self.top_feats[:self.ab_feats] 314 | gb_feat = self.top_feats[:self.gb_feats] 315 | knn_feat = self.top_feats[:self.knn_feats] 316 | nb_feat = self.top_feats[:self.nb_feats] 317 | gnb_feat = self.top_feats[:self.gnb_feats] 318 | svc_feat = self.top_feats[:self.svc_feats] 319 | svm_feat = self.top_feats[:self.svm_feats] 320 | lr_feat = self.top_feats[:self.lr_feats] 321 | 322 | X_knn = self.knn_pca.transform(X_std[knn_feat]) 323 | X_gnb = self.gnb_pca.transform(X_std[gnb_feat]) 324 | 325 | rf_results = (self.rf.predict(X[rf_feat]), 326 | self.rf.predict_proba(X[rf_feat])) 327 | ab_results = (self.ab.predict(X_std[ab_feat]), 328 | self.ab.predict_proba(X_std[ab_feat])) 329 | gb_results = (self.gb.predict(X_std[gb_feat]), 330 | self.gb.predict_proba(X_std[gb_feat])) 331 | knn_results = (self.knn.predict(X_knn), 332 | self.knn.predict_proba(X_knn)) 333 | nb_results = (self.nb.predict(X[nb_feat]), 334 | self.nb.predict_proba(X[nb_feat])) 335 | gnb_results = (self.gnb.predict(X_gnb), 336 | self.gnb.predict_proba(X_gnb)) 337 | svc_results = (self.svc.predict(X_std[svc_feat]), 338 | self.svc.predict_proba(X_std[svc_feat])) 339 | svm_results = (self.svm.predict(X_std[svm_feat]), 340 | self.svm.predict_proba(X_std[svm_feat])) 341 | lr_results = (self.lr.predict(X_std[lr_feat]), 342 | self.lr.predict_proba(X_std[lr_feat])) 343 | 344 | data = {'rf': rf_results[0], 'ab': ab_results[0], 345 | 'gb': gb_results[0], 'knn': knn_results[0], 346 | 'nb': nb_results[0], 'gnb': gnb_results[0], 347 | 'svc': svc_results[0], 'svm': svm_results[0], 348 | 'lr': lr_results[0]} 349 | 350 | probabilities = {'rf': rf_results[1], 'ab': ab_results[1], 351 | 'gb': gb_results[1], 'knn': knn_results[1], 352 | 'nb': nb_results[1], 'gnb': gnb_results[1], 353 | 'svc': svc_results[1], 'svm': svm_results[1], 354 | 'lr': lr_results[1]} 355 | 356 | for key, value in probabilities.items(): 357 | probabilities[key] = [item[1] for item in probabilities[key]] 358 | 359 | for key, value in probabilities.items(): 360 | print(key, value) 361 | 362 | for key, value in data.items(): 363 | print(key, value) 364 | 365 | return data, probabilities 366 | 367 | def _random_forest(self, X_train, y_train): 368 | print('Running Random Forest') 369 | rf = RandomForestClassifier(max_depth=20, 370 | max_features='sqrt', 371 | max_leaf_nodes=None, 372 | min_samples_leaf=2, 373 | min_samples_split=2, 374 | n_estimators=1000, 375 | n_jobs=-1).fit(X_train, y_train) 376 | save_pickle(rf, 'ensemble/rf.pkl') 377 | # rf = load_pickle('twitterbot_pickles/rf.pkl') 378 | predicted = rf.predict(X_train) 379 | proba = rf.predict_proba(X_train) 380 | self.rf = rf 381 | return predicted, proba 382 | 383 | def _adaboost(self, X_train, y_train): 384 | print('Running AdaBoost') 385 | ab = AdaBoostClassifier(learning_rate=1.25, 386 | n_estimators=40).fit(X_train, y_train) 387 | save_pickle(ab, 'ensemble/ab.pkl') 388 | # ab = load_pickle('twitterbot_pickles/ab.pkl') 389 | predicted = ab.predict(X_train) 390 | proba = ab.predict_proba(X_train) 391 | self.ab = ab 392 | return predicted, proba 393 | 394 | def _gradient_boosting(self, X_train, y_train): 395 | print('Running Gradient Boosting') 396 | gb = GradientBoostingClassifier(n_estimators=200, 397 | learning_rate=.1, 398 | max_depth=6, 399 | min_samples_split=2, 400 | min_samples_leaf=1, 401 | subsample=1, 402 | max_features=None 403 | ).fit(X_train, y_train) 404 | save_pickle(gb, 'ensemble/gb.pkl') 405 | # gb = load_pickle('twitterbot_pickles/gb.pkl') 406 | predicted = gb.predict(X_train) 407 | proba = gb.predict_proba(X_train) 408 | self.gb = gb 409 | return predicted, proba 410 | 411 | def _knn(self, X_train, y_train): 412 | print('Running K Nearest Neighbors') 413 | X_train = self.knn_pca.transform(X_train) 414 | knn = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train) 415 | save_pickle(knn, 'ensemble/knn.pkl') 416 | # knn = load_pickle('twitterbot_pickles/knn.pkl') 417 | predicted = knn.predict(X_train) 418 | proba = knn.predict_proba(X_train) 419 | self.knn = knn 420 | return predicted, proba 421 | 422 | def _knn_pca_calc(self, X_train): 423 | # Perform Principle Component Analysis 424 | print('Performing PCA on K Nearest Neighbors') 425 | pca = PCA(n_components=12) 426 | pca.fit(X_train) 427 | self.knn_pca = pca 428 | 429 | def _naive_bayes(self, X_train, y_train): 430 | print('Running Multinomial Naive Bayes') 431 | nb = MultinomialNB(alpha=10).fit(X_train, y_train) 432 | save_pickle(nb, 'ensemble/nb.pkl') 433 | # nb = load_pickle('twitterbot_pickles/nb.pkl') 434 | predicted = nb.predict(X_train) 435 | proba = nb.predict_proba(X_train) 436 | self.nb = nb 437 | return predicted, proba 438 | 439 | def _gaussian_naive_bayes(self, X_train, y_train): 440 | print('Running Gaussian Naive Bayes') 441 | X_train = self.gnb_pca.transform(X_train) 442 | gnb = GaussianNB().fit(X_train, y_train) 443 | save_pickle(gnb, 'ensemble/gnb.pkl') 444 | # gnb = load_pickle('twitterbot_pickles/gnb.pkl') 445 | predicted = gnb.predict(X_train) 446 | proba = gnb.predict_proba(X_train) 447 | self.gnb = gnb 448 | return predicted, proba 449 | 450 | def _gnb_pca_calc(self, X_train): 451 | # Perform Principle Component Analysis 452 | print('Performing PCA on Gaussian Naive Bayes') 453 | pca = PCA(n_components=10) 454 | pca.fit(X_train) 455 | self.gnb_pca = pca 456 | 457 | def _svc(self, X_train, y_train): 458 | print('Running Support Vector Classifier') 459 | svc = SVC(C=100, 460 | coef0=1, 461 | degree=2, 462 | gamma='auto', 463 | kernel='poly', 464 | shrinking=False, 465 | probability=True).fit(X_train, y_train) 466 | save_pickle(svc, 'ensemble/svc.pkl') 467 | # svc = load_pickle('twitterbot_pickles/svc.pkl') 468 | predicted = svc.predict(X_train) 469 | proba = svc.predict_proba(X_train) 470 | self.svc = svc 471 | return predicted, proba 472 | 473 | def _svm(self, X_train, y_train): 474 | print('Running Support Vector Machine') 475 | svm = SGDClassifier(loss='modified_huber', penalty='l2', 476 | alpha=0.0001, max_iter=10).fit(X_train, y_train) 477 | save_pickle(svm, 'ensemble/svm.pkl') 478 | # svm = load_pickle('twitterbot_pickles/svm.pkl') 479 | predicted = svm.predict(X_train) 480 | proba = svm.predict_proba(X_train) 481 | self.svm = svm 482 | return predicted, proba 483 | 484 | def _logistic_regression(self, X_train, y_train): 485 | print('Running Logistic Regression') 486 | lr = LogisticRegression(C=.05).fit(X_train, y_train) 487 | save_pickle(lr, 'ensemble/lr.pkl') 488 | # lr = load_pickle('twitterbot_pickles/lr.pkl') 489 | predicted = lr.predict(X_train) 490 | proba = lr.predict_proba(X_train) 491 | self.lr = lr 492 | return predicted, proba 493 | 494 | def _ridge(self, X_train, y_train): 495 | print('Running Ridge Regression') 496 | ridge = LogisticRegression(penalty='l2', C=10000000) 497 | save_pickle(ridge, 'ensemble/ridge.pkl') 498 | # ridge = load_pickle('twitterbot_pickles/ridge.pkl') 499 | ridge.fit(X_train, y_train) 500 | self.ridge = ridge 501 | return ridge 502 | 503 | def _tfidf_fit_transform(self, X): 504 | '''Fits and concatenates tf-idf columns to X for text, pos, and ner 505 | ''' 506 | print('Calculating TF-IDF') 507 | # Perform TF-IDF on text column 508 | print(' on text column') 509 | self.tfidf_text = TfidfVectorizer(ngram_range=(1, 2), 510 | lowercase=False, 511 | token_pattern='\w+|\@\w+', 512 | norm='l2', 513 | max_df=.99, 514 | min_df=.01) 515 | tfidf_text = self.tfidf_text.fit_transform(X['text']) 516 | self.text_cols = self.tfidf_text.get_feature_names() 517 | idx = X.index 518 | tfidf_text = pd.DataFrame(tfidf_text.todense(), 519 | columns=[self.text_cols], 520 | index=idx) 521 | save_pickle(self.tfidf_text, 'ensemble/tfidf_text.pkl') 522 | # self.tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl') 523 | 524 | # Perform TF-IDF on ner column 525 | print(' on ner column') 526 | self.tfidf_ner = TfidfVectorizer(ngram_range=(1, 2), 527 | lowercase=False, 528 | norm='l2', 529 | max_df=.99, 530 | min_df=.01) 531 | tfidf_ner = self.tfidf_ner.fit_transform(X['ner']) 532 | self.ner_cols = self.tfidf_ner.get_feature_names() 533 | tfidf_ner = pd.DataFrame(tfidf_ner.todense(), 534 | columns=[self.ner_cols], 535 | index=idx) 536 | save_pickle(self.tfidf_ner, 'ensemble/tfidf_ner.pkl') 537 | # self.tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl') 538 | 539 | # Perform TF-IDF on pos column 540 | print(' on pos column') 541 | self.tfidf_pos = TfidfVectorizer(ngram_range=(2, 3), 542 | lowercase=False, 543 | norm='l2', 544 | max_df=.99, 545 | min_df=.01) 546 | tfidf_pos = self.tfidf_pos.fit_transform(X['pos']) 547 | self.pos_cols = self.tfidf_pos.get_feature_names() 548 | tfidf_pos = pd.DataFrame(tfidf_pos.todense(), 549 | columns=[self.pos_cols], 550 | index=idx) 551 | save_pickle(self.tfidf_pos, 'ensemble/tfidf_pos.pkl') 552 | # self.tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl') 553 | 554 | X = self._tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner) 555 | 556 | return X 557 | 558 | def _tfidf_transform(self, X): 559 | '''Performs a tf-idf transform on the given column of data 560 | ''' 561 | X.reset_index(drop=True, inplace=True) 562 | tfidf_text = self.tfidf_text.transform(X['text']) 563 | tfidf_text = pd.DataFrame(tfidf_text.todense(), 564 | columns=[self.text_cols]) 565 | 566 | tfidf_ner = self.tfidf_ner.transform(X['ner']) 567 | tfidf_ner = pd.DataFrame(tfidf_ner.todense(), 568 | columns=[self.ner_cols]) 569 | 570 | tfidf_pos = self.tfidf_pos.transform(X['pos']) 571 | tfidf_pos = pd.DataFrame(tfidf_pos.todense(), 572 | columns=[self.pos_cols]) 573 | 574 | X = self._tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner) 575 | 576 | return X 577 | 578 | def _tfidf_remove_dups(self, X, tfidf_text, tfidf_pos, tfidf_ner): 579 | '''Removes columns in tfidf_pos and tfidf_ner that are duplicates from 580 | tfidf_text, and concatentates the DataFrames 581 | ''' 582 | # Drop ner columns also present in tfidf_text 583 | columns_to_keep = [x for x in tfidf_ner 584 | if x not in tfidf_text] 585 | tfidf_ner = tfidf_ner[columns_to_keep] 586 | 587 | # Drop pos columns also present in ner 588 | columns_to_keep = [x for x in tfidf_pos 589 | if x not in tfidf_ner] 590 | tfidf_pos = tfidf_pos[columns_to_keep] 591 | 592 | X = pd.concat([X, tfidf_text, tfidf_pos, tfidf_ner], axis=1) 593 | return X 594 | 595 | 596 | def save_pickle(item, filename): 597 | # Save pickle file 598 | output = open(filename, 'wb') 599 | print(' Pickle dump', filename) 600 | pickle.dump(item, output, protocol=4) 601 | output.close() 602 | 603 | 604 | def load_pickle(filename): 605 | # Open pickle filename 606 | print(' Pickle load', filename) 607 | with open(filename, 'rb') as f: 608 | return pickle.load(f) 609 | 610 | 611 | if __name__ == '__main__': 612 | main() 613 | --------------------------------------------------------------------------------